From fa38da78384e40e9fc208261efd5b2e4e2c1bedc Mon Sep 17 00:00:00 2001
From: cryptotavares <joao.tavares@consensys.net>
Date: Wed, 15 Apr 2026 22:19:16 +0100
Subject: [PATCH 01/36] refactor: decompose mcp-server into modular
 architecture with CLI entry point

Restructure monolithic src/mcp-server/ into focused modules:
- src/server/: HTTP server, daemon state, port allocation, request queue
- src/tools/: all tool implementations and type definitions
- src/knowledge-store/: knowledge store and tokenization
- src/validation/: schema validation
- src/utils/: shared logger, errors, time utilities
- src/cli/: new CLI entry point (mm)

Add SKILL.md and update package.json, README, and vitest config.
---
 README.md                                     | 1755 ++++------------
 SKILL.md                                      |  368 ++++
 package.json                                  |   12 +-
 src/capabilities/context.test.ts              |    4 -
 src/capabilities/context.ts                   |    2 -
 src/cli/mm.test.ts                            | 1756 +++++++++++++++++
 src/cli/mm.ts                                 | 1029 ++++++++++
 src/index.ts                                  |   73 +-
 .../knowledge-store.test.ts                   |  430 ++--
 .../knowledge-store.ts                        |   58 +-
 .../tokenization.test.ts                      |    2 +-
 .../tokenization.ts                           |    0
 .../utils/redaction.test.ts                   |    0
 .../utils/redaction.ts                        |    0
 src/launcher/console-error-buffer.test.ts     |    4 +-
 src/launcher/retry.test.ts                    |    2 +-
 src/mcp-server/server.test.ts                 |  677 -------
 src/mcp-server/server.ts                      |  237 ---
 src/mcp-server/session-manager.test.ts        |  105 -
 src/mcp-server/test-utils/flush-promises.ts   |    8 -
 src/mcp-server/tools/batch.test.ts            |  428 ----
 src/mcp-server/tools/batch.ts                 |  286 ---
 src/mcp-server/tools/build.test.ts            |  211 --
 src/mcp-server/tools/build.ts                 |  100 -
 src/mcp-server/tools/cleanup.test.ts          |  161 --
 src/mcp-server/tools/cleanup.ts               |   32 -
 src/mcp-server/tools/clipboard.test.ts        |  325 ---
 src/mcp-server/tools/clipboard.ts             |  117 --
 src/mcp-server/tools/context.test.ts          |  221 ---
 src/mcp-server/tools/context.ts               |   94 -
 src/mcp-server/tools/definitions.test.ts      |  759 -------
 src/mcp-server/tools/definitions.ts           |  638 ------
 src/mcp-server/tools/discovery-tools.ts       |  223 ---
 src/mcp-server/tools/helpers.test.ts          |  745 -------
 src/mcp-server/tools/helpers.ts               |  313 ---
 src/mcp-server/tools/index.ts                 |   10 -
 src/mcp-server/tools/interaction.test.ts      |  822 --------
 src/mcp-server/tools/interaction.ts           |  296 ---
 src/mcp-server/tools/knowledge.ts             |  212 --
 src/mcp-server/tools/launch.test.ts           |  384 ----
 src/mcp-server/tools/launch.ts                |   93 -
 src/mcp-server/tools/navigation.test.ts       |  787 --------
 src/mcp-server/tools/navigation.ts            |  329 ---
 src/mcp-server/tools/registry.test.ts         |  156 --
 src/mcp-server/tools/registry.ts              |   10 -
 src/mcp-server/tools/run-tool.test.ts         |  958 ---------
 src/mcp-server/tools/run-tool.ts              |  220 ---
 src/mcp-server/tools/screenshot.test.ts       |  307 ---
 src/mcp-server/tools/screenshot.ts            |   67 -
 src/mcp-server/tools/seeding.test.ts          |  552 ------
 src/mcp-server/tools/seeding.ts               |  327 ---
 src/mcp-server/tools/state.test.ts            |  358 ----
 src/mcp-server/tools/state.ts                 |  102 -
 src/mcp-server/types/responses.ts             |   27 -
 src/mcp-server/utils/index.ts                 |   14 -
 src/mcp-server/utils/response.ts              |   57 -
 src/server/create-server.test.ts              |  697 +++++++
 src/server/create-server.ts                   |  648 ++++++
 src/server/daemon-state.test.ts               |  232 +++
 src/server/daemon-state.ts                    |  209 ++
 src/server/port-allocator.test.ts             |   48 +
 src/server/port-allocator.ts                  |   27 +
 src/server/request-queue.test.ts              |   92 +
 src/server/request-queue.ts                   |   64 +
 src/{mcp-server => server}/session-manager.ts |   65 +-
 src/tools/batch.test.ts                       |  458 +++++
 src/tools/batch.ts                            |  160 ++
 src/tools/build.test.ts                       |  192 ++
 src/tools/build.ts                            |   52 +
 src/tools/cleanup.test.ts                     |   74 +
 src/tools/cleanup.ts                          |   19 +
 src/tools/clipboard.test.ts                   |  215 ++
 src/tools/clipboard.ts                        |   82 +
 src/tools/context.test.ts                     |  176 ++
 src/tools/context.ts                          |   55 +
 .../tools/discovery-tools.test.ts             |  334 ++--
 src/tools/discovery-tools.ts                  |  155 ++
 .../tools/error-classification.test.ts        |    4 +-
 .../tools/error-classification.ts             |    2 +-
 src/tools/index.ts                            |   15 +
 src/tools/interaction.test.ts                 |  660 +++++++
 src/tools/interaction.ts                      |  197 ++
 src/{mcp-server => }/tools/knowledge.test.ts  |  430 ++--
 src/tools/knowledge.ts                        |  164 ++
 src/tools/launch.test.ts                      |  252 +++
 src/tools/launch.ts                           |   72 +
 src/tools/navigation.test.ts                  |  471 +++++
 src/tools/navigation.ts                       |  247 +++
 src/tools/registry.test.ts                    |   48 +
 src/tools/registry.ts                         |   67 +
 src/tools/screenshot.test.ts                  |  242 +++
 src/tools/screenshot.ts                       |   49 +
 src/tools/seeding.test.ts                     |  346 ++++
 src/tools/seeding.ts                          |  187 ++
 src/tools/state.test.ts                       |  319 +++
 src/tools/state.ts                            |   88 +
 src/{mcp-server => tools}/test-utils/index.ts |    2 -
 .../test-utils/mock-factories.test.ts         |    2 +-
 .../test-utils/mock-factories.ts              |    8 +-
 .../test-utils/mock-playwright.ts             |    0
 src/{mcp-server => tools}/types/discovery.ts  |    0
 src/{mcp-server => tools}/types/errors.ts     |    0
 src/{mcp-server => tools}/types/index.ts      |    1 -
 src/{mcp-server => tools}/types/knowledge.ts  |    0
 src/{mcp-server => tools}/types/seeding.ts    |    0
 src/{mcp-server => tools}/types/session.ts    |    0
 .../types/step-record.ts                      |    0
 .../types/tool-inputs.ts                      |   15 +-
 .../types/tool-outputs.ts                     |   16 +
 src/tools/utils.ts                            |   53 +
 src/{mcp-server => tools/utils}/constants.ts  |    2 +-
 .../utils}/discovery.test.ts                  |    0
 src/{mcp-server => tools/utils}/discovery.ts  |    6 +-
 src/{mcp-server => tools}/utils/targets.ts    |    0
 .../utils/type-guards.test.ts                 |    0
 .../utils/type-guards.ts                      |    0
 src/types/http.ts                             |  119 ++
 src/{mcp-server => }/utils/errors.ts          |    0
 src/utils/index.ts                            |    3 +
 src/{mcp-server => }/utils/logger.test.ts     |   20 +-
 src/{mcp-server => }/utils/logger.ts          |   12 +-
 src/{mcp-server => }/utils/time.test.ts       |    0
 src/{mcp-server => }/utils/time.ts            |    0
 .../schemas.test.ts                           |    0
 src/{mcp-server => validation}/schemas.ts     |   12 +-
 src/version.ts                                |    2 +
 vitest.config.mts                             |    2 +-
 yarn.lock                                     |  283 +--
 128 files changed, 11743 insertions(+), 13992 deletions(-)
 create mode 100644 SKILL.md
 create mode 100644 src/cli/mm.test.ts
 create mode 100644 src/cli/mm.ts
 rename src/{mcp-server => knowledge-store}/knowledge-store.test.ts (91%)
 rename src/{mcp-server => knowledge-store}/knowledge-store.ts (97%)
 rename src/{mcp-server => knowledge-store}/tokenization.test.ts (99%)
 rename src/{mcp-server => knowledge-store}/tokenization.ts (100%)
 rename src/{mcp-server => knowledge-store}/utils/redaction.test.ts (100%)
 rename src/{mcp-server => knowledge-store}/utils/redaction.ts (100%)
 delete mode 100644 src/mcp-server/server.test.ts
 delete mode 100644 src/mcp-server/server.ts
 delete mode 100644 src/mcp-server/session-manager.test.ts
 delete mode 100644 src/mcp-server/test-utils/flush-promises.ts
 delete mode 100644 src/mcp-server/tools/batch.test.ts
 delete mode 100644 src/mcp-server/tools/batch.ts
 delete mode 100644 src/mcp-server/tools/build.test.ts
 delete mode 100644 src/mcp-server/tools/build.ts
 delete mode 100644 src/mcp-server/tools/cleanup.test.ts
 delete mode 100644 src/mcp-server/tools/cleanup.ts
 delete mode 100644 src/mcp-server/tools/clipboard.test.ts
 delete mode 100644 src/mcp-server/tools/clipboard.ts
 delete mode 100644 src/mcp-server/tools/context.test.ts
 delete mode 100644 src/mcp-server/tools/context.ts
 delete mode 100644 src/mcp-server/tools/definitions.test.ts
 delete mode 100644 src/mcp-server/tools/definitions.ts
 delete mode 100644 src/mcp-server/tools/discovery-tools.ts
 delete mode 100644 src/mcp-server/tools/helpers.test.ts
 delete mode 100644 src/mcp-server/tools/helpers.ts
 delete mode 100644 src/mcp-server/tools/index.ts
 delete mode 100644 src/mcp-server/tools/interaction.test.ts
 delete mode 100644 src/mcp-server/tools/interaction.ts
 delete mode 100644 src/mcp-server/tools/knowledge.ts
 delete mode 100644 src/mcp-server/tools/launch.test.ts
 delete mode 100644 src/mcp-server/tools/launch.ts
 delete mode 100644 src/mcp-server/tools/navigation.test.ts
 delete mode 100644 src/mcp-server/tools/navigation.ts
 delete mode 100644 src/mcp-server/tools/registry.test.ts
 delete mode 100644 src/mcp-server/tools/registry.ts
 delete mode 100644 src/mcp-server/tools/run-tool.test.ts
 delete mode 100644 src/mcp-server/tools/run-tool.ts
 delete mode 100644 src/mcp-server/tools/screenshot.test.ts
 delete mode 100644 src/mcp-server/tools/screenshot.ts
 delete mode 100644 src/mcp-server/tools/seeding.test.ts
 delete mode 100644 src/mcp-server/tools/seeding.ts
 delete mode 100644 src/mcp-server/tools/state.test.ts
 delete mode 100644 src/mcp-server/tools/state.ts
 delete mode 100644 src/mcp-server/types/responses.ts
 delete mode 100644 src/mcp-server/utils/index.ts
 delete mode 100644 src/mcp-server/utils/response.ts
 create mode 100644 src/server/create-server.test.ts
 create mode 100644 src/server/create-server.ts
 create mode 100644 src/server/daemon-state.test.ts
 create mode 100644 src/server/daemon-state.ts
 create mode 100644 src/server/port-allocator.test.ts
 create mode 100644 src/server/port-allocator.ts
 create mode 100644 src/server/request-queue.test.ts
 create mode 100644 src/server/request-queue.ts
 rename src/{mcp-server => server}/session-manager.ts (85%)
 create mode 100644 src/tools/batch.test.ts
 create mode 100644 src/tools/batch.ts
 create mode 100644 src/tools/build.test.ts
 create mode 100644 src/tools/build.ts
 create mode 100644 src/tools/cleanup.test.ts
 create mode 100644 src/tools/cleanup.ts
 create mode 100644 src/tools/clipboard.test.ts
 create mode 100644 src/tools/clipboard.ts
 create mode 100644 src/tools/context.test.ts
 create mode 100644 src/tools/context.ts
 rename src/{mcp-server => }/tools/discovery-tools.test.ts (58%)
 create mode 100644 src/tools/discovery-tools.ts
 rename src/{mcp-server => }/tools/error-classification.test.ts (99%)
 rename src/{mcp-server => }/tools/error-classification.ts (99%)
 create mode 100644 src/tools/index.ts
 create mode 100644 src/tools/interaction.test.ts
 create mode 100644 src/tools/interaction.ts
 rename src/{mcp-server => }/tools/knowledge.test.ts (53%)
 create mode 100644 src/tools/knowledge.ts
 create mode 100644 src/tools/launch.test.ts
 create mode 100644 src/tools/launch.ts
 create mode 100644 src/tools/navigation.test.ts
 create mode 100644 src/tools/navigation.ts
 create mode 100644 src/tools/registry.test.ts
 create mode 100644 src/tools/registry.ts
 create mode 100644 src/tools/screenshot.test.ts
 create mode 100644 src/tools/screenshot.ts
 create mode 100644 src/tools/seeding.test.ts
 create mode 100644 src/tools/seeding.ts
 create mode 100644 src/tools/state.test.ts
 create mode 100644 src/tools/state.ts
 rename src/{mcp-server => tools}/test-utils/index.ts (86%)
 rename src/{mcp-server => tools}/test-utils/mock-factories.test.ts (99%)
 rename src/{mcp-server => tools}/test-utils/mock-factories.ts (96%)
 rename src/{mcp-server => tools}/test-utils/mock-playwright.ts (100%)
 rename src/{mcp-server => tools}/types/discovery.ts (100%)
 rename src/{mcp-server => tools}/types/errors.ts (100%)
 rename src/{mcp-server => tools}/types/index.ts (88%)
 rename src/{mcp-server => tools}/types/knowledge.ts (100%)
 rename src/{mcp-server => tools}/types/seeding.ts (100%)
 rename src/{mcp-server => tools}/types/session.ts (100%)
 rename src/{mcp-server => tools}/types/step-record.ts (100%)
 rename src/{mcp-server => tools}/types/tool-inputs.ts (92%)
 rename src/{mcp-server => tools}/types/tool-outputs.ts (87%)
 create mode 100644 src/tools/utils.ts
 rename src/{mcp-server => tools/utils}/constants.ts (94%)
 rename src/{mcp-server => tools/utils}/discovery.test.ts (100%)
 rename src/{mcp-server => tools/utils}/discovery.ts (99%)
 rename src/{mcp-server => tools}/utils/targets.ts (100%)
 rename src/{mcp-server => tools}/utils/type-guards.test.ts (100%)
 rename src/{mcp-server => tools}/utils/type-guards.ts (100%)
 create mode 100644 src/types/http.ts
 rename src/{mcp-server => }/utils/errors.ts (100%)
 rename src/{mcp-server => }/utils/logger.test.ts (82%)
 rename src/{mcp-server => }/utils/logger.ts (56%)
 rename src/{mcp-server => }/utils/time.test.ts (100%)
 rename src/{mcp-server => }/utils/time.ts (100%)
 rename src/{mcp-server => validation}/schemas.test.ts (100%)
 rename src/{mcp-server => validation}/schemas.ts (97%)
 create mode 100644 src/version.ts

diff --git a/README.md b/README.md
index ef9fbea..75bc819 100644
--- a/README.md
+++ b/README.md
@@ -1,219 +1,295 @@
 # @metamask/client-mcp-core
 
-MCP (Model Context Protocol) server for MetaMask Extension visual testing with LLM agents.
+HTTP daemon and CLI architecture for agent-driven browser extension testing with Playwright.
 
 ## Overview
 
-This package provides the core MCP server infrastructure for enabling LLM agents to interact with the MetaMask browser extension through Playwright.
+This package provides the core infrastructure for enabling LLM agents to interact with browser extensions through Playwright. It ships a persistent HTTP daemon that manages browser lifecycle and a unified `mm` CLI that agents (and developers) use to drive sessions.
+
+The design is **consumer-agnostic**: the core handles protocol, tooling, and knowledge — consumers provide extension-specific logic by implementing the `ISessionManager` interface and injecting capabilities.
+
+```
+                         ┌─────────────────────────────────┐
+                         │         LLM Agent / Dev         │
+                         └────────────┬────────────────────┘
+                                      │  mm CLI commands
+                                      ▼
+                         ┌─────────────────────────────────┐
+                         │     mm CLI  (src/cli/mm.ts)     │
+                         │  discover / auto-start daemon   │
+                         └────────────┬────────────────────┘
+                                      │  HTTP (127.0.0.1)
+                                      ▼
+  ┌───────────────────────────────────────────────────────────────────┐
+  │                    HTTP Daemon (createServer)                     │
+  │                                                                   │
+  │  ┌──────────┐  ┌──────────────┐  ┌────────────┐  ┌────────────┐ │
+  │  │  Routes   │  │ RequestQueue │  │   Tool     │  │ Knowledge  │ │
+  │  │ /health   │  │ (async mutex)│  │  Registry  │  │   Store    │ │
+  │  │ /status   │  │              │  │  25+ tools │  │            │ │
+  │  │ /launch   │  └──────────────┘  └─────┬──────┘  └────────────┘ │
+  │  │ /cleanup  │                          │                         │
+  │  │ /tool/:n  │                          ▼                         │
+  │  └──────────┘               ┌──────────────────┐                 │
+  │                             │   ToolContext     │                 │
+  │                             │  sessionManager   │                 │
+  │                             │  page / refMap    │                 │
+  │                             │  workflowContext  │                 │
+  │                             │  knowledgeStore   │                 │
+  │                             └────────┬─────────┘                 │
+  └──────────────────────────────────────┼───────────────────────────┘
+                                         │
+                   ┌─────────────────────┼─────────────────────┐
+                   │          ISessionManager                   │
+                   │       (consumer implementation)            │
+                   │                                            │
+                   │  Session lifecycle   Page management       │
+                   │  Extension state     A11y reference map    │
+                   │  Navigation          Screenshots           │
+                   │  Capabilities (opt)  Environment config    │
+                   └─────────────────────┬─────────────────────┘
+                                         │
+                   ┌─────────────────────┼─────────────────────┐
+                   │          WorkflowContext                   │
+                   │                                            │
+                   │  build?            fixture?                │
+                   │  chain?            contractSeeding?        │
+                   │  stateSnapshot?    mockServer?             │
+                   │  config: EnvironmentConfig                 │
+                   └─────────────────────┬─────────────────────┘
+                                         │
+                                         ▼
+                   ┌───────────────────────────────────────────┐
+                   │        Playwright  →  Chrome Browser      │
+                   │            Browser Extension               │
+                   └───────────────────────────────────────────┘
+```
 
 ## Requirements
 
-- **Node.js ^20 || ^22 || >=24** (required)
-- **TypeScript >=5.0** (for consumer type definitions)
-- Playwright `^1.49.0` (peer dependency)
+- **Node.js** `^20 || ^22 || >=24`
+- **TypeScript** `>=5.0` (for consumer type definitions)
+- **Playwright** `^1.49.0` (peer dependency)
 
 ## Installation
 
+As a project dependency (the CLI is available via `npx mm` or `yarn mm`):
+
 ```bash
 yarn add @metamask/client-mcp-core
 ```
 
-## Architecture
-
-### High-Level Overview
+As a global CLI (puts `mm` directly on your PATH — recommended for LLM agents):
 
+```bash
+npm install -g @metamask/client-mcp-core
 ```
-┌─────────────────────────────────────────────────────────────────────────┐
-│                           LLM Agent                                     │
-│                    (Claude, GPT, etc.)                                  │
-└─────────────────────────────────────────────────────────────────────────┘
-                                  │
-                                  │ MCP Protocol (stdio)
-                                  ▼
-┌─────────────────────────────────────────────────────────────────────────┐
-│                    @metamask/client-mcp-core                     │
-│                                                                         │
-│  Core MCP Server + Generic Tools                                        │
-│  - Session management                                                   │
-│  - Element interaction (click, type, wait)                              │
-│  - Discovery (testIds, accessibility tree)                              │
-│  - Screenshots                                                          │
-│  - Knowledge store (cross-session learning)                             │
-└─────────────────────────────────────────────────────────────────────────┘
-                                  │
-                                  │ Capability Injection
-                                  ▼
-┌─────────────────────────────────────────────────────────────────────────┐
-│                   MetaMask Extension Provider                           │
-│                                                                         │
-│  - Build capability (yarn build:test)                                   │
-│  - Fixture/state management                                             │
-│  - Anvil blockchain integration                                         │
-│  - Contract seeding                                                     │
-└─────────────────────────────────────────────────────────────────────────┘
-                                  │
-                                  │ Playwright
-                                  ▼
-┌─────────────────────────────────────────────────────────────────────────┐
-│                    Headed Chrome Browser                                │
-│                    + MetaMask Extension                                 │
-└─────────────────────────────────────────────────────────────────────────┘
+
+The global CLI can target any project via `--project` or `MM_PROJECT` (see [Project Targeting](#project-targeting)).
+
+## Getting Started
+
+Consuming this package requires two things: a **daemon entry point** and a **`package.json` configuration**.
+
+### 1. Create a daemon entry point
+
+```typescript
+// daemon.ts
+import { createServer } from '@metamask/client-mcp-core';
+import { MySessionManager } from './my-session-manager';
+import { createMyContext } from './my-context';
+
+const server = createServer({
+  sessionManager: new MySessionManager(),
+  contextFactory: (options) => createMyContext({ ports: options.ports }),
+});
+
+server.start().then((state) => {
+  console.error(`Daemon started on port ${state.port}`);
+});
 ```
 
-### Detailed Architecture
+### 2. Configure `package.json`
+
+```json
+{
+  "mm": {
+    "daemon": "path/to/daemon.ts",
+    "runtime": "tsx"
+  },
+  "scripts": {
+    "mm:serve": "tsx path/to/daemon.ts"
+  }
+}
+```
 
-The package follows a **capability-based dependency injection** pattern that separates concerns between:
+The `mm.daemon` field tells the CLI where the daemon entry point lives. The `mm.runtime` field specifies the TypeScript runner (defaults to `tsx`).
 
-1. **Core MCP Server** - Protocol handling, tool routing, and generic browser interactions
-2. **Session Manager Interface** - Abstract contract for extension-specific session management
-3. **Capabilities** - Optional features injected by consumer implementations
+### 3. Use the CLI
 
+```bash
+mm launch              # auto-starts daemon, opens browser session
+mm describe-screen     # get element references
+mm click e3            # interact using a11y refs
+mm cleanup --shutdown  # stop browser and daemon
 ```
-┌─────────────────────────────────────────────────────────────────────────┐
-│                         createMcpServer()                               │
-│                                                                         │
-│  ┌─────────────────────┐    ┌─────────────────────────────────────┐    │
-│  │   Tool Definitions  │───▶│         Tool Handlers               │    │
-│  │   (mm_click, etc.)  │    │   (registry.ts + individual tools)  │    │
-│  └─────────────────────┘    └──────────────┬──────────────────────┘    │
-│                                            │                            │
-│                                            ▼                            │
-│  ┌─────────────────────────────────────────────────────────────────┐   │
-│  │                    ISessionManager Interface                     │   │
-│  │  ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ ┌───────────┐  │   │
-│  │  │ Page Mgmt   │ │ Navigation  │ │ Screenshots │ │ A11y Refs │  │   │
-│  │  └─────────────┘ └─────────────┘ └─────────────┘ └───────────┘  │   │
-│  │  ┌──────────────────────────────────────────────────────────┐   │   │
-│  │  │              Optional Capabilities                        │   │   │
-│  │  │  • BuildCapability      • FixtureCapability              │   │   │
-│  │  │  • ChainCapability      • ContractSeedingCapability      │   │   │
-│  │  │  • StateSnapshotCapability                               │   │   │
-│  │  └──────────────────────────────────────────────────────────┘   │   │
-│  └─────────────────────────────────────────────────────────────────┘   │
-└─────────────────────────────────────────────────────────────────────────┘
-                                  │
-                                  │ setSessionManager()
-                                  ▼
-┌─────────────────────────────────────────────────────────────────────────┐
-│              Consumer Implementation (e.g., MetaMask)                   │
-│                                                                         │
-│  class MetaMaskSessionManager implements ISessionManager {              │
-│    // Browser context, page tracking, extension-specific logic          │
-│    // Capability implementations for build, fixtures, chain, etc.       │
-│  }                                                                      │
-└─────────────────────────────────────────────────────────────────────────┘
+
+If running from outside the project directory (e.g., a parent folder containing multiple repos):
+
+```bash
+mm --project ./my-extension launch
+mm --project ./my-extension describe-screen
+
+# Or set once via environment variable
+export MM_PROJECT=/path/to/my-extension
+mm launch
 ```
 
-### Core Components
+## Core Concepts
 
-| Component             | Description                                                    |
-| --------------------- | -------------------------------------------------------------- |
-| `createMcpServer()`   | Factory function that creates the MCP server instance          |
-| `ISessionManager`     | Interface that consumers must implement for session management |
-| `setSessionManager()` | Injects the consumer's session manager into the core           |
-| `WorkflowContext`     | Container for browser capability and optional capabilities     |
-| `EnvironmentConfig`   | Configuration discriminated by `'e2e'` or `'prod'` mode        |
+### Daemon Model
 
-### Capability System
+The architecture relies on a persistent background HTTP daemon that manages the browser lifecycle:
 
-The package defines several capabilities that consumers can provide.
+- **Worktree Isolation**: Each git worktree runs its own daemon instance, tracked via a `.mm-server` state file in the project root. This allows parallel work across branches.
+- **Port Allocation**: The daemon automatically allocates ports for the HTTP server and test infrastructure (Anvil, fixture server, mock server) to avoid conflicts.
+- **Auto-Start**: The daemon starts automatically on `mm launch` if not already running, and shuts down after a period of inactivity (default: 30 minutes).
+- **Request Serialization**: A `RequestQueue` (async mutex) ensures only one tool executes at a time, preventing race conditions on shared browser state.
+- **Health Checks**: Each daemon generates a unique nonce on startup. The CLI verifies daemon identity via `GET /health` to detect stale `.mm-server` files from crashed processes.
+- **Logs**: Daemon activity is logged to `.mm-daemon.log`.
 
-#### BuildCapability (Optional)
+### Session Manager Interface
 
-Enables the `mm_build` tool. Implement this to allow LLM agents to build the extension from source.
+`ISessionManager` is the core abstraction boundary between this package and consumer implementations. Consumers must implement this interface to provide extension-specific browser control.
 
 ```typescript
-type BuildCapability = {
-  // Build the extension (e.g., yarn build:test)
-  build(options?: BuildOptions): Promise<BuildResult>;
+type ISessionManager = {
+  // Session Lifecycle
+  hasActiveSession(): boolean;
+  getSessionId(): string | undefined;
+  launch(input: SessionLaunchInput): Promise<SessionLaunchResult>;
+  cleanup(): Promise<boolean>;
 
-  // Get path to built extension directory
-  getExtensionPath(): string;
+  // Page Management
+  getPage(): Page;
+  setActivePage(page: Page): void;
+  getTrackedPages(): TrackedPage[];
+  classifyPageRole(page: Page, extensionId?: string): TabRole;
+  getContext(): BrowserContext;
 
-  // Check if extension is already built
-  isBuilt(): Promise<boolean>;
-};
+  // Extension State
+  getExtensionState(): Promise<ExtensionState>;
+
+  // A11y Reference Map
+  setRefMap(map: Map<string, string>): void;
+  getRefMap(): Map<string, string>;
+  resolveA11yRef(ref: string): string | undefined;
+
+  // Navigation
+  navigateToHome(): Promise<void>;
+  navigateToSettings(): Promise<void>;
+  navigateToUrl(url: string): Promise<Page>;
+  navigateToNotification(): Promise<Page>;
+  waitForNotificationPage(timeoutMs: number): Promise<Page>;
+
+  // Screenshots
+  screenshot(options: SessionScreenshotOptions): Promise<ScreenshotResult>;
 
-type BuildOptions = {
-  buildType?: string; // e.g., "build:test"
-  force?: boolean; // Force rebuild even if exists
+  // Capabilities (optional, extension-specific)
+  getBuildCapability(): BuildCapability | undefined;
+  getFixtureCapability(): FixtureCapability | undefined;
+  getChainCapability(): ChainCapability | undefined;
+  getContractSeedingCapability(): ContractSeedingCapability | undefined;
+  getStateSnapshotCapability(): StateSnapshotCapability | undefined;
+
+  // Environment
+  getEnvironmentMode(): EnvironmentMode;
+  setContext(context: 'e2e' | 'prod', options?: Record<string, unknown>): void;
+  getContextInfo(): { currentContext: 'e2e' | 'prod'; ... };
 };
+```
+
+### Workflow Context & Capabilities
 
-type BuildResult = {
-  success: boolean;
-  extensionPath: string;
-  durationMs: number;
-  error?: string;
+The `WorkflowContext` aggregates optional capabilities that consumers inject through the `contextFactory`. The tool system checks for capabilities at runtime — tools that depend on missing capabilities return clear errors.
+
+```typescript
+type WorkflowContext = {
+  build?: BuildCapability;
+  fixture?: FixtureCapability;
+  chain?: ChainCapability;
+  contractSeeding?: ContractSeedingCapability;
+  stateSnapshot?: StateSnapshotCapability;
+  mockServer?: MockServerCapability;
+  config: EnvironmentConfig;
 };
 ```
 
----
+Capabilities are created by the consumer's `contextFactory` function, which receives allocated port numbers:
+
+```typescript
+function createMyContext(options: {
+  ports: { anvil: number; fixture: number; mock: number };
+}): WorkflowContext {
+  return {
+    build: new MyBuildCapability(),
+    fixture: new MyFixtureCapability(options.ports.fixture),
+    chain: new MyChainCapability(options.ports.anvil),
+    config: {
+      environment: 'e2e',
+      extensionName: 'MyExtension',
+      defaultPassword: 'test-password',
+      artifactsDir: './test-artifacts',
+      defaultChainId: 1337,
+      ports: {
+        anvil: options.ports.anvil,
+        fixtureServer: options.ports.fixture,
+      },
+    },
+  };
+}
+```
+
+### Capability Reference
 
-#### FixtureCapability (Optional)
+| Capability                  | Purpose                                 | Enables Tools                                                               |
+| --------------------------- | --------------------------------------- | --------------------------------------------------------------------------- |
+| `BuildCapability`           | Build extension from source             | `build`                                                                     |
+| `FixtureCapability`         | Manage wallet state via fixtures        | `launch` (state modes)                                                      |
+| `ChainCapability`           | Local blockchain (Anvil) lifecycle      | Chain interactions                                                          |
+| `ContractSeedingCapability` | Deploy smart contracts to Anvil         | `seed_contract`, `seed_contracts`, `get_contract_address`, `list_contracts` |
+| `StateSnapshotCapability`   | Read extension state and detect screens | `get_state`                                                                 |
+| `MockServerCapability`      | HTTP mock server for API stubbing       | Mock-dependent tests                                                        |
 
-Enables wallet state management through fixtures. Essential for E2E testing where you need reproducible wallet states.
+Each capability interface is defined in `src/capabilities/types.ts`:
 
 ```typescript
+type BuildCapability = {
+  build(options?: BuildOptions): Promise<BuildResult>;
+  getExtensionPath(): string;
+  isBuilt(): Promise<boolean>;
+};
+
 type FixtureCapability = {
-  // Start fixture server with given wallet state
   start(state: WalletState): Promise<void>;
-
-  // Stop fixture server
   stop(): Promise<void>;
-
-  // Get default pre-onboarded wallet state (25 ETH, unlocked)
   getDefaultState(): WalletState;
-
-  // Get fresh onboarding state (no wallet configured)
   getOnboardingState(): WalletState;
-
-  // Resolve a named preset to fixture data
   resolvePreset(presetName: string): WalletState;
 };
 
-type WalletState = {
-  data: Record<string, unknown>; // Extension storage state
-  meta?: { version: number };
-};
-```
-
----
-
-#### ChainCapability (Optional)
-
-Manages local blockchain (Anvil) for E2E testing. Required for contract interactions.
-
-```typescript
 type ChainCapability = {
-  // Start the local Anvil node
   start(): Promise<void>;
-
-  // Stop the Anvil node
   stop(): Promise<void>;
-
-  // Check if Anvil is running
   isRunning(): boolean;
-
-  // Set the port for the Anvil node
   setPort(port: number): void;
 };
-```
-
----
-
-#### ContractSeedingCapability (Optional)
-
-Enables smart contract deployment tools (`mm_seed_contract`, `mm_seed_contracts`, etc.).
 
-```typescript
 type ContractSeedingCapability = {
-  // Deploy a single contract
   deployContract(
     name: string,
     options?: DeployOptions,
   ): Promise<ContractDeployment>;
-
-  // Deploy multiple contracts in sequence
   deployContracts(
     names: string[],
     options?: DeployOptions,
@@ -221,1264 +297,297 @@ type ContractSeedingCapability = {
     deployed: ContractDeployment[];
     failed: { name: string; error: string }[];
   }>;
-
-  // Get deployed contract address by name
   getContractAddress(name: string): string | null;
-
-  // List all deployed contracts in this session
   listDeployedContracts(): ContractInfo[];
-
-  // Get available contract names
   getAvailableContracts(): string[];
-
-  // Clear the deployment registry
   clearRegistry(): void;
-
-  // Initialize the capability (called during session launch)
   initialize(): void;
 };
 
-type DeployOptions = {
-  hardfork?: string; // EVM hardfork (default: "prague")
-  deployerOptions?: {
-    fromAddress?: string; // Impersonate address
-    fromPrivateKey?: string; // Deploy from specific key
-  };
-};
-```
-
----
-
-#### StateSnapshotCapability (Optional)
-
-```typescript
 type StateSnapshotCapability = {
-  // Get detailed state snapshot
   getState(page: Page, options: StateOptions): Promise<StateSnapshot>;
-
-  // Detect current screen from page content
   detectCurrentScreen(page: Page): Promise<string>;
 };
 
-type StateOptions = {
-  extensionId?: string;
-  chainId?: number;
-};
-```
-
----
-
-#### MockServerCapability (Optional)
-
-Enables mock server for API testing scenarios.
-
-```typescript
 type MockServerCapability = {
-  // Start the mock server
   start(): Promise<void>;
-
-  // Stop the mock server
   stop(): Promise<void>;
-
-  // Check if mock server is running
   isRunning(): boolean;
-
-  // Get the server instance
   getServer(): unknown;
-
-  // Get the port the server is running on
   getPort(): number;
 };
 ```
 
-## Client Integration
-
-### How to Consume the Package
-
-Consumers must:
+### Tool System
 
-1. **Implement `ISessionManager`** - The core interface for session management
-2. **Inject the session manager** - Call `setSessionManager()` before starting the server
-3. **Start the MCP server** - Call `server.start()`
-
-### McpServerConfig
-
-The `createMcpServer()` function accepts a configuration object:
-
-```typescript
-export type McpServerConfig = {
-  name: string;
-  version: string;
-  onCleanup?: () => Promise<void>;
-  logger?: (message: string) => void;
-};
-```
-
-### Minimal Integration Example
+Tools are standalone functions registered in a central `toolRegistry`. Each tool receives a `ToolContext` and returns a `ToolResponse`.
 
 ```typescript
-import {
-  createMcpServer,
-  setSessionManager,
-  ISessionManager,
-  type McpServerConfig,
-} from '@metamask/client-mcp-core';
-
-// 1. Implement the ISessionManager interface
-class MyExtensionSessionManager implements ISessionManager {
-  // ... implement all required methods
-  // See ISessionManager interface for full contract
-}
+type ToolFunction<TParams, TResult> = (
+  params: TParams,
+  context: ToolContext,
+) => Promise<ToolResponse<TResult>>;
 
-// 2. Create and inject your session manager
-const sessionManager = new MyExtensionSessionManager();
-setSessionManager(sessionManager);
-
-// 3. Create and start the MCP server
-const config: McpServerConfig = {
-  name: 'my-extension-mcp',
-  version: '1.0.0',
-  onCleanup: async () => {
-    // Optional cleanup logic
-  },
+type ToolContext = {
+  sessionManager: ISessionManager;
+  page: Page;
+  refMap: Map<string, string>;
+  workflowContext: WorkflowContext;
+  knowledgeStore: KnowledgeStore;
 };
-
-const server = createMcpServer(config);
-await server.start();
 ```
 
-### Full Integration Example
-
-```typescript
-import {
-  createMcpServer,
-  setSessionManager,
-  ISessionManager,
-  SessionLaunchInput,
-  SessionLaunchResult,
-  TrackedPage,
-  type ExtensionState,
-  type BuildCapability,
-  type FixtureCapability,
-  type ChainCapability,
-  type ContractSeedingCapability,
-  type EnvironmentMode,
-} from '@metamask/client-mcp-core';
-import type { Page, BrowserContext } from '@playwright/test';
-
-class MetaMaskSessionManager implements ISessionManager {
-  private context?: BrowserContext;
-  private activePage?: Page;
-  private extensionId?: string;
-  private sessionId?: string;
-  private refMap = new Map<string, string>();
-
-  // Capabilities (inject via constructor or lazy-load)
-  private buildCapability?: BuildCapability;
-  private fixtureCapability?: FixtureCapability;
-  private chainCapability?: ChainCapability;
-  private contractSeedingCapability?: ContractSeedingCapability;
-
-  // Session Lifecycle
-  hasActiveSession(): boolean {
-    return this.context !== undefined;
-  }
-
-  getSessionId(): string | undefined {
-    return this.sessionId;
-  }
-
-  async launch(input: SessionLaunchInput): Promise<SessionLaunchResult> {
-    // 1. Start local chain if needed
-    if (this.chainCapability) {
-      await this.chainCapability.start();
-    }
-
-    // 2. Start fixture server if needed
-    if (this.fixtureCapability && input.stateMode !== 'onboarding') {
-      const fixture = input.fixture ?? this.fixtureCapability.getDefaultState();
-      await this.fixtureCapability.start(fixture);
-    }
-
-    // 3. Launch browser with extension
-    // ... Playwright browser launch logic
-
-    // 4. Return session info
-    return {
-      sessionId: this.sessionId!,
-      extensionId: this.extensionId!,
-      state: await this.getExtensionState(),
-    };
-  }
-
-  async cleanup(): Promise<boolean> {
-    if (!this.hasActiveSession()) return false;
-
-    // Close browser, stop services
-    await this.context?.close();
-    await this.chainCapability?.stop();
-    await this.fixtureCapability?.stop();
-
-    this.context = undefined;
-    this.activePage = undefined;
-    return true;
-  }
-
-  // Page Management
-  getPage(): Page {
-    if (!this.activePage) throw new Error('No active session');
-    return this.activePage;
-  }
-
-  setActivePage(page: Page): void {
-    this.activePage = page;
-  }
-
-  getTrackedPages(): TrackedPage[] {
-    // Return all tracked pages with roles
-    return [];
-  }
-
-  getContext(): BrowserContext {
-    if (!this.context) throw new Error('No active session');
-    return this.context;
-  }
-
-  // Extension State
-  async getExtensionState(): Promise<ExtensionState> {
-    // Query extension for current state
-    return {
-      isLoaded: true,
-      currentUrl: this.activePage?.url() ?? '',
-      extensionId: this.extensionId ?? '',
-      isUnlocked: false,
-      currentScreen: 'unknown',
-      accountAddress: null,
-      networkName: null,
-      chainId: null,
-      balance: null,
-    };
-  }
-
-  // A11y Reference Map
-  setRefMap(map: Map<string, string>): void {
-    this.refMap = map;
-  }
-
-  getRefMap(): Map<string, string> {
-    return this.refMap;
-  }
-
-  clearRefMap(): void {
-    this.refMap.clear();
-  }
-
-  resolveA11yRef(ref: string): string | undefined {
-    return this.refMap.get(ref);
-  }
-
-  // Navigation
-  async navigateToHome(): Promise<void> {
-    // Navigate to extension home page
-  }
+The daemon routes `POST /tool/:name` requests through the registry, applies Zod validation on inputs, executes the tool through the request queue, and captures observations (extension state, test IDs, a11y snapshot) after each execution.
 
-  async navigateToSettings(): Promise<void> {
-    // Navigate to extension settings page
-  }
-
-  async navigateToUrl(url: string): Promise<Page> {
-    // Open URL in new tab and return the page
-    return this.activePage!;
-  }
-
-  async navigateToNotification(): Promise<Page> {
-    // Navigate to notification page
-    return this.activePage!;
-  }
+**Registered tools:**
 
-  async waitForNotificationPage(timeoutMs: number): Promise<Page> {
-    // Wait for notification popup to appear
-    return this.activePage!;
-  }
+| Tool                     | Description                                                                                                                                                                                         |
+| ------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| **Lifecycle**            |                                                                                                                                                                                                     |
+| `build`                  | Triggers an extension build using the configured `BuildCapability`. Accepts build type and force options.                                                                                           |
+| `launch`                 | Launches a new browser session with the configured extension. Supports state modes (`default`, `onboarding`, `custom`), fixture presets, goal/tag metadata, and optional contract seeding on start. |
+| `cleanup`                | Tears down the active browser session and cleans up all resources (browser, services, fixtures).                                                                                                    |
+| **Interaction**          |                                                                                                                                                                                                     |
+| `click`                  | Clicks an element identified by a11y ref, test ID, or CSS selector. Waits for the element to be visible before clicking.                                                                            |
+| `type`                   | Types text into an input element identified by a11y ref, test ID, or CSS selector. Uses Playwright's `fill()` for reliable input.                                                                   |
+| `wait_for`               | Waits for an element to become visible on the page within a configurable timeout.                                                                                                                   |
+| `clipboard`              | Reads from or writes to the system clipboard via Chrome DevTools Protocol. Useful for pasting seed phrases or copying addresses.                                                                    |
+| **Navigation**           |                                                                                                                                                                                                     |
+| `navigate`               | Navigates the browser to a named screen (`home`, `settings`, `notification`) or an arbitrary URL.                                                                                                   |
+| `switch_to_tab`          | Switches the active page to a tab matching a given role (e.g., `extension`, `dapp`) or URL prefix.                                                                                                  |
+| `close_tab`              | Closes a browser tab matching a given role or URL. Falls back to the extension tab if the active tab is closed.                                                                                     |
+| `wait_for_notification`  | Waits for the extension notification popup to appear within a timeout. Returns the notification page URL.                                                                                           |
+| **Discovery**            |                                                                                                                                                                                                     |
+| `describe_screen`        | Captures a comprehensive screen snapshot: extension state, visible test IDs, trimmed a11y tree with refs, optional screenshot, and prior knowledge from historical sessions.                        |
+| `accessibility_snapshot` | Captures a trimmed accessibility tree of the current page with deterministic refs (`e1`, `e2`, ...). Supports scoping to a root CSS selector.                                                       |
+| `list_testids`           | Collects all visible `data-testid` attributes from the current page with text previews and visibility status.                                                                                       |
+| **State**                |                                                                                                                                                                                                     |
+| `get_state`              | Retrieves the current extension state (URL, screen, network, balance, account) and tracked tab information.                                                                                         |
+| `get_context`            | Returns the current environment context (`e2e` or `prod`), session status, available capabilities, and whether context switching is allowed.                                                        |
+| `set_context`            | Switches the session environment between `e2e` and `prod` modes. Blocked while a session is active.                                                                                                 |
+| **Screenshots**          |                                                                                                                                                                                                     |
+| `screenshot`             | Captures a screenshot of the current page. Supports naming, full-page capture, scoping to a CSS selector, and optional base64 output.                                                               |
+| **Knowledge**            |                                                                                                                                                                                                     |
+| `knowledge_last`         | Retrieves the N most recent step records from the knowledge store, with optional scope and filter parameters.                                                                                       |
+| `knowledge_search`       | Searches step records by query string with token-based matching and synonym expansion. Scores results by relevance to screen, URL, test IDs, and a11y nodes.                                        |
+| `knowledge_summarize`    | Generates a recipe-style summary of a session's tool invocations, showing the step sequence with targets and outcomes.                                                                              |
+| `knowledge_sessions`     | Lists available knowledge sessions with metadata (goal, flow tags, timestamps), with optional filtering.                                                                                            |
+| **Contracts**            |                                                                                                                                                                                                     |
+| `seed_contract`          | Deploys a single smart contract to the local Anvil chain by name. Requires `ContractSeedingCapability`.                                                                                             |
+| `seed_contracts`         | Deploys multiple smart contracts in sequence. Returns both successful deployments and individual failures.                                                                                          |
+| `get_contract_address`   | Looks up the deployed address of a contract by name from the session's deployment registry.                                                                                                         |
+| `list_contracts`         | Lists all contracts deployed in the current session with addresses and deployment timestamps.                                                                                                       |
+| **Batching**             |                                                                                                                                                                                                     |
+| `run_steps`              | Executes a batch of tool invocations sequentially. Supports `stopOnError` to halt on first failure. Returns per-step results with timing.                                                           |
 
-  // Screenshots
-  async screenshot(options: { name: string; fullPage?: boolean }) {
-    // ... screenshot logic
-    return { path: '', base64: '', width: 0, height: 0 };
-  }
+### Accessibility References
 
-  // Capabilities
-  getBuildCapability() {
-    return this.buildCapability;
-  }
-  getFixtureCapability() {
-    return this.fixtureCapability;
-  }
-  getChainCapability() {
-    return this.chainCapability;
-  }
-  getContractSeedingCapability() {
-    return this.contractSeedingCapability;
-  }
-  getStateSnapshotCapability() {
-    return undefined;
-  }
+The core uses Playwright's `ariaSnapshot()` to build a deterministic reference map of interactive elements. Each element gets a short ref like `e1`, `e2`, etc., mapped to an ARIA selector.
 
-  // Environment
-  getEnvironmentMode(): EnvironmentMode {
-    return 'e2e';
-  }
+Agents call `describe_screen` to get the current reference map, then use refs for interaction:
 
-  // Required by interface but implementation-specific
-  classifyPageRole(
-    page: Page,
-  ): 'extension' | 'notification' | 'dapp' | 'other' {
-    return 'extension';
-  }
-  getSessionState() {
-    return undefined;
-  }
-  getSessionMetadata() {
-    return undefined;
-  }
+```
+mm describe-screen    → { ..., a11y: [{ ref: "e1", role: "button", name: "Submit" }, ...] }
+mm click e1           → clicks the "Submit" button
+mm type e3 "hello"    → types into the element mapped to e3
+```
 
-  // Context Management
-  setContext(context: 'e2e' | 'prod', options?: Record<string, unknown>): void {
-    if (this.hasActiveSession()) {
-      throw new Error('Cannot switch context while session is active');
-    }
-    // Switch environment context and apply optional context-specific config
-    void options;
-  }
+This accessibility-first approach provides reliable element targeting that survives minor UI changes.
 
-  getContextInfo() {
-    return {
-      currentContext: this.getEnvironmentMode(),
-      hasActiveSession: this.hasActiveSession(),
-      sessionId: this.sessionId ?? null,
-      capabilities: {
-        available: [
-          this.buildCapability && 'build',
-          this.fixtureCapability && 'fixture',
-          this.chainCapability && 'chain',
-          this.contractSeedingCapability && 'contractSeeding',
-        ].filter(Boolean) as string[],
-      },
-      canSwitchContext: !this.hasActiveSession(),
-    };
-  }
-}
+### Knowledge Store
 
-// Bootstrap the server
-async function main() {
-  const sessionManager = new MetaMaskSessionManager();
-  setSessionManager(sessionManager);
+The `KnowledgeStore` provides cross-session learning by recording every tool execution as a structured step record:
 
-  const server = createMcpServer({
-    name: 'metamask-mcp',
-    version: '1.0.0',
-  });
+- **Step Recording**: Each tool invocation captures the tool name, input, outcome, observation (extension state, visible test IDs, a11y nodes), and timing.
+- **Session Metadata**: Sessions are tagged with goals, flow tags, and free-form tags for filtering.
+- **Prior Knowledge**: Before tool execution, the store can generate context from historical sessions — similar steps, suggested actions, and patterns to avoid — based on the current screen state.
+- **Search**: Token-based search with synonym expansion across sessions, scored by relevance to screen, URL, test IDs, and a11y nodes.
+- **Sensitive Data Handling**: Input text for password fields and other sensitive inputs is automatically redacted.
 
-  await server.start();
-}
+Knowledge artifacts are stored on disk at `test-artifacts/llm-knowledge/` organized by session ID.
 
-main().catch(console.error);
-```
+### Environment Modes
 
-### Environment Configuration
+The package supports two environment modes via discriminated union configuration:
 
-The package supports two environment modes:
+**E2E Testing** — Full test infrastructure with local chain, fixtures, and contract seeding:
 
 ```typescript
-// E2E Testing Environment
 const e2eConfig: E2EEnvironmentConfig = {
   environment: 'e2e',
   extensionName: 'MetaMask',
   defaultPassword: 'password123',
-  toolPrefix: 'mm',
   artifactsDir: './test-artifacts',
   defaultChainId: 1337,
-  ports: {
-    anvil: 8545,
-    fixtureServer: 12345,
-  },
-};
-
-// Production-like Environment
-const prodConfig: ProdEnvironmentConfig = {
-  environment: 'prod',
-  extensionName: 'MetaMask',
-  toolPrefix: 'mm',
+  ports: { anvil: 8545, fixtureServer: 12345 },
 };
 ```
 
-### Context Switching Options
-
-`mm_set_context` supports an optional `options` payload that is forwarded to the session manager's `setContext(context, options)` implementation.
+**Production-like** — Minimal configuration without test infrastructure:
 
 ```typescript
-type SetContextInput = {
-  context: 'e2e' | 'prod';
-  options?: Record<string, unknown>;
+const prodConfig: ProdEnvironmentConfig = {
+  environment: 'prod',
+  extensionName: 'MetaMask',
 };
-
-// Example: switch to e2e and pass context-specific overrides
-await handleSetContext({
-  context: 'e2e',
-  options: {
-    mockServer: {
-      enabled: true,
-      port: 18000,
-    },
-  },
-});
-```
-
-Use `options` only for context-specific configuration your `ISessionManager` implementation understands.
-
-### Custom Tool Definitions
-
-The package provides a fixed set of tools prefixed with `mm_`. Custom tool injection is currently not supported. You can inspect the available tool definitions using `getToolDefinitions()`:
-
-```typescript
-import { getToolDefinitions } from '@metamask/client-mcp-core';
-
-const tools = getToolDefinitions();
-console.log(`Available tools: ${tools.map((t) => t.name).join(', ')}`);
 ```
 
-### Registering Custom Tool Handlers
-
-Custom tool handlers are not supported. The server uses a fixed set of handlers for the provided tools.
-
-## Available Tools
-
-All tools are prefixed with `mm_` and return a standardized response format:
-
-```typescript
-type ToolResponse<Result> =
-  | {
-      ok: true;
-      meta: {
-        timestamp: string; // ISO timestamp
-        sessionId?: string; // Current session ID
-        durationMs: number; // Operation duration
-      };
-      result: Result; // Success payload
-    }
-  | {
-      ok: false;
-      meta: {
-        timestamp: string;
-        sessionId?: string;
-        durationMs: number;
-      };
-      error: {
-        code: string;
-        message: string;
-        details?: Record<string, unknown>;
-      };
-    };
-```
-
----
-
-### Session Management Tools
+Use `set_context` / `get_context` tools to switch between modes at runtime (requires no active session).
 
-#### `mm_build`
+## Server Configuration
 
-Build the extension from source. Requires `BuildCapability`.
-
-**Input:**
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `buildType` | `"build:test"` | `"build:test"` | Build script to run |
-| `force` | `boolean` | `false` | Force rebuild even if build exists |
-
-**Output:**
+The `createServer()` function accepts a `ServerConfig` object:
 
 ```typescript
-{
-  buildType: 'build:test';
-  extensionPathResolved: string; // Absolute path to built extension
-}
-```
-
-**Example:**
+type ServerConfig = {
+  /** Session manager instance (required) */
+  sessionManager: ISessionManager;
+  /** Factory function to create workflow context (required) */
+  contextFactory: (options: ContextFactoryOptions) => WorkflowContext;
+  /** Idle timeout in milliseconds (optional, defaults to 30000) */
+  idleTimeoutMs?: number;
+  /** Path to log file (optional) */
+  logFilePath?: string;
+};
 
-```json
-{ "buildType": "build:test", "force": true }
+type ContextFactoryOptions = {
+  ports: {
+    anvil: number;
+    fixture: number;
+    mock: number;
+  };
+};
 ```
 
----
-
-#### `mm_launch`
+The returned `ServerInstance` exposes:
 
-Launch a headed Chrome browser with the extension loaded. This is typically the first tool called.
+- `start(): Promise<DaemonState>` — Allocates ports, starts HTTP server, writes `.mm-server` state, sets up idle timeout and signal handlers.
+- `stop(): Promise<void>` — Stops accepting connections, cleans up session, removes `.mm-server` state.
 
-**Input:**
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `autoBuild` | `boolean` | `true` | Auto-build if extension not found |
-| `stateMode` | `"default" \| "onboarding" \| "custom"` | `"default"` | Wallet initialization mode |
-| `fixturePreset` | `string` | - | Named preset when `stateMode="custom"` |
-| `fixture` | `object` | - | Direct fixture object when `stateMode="custom"` |
-| `ports.anvil` | `number` | `8545` | Anvil RPC port |
-| `ports.fixtureServer` | `number` | `12345` | Fixture server port |
-| `slowMo` | `number` | `0` | Slow down actions (ms) for debugging |
-| `extensionPath` | `string` | - | Custom extension directory path |
-| `goal` | `string` | - | Session goal for knowledge store |
-| `flowTags` | `string[]` | - | Flow categorization tags |
-| `tags` | `string[]` | - | Free-form tags |
-| `seedContracts` | `string[]` | - | Contracts to deploy on launch |
+## HTTP API
 
-**State Modes:**
+The daemon exposes the following endpoints on `127.0.0.1`:
 
-- `default` - Pre-onboarded wallet with 25 ETH, ready to use
-- `onboarding` - Fresh state, requires wallet setup flow
-- `custom` - Use provided fixture or preset
+| Method | Path          | Description                                  |
+| ------ | ------------- | -------------------------------------------- |
+| `GET`  | `/health`     | Health check with nonce verification         |
+| `GET`  | `/status`     | Daemon status (PID, port, uptime, sub-ports) |
+| `POST` | `/launch`     | Start a browser session                      |
+| `POST` | `/cleanup`    | Stop the current browser session             |
+| `POST` | `/tool/:name` | Execute a registered tool with JSON body     |
 
-**Output:**
+All responses follow a consistent shape:
 
 ```typescript
-{
-  sessionId: string;        // Unique session identifier
-  extensionId: string;      // Extension's Chrome ID
-  state: ExtensionState;    // Initial extension state
-  prerequisites?: [{        // Steps taken before launch
-    step: string;
-    description: string;
-  }];
-}
-```
-
-**Example:**
+// Success
+{ ok: true, result: T, observations?: { state, testIds, a11y } }
 
-```json
-{
-  "stateMode": "default",
-  "goal": "Test send flow",
-  "flowTags": ["send"],
-  "seedContracts": ["hst"]
-}
-```
-
----
-
-#### `mm_cleanup`
-
-Stop the browser and all services (Anvil, fixture server). Always call when done.
-
-**Input:**
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `sessionId` | `string` | - | Optional session ID to clean up |
-
-**Output:**
-
-```typescript
-{
-  cleanedUp: boolean; // Whether cleanup was performed
-}
+// Error
+{ ok: false, error: { code: string, message: string } }
 ```
 
----
-
-### Discovery Tools
-
-#### `mm_get_state`
-
-Get current extension state including screen, balance, network, and account.
+## CLI Reference
 
-**Input:** None
+The `mm` CLI provides a unified interface for agents and developers. All commands communicate with the daemon over HTTP — the daemon is auto-started on `mm launch` if not already running.
 
-**Output:**
+### Global Options
 
-```typescript
-{
-  state: {
-    isLoaded: boolean;
-    currentUrl: string;
-    extensionId: string;
-    isUnlocked: boolean;
-    currentScreen: ScreenName;
-    accountAddress: string | null;
-    networkName: string | null;
-    chainId: number | null;
-    balance: string | null;
-  };
-  tabs?: {
-    active: { role: TabRole; url: string };
-    tracked: { role: TabRole; url: string }[];
-  };
-}
-```
+| Option             | Description                                                                                                 |
+| ------------------ | ----------------------------------------------------------------------------------------------------------- |
+| `--project <path>` | Target a specific project directory (absolute or relative). Overrides `MM_PROJECT` and git-based discovery. |
 
----
+| Environment Variable | Description                                                                                              |
+| -------------------- | -------------------------------------------------------------------------------------------------------- |
+| `MM_PROJECT`         | Default project directory when `--project` is not provided. Falls back to the current git worktree root. |
 
-#### `mm_list_testids`
+### Project Targeting
 
-List all visible `data-testid` attributes on the current page. Use to discover interaction targets.
+By default, the CLI resolves the target project from the current git worktree. This works when running from inside the project directory. For other scenarios, the resolution order is:
 
-**Input:**
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `limit` | `number` | `150` | Maximum items to return (1-500) |
+1. **`--project <path>`** — Explicit flag, highest priority. Accepts absolute or relative paths.
+2. **`MM_PROJECT`** — Environment variable. Useful for setting once in agent config or shell profile.
+3. **Git worktree** — `git rev-parse --show-toplevel` from the current working directory (existing behavior).
 
-**Output:**
-
-```typescript
-{
-  items: [{
-    testId: string;    // The data-testid value
-    tag: string;       // HTML tag (button, input, div, etc.)
-    text?: string;     // Visible text content
-    visible: boolean;  // Whether element is visible
-  }];
-}
-```
+```bash
+# From inside the project (unchanged)
+mm launch
+
+# From a parent folder containing multiple repos
+mm --project ./metamask-extension launch
+
+# Via environment variable
+export MM_PROJECT=/path/to/metamask-extension
+mm describe-screen
+```
+
+### Lifecycle
+
+| Command                                                                               | Description                                                                                                                                                                                                                                                                                                           |
+| ------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `mm launch [--state default\|onboarding\|custom] [--extension-path <path>] [--force]` | Auto-starts the daemon if needed, then launches a headed Chrome session with the configured extension. Use `--state` to control wallet initialization (pre-configured, onboarding flow, or custom fixture). Use `--extension-path` to override the extension directory. Use `--force` to replace an existing session. |
+| `mm cleanup [--shutdown]`                                                             | Stops the browser, tears down test services (fixture server, Anvil, mock server), and releases session resources. Add `--shutdown` to also terminate the daemon process.                                                                                                                                              |
+| `mm status`                                                                           | Displays the daemon's current status: PID, port, uptime, allocated sub-ports, and whether a browser session is active.                                                                                                                                                                                                |
+| `mm serve [--background]`                                                             | Manually starts the HTTP daemon without launching a browser session. Use `--background` to detach the process. Fails if a daemon is already running for this worktree.                                                                                                                                                |
+
+### Interaction
+
+| Command                              | Description                                                                                                                                                                                                                                                                |
+| ------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `mm click <ref>`                     | Clicks an element by its accessibility reference (e.g., `e3`). The ref comes from a prior `describe-screen` call. Waits for the element to be visible before clicking.                                                                                                     |
+| `mm type <ref> <text>`               | Types text into an input element identified by its accessibility reference. Replaces any existing content in the field.                                                                                                                                                    |
+| `mm describe-screen`                 | Captures the full screen state: extension info, visible test IDs, a trimmed accessibility tree with deterministic refs (`e1`, `e2`, ...), and prior knowledge from historical sessions. This is the primary command for understanding what's on screen before interacting. |
+| `mm screenshot [--name <name>]`      | Takes a full-page screenshot of the current page. Saves to the artifacts directory. Use `--name` to set a descriptive filename.                                                                                                                                            |
+| `mm wait-for <ref> [--timeout <ms>]` | Blocks until an element identified by its accessibility reference becomes visible, or the timeout expires. Default timeout is 5 seconds.                                                                                                                                   |
+
+### Navigation
+
+| Command                | Description                                                                                       |
+| ---------------------- | ------------------------------------------------------------------------------------------------- |
+| `mm navigate <url>`    | Opens a new tab and navigates to the given URL. Useful for navigating to dApps or external pages. |
+| `mm navigate-home`     | Navigates the extension tab to the wallet home screen.                                            |
+| `mm navigate-settings` | Navigates the extension tab to the settings page.                                                 |
+
+### State & Knowledge
+
+| Command                       | Description                                                                                                                                                         |
+| ----------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `mm get-state`                | Returns the current extension state: loaded status, current URL, screen name, network, chain ID, account address, and balance. Also lists all tracked browser tabs. |
+| `mm knowledge-search <query>` | Searches the knowledge store for past tool invocations matching the query. Results are scored by relevance to screen, URL, test IDs, and a11y nodes.                |
+| `mm knowledge-last`           | Retrieves the most recent step records from the current session's knowledge store.                                                                                  |
+| `mm knowledge-sessions`       | Lists recent knowledge sessions with metadata (goal, flow tags, timestamps).                                                                                        |
+| `mm run-steps <json>`         | Executes a batch of tool invocations sequentially from a JSON definition. Each step specifies a tool name and arguments.                                            |
+
+For the full agent-facing reference and workflow guidelines, see [SKILL.md](./SKILL.md).
+
+## Error Classification
+
+Tool errors are classified into specific error codes for structured handling:
+
+| Code                        | Meaning                                       |
+| --------------------------- | --------------------------------------------- |
+| `MM_TARGET_NOT_FOUND`       | Element not found by ref, testId, or selector |
+| `MM_WAIT_TIMEOUT`           | Timeout waiting for element or condition      |
+| `MM_CLICK_FAILED`           | Click operation failed                        |
+| `MM_TYPE_FAILED`            | Type operation failed                         |
+| `MM_NAVIGATION_FAILED`      | Navigation error or network failure           |
+| `MM_PAGE_CLOSED`            | Browser page was closed unexpectedly          |
+| `MM_NOTIFICATION_TIMEOUT`   | Notification popup did not appear             |
+| `MM_TAB_NOT_FOUND`          | Tab not found by role or URL                  |
+| `MM_DISCOVERY_FAILED`       | Discovery tool failure                        |
+| `MM_SCREENSHOT_FAILED`      | Screenshot capture failure                    |
+| `MM_CONTRACT_NOT_FOUND`     | Unknown contract name                         |
+| `MM_SEED_FAILED`            | Contract deployment failure                   |
+| `MM_CONTEXT_SWITCH_BLOCKED` | Context switch while session is active        |
 
-**Example Output:**
-
-```json
-{
-  "items": [
-    {
-      "testId": "account-menu-icon",
-      "tag": "button",
-      "text": "",
-      "visible": true
-    },
-    {
-      "testId": "eth-overview-send",
-      "tag": "button",
-      "text": "Send",
-      "visible": true
-    },
-    {
-      "testId": "token-balance",
-      "tag": "span",
-      "text": "25 ETH",
-      "visible": true
-    }
-  ]
-}
-```
-
----
-
-#### `mm_accessibility_snapshot`
-
-Get a trimmed accessibility tree with deterministic refs (e1, e2, ...). Refs can be used with `mm_click` and `mm_type`.
-
-**Input:**
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `rootSelector` | `string` | - | CSS selector to scope the snapshot |
-
-**Included Roles:**
-
-- **Actionable:** button, link, checkbox, radio, switch, textbox, combobox, menuitem
-- **Important:** dialog, alert, status, heading
-
-**Output:**
-
-```typescript
-{
-  nodes: [{
-    ref: string;       // Deterministic ref (e1, e2, e3, ...)
-    role: string;      // ARIA role
-    name: string;      // Accessible name
-    disabled?: boolean;
-    checked?: boolean;
-    expanded?: boolean;
-    path: string[];    // Ancestor path for context
-  }];
-}
-```
-
-**Example Output:**
-
-```json
-{
-  "nodes": [
-    { "ref": "e1", "role": "button", "name": "Send", "path": ["main", "div"] },
-    { "ref": "e2", "role": "button", "name": "Swap", "path": ["main", "div"] },
-    { "ref": "e3", "role": "textbox", "name": "Amount", "path": ["form"] }
-  ]
-}
-```
-
----
-
-#### `mm_describe_screen`
-
-Comprehensive screen state combining extension state, testIds, and accessibility snapshot. Optionally includes screenshot.
-
-**Input:**
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `includeScreenshot` | `boolean` | `false` | Capture screenshot |
-| `screenshotName` | `string` | - | Screenshot filename |
-| `includeScreenshotBase64` | `boolean` | `false` | Include base64 in response |
-
-**Output:**
-
-```typescript
-{
-  state: ExtensionState;
-  testIds: { items: TestIdItem[] };
-  a11y: { nodes: A11yNodeTrimmed[] };
-  screenshot: {
-    path: string;
-    width: number;
-    height: number;
-    base64?: string;
-  } | null;
-  priorKnowledge?: PriorKnowledgeV1;  // Past session hints
-}
-```
-
----
-
-### Interaction Tools
-
-#### `mm_click`
-
-Click an element. Specify exactly ONE of: `a11yRef`, `testId`, or `selector`.
-
-**Input:**
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `a11yRef` | `string` | - | Accessibility ref from `mm_accessibility_snapshot` (e.g., "e5") |
-| `testId` | `string` | - | `data-testid` attribute value |
-| `selector` | `string` | - | CSS selector |
-| `timeoutMs` | `number` | `15000` | Max wait time (0-60000) |
-
-**Output:**
-
-```typescript
-{
-  clicked: boolean;
-  target: string;                 // Resolved selector
-  pageClosedAfterClick?: boolean; // True if click caused page close
-}
-```
-
-**Examples:**
-
-```json
-{ "a11yRef": "e5" }
-{ "testId": "confirm-btn" }
-{ "selector": "button.primary" }
-```
-
----
-
-#### `mm_type`
-
-Type text into an input element. Specify exactly ONE of: `a11yRef`, `testId`, or `selector`.
-
-**Input:**
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `a11yRef` | `string` | - | Accessibility ref |
-| `testId` | `string` | - | `data-testid` value |
-| `selector` | `string` | - | CSS selector |
-| `text` | `string` | **required** | Text to type |
-| `timeoutMs` | `number` | `15000` | Max wait time |
-
-**Output:**
-
-```typescript
-{
-  typed: boolean;
-  target: string;
-  textLength: number;
-}
-```
-
-**Example:**
-
-```json
-{ "testId": "amount-input", "text": "0.5" }
-```
-
----
-
-#### `mm_wait_for`
-
-Wait for an element to become visible. Specify exactly ONE of: `a11yRef`, `testId`, or `selector`.
-
-**Input:**
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `a11yRef` | `string` | - | Accessibility ref |
-| `testId` | `string` | - | `data-testid` value |
-| `selector` | `string` | - | CSS selector |
-| `timeoutMs` | `number` | `15000` | Max wait time (100-120000) |
-
-**Output:**
-
-```typescript
-{
-  found: boolean;
-  target: string;
-}
-```
-
----
-
-#### `mm_clipboard`
-
-Read from or write to the browser clipboard. Useful for pasting content (e.g., Secret Recovery Phrase) into components that support paste functionality.
-
-**Input:**
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `action` | `"write" \| "read"` | **required** | Clipboard action |
-| `text` | `string` | - | Text to write (required when `action="write"`) |
-
-**Output:**
-
-```typescript
-{
-  action: "write" | "read";
-  success: boolean;
-  text?: string;  // Present when action="read" and successful
-}
-```
-
-**Examples:**
-
-```json
-{ "action": "write", "text": "word1 word2 word3 word4 word5 word6 word7 word8 word9 word10 word11 word12" }
-{ "action": "read" }
-```
-
-**Use Case - Fast SRP Entry:**
-
-```
-1. mm_clipboard { "action": "write", "text": "abandon abandon ... about" }
-2. mm_click { "testId": "srp-input-import__paste-button" }
-→ All 12 words populated instantly via paste
-```
-
----
-
-#### `mm_navigate`
-
-Navigate to a specific screen in the extension.
-
-**Input:**
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `screen` | `"home" \| "settings" \| "notification" \| "url"` | **required** | Target screen |
-| `url` | `string` | - | Required when `screen="url"` |
-
-**Output:**
-
-```typescript
-{
-  navigated: boolean;
-  currentUrl: string;
-}
-```
-
-**Examples:**
-
-```json
-{ "screen": "home" }
-{ "screen": "settings" }
-{ "screen": "url", "url": "https://app.uniswap.org" }
-```
-
----
-
-### Multi-Tab Tools
-
-#### `mm_wait_for_notification`
-
-Wait for a notification popup to appear (e.g., after dApp interaction). Sets the notification page as active.
-
-**Input:**
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `timeoutMs` | `number` | `15000` | Max wait time (1000-60000) |
-
-**Output:**
-
-```typescript
-{
-  found: boolean;
-  pageUrl: string;
-}
-```
-
----
-
-#### `mm_switch_to_tab`
-
-Switch the active page for subsequent interactions.
-
-**Input:**
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `role` | `"extension" \| "notification" \| "dapp" \| "other"` | - | Tab role to switch to |
-| `url` | `string` | - | URL prefix to match |
-
-**Output:**
-
-```typescript
-{
-  switched: boolean;
-  activeTab: {
-    role: TabRole;
-    url: string;
-  }
-}
-```
-
-**Example:**
-
-```json
-{ "role": "dapp" }
-{ "url": "https://app.uniswap.org" }
-```
-
----
-
-#### `mm_close_tab`
-
-Close a specific tab. Cannot close the extension home page.
-
-**Input:**
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `role` | `"notification" \| "dapp" \| "other"` | - | Tab role to close |
-| `url` | `string` | - | URL prefix to match |
-
-**Output:**
-
-```typescript
-{
-  closed: boolean;
-  closedUrl: string;
-}
-```
-
----
-
-### Screenshot Tools
-
-#### `mm_screenshot`
-
-Capture a screenshot and save to `test-artifacts/screenshots/`.
-
-**Input:**
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `name` | `string` | **required** | Filename (without extension) |
-| `fullPage` | `boolean` | `true` | Capture full page |
-| `selector` | `string` | - | Capture specific element only |
-| `includeBase64` | `boolean` | `false` | Include base64 in response |
-
-**Output:**
-
-```typescript
-{
-  path: string;      // File path
-  width: number;
-  height: number;
-  base64?: string;   // If includeBase64=true
-}
-```
-
----
-
-### Smart Contract Tools
-
-#### `mm_seed_contract`
-
-Deploy a smart contract to the local Anvil node. Requires `ContractSeedingCapability`.
-
-**Input:**
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `contractName` | `string` | **required** | Contract to deploy (see list below) |
-| `hardfork` | `string` | `"prague"` | EVM hardfork |
-| `deployerOptions.fromAddress` | `string` | - | Impersonate address |
-| `deployerOptions.fromPrivateKey` | `string` | - | Deploy from specific key |
-
-**Available Contracts:**
-| Name | Description |
-|------|-------------|
-| `hst` | ERC-20 TST token |
-| `nfts` | ERC-721 NFT collection |
-| `erc1155` | ERC-1155 multi-token |
-| `piggybank` | Simple ETH storage |
-| `failing` | Always reverts (error testing) |
-| `multisig` | Multi-signature wallet |
-| `entrypoint` | ERC-4337 EntryPoint |
-| `simpleAccountFactory` | ERC-4337 account factory |
-| `verifyingPaymaster` | ERC-4337 paymaster |
-
-**Output:**
-
-```typescript
-{
-  contractName: string;
-  contractAddress: string;
-  deployedAt: string; // ISO timestamp
-}
-```
-
----
-
-#### `mm_seed_contracts`
-
-Deploy multiple contracts in sequence.
-
-**Input:**
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `contracts` | `string[]` | **required** | Contracts to deploy (1-9) |
-| `hardfork` | `string` | `"prague"` | EVM hardfork |
-
-**Output:**
-
-```typescript
-{
-  deployed: [{ contractName, contractAddress, deployedAt }];
-  failed: [{ contractName, error }];
-}
-```
-
----
-
-#### `mm_get_contract_address`
-
-Get the deployed address of a contract.
-
-**Input:**
-| Parameter | Type | Description |
-|-----------|------|-------------|
-| `contractName` | `string` | Contract name to look up |
-
-**Output:**
-
-```typescript
-{
-  contractName: string;
-  contractAddress: string | null;
-}
-```
-
----
-
-#### `mm_list_contracts`
-
-List all contracts deployed in this session.
-
-**Input:** None
-
-**Output:**
-
-```typescript
-{
-  contracts: [{
-    contractName: string;
-    contractAddress: string;
-    deployedAt: string;
-  }];
-}
-```
-
----
-
-### Knowledge Store Tools
-
-The knowledge store enables cross-session learning by recording tool invocations and their context.
-
-#### `mm_knowledge_last`
-
-Get the last N step records from the knowledge store.
-
-**Input:**
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `n` | `number` | `20` | Number of steps (1-200) |
-| `scope` | `"current" \| "all" \| { sessionId }` | `"current"` | Which sessions to query |
-| `filters.flowTag` | `string` | - | Filter by flow tag |
-| `filters.tag` | `string` | - | Filter by tag |
-| `filters.screen` | `string` | - | Filter by screen |
-| `filters.sinceHours` | `number` | - | Only steps from last N hours |
-
-**Output:**
-
-```typescript
-{
-  steps: [{
-    timestamp: string;
-    tool: string;
-    screen: ScreenName;
-    snippet: string;      // Human-readable summary
-    sessionId?: string;
-    matchedFields?: string[];
-    sessionGoal?: string;
-  }];
-}
-```
-
----
-
-#### `mm_knowledge_search`
-
-Search step records by tool name, screen, testId, or accessibility names.
-
-**Input:**
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `query` | `string` | **required** | Search query (1-200 chars) |
-| `limit` | `number` | `20` | Max results (1-100) |
-| `scope` | `"current" \| "all" \| { sessionId }` | `"all"` | Which sessions to search |
-| `filters` | `KnowledgeFilters` | - | Additional filters |
-
-**Output:**
-
-```typescript
-{
-  matches: KnowledgeStepSummary[];
-  query: string;
-}
-```
-
----
-
-#### `mm_knowledge_summarize`
-
-Generate a recipe-like summary of steps taken in a session.
-
-**Input:**
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `scope` | `"current" \| { sessionId }` | `"current"` | Session to summarize |
-
-**Output:**
-
-```typescript
-{
-  sessionId: string;
-  stepCount: number;
-  recipe: [{
-    stepNumber: number;
-    tool: string;
-    notes: string;
-  }];
-}
-```
-
----
-
-#### `mm_knowledge_sessions`
-
-List recent sessions with metadata.
-
-**Input:**
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `limit` | `number` | `10` | Max sessions (1-50) |
-| `filters` | `KnowledgeFilters` | - | Filter options |
-
-**Output:**
-
-```typescript
-{
-  sessions: [{
-    sessionId: string;
-    createdAt: string;
-    goal?: string;
-    flowTags: string[];
-    tags: string[];
-  }];
-}
-```
-
----
-
-### Batching Tools
-
-#### `mm_run_steps`
-
-Execute multiple tools in sequence. Reduces round trips for multi-step flows.
-
-**Input:**
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `steps` | `array` | **required** | Tool calls to execute (1-50) |
-| `steps[].tool` | `string` | **required** | Tool name (e.g., `mm_click`) |
-| `steps[].args` | `object` | `{}` | Tool arguments |
-| `stopOnError` | `boolean` | `false` | Stop on first error |
-| `includeObservations` | `"none" \| "failures" \| "all"` | `"all"` | When to include state observations |
-
-**Output:**
-
-```typescript
-{
-  steps: [{
-    tool: string;
-    ok: boolean;
-    result?: unknown;
-    error?: { code: string; message: string; details?: unknown };
-    meta: { durationMs: number; timestamp: string };
-  }];
-  summary: {
-    ok: boolean;      // All steps succeeded
-    total: number;
-    succeeded: number;
-    failed: number;
-    durationMs: number;
-  };
-}
-```
-
-**Example:**
-
-```json
-{
-  "steps": [
-    { "tool": "mm_click", "args": { "testId": "send-button" } },
-    { "tool": "mm_type", "args": { "testId": "amount-input", "text": "0.1" } },
-    { "tool": "mm_click", "args": { "testId": "confirm-button" } }
-  ],
-  "stopOnError": true
-}
-```
-
-## Development
-
-### Building
-
-```bash
-yarn build
-```
-
-### Testing
-
-```bash
-yarn test
-```
-
-### Local Development with yalc
+## Development
 
 ```bash
-# In this repo
-yarn build && yalc publish
-
-# In consumer repo
-yalc add @metamask/client-mcp-core
+yarn build        # Build the package
+yarn test         # Run tests and type checks
+yarn lint         # Lint everything
+yarn lint:fix     # Auto-fix lint issues
 ```
 
 ## License
 
-MIT
+(MIT OR Apache-2.0)
diff --git a/SKILL.md b/SKILL.md
new file mode 100644
index 0000000..a93f8c8
--- /dev/null
+++ b/SKILL.md
@@ -0,0 +1,368 @@
+# mm CLI — Agent Reference
+
+You control a browser extension through the `mm` CLI. Every command talks to a local HTTP daemon that manages Playwright and the extension lifecycle. The daemon auto-starts when you run `mm launch`.
+
+If you are running outside the target project directory, use `--project <path>` or set the `MM_PROJECT` environment variable to point at the project root. All commands accept `--project` before the command name (e.g., `mm --project ../metamask-extension launch`).
+
+## Core Loop
+
+```
+mm launch                  # 1. Start browser + extension
+mm describe-screen         # 2. See what's on screen (ALWAYS do this before interacting)
+mm click <ref>             # 3. Interact using refs from describe-screen
+mm describe-screen         # 4. Re-describe after every action to get fresh refs
+mm cleanup --shutdown      # 5. Clean up when done
+```
+
+**Critical rules:**
+
+- **Always `describe-screen` before interacting.** Refs like `e1`, `e2` are ephemeral — they change after every action.
+- **Always `describe-screen` after interacting.** The screen state changed; your old refs are stale.
+- **One target per command.** Specify exactly ONE of: a11y ref (`e5`), testId, or CSS selector.
+- **Errors are structured.** Check the `error.code` field to decide recovery strategy (see Error Codes below).
+
+## Commands
+
+### Session Lifecycle
+
+#### `mm launch`
+
+Starts the daemon (if not running) and launches a headed Chrome session with the extension.
+
+```
+mm launch [--state default|onboarding|custom] [--extension-path <path>] [--force]
+```
+
+| Flag                      | Description                                                     |
+| ------------------------- | --------------------------------------------------------------- |
+| `--state default`         | Pre-onboarded wallet with 25 ETH on local Anvil chain (default) |
+| `--state onboarding`      | Fresh wallet requiring manual onboarding setup                  |
+| `--state custom`          | Use a custom fixture for wallet state                           |
+| `--extension-path <path>` | Override the extension build directory                          |
+| `--force`                 | Replace an existing active session                              |
+
+Returns: `sessionId`, `extensionId`, `state` (current extension state).
+
+#### `mm cleanup`
+
+Stops the browser, tears down test services, and releases session resources.
+
+```
+mm cleanup [--shutdown]
+```
+
+| Flag         | Description                       |
+| ------------ | --------------------------------- |
+| `--shutdown` | Also terminate the daemon process |
+
+Without `--shutdown`, the daemon stays running for the next `mm launch`.
+
+#### `mm status`
+
+Shows daemon status: PID, port, uptime, allocated sub-ports.
+
+```
+mm status
+```
+
+#### `mm serve`
+
+Manually starts the daemon without launching a browser. Useful for debugging.
+
+```
+mm serve [--background]
+```
+
+### Screen Discovery
+
+#### `mm describe-screen`
+
+**Your primary observation tool.** Returns the complete screen state:
+
+- **Extension state**: current URL, screen name, network, account, balance
+- **Test IDs**: visible `data-testid` attributes with text previews
+- **A11y tree**: interactive elements with deterministic refs (`e1`, `e2`, ...)
+- **Prior knowledge**: suggested actions from past sessions on this screen
+
+```
+mm describe-screen
+```
+
+The a11y tree only includes actionable roles: `button`, `link`, `checkbox`, `radio`, `switch`, `textbox`, `combobox`, `menuitem`, and important roles: `dialog`, `alert`, `status`, `heading`.
+
+Each node looks like:
+
+```json
+{
+  "ref": "e3",
+  "role": "button",
+  "name": "Confirm",
+  "path": ["dialog:Transaction"]
+}
+```
+
+Use the `ref` value (`e3`) for click/type/wait-for commands.
+
+#### `mm get-state`
+
+Returns extension state and tracked tabs without the full a11y tree.
+
+```
+mm get-state
+```
+
+Returns: `state` (extension state) and `tabs` (active + tracked tabs with roles and URLs).
+
+#### `mm screenshot`
+
+Captures a screenshot of the current page.
+
+```
+mm screenshot [--name <name>]
+```
+
+Returns: file path, dimensions.
+
+### Element Interaction
+
+All interaction commands accept an element reference from `describe-screen`.
+
+#### `mm click <ref>`
+
+Clicks an element. Waits up to 15s for it to become visible.
+
+```
+mm click e3
+```
+
+If the page closes after clicking (e.g., confirmation popup), the response includes `pageClosedAfterClick: true` — this is normal, not an error.
+
+#### `mm type <ref> <text>`
+
+Types text into an input field. Replaces existing content (uses `fill()`).
+
+```
+mm type e5 "0x1234abcd..."
+```
+
+#### `mm wait-for <ref>`
+
+Blocks until an element becomes visible. Default timeout: 15s.
+
+```
+mm wait-for e7 [--timeout <ms>]
+```
+
+### Navigation
+
+#### `mm navigate <url>`
+
+Opens a new tab and navigates to the given URL.
+
+```
+mm navigate https://app.uniswap.org
+```
+
+#### `mm navigate-home`
+
+Navigates the extension tab to the wallet home screen.
+
+```
+mm navigate-home
+```
+
+#### `mm navigate-settings`
+
+Navigates the extension tab to the settings page.
+
+```
+mm navigate-settings
+```
+
+### Knowledge Store
+
+The knowledge store records every tool invocation and uses past sessions to suggest actions.
+
+#### `mm knowledge-search <query>`
+
+Searches past sessions for steps matching the query. Matches against tool names, screen names, test IDs, and a11y node names.
+
+```
+mm knowledge-search "confirm transaction"
+```
+
+#### `mm knowledge-last`
+
+Gets the most recent step records from the current session.
+
+```
+mm knowledge-last
+```
+
+#### `mm knowledge-sessions`
+
+Lists recent sessions with metadata (goal, flow tags, timestamps).
+
+```
+mm knowledge-sessions
+```
+
+### Batch Execution
+
+#### `mm run-steps <json>`
+
+Executes multiple tool invocations in sequence from a JSON array. Each step specifies a tool name and arguments.
+
+```
+mm run-steps '{"steps":[{"tool":"click","args":{"a11yRef":"e3"}},{"tool":"wait_for","args":{"a11yRef":"e5"}}]}'
+```
+
+Supports `stopOnError` (halt on first failure) and returns per-step results with timing.
+
+## Element Targeting
+
+Every interaction command (`click`, `type`, `wait-for`) needs a target. You must provide exactly ONE of:
+
+| Method           | Format              | Stability                       | When to use                                          |
+| ---------------- | ------------------- | ------------------------------- | ---------------------------------------------------- |
+| **a11y ref**     | `e1`, `e2`, ...     | Ephemeral (per describe-screen) | Default — use refs from the latest `describe-screen` |
+| **testId**       | `data-testid` value | Stable across sessions          | When you know the testId from prior knowledge        |
+| **CSS selector** | Any CSS selector    | Fragile                         | Last resort fallback                                 |
+
+**Prefer a11y refs.** They come directly from the accessibility tree and map to ARIA selectors, making them the most reliable for the current screen state.
+
+## Prior Knowledge
+
+When you call `describe-screen`, the response may include a `priorKnowledge` section with:
+
+- **`similarSteps`**: Past tool invocations on the same screen with confidence scores
+- **`suggestedNextActions`**: Ranked actions based on historical success (e.g., "click confirm button")
+- **`avoid`**: Targets that frequently fail on this screen — skip these
+
+Use prior knowledge to guide your actions, but always verify against the current a11y tree.
+
+## Error Codes
+
+When a command fails, the response includes `error.code`. Use this to decide what to do:
+
+| Code                          | Meaning                                      | Recovery                                                  |
+| ----------------------------- | -------------------------------------------- | --------------------------------------------------------- |
+| `MM_NO_ACTIVE_SESSION`        | No browser session running                   | Run `mm launch` first                                     |
+| `MM_SESSION_ALREADY_RUNNING`  | Session already exists                       | Run `mm cleanup` first, or use `--force`                  |
+| `MM_TARGET_NOT_FOUND`         | Element ref/testId/selector not found        | Run `mm describe-screen` to get fresh refs                |
+| `MM_WAIT_TIMEOUT`             | Element didn't appear in time                | Increase timeout or verify you're on the right screen     |
+| `MM_CLICK_FAILED`             | Click failed after finding element           | Element may be obscured; try waiting or scrolling         |
+| `MM_TYPE_FAILED`              | Type failed after finding element            | Element may not be an input; verify with describe-screen  |
+| `MM_PAGE_CLOSED`              | Page was closed unexpectedly                 | Normal after some confirmations; run describe-screen      |
+| `MM_NAVIGATION_FAILED`        | Navigation error or network failure          | Check URL validity; retry once                            |
+| `MM_NOTIFICATION_TIMEOUT`     | Extension notification popup didn't appear   | Action may not have triggered a notification; check state |
+| `MM_TAB_NOT_FOUND`            | Tab role/URL not found                       | Run `mm get-state` to see available tabs                  |
+| `MM_CAPABILITY_NOT_AVAILABLE` | Feature requires a capability not configured | Check environment mode (e2e vs prod)                      |
+| `MM_CONTEXT_SWITCH_BLOCKED`   | Can't switch context with active session     | Run `mm cleanup` first                                    |
+| `MM_INVALID_INPUT`            | Bad parameters                               | Fix input and retry                                       |
+| `MM_CONTRACT_NOT_FOUND`       | Unknown contract name for seeding            | See available contracts below                             |
+
+## Available Contracts (E2E only)
+
+These contracts can be deployed to the local Anvil chain via `seed_contract` / `seed_contracts`:
+
+| Name                   | Type                                                |
+| ---------------------- | --------------------------------------------------- |
+| `hst`                  | ERC-20 token                                        |
+| `nfts`                 | ERC-721 NFT                                         |
+| `erc1155`              | ERC-1155 multi-token                                |
+| `piggybank`            | Simple deposit contract                             |
+| `failing`              | Contract that always reverts (for testing failures) |
+| `multisig`             | Multi-signature wallet                              |
+| `entrypoint`           | ERC-4337 EntryPoint                                 |
+| `simpleAccountFactory` | ERC-4337 account factory                            |
+| `verifyingPaymaster`   | ERC-4337 paymaster                                  |
+
+## Flow Tags
+
+When launching, tag your session with flow tags for cross-session knowledge:
+
+| Tag               | Use for                        |
+| ----------------- | ------------------------------ |
+| `send`            | Token send flows               |
+| `swap`            | Token swap flows               |
+| `connect`         | dApp connection flows          |
+| `sign`            | Message/transaction signing    |
+| `onboarding`      | Wallet setup/onboarding        |
+| `settings`        | Settings configuration         |
+| `tx-confirmation` | Transaction confirmation flows |
+
+## Daemon Model
+
+- Daemon runs per project, state tracked in `.mm-server` at the project root
+- Auto-starts on `mm launch` if not running
+- Shuts down after 30 minutes of inactivity
+- Logs to `.mm-daemon.log`
+- One tool executes at a time (requests are queued)
+- Project resolution: `--project` flag → `MM_PROJECT` env var → current git worktree
+
+## Workflow Examples
+
+### Basic Interaction
+
+```bash
+mm launch --state default
+mm describe-screen
+# Response includes a11y nodes: [{ ref: "e1", role: "button", name: "Send" }, ...]
+mm click e1
+mm describe-screen
+# Now on send screen — get new refs
+mm type e3 "0.01"
+mm click e5
+mm cleanup --shutdown
+```
+
+### Transaction with Notification
+
+```bash
+mm launch --state default
+mm navigate https://app.uniswap.org
+mm describe-screen
+# Interact with dApp...
+mm click e4                    # triggers wallet popup
+mm wait-for e2 --timeout 10000 # wait for confirm button in notification
+mm click e2                    # confirm
+mm describe-screen             # check result
+mm cleanup --shutdown
+```
+
+### Running From a Parent Folder
+
+```bash
+# Set once — all subsequent mm commands target this project
+export MM_PROJECT=/path/to/metamask-extension
+
+mm launch --state default
+mm describe-screen
+mm click e1
+mm cleanup --shutdown
+
+# Or use --project per command
+mm --project ../metamask-extension launch
+mm --project ../metamask-extension describe-screen
+```
+
+### Using Prior Knowledge
+
+```bash
+mm launch --state default --goal "Test send flow" --flow-tags send
+mm describe-screen
+# Response includes priorKnowledge.suggestedNextActions:
+# [{ action: "click", preferredTarget: { type: "testId", value: "send-button" }, confidence: 0.85 }]
+# Use the suggestion but verify the target exists in the current a11y tree
+mm click e3
+mm cleanup --shutdown
+```
+
+## Project-Specific Commands
+
+<!-- Consumer repos extend this section -->
+
+## Project-Specific Workflow Examples
+
+<!-- Consumer repos add examples here -->
diff --git a/package.json b/package.json
index e7a0560..9c7e225 100644
--- a/package.json
+++ b/package.json
@@ -1,13 +1,13 @@
 {
   "name": "@metamask/client-mcp-core",
   "version": "0.1.1",
-  "description": "MCP server for MetaMask Extension visual testing with LLM agents",
+  "description": "HTTP daemon and CLI for agent-driven browser extension testing with Playwright",
   "keywords": [
-    "mcp",
     "playwright",
     "llm",
     "visual-testing",
-    "browser-extension"
+    "browser-extension",
+    "cli"
   ],
   "homepage": "https://github.com/MetaMask/client-mcp-core#readme",
   "bugs": {
@@ -35,6 +35,9 @@
   "main": "./dist/index.cjs",
   "module": "./dist/index.mjs",
   "types": "./dist/index.d.cts",
+  "bin": {
+    "mm": "./dist/cli/mm.cjs"
+  },
   "files": [
     "dist"
   ],
@@ -57,7 +60,7 @@
     "@isaacs/brace-expansion": "5.0.1"
   },
   "dependencies": {
-    "@modelcontextprotocol/sdk": "^1.26.0",
+    "express": "^5.2.1",
     "zod": "^4.3.5"
   },
   "devDependencies": {
@@ -71,6 +74,7 @@
     "@metamask/eslint-config-vitest": "^15.0.0",
     "@playwright/test": "^1.49.0",
     "@ts-bridge/cli": "^0.6.3",
+    "@types/express": "^5.0.6",
     "@types/node": "^20.0.0",
     "@typescript-eslint/utils": "^8.6.0",
     "@vitest/coverage-istanbul": "^3.0.7",
diff --git a/src/capabilities/context.test.ts b/src/capabilities/context.test.ts
index 25410cc..772c25a 100644
--- a/src/capabilities/context.test.ts
+++ b/src/capabilities/context.test.ts
@@ -20,7 +20,6 @@ describe('isE2EConfig', () => {
       environment: 'e2e',
       extensionName: 'MetaMask',
       defaultPassword: 'password123',
-      toolPrefix: 'mm',
       artifactsDir: './test-artifacts',
       defaultChainId: 1337,
       ports: {
@@ -54,7 +53,6 @@ describe('isE2EConfig', () => {
       environment: 'prod',
       extensionName: 'MetaMask',
       defaultPassword: 'password123',
-      toolPrefix: 'mm',
       defaultChainId: 1,
     };
 
@@ -93,7 +91,6 @@ describe('isProdConfig', () => {
       environment: 'prod',
       extensionName: 'MetaMask',
       defaultPassword: 'password123',
-      toolPrefix: 'mm',
       artifactsDir: './artifacts',
       defaultChainId: 1,
     };
@@ -119,7 +116,6 @@ describe('isProdConfig', () => {
       environment: 'e2e',
       extensionName: 'MetaMask',
       defaultPassword: 'password123',
-      toolPrefix: 'mm',
       artifactsDir: './test-artifacts',
       defaultChainId: 1337,
       ports: {
diff --git a/src/capabilities/context.ts b/src/capabilities/context.ts
index a88fa1b..e5f381a 100644
--- a/src/capabilities/context.ts
+++ b/src/capabilities/context.ts
@@ -22,8 +22,6 @@ export type BaseEnvironmentConfig = {
   extensionName: string;
   /** Default password for wallet unlock operations */
   defaultPassword?: string;
-  /** Prefix for MCP tool names (e.g., "mm" -> "mm_build", "mm_launch") */
-  toolPrefix?: string;
   /** Directory for storing screenshots and other artifacts */
   artifactsDir?: string;
 };
diff --git a/src/cli/mm.test.ts b/src/cli/mm.test.ts
new file mode 100644
index 0000000..ccc92e1
--- /dev/null
+++ b/src/cli/mm.test.ts
@@ -0,0 +1,1756 @@
+/* eslint-disable n/no-unsupported-features/node-builtins */
+/* eslint-disable n/no-process-env */
+/* eslint-disable n/no-sync */
+/* eslint-disable require-atomic-updates */
+import { existsSync } from 'node:fs';
+import * as fs from 'node:fs/promises';
+import * as path from 'node:path';
+import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
+import type { MockInstance } from 'vitest';
+
+import {
+  extractProjectFlag,
+  resolveTargetFromArgs,
+  getPositionalTarget,
+  isTransientError,
+  parseIntFlag,
+  parseStringFlag,
+  parseLaunchArgs,
+  printHelp,
+  resolveRuntime,
+  sendRequest,
+  routeCommand,
+  resolveWorktreeRoot,
+  readDaemonConfig,
+  shutdownDaemon,
+  waitForDaemon,
+  discoverDaemon,
+  autoStartDaemon,
+  handleServe,
+  sleep,
+  main,
+} from './mm.js';
+
+vi.mock('node:child_process', () => ({
+  execSync: vi.fn(() => Buffer.from('/mock/worktree\n')),
+  spawn: vi.fn(() => {
+    const child = {
+      unref: vi.fn(),
+      on: vi.fn(
+        (event: string, handler: (code: number | null) => void) =>
+          event === 'exit' && setTimeout(() => handler(0), 10),
+      ),
+    };
+    return child;
+  }),
+}));
+
+vi.mock('node:fs', async (importOriginal) => {
+  const actual = await importOriginal<typeof import('node:fs')>();
+  return { ...actual, existsSync: vi.fn(() => true) };
+});
+
+vi.mock('node:fs/promises', async (importOriginal) => {
+  const actual = await importOriginal<typeof import('node:fs/promises')>();
+  return {
+    ...actual,
+    realpath: vi.fn(async (p: string) => p),
+    stat: vi.fn(async () => ({ isDirectory: () => true })),
+    readFile: vi.fn(async () =>
+      JSON.stringify({ mm: { daemon: './daemon.ts', runtime: 'tsx' } }),
+    ),
+  };
+});
+
+vi.mock('../server/daemon-state.js', () => ({
+  readDaemonState: vi.fn(async () => null),
+  isDaemonAlive: vi.fn(async () => false),
+  isDaemonVersionMatch: vi.fn(() => true),
+  removeDaemonState: vi.fn(async () => {}),
+  acquireStartupLock: vi.fn(async () => true),
+  releaseStartupLock: vi.fn(async () => {}),
+}));
+
+let exitSpy: MockInstance;
+let stderrSpy: MockInstance;
+let stdoutSpy: MockInstance;
+
+// eslint-disable-next-line vitest/require-top-level-describe
+beforeEach(() => {
+  vi.clearAllMocks();
+  exitSpy = vi.spyOn(process, 'exit').mockImplementation((() => {
+    throw new Error('process.exit');
+  }) as never);
+  stderrSpy = vi.spyOn(process.stderr, 'write').mockReturnValue(true);
+  stdoutSpy = vi.spyOn(process.stdout, 'write').mockReturnValue(true);
+});
+
+// eslint-disable-next-line vitest/require-top-level-describe
+afterEach(() => {
+  exitSpy.mockRestore();
+  stderrSpy.mockRestore();
+  stdoutSpy.mockRestore();
+  vi.restoreAllMocks();
+});
+
+describe('extractProjectFlag', () => {
+  it('returns args unchanged when no --project flag', () => {
+    const result = extractProjectFlag(['launch', '--force']);
+    expect(result).toStrictEqual({
+      args: ['launch', '--force'],
+      projectPath: undefined,
+    });
+  });
+
+  it('extracts project path and removes flag from args', () => {
+    const result = extractProjectFlag([
+      '--project',
+      '/path/to/project',
+      'launch',
+    ]);
+    expect(result).toStrictEqual({
+      args: ['launch'],
+      projectPath: '/path/to/project',
+    });
+  });
+
+  it('handles --project in the middle of args', () => {
+    const result = extractProjectFlag([
+      'launch',
+      '--project',
+      '/my/path',
+      '--force',
+    ]);
+    expect(result).toStrictEqual({
+      args: ['launch', '--force'],
+      projectPath: '/my/path',
+    });
+  });
+
+  it('exits when --project has no value', () => {
+    expect(() => extractProjectFlag(['--project'])).toThrowError(
+      'process.exit',
+    );
+    expect(stderrSpy).toHaveBeenCalledWith(
+      'Error: --project requires a path value\n',
+    );
+  });
+
+  it('exits when --project value starts with --', () => {
+    expect(() => extractProjectFlag(['--project', '--force'])).toThrowError(
+      'process.exit',
+    );
+    expect(stderrSpy).toHaveBeenCalledWith(
+      'Error: --project requires a path value\n',
+    );
+  });
+});
+
+describe('resolveTargetFromArgs', () => {
+  it('returns selector for --selector flag', () => {
+    expect(resolveTargetFromArgs(['--selector', '.my-button'])).toStrictEqual({
+      selector: '.my-button',
+    });
+  });
+
+  it('returns testId for --testid flag', () => {
+    expect(resolveTargetFromArgs(['--testid', 'my-btn'])).toStrictEqual({
+      testId: 'my-btn',
+    });
+  });
+
+  it('returns a11yRef for e-number patterns', () => {
+    expect(resolveTargetFromArgs(['e3'])).toStrictEqual({ a11yRef: 'e3' });
+    expect(resolveTargetFromArgs(['e123'])).toStrictEqual({ a11yRef: 'e123' });
+  });
+
+  it('returns testId for non-e-number strings', () => {
+    expect(resolveTargetFromArgs(['submit-button'])).toStrictEqual({
+      testId: 'submit-button',
+    });
+    expect(resolveTargetFromArgs(['eabc'])).toStrictEqual({
+      testId: 'eabc',
+    });
+  });
+
+  it('exits when --selector has no value', () => {
+    expect(() => resolveTargetFromArgs(['--selector'])).toThrowError(
+      'process.exit',
+    );
+    expect(stderrSpy).toHaveBeenCalledWith(
+      'Error: --selector requires a value\n',
+    );
+  });
+
+  it('exits when --selector value starts with --', () => {
+    expect(() => resolveTargetFromArgs(['--selector', '--other'])).toThrowError(
+      'process.exit',
+    );
+  });
+
+  it('exits when --testid has no value', () => {
+    expect(() => resolveTargetFromArgs(['--testid'])).toThrowError(
+      'process.exit',
+    );
+    expect(stderrSpy).toHaveBeenCalledWith(
+      'Error: --testid requires a value\n',
+    );
+  });
+
+  it('exits when no target provided', () => {
+    expect(() => resolveTargetFromArgs([])).toThrowError('process.exit');
+    expect(stderrSpy).toHaveBeenCalledWith(
+      'Error: element target is required\n',
+    );
+  });
+});
+
+describe('getPositionalTarget', () => {
+  it('returns first non-flag argument', () => {
+    expect(getPositionalTarget(['e1', '--timeout', '5000'])).toBe('e1');
+  });
+
+  it('skips flag-value pairs', () => {
+    expect(getPositionalTarget(['--timeout', '5000', 'e1'])).toBe('e1');
+  });
+
+  it('returns undefined for empty args', () => {
+    expect(getPositionalTarget([])).toBeUndefined();
+  });
+
+  it('returns undefined when only flags present', () => {
+    expect(getPositionalTarget(['--timeout', '5000'])).toBeUndefined();
+  });
+});
+
+describe('isTransientError', () => {
+  it('returns true for ECONNREFUSED', () => {
+    expect(isTransientError(new Error('ECONNREFUSED'))).toBe(true);
+  });
+
+  it('returns true for ECONNRESET', () => {
+    expect(isTransientError(new Error('ECONNRESET'))).toBe(true);
+  });
+
+  it('returns true for EPIPE', () => {
+    expect(isTransientError(new Error('EPIPE'))).toBe(true);
+  });
+
+  it('returns true for UND_ERR_SOCKET', () => {
+    expect(isTransientError(new Error('UND_ERR_SOCKET'))).toBe(true);
+  });
+
+  it('returns true for fetch failed', () => {
+    expect(isTransientError(new Error('fetch failed'))).toBe(true);
+  });
+
+  it('returns false for other errors', () => {
+    expect(isTransientError(new Error('timeout'))).toBe(false);
+    expect(isTransientError(new Error('404 not found'))).toBe(false);
+  });
+});
+
+describe('parseIntFlag', () => {
+  it('returns parsed integer value', () => {
+    expect(parseIntFlag(['--timeout', '5000'], '--timeout')).toBe(5000);
+  });
+
+  it('returns undefined when flag is absent', () => {
+    expect(parseIntFlag(['--other', '5000'], '--timeout')).toBeUndefined();
+  });
+
+  it('returns undefined for NaN values', () => {
+    expect(parseIntFlag(['--timeout', 'abc'], '--timeout')).toBeUndefined();
+  });
+
+  it('returns undefined when no value follows flag', () => {
+    expect(parseIntFlag(['--timeout'], '--timeout')).toBeUndefined();
+  });
+});
+
+describe('parseStringFlag', () => {
+  it('returns string value', () => {
+    expect(parseStringFlag(['--role', 'extension'], '--role')).toBe(
+      'extension',
+    );
+  });
+
+  it('returns undefined when flag is absent', () => {
+    expect(parseStringFlag(['--other', 'val'], '--role')).toBeUndefined();
+  });
+
+  it('returns undefined when value starts with --', () => {
+    expect(parseStringFlag(['--role', '--other'], '--role')).toBeUndefined();
+  });
+
+  it('returns undefined when no value follows', () => {
+    expect(parseStringFlag(['--role'], '--role')).toBeUndefined();
+  });
+});
+
+describe('parseLaunchArgs', () => {
+  it('returns empty object for no args', () => {
+    expect(parseLaunchArgs([])).toStrictEqual({});
+  });
+
+  it('parses --force flag', () => {
+    expect(parseLaunchArgs(['--force'])).toStrictEqual({ force: true });
+  });
+
+  it('parses --state value', () => {
+    expect(parseLaunchArgs(['--state', 'onboarding'])).toStrictEqual({
+      stateMode: 'onboarding',
+    });
+  });
+
+  it('parses --extension-path value', () => {
+    expect(parseLaunchArgs(['--extension-path', '/ext'])).toStrictEqual({
+      extensionPath: '/ext',
+    });
+  });
+
+  it('parses --goal value', () => {
+    expect(parseLaunchArgs(['--goal', 'test swap'])).toStrictEqual({
+      goal: 'test swap',
+    });
+  });
+
+  it('parses --flow-tags as comma-separated array', () => {
+    expect(parseLaunchArgs(['--flow-tags', 'send, swap'])).toStrictEqual({
+      flowTags: ['send', 'swap'],
+    });
+  });
+
+  it('parses multiple flags together', () => {
+    expect(
+      parseLaunchArgs(['--state', 'default', '--force', '--goal', 'test it']),
+    ).toStrictEqual({
+      stateMode: 'default',
+      force: true,
+      goal: 'test it',
+    });
+  });
+
+  it('exits for --state without value', () => {
+    expect(() => parseLaunchArgs(['--state'])).toThrowError('process.exit');
+    expect(stderrSpy).toHaveBeenCalledWith(
+      'Error: --state requires a value (default|onboarding|custom)\n',
+    );
+  });
+
+  it('exits for --state with flag as value', () => {
+    expect(() => parseLaunchArgs(['--state', '--force'])).toThrowError(
+      'process.exit',
+    );
+  });
+
+  it('exits for --extension-path without value', () => {
+    expect(() => parseLaunchArgs(['--extension-path'])).toThrowError(
+      'process.exit',
+    );
+    expect(stderrSpy).toHaveBeenCalledWith(
+      'Error: --extension-path requires a value\n',
+    );
+  });
+
+  it('exits for --goal without value', () => {
+    expect(() => parseLaunchArgs(['--goal'])).toThrowError('process.exit');
+    expect(stderrSpy).toHaveBeenCalledWith('Error: --goal requires a value\n');
+  });
+
+  it('exits for --flow-tags without value', () => {
+    expect(() => parseLaunchArgs(['--flow-tags'])).toThrowError('process.exit');
+    expect(stderrSpy).toHaveBeenCalledWith(
+      'Error: --flow-tags requires a comma-separated value\n',
+    );
+  });
+
+  it('writes warning for unknown flags', () => {
+    parseLaunchArgs(['--unknown']);
+    expect(stderrSpy).toHaveBeenCalledWith(
+      "Warning: unknown launch flag '--unknown'\n",
+    );
+  });
+});
+
+describe('printHelp', () => {
+  it('writes help text to stdout', () => {
+    printHelp();
+    expect(stdoutSpy).toHaveBeenCalledTimes(1);
+    const output = (stdoutSpy.mock.calls[0] as string[])[0];
+    expect(output).toContain('mm — MetaMask CLI');
+    expect(output).toContain('Usage:');
+    expect(output).toContain('mm launch');
+  });
+});
+
+describe('resolveRuntime', () => {
+  it('returns node for node runtime', () => {
+    expect(resolveRuntime('/root', 'node')).toBe('node');
+  });
+
+  it('returns bin path when runtime exists', () => {
+    vi.mocked(existsSync).mockReturnValue(true);
+    const result = resolveRuntime('/root', 'tsx');
+    expect(result).toBe(path.join('/root', 'node_modules', '.bin', 'tsx'));
+  });
+
+  it('exits when runtime binary not found', () => {
+    vi.mocked(existsSync).mockReturnValue(false);
+    expect(() => resolveRuntime('/root', 'tsx')).toThrowError('process.exit');
+    expect(stderrSpy).toHaveBeenCalledWith(
+      expect.stringContaining("Runtime 'tsx' not found"),
+    );
+  });
+});
+
+describe('sleep', () => {
+  it('resolves after delay', async () => {
+    vi.useFakeTimers();
+    const promise = sleep(100);
+    vi.advanceTimersByTime(100);
+    expect(await promise).toBeUndefined();
+    vi.useRealTimers();
+  });
+});
+
+describe('shutdownDaemon', () => {
+  it('sends SIGTERM and removes state', async () => {
+    const { removeDaemonState } = await import('../server/daemon-state.js');
+    const killSpy = vi
+      .spyOn(process, 'kill')
+      .mockImplementation(vi.fn() as unknown as typeof process.kill);
+
+    await shutdownDaemon('/root', {
+      port: 3000,
+      pid: 12345,
+      nonce: 'abc',
+      startedAt: '2024-01-01',
+      subPorts: { anvil: 8545, fixture: 8546, mock: 8547 },
+    });
+
+    expect(killSpy).toHaveBeenCalledWith(12345, 'SIGTERM');
+    expect(removeDaemonState).toHaveBeenCalledWith('/root');
+    killSpy.mockRestore();
+  });
+
+  it('ignores kill errors for dead processes', async () => {
+    const killSpy = vi.spyOn(process, 'kill').mockImplementation((() => {
+      throw new Error('ESRCH');
+    }) as unknown as typeof process.kill);
+
+    await shutdownDaemon('/root', {
+      port: 3000,
+      pid: 12345,
+      nonce: 'abc',
+      startedAt: '2024-01-01',
+      subPorts: { anvil: 8545, fixture: 8546, mock: 8547 },
+    });
+
+    expect(killSpy).toHaveBeenCalled();
+    killSpy.mockRestore();
+  });
+
+  it('skips kill when pid is falsy', async () => {
+    const killSpy = vi
+      .spyOn(process, 'kill')
+      .mockImplementation(vi.fn() as unknown as typeof process.kill);
+
+    await shutdownDaemon('/root', {
+      port: 3000,
+      pid: 0,
+      nonce: 'abc',
+      startedAt: '2024-01-01',
+      subPorts: { anvil: 8545, fixture: 8546, mock: 8547 },
+    });
+
+    expect(killSpy).not.toHaveBeenCalled();
+    killSpy.mockRestore();
+  });
+});
+
+describe('readDaemonConfig', () => {
+  it('reads and parses mm config from package.json', async () => {
+    vi.mocked(fs.readFile).mockResolvedValueOnce(
+      JSON.stringify({
+        mm: { daemon: './my-daemon.ts', runtime: 'tsx' },
+      }),
+    );
+
+    const result = await readDaemonConfig('/project');
+
+    expect(result).toStrictEqual({
+      daemonPath: './my-daemon.ts',
+      runtime: 'tsx',
+    });
+  });
+
+  it('defaults runtime to tsx when not specified', async () => {
+    vi.mocked(fs.readFile).mockResolvedValueOnce(
+      JSON.stringify({ mm: { daemon: './d.ts' } }),
+    );
+
+    const result = await readDaemonConfig('/project');
+
+    expect(result.runtime).toBe('tsx');
+  });
+
+  it('exits when package.json cannot be read', async () => {
+    vi.mocked(fs.readFile).mockRejectedValueOnce(new Error('ENOENT'));
+
+    await expect(readDaemonConfig('/project')).rejects.toThrowError(
+      'process.exit',
+    );
+    expect(stderrSpy).toHaveBeenCalledWith(
+      expect.stringContaining('Cannot read package.json'),
+    );
+  });
+
+  it('exits when mm.daemon is not configured', async () => {
+    vi.mocked(fs.readFile).mockResolvedValueOnce(JSON.stringify({}));
+
+    await expect(readDaemonConfig('/project')).rejects.toThrowError(
+      'process.exit',
+    );
+    expect(stderrSpy).toHaveBeenCalledWith(
+      expect.stringContaining('No daemon entry point configured'),
+    );
+  });
+});
+
+describe('resolveWorktreeRoot', () => {
+  it('resolves path from --project flag', async () => {
+    vi.mocked(fs.realpath).mockResolvedValueOnce('/resolved/path');
+    vi.mocked(fs.stat).mockResolvedValueOnce({
+      isDirectory: () => true,
+    } as any);
+
+    const result = await resolveWorktreeRoot('/some/path');
+    expect(result).toBe('/resolved/path');
+  });
+
+  it('resolves path from MM_PROJECT env when no flag', async () => {
+    const origEnv = process.env.MM_PROJECT;
+    process.env.MM_PROJECT = '/env/path';
+
+    vi.mocked(fs.realpath).mockResolvedValueOnce('/env/path');
+    vi.mocked(fs.stat).mockResolvedValueOnce({
+      isDirectory: () => true,
+    } as any);
+
+    const result = await resolveWorktreeRoot(undefined);
+    expect(result).toBe('/env/path');
+
+    process.env.MM_PROJECT = origEnv;
+  });
+
+  it('exits when path does not exist', async () => {
+    vi.mocked(fs.realpath).mockRejectedValueOnce(new Error('ENOENT'));
+
+    await expect(resolveWorktreeRoot('/bad/path')).rejects.toThrowError(
+      'process.exit',
+    );
+    expect(stderrSpy).toHaveBeenCalledWith(
+      expect.stringContaining('project path does not exist'),
+    );
+  });
+
+  it('exits when path is not a directory', async () => {
+    vi.mocked(fs.realpath).mockResolvedValueOnce('/some/file.txt');
+    vi.mocked(fs.stat).mockResolvedValueOnce({
+      isDirectory: () => false,
+    } as any);
+
+    await expect(resolveWorktreeRoot('/some/file.txt')).rejects.toThrowError(
+      'process.exit',
+    );
+    expect(stderrSpy).toHaveBeenCalledWith(
+      expect.stringContaining('project path is not a directory'),
+    );
+  });
+
+  it('exits when stat fails', async () => {
+    vi.mocked(fs.realpath).mockResolvedValueOnce('/some/path');
+    vi.mocked(fs.stat).mockRejectedValueOnce(new Error('EACCES'));
+
+    await expect(resolveWorktreeRoot('/some/path')).rejects.toThrowError(
+      'process.exit',
+    );
+    expect(stderrSpy).toHaveBeenCalledWith(
+      expect.stringContaining('cannot access project path'),
+    );
+  });
+
+  it('falls back to git worktree when no explicit path', async () => {
+    const origEnv = process.env.MM_PROJECT;
+    delete process.env.MM_PROJECT;
+
+    const { execSync } = await import('node:child_process');
+    vi.mocked(execSync).mockReturnValueOnce(Buffer.from('/git/root\n'));
+
+    const result = await resolveWorktreeRoot(undefined);
+    expect(result).toBe('/git/root');
+
+    process.env.MM_PROJECT = origEnv;
+  });
+
+  it('exits when not in a git repository', async () => {
+    const origEnv = process.env.MM_PROJECT;
+    delete process.env.MM_PROJECT;
+
+    const { execSync } = await import('node:child_process');
+    vi.mocked(execSync).mockImplementation(() => {
+      throw new Error('not a git repo');
+    });
+
+    await expect(resolveWorktreeRoot(undefined)).rejects.toThrowError(
+      'process.exit',
+    );
+    expect(stderrSpy).toHaveBeenCalledWith(
+      expect.stringContaining('not in a git repository'),
+    );
+
+    process.env.MM_PROJECT = origEnv;
+  });
+});
+
+describe('sendRequest', () => {
+  const originalFetch = globalThis.fetch;
+
+  afterEach(() => {
+    globalThis.fetch = originalFetch;
+  });
+
+  it('sends GET request and prints JSON result', async () => {
+    vi.spyOn(globalThis, 'fetch').mockResolvedValue({
+      ok: true,
+      json: async () => ({ ok: true, result: { status: 'running' } }),
+    } as Response);
+
+    await sendRequest(3000, 'GET', '/status', null);
+
+    expect(globalThis.fetch).toHaveBeenCalledWith(
+      'http://127.0.0.1:3000/status',
+      expect.objectContaining({ method: 'GET' }),
+    );
+    expect(stdoutSpy).toHaveBeenCalled();
+  });
+
+  it('sends POST request with JSON body', async () => {
+    vi.spyOn(globalThis, 'fetch').mockResolvedValue({
+      ok: true,
+      json: async () => ({ ok: true, result: 'launched' }),
+    } as Response);
+
+    await sendRequest(3000, 'POST', '/launch', { state: 'default' });
+
+    expect(globalThis.fetch).toHaveBeenCalledWith(
+      'http://127.0.0.1:3000/launch',
+      expect.objectContaining({
+        method: 'POST',
+        body: '{"state":"default"}',
+        headers: { 'Content-Type': 'application/json' },
+      }),
+    );
+  });
+
+  it('prints string results directly', async () => {
+    vi.spyOn(globalThis, 'fetch').mockResolvedValue({
+      ok: true,
+      json: async () => ({ ok: true, result: 'simple string' }),
+    } as Response);
+
+    await sendRequest(3000, 'GET', '/status', null);
+
+    expect(stdoutSpy).toHaveBeenCalledWith('simple string\n');
+  });
+
+  it('exits on error response', async () => {
+    vi.spyOn(globalThis, 'fetch').mockResolvedValue({
+      ok: false,
+      json: async () => ({
+        ok: false,
+        error: { message: 'No session' },
+      }),
+    } as Response);
+
+    await expect(
+      sendRequest(3000, 'POST', '/tool/click', {}),
+    ).rejects.toThrowError('process.exit');
+    expect(stderrSpy).toHaveBeenCalledWith('Error: No session\n');
+  });
+
+  it('exits on ok:false in response body', async () => {
+    vi.spyOn(globalThis, 'fetch').mockResolvedValue({
+      ok: true,
+      json: async () => ({
+        ok: false,
+        error: { message: 'Tool failed' },
+      }),
+    } as Response);
+
+    await expect(
+      sendRequest(3000, 'POST', '/tool/click', {}),
+    ).rejects.toThrowError('process.exit');
+    expect(stderrSpy).toHaveBeenCalledWith('Error: Tool failed\n');
+  });
+
+  it('retries transient errors', async () => {
+    let attempts = 0;
+    vi.spyOn(globalThis, 'fetch').mockImplementation(async () => {
+      attempts += 1;
+      if (attempts <= 2) {
+        throw new Error('ECONNREFUSED');
+      }
+      return {
+        ok: true,
+        json: async () => ({ ok: true, result: 'ok' }),
+      } as Response;
+    });
+
+    await sendRequest(3000, 'GET', '/health', null);
+
+    expect(attempts).toBe(3);
+  });
+
+  it('exits after max retries for transient errors', async () => {
+    vi.spyOn(globalThis, 'fetch').mockRejectedValue(new Error('ECONNREFUSED'));
+
+    await expect(
+      sendRequest(3000, 'GET', '/health', null),
+    ).rejects.toThrowError('process.exit');
+    expect(stderrSpy).toHaveBeenCalledWith(
+      expect.stringContaining('ECONNREFUSED'),
+    );
+  });
+
+  it('exits immediately for non-transient errors', async () => {
+    vi.spyOn(globalThis, 'fetch').mockRejectedValue(
+      new Error('some other error'),
+    );
+
+    await expect(
+      sendRequest(3000, 'GET', '/health', null),
+    ).rejects.toThrowError('process.exit');
+    expect(stderrSpy).toHaveBeenCalledWith(
+      expect.stringContaining('some other error'),
+    );
+  });
+
+  it('exits on request timeout (AbortError)', async () => {
+    const abortError = new Error('The operation was aborted');
+    abortError.name = 'AbortError';
+    vi.spyOn(globalThis, 'fetch').mockRejectedValue(abortError);
+
+    await expect(sendRequest(3000, 'POST', '/launch', {})).rejects.toThrowError(
+      'process.exit',
+    );
+    expect(stderrSpy).toHaveBeenCalledWith(
+      expect.stringContaining('request timed out'),
+    );
+  });
+
+  it('falls back to data when no result key', async () => {
+    vi.spyOn(globalThis, 'fetch').mockResolvedValue({
+      ok: true,
+      json: async () => ({ ok: true, status: 'running' }),
+    } as Response);
+
+    await sendRequest(3000, 'GET', '/status', null);
+
+    expect(stdoutSpy).toHaveBeenCalled();
+    const output = (stdoutSpy.mock.calls[0] as string[])[0];
+    expect(output).toContain('running');
+  });
+
+  it('falls back to "Request failed" when error has no message', async () => {
+    vi.spyOn(globalThis, 'fetch').mockResolvedValue({
+      ok: false,
+      json: async () => ({ ok: false }),
+    } as Response);
+
+    await expect(sendRequest(3000, 'POST', '/tool/x', {})).rejects.toThrowError(
+      'process.exit',
+    );
+    expect(stderrSpy).toHaveBeenCalledWith('Error: Request failed\n');
+  });
+});
+
+describe('routeCommand', () => {
+  const originalFetch = globalThis.fetch;
+
+  beforeEach(() => {
+    vi.spyOn(globalThis, 'fetch').mockResolvedValue({
+      ok: true,
+      json: async () => ({ ok: true, result: {} }),
+    } as Response);
+  });
+
+  afterEach(() => {
+    globalThis.fetch = originalFetch;
+  });
+
+  it('routes status to GET /status', async () => {
+    await routeCommand('status', [], 3000);
+    expect(globalThis.fetch).toHaveBeenCalledWith(
+      'http://127.0.0.1:3000/status',
+      expect.objectContaining({ method: 'GET' }),
+    );
+  });
+
+  it('routes click with a11y ref', async () => {
+    await routeCommand('click', ['e3'], 3000);
+    expect(globalThis.fetch).toHaveBeenCalledWith(
+      'http://127.0.0.1:3000/tool/click',
+      expect.objectContaining({
+        body: JSON.stringify({ a11yRef: 'e3' }),
+      }),
+    );
+  });
+
+  it('routes click with --selector', async () => {
+    await routeCommand('click', ['--selector', '.btn'], 3000);
+    expect(globalThis.fetch).toHaveBeenCalledWith(
+      'http://127.0.0.1:3000/tool/click',
+      expect.objectContaining({
+        body: JSON.stringify({ selector: '.btn' }),
+      }),
+    );
+  });
+
+  it('exits when click has no target', async () => {
+    await expect(routeCommand('click', [], 3000)).rejects.toThrowError(
+      'process.exit',
+    );
+    expect(stderrSpy).toHaveBeenCalledWith(
+      expect.stringContaining('Usage: mm click'),
+    );
+  });
+
+  it('routes type with ref and text', async () => {
+    await routeCommand('type', ['e1', 'hello'], 3000);
+    expect(globalThis.fetch).toHaveBeenCalledWith(
+      'http://127.0.0.1:3000/tool/type',
+      expect.objectContaining({
+        body: JSON.stringify({ a11yRef: 'e1', text: 'hello' }),
+      }),
+    );
+  });
+
+  it('routes type with --testid', async () => {
+    await routeCommand('type', ['--testid', 'input', 'text'], 3000);
+    expect(globalThis.fetch).toHaveBeenCalledWith(
+      'http://127.0.0.1:3000/tool/type',
+      expect.objectContaining({
+        body: JSON.stringify({ testId: 'input', text: 'text' }),
+      }),
+    );
+  });
+
+  it('exits when type has no target', async () => {
+    await expect(routeCommand('type', [], 3000)).rejects.toThrowError(
+      'process.exit',
+    );
+  });
+
+  it('exits when type has no text', async () => {
+    await expect(routeCommand('type', ['e1'], 3000)).rejects.toThrowError(
+      'process.exit',
+    );
+    expect(stderrSpy).toHaveBeenCalledWith('Usage: mm type <ref> <text>\n');
+  });
+
+  it('routes describe-screen', async () => {
+    await routeCommand('describe-screen', [], 3000);
+    expect(globalThis.fetch).toHaveBeenCalledWith(
+      'http://127.0.0.1:3000/tool/describe_screen',
+      expect.anything(),
+    );
+  });
+
+  it('routes screenshot', async () => {
+    await routeCommand('screenshot', [], 3000);
+    expect(globalThis.fetch).toHaveBeenCalledWith(
+      'http://127.0.0.1:3000/tool/screenshot',
+      expect.objectContaining({ body: JSON.stringify({}) }),
+    );
+  });
+
+  it('routes screenshot with --name', async () => {
+    await routeCommand('screenshot', ['--name', 'my-shot'], 3000);
+    expect(globalThis.fetch).toHaveBeenCalledWith(
+      'http://127.0.0.1:3000/tool/screenshot',
+      expect.objectContaining({
+        body: JSON.stringify({ name: 'my-shot' }),
+      }),
+    );
+  });
+
+  it('routes wait-for with ref', async () => {
+    await routeCommand('wait-for', ['e5'], 3000);
+    expect(globalThis.fetch).toHaveBeenCalledWith(
+      'http://127.0.0.1:3000/tool/wait_for',
+      expect.objectContaining({
+        body: JSON.stringify({ a11yRef: 'e5' }),
+      }),
+    );
+  });
+
+  it('routes wait-for with --timeout', async () => {
+    await routeCommand('wait-for', ['e5', '--timeout', '10000'], 3000);
+    expect(globalThis.fetch).toHaveBeenCalledWith(
+      'http://127.0.0.1:3000/tool/wait_for',
+      expect.objectContaining({
+        body: JSON.stringify({ a11yRef: 'e5', timeoutMs: 10000 }),
+      }),
+    );
+  });
+
+  it('exits when wait-for has no target', async () => {
+    await expect(routeCommand('wait-for', [], 3000)).rejects.toThrowError(
+      'process.exit',
+    );
+  });
+
+  it('routes navigate with url', async () => {
+    await routeCommand('navigate', ['http://example.com'], 3000);
+    expect(globalThis.fetch).toHaveBeenCalledWith(
+      'http://127.0.0.1:3000/tool/navigate',
+      expect.objectContaining({
+        body: JSON.stringify({ screen: 'url', url: 'http://example.com' }),
+      }),
+    );
+  });
+
+  it('exits when navigate has no url', async () => {
+    await expect(routeCommand('navigate', [], 3000)).rejects.toThrowError(
+      'process.exit',
+    );
+  });
+
+  it('routes navigate-home', async () => {
+    await routeCommand('navigate-home', [], 3000);
+    expect(globalThis.fetch).toHaveBeenCalledWith(
+      'http://127.0.0.1:3000/tool/navigate',
+      expect.objectContaining({
+        body: JSON.stringify({ screen: 'home' }),
+      }),
+    );
+  });
+
+  it('routes navigate-settings', async () => {
+    await routeCommand('navigate-settings', [], 3000);
+    expect(globalThis.fetch).toHaveBeenCalledWith(
+      'http://127.0.0.1:3000/tool/navigate',
+      expect.objectContaining({
+        body: JSON.stringify({ screen: 'settings' }),
+      }),
+    );
+  });
+
+  it('routes get-state', async () => {
+    await routeCommand('get-state', [], 3000);
+    expect(globalThis.fetch).toHaveBeenCalledWith(
+      'http://127.0.0.1:3000/tool/get_state',
+      expect.anything(),
+    );
+  });
+
+  it('routes get-context', async () => {
+    await routeCommand('get-context', [], 3000);
+    expect(globalThis.fetch).toHaveBeenCalledWith(
+      'http://127.0.0.1:3000/tool/get_context',
+      expect.anything(),
+    );
+  });
+
+  it('routes set-context with e2e', async () => {
+    await routeCommand('set-context', ['e2e'], 3000);
+    expect(globalThis.fetch).toHaveBeenCalledWith(
+      'http://127.0.0.1:3000/tool/set_context',
+      expect.objectContaining({
+        body: JSON.stringify({ context: 'e2e' }),
+      }),
+    );
+  });
+
+  it('routes set-context with prod', async () => {
+    await routeCommand('set-context', ['prod'], 3000);
+    expect(globalThis.fetch).toHaveBeenCalledWith(
+      'http://127.0.0.1:3000/tool/set_context',
+      expect.objectContaining({
+        body: JSON.stringify({ context: 'prod' }),
+      }),
+    );
+  });
+
+  it('exits when set-context has invalid value', async () => {
+    await expect(
+      routeCommand('set-context', ['other'], 3000),
+    ).rejects.toThrowError('process.exit');
+    expect(stderrSpy).toHaveBeenCalledWith(
+      'Usage: mm set-context <e2e|prod>\n',
+    );
+  });
+
+  it('exits when set-context has no value', async () => {
+    await expect(routeCommand('set-context', [], 3000)).rejects.toThrowError(
+      'process.exit',
+    );
+  });
+
+  it('routes build', async () => {
+    await routeCommand('build', [], 3000);
+    expect(globalThis.fetch).toHaveBeenCalledWith(
+      'http://127.0.0.1:3000/tool/build',
+      expect.objectContaining({ body: JSON.stringify({}) }),
+    );
+  });
+
+  it('routes build with --force', async () => {
+    await routeCommand('build', ['--force'], 3000);
+    expect(globalThis.fetch).toHaveBeenCalledWith(
+      'http://127.0.0.1:3000/tool/build',
+      expect.objectContaining({
+        body: JSON.stringify({ force: true }),
+      }),
+    );
+  });
+
+  it('routes wait-for-notification', async () => {
+    await routeCommand('wait-for-notification', [], 3000);
+    expect(globalThis.fetch).toHaveBeenCalledWith(
+      'http://127.0.0.1:3000/tool/wait_for_notification',
+      expect.objectContaining({ body: JSON.stringify({}) }),
+    );
+  });
+
+  it('routes wait-for-notification with --timeout', async () => {
+    await routeCommand('wait-for-notification', ['--timeout', '5000'], 3000);
+    expect(globalThis.fetch).toHaveBeenCalledWith(
+      'http://127.0.0.1:3000/tool/wait_for_notification',
+      expect.objectContaining({
+        body: JSON.stringify({ timeoutMs: 5000 }),
+      }),
+    );
+  });
+
+  it('routes switch-to-tab with --role', async () => {
+    await routeCommand('switch-to-tab', ['--role', 'extension'], 3000);
+    expect(globalThis.fetch).toHaveBeenCalledWith(
+      'http://127.0.0.1:3000/tool/switch_to_tab',
+      expect.objectContaining({
+        body: JSON.stringify({ role: 'extension' }),
+      }),
+    );
+  });
+
+  it('routes switch-to-tab with --url', async () => {
+    await routeCommand('switch-to-tab', ['--url', 'http://dapp.io'], 3000);
+    expect(globalThis.fetch).toHaveBeenCalledWith(
+      'http://127.0.0.1:3000/tool/switch_to_tab',
+      expect.objectContaining({
+        body: JSON.stringify({ url: 'http://dapp.io' }),
+      }),
+    );
+  });
+
+  it('exits when switch-to-tab has no flags', async () => {
+    await expect(routeCommand('switch-to-tab', [], 3000)).rejects.toThrowError(
+      'process.exit',
+    );
+  });
+
+  it('routes close-tab with --role', async () => {
+    await routeCommand('close-tab', ['--role', 'dapp'], 3000);
+    expect(globalThis.fetch).toHaveBeenCalledWith(
+      'http://127.0.0.1:3000/tool/close_tab',
+      expect.objectContaining({
+        body: JSON.stringify({ role: 'dapp' }),
+      }),
+    );
+  });
+
+  it('routes close-tab with --url', async () => {
+    await routeCommand('close-tab', ['--url', 'http://x.io'], 3000);
+    expect(globalThis.fetch).toHaveBeenCalledWith(
+      'http://127.0.0.1:3000/tool/close_tab',
+      expect.objectContaining({
+        body: JSON.stringify({ url: 'http://x.io' }),
+      }),
+    );
+  });
+
+  it('exits when close-tab has no flags', async () => {
+    await expect(routeCommand('close-tab', [], 3000)).rejects.toThrowError(
+      'process.exit',
+    );
+  });
+
+  it('routes clipboard read', async () => {
+    await routeCommand('clipboard', ['read'], 3000);
+    expect(globalThis.fetch).toHaveBeenCalledWith(
+      'http://127.0.0.1:3000/tool/clipboard',
+      expect.objectContaining({
+        body: JSON.stringify({ action: 'read' }),
+      }),
+    );
+  });
+
+  it('routes clipboard write with text', async () => {
+    await routeCommand('clipboard', ['write', 'hello'], 3000);
+    expect(globalThis.fetch).toHaveBeenCalledWith(
+      'http://127.0.0.1:3000/tool/clipboard',
+      expect.objectContaining({
+        body: JSON.stringify({ action: 'write', text: 'hello' }),
+      }),
+    );
+  });
+
+  it('exits when clipboard has invalid action', async () => {
+    await expect(
+      routeCommand('clipboard', ['invalid'], 3000),
+    ).rejects.toThrowError('process.exit');
+  });
+
+  it('exits when clipboard has no action', async () => {
+    await expect(routeCommand('clipboard', [], 3000)).rejects.toThrowError(
+      'process.exit',
+    );
+  });
+
+  it('exits when clipboard write has no text', async () => {
+    await expect(
+      routeCommand('clipboard', ['write'], 3000),
+    ).rejects.toThrowError('process.exit');
+  });
+
+  it('routes seed-contract', async () => {
+    await routeCommand('seed-contract', ['hst'], 3000);
+    expect(globalThis.fetch).toHaveBeenCalledWith(
+      'http://127.0.0.1:3000/tool/seed_contract',
+      expect.objectContaining({
+        body: JSON.stringify({ contractName: 'hst' }),
+      }),
+    );
+  });
+
+  it('routes seed-contract with --hardfork', async () => {
+    await routeCommand(
+      'seed-contract',
+      ['hst', '--hardfork', 'shanghai'],
+      3000,
+    );
+    expect(globalThis.fetch).toHaveBeenCalledWith(
+      'http://127.0.0.1:3000/tool/seed_contract',
+      expect.objectContaining({
+        body: JSON.stringify({ contractName: 'hst', hardfork: 'shanghai' }),
+      }),
+    );
+  });
+
+  it('exits when seed-contract has no name', async () => {
+    await expect(routeCommand('seed-contract', [], 3000)).rejects.toThrowError(
+      'process.exit',
+    );
+  });
+
+  it('routes seed-contracts with multiple names', async () => {
+    await routeCommand('seed-contracts', ['hst', 'nfts'], 3000);
+    expect(globalThis.fetch).toHaveBeenCalledWith(
+      'http://127.0.0.1:3000/tool/seed_contracts',
+      expect.objectContaining({
+        body: JSON.stringify({ contracts: ['hst', 'nfts'] }),
+      }),
+    );
+  });
+
+  it('routes seed-contracts with --hardfork', async () => {
+    await routeCommand(
+      'seed-contracts',
+      ['hst', '--hardfork', 'shanghai'],
+      3000,
+    );
+    expect(globalThis.fetch).toHaveBeenCalledWith(
+      'http://127.0.0.1:3000/tool/seed_contracts',
+      expect.objectContaining({
+        body: expect.stringContaining('"hardfork":"shanghai"'),
+      }),
+    );
+  });
+
+  it('exits when seed-contracts has no names', async () => {
+    await expect(routeCommand('seed-contracts', [], 3000)).rejects.toThrowError(
+      'process.exit',
+    );
+  });
+
+  it('routes get-contract-address', async () => {
+    await routeCommand('get-contract-address', ['hst'], 3000);
+    expect(globalThis.fetch).toHaveBeenCalledWith(
+      'http://127.0.0.1:3000/tool/get_contract_address',
+      expect.objectContaining({
+        body: JSON.stringify({ contractName: 'hst' }),
+      }),
+    );
+  });
+
+  it('exits when get-contract-address has no name', async () => {
+    await expect(
+      routeCommand('get-contract-address', [], 3000),
+    ).rejects.toThrowError('process.exit');
+  });
+
+  it('routes list-contracts', async () => {
+    await routeCommand('list-contracts', [], 3000);
+    expect(globalThis.fetch).toHaveBeenCalledWith(
+      'http://127.0.0.1:3000/tool/list_contracts',
+      expect.anything(),
+    );
+  });
+
+  it('routes list-testids', async () => {
+    await routeCommand('list-testids', [], 3000);
+    expect(globalThis.fetch).toHaveBeenCalledWith(
+      'http://127.0.0.1:3000/tool/list_testids',
+      expect.objectContaining({ body: JSON.stringify({}) }),
+    );
+  });
+
+  it('routes list-testids with --limit', async () => {
+    await routeCommand('list-testids', ['--limit', '50'], 3000);
+    expect(globalThis.fetch).toHaveBeenCalledWith(
+      'http://127.0.0.1:3000/tool/list_testids',
+      expect.objectContaining({
+        body: JSON.stringify({ limit: 50 }),
+      }),
+    );
+  });
+
+  it('routes accessibility-snapshot', async () => {
+    await routeCommand('accessibility-snapshot', [], 3000);
+    expect(globalThis.fetch).toHaveBeenCalledWith(
+      'http://127.0.0.1:3000/tool/accessibility_snapshot',
+      expect.objectContaining({ body: JSON.stringify({}) }),
+    );
+  });
+
+  it('routes accessibility-snapshot with --root', async () => {
+    await routeCommand('accessibility-snapshot', ['--root', '#main'], 3000);
+    expect(globalThis.fetch).toHaveBeenCalledWith(
+      'http://127.0.0.1:3000/tool/accessibility_snapshot',
+      expect.objectContaining({
+        body: JSON.stringify({ rootSelector: '#main' }),
+      }),
+    );
+  });
+
+  it('routes knowledge-search', async () => {
+    await routeCommand('knowledge-search', ['send flow'], 3000);
+    expect(globalThis.fetch).toHaveBeenCalledWith(
+      'http://127.0.0.1:3000/tool/knowledge_search',
+      expect.objectContaining({
+        body: JSON.stringify({ query: 'send flow' }),
+      }),
+    );
+  });
+
+  it('exits when knowledge-search has no query', async () => {
+    await expect(
+      routeCommand('knowledge-search', [], 3000),
+    ).rejects.toThrowError('process.exit');
+  });
+
+  it('routes knowledge-last', async () => {
+    await routeCommand('knowledge-last', [], 3000);
+    expect(globalThis.fetch).toHaveBeenCalledWith(
+      'http://127.0.0.1:3000/tool/knowledge_last',
+      expect.anything(),
+    );
+  });
+
+  it('routes knowledge-sessions', async () => {
+    await routeCommand('knowledge-sessions', [], 3000);
+    expect(globalThis.fetch).toHaveBeenCalledWith(
+      'http://127.0.0.1:3000/tool/knowledge_sessions',
+      expect.anything(),
+    );
+  });
+
+  it('routes knowledge-summarize', async () => {
+    await routeCommand('knowledge-summarize', [], 3000);
+    expect(globalThis.fetch).toHaveBeenCalledWith(
+      'http://127.0.0.1:3000/tool/knowledge_summarize',
+      expect.objectContaining({ body: JSON.stringify({}) }),
+    );
+  });
+
+  it('routes knowledge-summarize with --session', async () => {
+    await routeCommand('knowledge-summarize', ['--session', 'sid'], 3000);
+    expect(globalThis.fetch).toHaveBeenCalledWith(
+      'http://127.0.0.1:3000/tool/knowledge_summarize',
+      expect.objectContaining({
+        body: JSON.stringify({ scope: { sessionId: 'sid' } }),
+      }),
+    );
+  });
+
+  it('routes run-steps with JSON input', async () => {
+    const input = JSON.stringify({
+      steps: [{ tool: 'click', args: { a11yRef: 'e1' } }],
+    });
+    await routeCommand('run-steps', [input], 3000);
+    expect(globalThis.fetch).toHaveBeenCalledWith(
+      'http://127.0.0.1:3000/tool/run_steps',
+      expect.objectContaining({ body: input }),
+    );
+  });
+
+  it('exits when run-steps has no input', async () => {
+    await expect(routeCommand('run-steps', [], 3000)).rejects.toThrowError(
+      'process.exit',
+    );
+  });
+
+  it('exits when run-steps has invalid JSON', async () => {
+    await expect(
+      routeCommand('run-steps', ['{bad json}'], 3000),
+    ).rejects.toThrowError('process.exit');
+    expect(stderrSpy).toHaveBeenCalledWith(
+      expect.stringContaining('invalid JSON'),
+    );
+  });
+
+  it('exits for unknown command', async () => {
+    await expect(routeCommand('unknown-cmd', [], 3000)).rejects.toThrowError(
+      'process.exit',
+    );
+    expect(stderrSpy).toHaveBeenCalledWith(
+      expect.stringContaining("unknown command 'unknown-cmd'"),
+    );
+  });
+});
+
+describe('discoverDaemon', () => {
+  it('returns existing alive daemon with matching version', async () => {
+    const { readDaemonState, isDaemonAlive, isDaemonVersionMatch } =
+      await import('../server/daemon-state.js');
+    const mockState = {
+      port: 3000,
+      pid: 123,
+      nonce: 'abc',
+      startedAt: '2024-01-01',
+      version: '1.0.0',
+      subPorts: { anvil: 8545, fixture: 8546, mock: 8547 },
+    };
+    vi.mocked(readDaemonState).mockResolvedValueOnce(mockState);
+    vi.mocked(isDaemonAlive).mockResolvedValueOnce(true);
+    vi.mocked(isDaemonVersionMatch).mockReturnValueOnce(true);
+
+    const result = await discoverDaemon('/root', 'click');
+    expect(result).toStrictEqual(mockState);
+  });
+
+  it('restarts daemon on version mismatch', async () => {
+    const {
+      readDaemonState,
+      isDaemonAlive,
+      isDaemonVersionMatch,
+      removeDaemonState,
+    } = await import('../server/daemon-state.js');
+
+    const killSpy = vi
+      .spyOn(process, 'kill')
+      .mockImplementation(vi.fn() as unknown as typeof process.kill);
+
+    const oldState = {
+      port: 3000,
+      pid: 123,
+      nonce: 'abc',
+      startedAt: '2024-01-01',
+      version: '0.0.1',
+      subPorts: { anvil: 8545, fixture: 8546, mock: 8547 },
+    };
+    vi.mocked(readDaemonState).mockResolvedValueOnce(oldState);
+    vi.mocked(isDaemonAlive).mockResolvedValueOnce(true);
+    vi.mocked(isDaemonVersionMatch).mockReturnValueOnce(false);
+
+    await expect(discoverDaemon('/root', 'click')).rejects.toThrowError(
+      'process.exit',
+    );
+
+    expect(removeDaemonState).toHaveBeenCalledWith('/root');
+    expect(stderrSpy).toHaveBeenCalledWith(
+      expect.stringContaining('Daemon version mismatch'),
+    );
+
+    killSpy.mockRestore();
+  });
+
+  it('auto-starts daemon for launch command when no daemon running', async () => {
+    const {
+      readDaemonState,
+      isDaemonAlive,
+      acquireStartupLock,
+      releaseStartupLock,
+    } = await import('../server/daemon-state.js');
+
+    vi.mocked(readDaemonState).mockResolvedValueOnce(null);
+
+    vi.mocked(acquireStartupLock).mockResolvedValueOnce(true);
+
+    const mockState = {
+      port: 3000,
+      pid: 123,
+      nonce: 'abc',
+      startedAt: '2024-01-01',
+      subPorts: { anvil: 8545, fixture: 8546, mock: 8547 },
+    };
+    vi.mocked(readDaemonState).mockResolvedValueOnce(mockState);
+    vi.mocked(isDaemonAlive).mockResolvedValueOnce(true);
+
+    const result = await discoverDaemon('/root', 'launch');
+
+    expect(result).toStrictEqual(mockState);
+    expect(releaseStartupLock).toHaveBeenCalledWith('/root');
+  });
+
+  it('removes stale daemon state when not alive', async () => {
+    const { readDaemonState, isDaemonAlive, removeDaemonState } =
+      await import('../server/daemon-state.js');
+    const mockState = {
+      port: 3000,
+      pid: 123,
+      nonce: 'abc',
+      startedAt: '2024-01-01',
+      subPorts: { anvil: 8545, fixture: 8546, mock: 8547 },
+    };
+    vi.mocked(readDaemonState).mockResolvedValueOnce(mockState);
+    vi.mocked(isDaemonAlive).mockResolvedValueOnce(false);
+
+    await expect(discoverDaemon('/root', 'click')).rejects.toThrowError(
+      'process.exit',
+    );
+
+    expect(removeDaemonState).toHaveBeenCalledWith('/root');
+    expect(stderrSpy).toHaveBeenCalledWith(
+      expect.stringContaining('no daemon running'),
+    );
+  });
+
+  it('exits for non-auto-start commands when no daemon', async () => {
+    const { readDaemonState } = await import('../server/daemon-state.js');
+    vi.mocked(readDaemonState).mockResolvedValueOnce(null);
+
+    await expect(discoverDaemon('/root', 'status')).rejects.toThrowError(
+      'process.exit',
+    );
+    expect(stderrSpy).toHaveBeenCalledWith(
+      expect.stringContaining('no daemon running'),
+    );
+  });
+});
+
+describe('waitForDaemon', () => {
+  it('returns daemon state when daemon becomes alive', async () => {
+    const { readDaemonState, isDaemonAlive } =
+      await import('../server/daemon-state.js');
+    const mockState = {
+      port: 3000,
+      pid: 123,
+      nonce: 'abc',
+      startedAt: '2024-01-01',
+      subPorts: { anvil: 8545, fixture: 8546, mock: 8547 },
+    };
+    vi.mocked(readDaemonState)
+      .mockResolvedValueOnce(null)
+      .mockResolvedValueOnce(mockState);
+    vi.mocked(isDaemonAlive).mockResolvedValueOnce(true);
+
+    vi.useFakeTimers();
+    const promise = waitForDaemon('/root');
+    for (let i = 0; i < 3; i++) {
+      await vi.advanceTimersByTimeAsync(200);
+    }
+    const result = await promise;
+    vi.useRealTimers();
+
+    expect(result).toStrictEqual(mockState);
+  });
+
+  it('throws when daemon fails to start within timeout', async () => {
+    const { readDaemonState } = await import('../server/daemon-state.js');
+    vi.mocked(readDaemonState).mockResolvedValue(null);
+
+    vi.useFakeTimers();
+    const promise = waitForDaemon('/root').catch((error: Error) => error);
+    for (let i = 0; i < 55; i++) {
+      await vi.advanceTimersByTimeAsync(200);
+    }
+    const result = await promise;
+    expect(result).toBeInstanceOf(Error);
+    expect((result as Error).message).toContain('Daemon failed to start');
+    vi.useRealTimers();
+  });
+});
+
+describe('main', () => {
+  it('prints help when no args', async () => {
+    const origArgv = process.argv;
+    process.argv = ['node', 'mm'];
+
+    await expect(main()).rejects.toThrowError('process.exit');
+    expect(stdoutSpy).toHaveBeenCalledWith(expect.stringContaining('mm —'));
+
+    process.argv = origArgv;
+  });
+
+  it('prints help for --help flag', async () => {
+    const origArgv = process.argv;
+    process.argv = ['node', 'mm', '--help'];
+
+    await expect(main()).rejects.toThrowError('process.exit');
+    expect(stdoutSpy).toHaveBeenCalledWith(expect.stringContaining('Usage:'));
+
+    process.argv = origArgv;
+  });
+
+  it('prints help for -h flag', async () => {
+    const origArgv = process.argv;
+    process.argv = ['node', 'mm', '-h'];
+
+    await expect(main()).rejects.toThrowError('process.exit');
+
+    process.argv = origArgv;
+  });
+});
+
+describe('type command --selector/--testid text resolution', () => {
+  const originalFetch = globalThis.fetch;
+
+  beforeEach(() => {
+    vi.spyOn(globalThis, 'fetch').mockResolvedValue({
+      ok: true,
+      json: async () => ({ ok: true, result: {} }),
+    } as Response);
+  });
+
+  afterEach(() => {
+    globalThis.fetch = originalFetch;
+  });
+
+  it('routes type with --selector and text after selector', async () => {
+    await routeCommand('type', ['--selector', '.input', 'hello world'], 3000);
+    expect(globalThis.fetch).toHaveBeenCalledWith(
+      'http://127.0.0.1:3000/tool/type',
+      expect.objectContaining({
+        body: JSON.stringify({
+          selector: '.input',
+          text: 'hello world',
+        }),
+      }),
+    );
+  });
+});
+
+describe('handleServe', () => {
+  it('exits when daemon is already running', async () => {
+    const { readDaemonState, isDaemonAlive } =
+      await import('../server/daemon-state.js');
+    vi.mocked(readDaemonState).mockResolvedValueOnce({
+      port: 3000,
+      pid: 123,
+      nonce: 'abc',
+      startedAt: '2024-01-01',
+      subPorts: { anvil: 8545, fixture: 8546, mock: 8547 },
+    });
+    vi.mocked(isDaemonAlive).mockResolvedValueOnce(true);
+
+    await expect(handleServe('/root', false)).rejects.toThrowError(
+      'process.exit',
+    );
+    expect(stderrSpy).toHaveBeenCalledWith(
+      expect.stringContaining('daemon already running'),
+    );
+  });
+
+  it('starts daemon in background mode', async () => {
+    const { readDaemonState, isDaemonAlive } =
+      await import('../server/daemon-state.js');
+    const { spawn } = await import('node:child_process');
+
+    vi.mocked(readDaemonState).mockResolvedValueOnce(null);
+
+    vi.mocked(existsSync).mockReturnValue(true);
+    vi.mocked(fs.readFile).mockResolvedValue(
+      JSON.stringify({ mm: { daemon: './daemon.ts', runtime: 'node' } }),
+    );
+
+    const mockState = {
+      port: 4000,
+      pid: 456,
+      nonce: 'xyz',
+      startedAt: '2024-01-01',
+      subPorts: { anvil: 8545, fixture: 8546, mock: 8547 },
+    };
+    vi.mocked(readDaemonState)
+      .mockResolvedValueOnce(null)
+      .mockResolvedValueOnce(mockState);
+    vi.mocked(isDaemonAlive).mockResolvedValueOnce(true);
+
+    vi.useFakeTimers();
+    const promise = handleServe('/root', true);
+    for (let i = 0; i < 3; i++) {
+      await vi.advanceTimersByTimeAsync(200);
+    }
+    await promise;
+    vi.useRealTimers();
+
+    expect(spawn).toHaveBeenCalledWith('node', ['./daemon.ts'], {
+      detached: true,
+      stdio: ['ignore', 'ignore', 'ignore'],
+      cwd: '/root',
+    });
+    expect(stdoutSpy).toHaveBeenCalledWith(
+      'Daemon started on port 4000 (PID 456)\n',
+    );
+  });
+
+  it('cleans stale state before starting', async () => {
+    const { readDaemonState, isDaemonAlive, removeDaemonState } =
+      await import('../server/daemon-state.js');
+    const staleState = {
+      port: 3000,
+      pid: 123,
+      nonce: 'abc',
+      startedAt: '2024-01-01',
+      subPorts: { anvil: 8545, fixture: 8546, mock: 8547 },
+    };
+    vi.mocked(readDaemonState).mockResolvedValueOnce(staleState);
+    vi.mocked(isDaemonAlive).mockResolvedValueOnce(false);
+
+    vi.mocked(existsSync).mockReturnValue(true);
+    vi.mocked(fs.readFile).mockResolvedValue(
+      JSON.stringify({ mm: { daemon: './d.ts', runtime: 'node' } }),
+    );
+
+    const { spawn } = await import('node:child_process');
+    vi.mocked(spawn).mockReturnValue({
+      stdio: 'inherit',
+      on: vi.fn((event: string, handler: (code: number | null) => void) => {
+        if (event === 'exit') {
+          setTimeout(() => handler(0), 10);
+        }
+      }),
+    } as any);
+
+    const promise = handleServe('/root', false);
+    await new Promise((resolve) => setTimeout(resolve, 50));
+    await promise;
+
+    expect(removeDaemonState).toHaveBeenCalledWith('/root');
+  });
+});
+
+describe('autoStartDaemon', () => {
+  it('returns existing daemon if one appeared after locking', async () => {
+    const {
+      acquireStartupLock,
+      readDaemonState,
+      isDaemonAlive,
+      releaseStartupLock,
+    } = await import('../server/daemon-state.js');
+
+    vi.mocked(acquireStartupLock).mockResolvedValueOnce(true);
+
+    const mockState = {
+      port: 3000,
+      pid: 123,
+      nonce: 'abc',
+      startedAt: '2024-01-01',
+      subPorts: { anvil: 8545, fixture: 8546, mock: 8547 },
+    };
+    vi.mocked(readDaemonState).mockResolvedValueOnce(mockState);
+    vi.mocked(isDaemonAlive).mockResolvedValueOnce(true);
+
+    const result = await autoStartDaemon('/root');
+
+    expect(result).toStrictEqual(mockState);
+    expect(releaseStartupLock).toHaveBeenCalledWith('/root');
+  });
+
+  it('spawns daemon when no existing daemon is found', async () => {
+    const {
+      acquireStartupLock,
+      readDaemonState,
+      isDaemonAlive,
+      releaseStartupLock,
+    } = await import('../server/daemon-state.js');
+    const { spawn } = await import('node:child_process');
+
+    vi.mocked(acquireStartupLock).mockResolvedValueOnce(true);
+    vi.mocked(readDaemonState).mockResolvedValueOnce(null);
+
+    vi.mocked(existsSync).mockReturnValue(true);
+    vi.mocked(fs.readFile).mockResolvedValue(
+      JSON.stringify({ mm: { daemon: './daemon.ts', runtime: 'node' } }),
+    );
+
+    const mockState = {
+      port: 3000,
+      pid: 123,
+      nonce: 'abc',
+      startedAt: '2024-01-01',
+      subPorts: { anvil: 8545, fixture: 8546, mock: 8547 },
+    };
+    vi.mocked(readDaemonState)
+      .mockResolvedValueOnce(null)
+      .mockResolvedValueOnce(mockState);
+    vi.mocked(isDaemonAlive).mockResolvedValueOnce(true);
+
+    vi.useFakeTimers();
+    const promise = autoStartDaemon('/root');
+    for (let i = 0; i < 3; i++) {
+      await vi.advanceTimersByTimeAsync(200);
+    }
+    const result = await promise;
+    vi.useRealTimers();
+
+    expect(spawn).toHaveBeenCalledWith('node', ['./daemon.ts'], {
+      detached: true,
+      stdio: ['ignore', 'ignore', 'ignore'],
+      cwd: '/root',
+    });
+    expect(releaseStartupLock).toHaveBeenCalledWith('/root');
+    expect(result).toStrictEqual(mockState);
+  });
+
+  it('waits when lock is held by another process', async () => {
+    const { acquireStartupLock, readDaemonState, isDaemonAlive } =
+      await import('../server/daemon-state.js');
+
+    vi.mocked(acquireStartupLock).mockResolvedValueOnce(false);
+
+    const mockState = {
+      port: 3000,
+      pid: 123,
+      nonce: 'abc',
+      startedAt: '2024-01-01',
+      subPorts: { anvil: 8545, fixture: 8546, mock: 8547 },
+    };
+    vi.mocked(readDaemonState)
+      .mockResolvedValueOnce(null)
+      .mockResolvedValueOnce(mockState);
+    vi.mocked(isDaemonAlive).mockResolvedValueOnce(true);
+
+    vi.useFakeTimers();
+    const promise = autoStartDaemon('/root');
+    for (let i = 0; i < 3; i++) {
+      await vi.advanceTimersByTimeAsync(200);
+    }
+    const result = await promise;
+    vi.useRealTimers();
+
+    expect(result).toStrictEqual(mockState);
+  });
+});
diff --git a/src/cli/mm.ts b/src/cli/mm.ts
new file mode 100644
index 0000000..eeadcbd
--- /dev/null
+++ b/src/cli/mm.ts
@@ -0,0 +1,1029 @@
+#!/usr/bin/env node
+import { execSync, spawn } from 'node:child_process';
+import { existsSync } from 'node:fs';
+import * as fs from 'node:fs/promises';
+import * as path from 'node:path';
+
+import {
+  acquireStartupLock,
+  isDaemonAlive,
+  isDaemonVersionMatch,
+  readDaemonState,
+  releaseStartupLock,
+  removeDaemonState,
+} from '../server/daemon-state.js';
+import type { DaemonState } from '../types/http.js';
+import { PACKAGE_VERSION } from '../version.js';
+
+const COMMAND_TIMEOUTS_MS: Record<string, number> = {
+  launch: 120_000,
+  cleanup: 30_000,
+  default: 30_000,
+};
+
+const AUTO_START_COMMANDS = new Set(['launch', 'serve']);
+
+const DAEMON_POLL_INTERVAL_MS = 200;
+const DAEMON_POLL_MAX_ATTEMPTS = 50; // 50 * 200ms = 10s
+const SEND_MAX_RETRIES = 3;
+const SEND_RETRY_BASE_DELAY_MS = 200;
+
+type DaemonConfig = {
+  daemonPath: string;
+  runtime: string;
+};
+
+/**
+ * Extracts and consumes the `--project <path>` flag from argv, returning
+ * the remaining args and the extracted project path (if any).
+ *
+ * @param argv - Raw CLI arguments (after the node/script entries).
+ * @returns The remaining arguments and the optional project path.
+ */
+export function extractProjectFlag(argv: string[]): {
+  args: string[];
+  projectPath: string | undefined;
+} {
+  const idx = argv.indexOf('--project');
+  if (idx < 0) {
+    return { args: argv, projectPath: undefined };
+  }
+  const value = argv[idx + 1];
+  if (!value || value.startsWith('--')) {
+    process.stderr.write('Error: --project requires a path value\n');
+    process.exit(1);
+  }
+  const remaining = [...argv.slice(0, idx), ...argv.slice(idx + 2)];
+  return { args: remaining, projectPath: value };
+}
+
+/**
+ * Resolves the target project root directory using the following precedence:
+ *   1. `--project <path>` CLI flag
+ *   2. `MM_PROJECT` environment variable
+ *   3. `git rev-parse --show-toplevel` (current working directory)
+ *
+ * Both explicit sources accept absolute or relative paths (resolved from cwd).
+ * The resolved path is normalized via `fs.realpath` to handle symlinks.
+ *
+ * @param projectFlag - The value of `--project`, if provided.
+ * @returns The absolute, real path to the project root.
+ */
+export async function resolveWorktreeRoot(
+  projectFlag: string | undefined,
+): Promise<string> {
+  const explicit = projectFlag ?? process.env.MM_PROJECT;
+
+  if (explicit) {
+    const resolved = path.resolve(process.cwd(), explicit);
+    let real: string;
+    try {
+      real = await fs.realpath(resolved);
+    } catch {
+      process.stderr.write(`Error: project path does not exist: ${resolved}\n`);
+      process.exit(1);
+    }
+
+    try {
+      const stat = await fs.stat(real);
+      if (!stat.isDirectory()) {
+        process.stderr.write(
+          `Error: project path is not a directory: ${real}\n`,
+        );
+        process.exit(1);
+      }
+    } catch {
+      process.stderr.write(`Error: cannot access project path: ${real}\n`);
+      process.exit(1);
+    }
+
+    return real;
+  }
+
+  try {
+    return execSync('git rev-parse --show-toplevel', {
+      stdio: ['pipe', 'pipe', 'pipe'],
+    })
+      .toString()
+      .trim();
+  } catch {
+    process.stderr.write(
+      'Error: not in a git repository. Use --project <path> or set MM_PROJECT to target a project.\n',
+    );
+    return process.exit(1);
+  }
+}
+
+/**
+ * CLI entry point that parses arguments and routes to the appropriate handler.
+ */
+export async function main(): Promise<void> {
+  const { args: remainingArgs, projectPath } = extractProjectFlag(
+    process.argv.slice(2),
+  );
+
+  if (
+    remainingArgs.length === 0 ||
+    remainingArgs[0] === '--help' ||
+    remainingArgs[0] === '-h'
+  ) {
+    printHelp();
+    process.exit(0);
+  }
+
+  const worktreeRoot = await resolveWorktreeRoot(projectPath);
+  const args = remainingArgs;
+  const command = args[0];
+
+  // mm serve manages daemon lifecycle directly (no discovery needed)
+  if (command === 'serve') {
+    const background = args.includes('--background');
+    await handleServe(worktreeRoot, background);
+    return;
+  }
+
+  // Discover existing daemon or auto-start for launch
+  const daemonState = await discoverDaemon(worktreeRoot, command);
+
+  if (command === 'launch') {
+    const launchArgs = parseLaunchArgs(args.slice(1));
+    await sendRequest(daemonState.port, 'POST', '/launch', launchArgs);
+    return;
+  }
+
+  if (command === 'cleanup') {
+    const shutdown = args.includes('--shutdown');
+    await sendRequest(daemonState.port, 'POST', '/cleanup', {});
+    if (shutdown) {
+      await shutdownDaemon(worktreeRoot, daemonState);
+    }
+    return;
+  }
+
+  await routeCommand(command, args.slice(1), daemonState.port);
+}
+
+/**
+ * Resolves element targeting from CLI arguments. Supports three targeting modes:
+ * --selector <css>  → CSS selector (explicit)
+ * --testid <id>     → data-testid value (explicit)
+ * positional arg    → a11yRef if /^e\d+$/, otherwise testId (auto-detected)
+ *
+ * @param args - The CLI arguments after the command name.
+ * @returns An object with exactly one of `a11yRef`, `testId`, or `selector`.
+ */
+export function resolveTargetFromArgs(
+  args: string[],
+): { a11yRef: string } | { testId: string } | { selector: string } {
+  const selectorIdx = args.indexOf('--selector');
+  if (selectorIdx >= 0) {
+    const val = args[selectorIdx + 1];
+    if (!val || val.startsWith('--')) {
+      process.stderr.write('Error: --selector requires a value\n');
+      process.exit(1);
+    }
+    return { selector: val };
+  }
+
+  const testIdIdx = args.indexOf('--testid');
+  if (testIdIdx >= 0) {
+    const val = args[testIdIdx + 1];
+    if (!val || val.startsWith('--')) {
+      process.stderr.write('Error: --testid requires a value\n');
+      process.exit(1);
+    }
+    return { testId: val };
+  }
+
+  const target = args[0];
+  if (!target) {
+    process.stderr.write('Error: element target is required\n');
+    process.exit(1);
+  }
+  return /^e[0-9]+$/u.test(target) ? { a11yRef: target } : { testId: target };
+}
+
+/**
+ * Returns the positional target argument from a CLI args list,
+ * skipping any --flag/value pairs.
+ *
+ * @param args - The CLI arguments to scan.
+ * @returns The first non-flag argument, or undefined.
+ */
+export function getPositionalTarget(args: string[]): string | undefined {
+  for (let i = 0; i < args.length; i++) {
+    if (args[i].startsWith('--')) {
+      i += 1;
+      continue;
+    }
+    return args[i];
+  }
+  return undefined;
+}
+
+/**
+ * Routes a CLI command to the appropriate daemon HTTP endpoint.
+ *
+ * @param command - The CLI command to route.
+ * @param args - Additional arguments for the command.
+ * @param port - The daemon HTTP server port.
+ */
+export async function routeCommand(
+  command: string,
+  args: string[],
+  port: number,
+): Promise<void> {
+  switch (command) {
+    case 'status':
+      await sendRequest(port, 'GET', '/status', null);
+      break;
+    case 'click': {
+      const target = getPositionalTarget(args);
+      if (
+        !target &&
+        !args.includes('--selector') &&
+        !args.includes('--testid')
+      ) {
+        process.stderr.write(
+          'Usage: mm click <ref> [--selector <css>] [--testid <id>]\n',
+        );
+        process.exit(1);
+      }
+      await sendRequest(
+        port,
+        'POST',
+        '/tool/click',
+        resolveTargetFromArgs(args),
+      );
+      break;
+    }
+    case 'type': {
+      const typeTarget = getPositionalTarget(args);
+      if (
+        !typeTarget &&
+        !args.includes('--selector') &&
+        !args.includes('--testid')
+      ) {
+        process.stderr.write(
+          'Usage: mm type <ref> <text> [--selector <css>] [--testid <id>]\n',
+        );
+        process.exit(1);
+      }
+      let textArgIdx = 1;
+      if (args.includes('--selector')) {
+        textArgIdx = args.indexOf('--selector') + 2;
+      } else if (args.includes('--testid')) {
+        textArgIdx = args.indexOf('--testid') + 2;
+      }
+      const text = args[textArgIdx] ?? args[1];
+      if (text === undefined) {
+        process.stderr.write('Usage: mm type <ref> <text>\n');
+        process.exit(1);
+      }
+      await sendRequest(port, 'POST', '/tool/type', {
+        ...resolveTargetFromArgs(args),
+        text,
+      });
+      break;
+    }
+    case 'describe-screen':
+      await sendRequest(port, 'POST', '/tool/describe_screen', {});
+      break;
+    case 'screenshot': {
+      const nameIdx = args.indexOf('--name');
+      const name = nameIdx >= 0 ? args[nameIdx + 1] : undefined;
+      await sendRequest(port, 'POST', '/tool/screenshot', name ? { name } : {});
+      break;
+    }
+    case 'wait-for': {
+      const waitTarget = getPositionalTarget(args);
+      if (
+        !waitTarget &&
+        !args.includes('--selector') &&
+        !args.includes('--testid')
+      ) {
+        process.stderr.write(
+          'Usage: mm wait-for <ref> [--timeout <ms>] [--selector <css>] [--testid <id>]\n',
+        );
+        process.exit(1);
+      }
+      const timeoutMs = parseIntFlag(args, '--timeout');
+      await sendRequest(port, 'POST', '/tool/wait_for', {
+        ...resolveTargetFromArgs(args),
+        ...(timeoutMs === undefined ? {} : { timeoutMs }),
+      });
+      break;
+    }
+    case 'navigate':
+      if (!args[0]) {
+        process.stderr.write('Usage: mm navigate <url>\n');
+        process.exit(1);
+      }
+      await sendRequest(port, 'POST', '/tool/navigate', {
+        screen: 'url',
+        url: args[0],
+      });
+      break;
+    case 'navigate-home':
+      await sendRequest(port, 'POST', '/tool/navigate', { screen: 'home' });
+      break;
+    case 'navigate-settings':
+      await sendRequest(port, 'POST', '/tool/navigate', {
+        screen: 'settings',
+      });
+      break;
+    case 'get-state':
+      await sendRequest(port, 'POST', '/tool/get_state', {});
+      break;
+    case 'get-context':
+      await sendRequest(port, 'POST', '/tool/get_context', {});
+      break;
+    case 'set-context':
+      if (!args[0] || (args[0] !== 'e2e' && args[0] !== 'prod')) {
+        process.stderr.write('Usage: mm set-context <e2e|prod>\n');
+        process.exit(1);
+      }
+      await sendRequest(port, 'POST', '/tool/set_context', {
+        context: args[0],
+      });
+      break;
+    case 'build': {
+      const buildForce = args.includes('--force');
+      await sendRequest(port, 'POST', '/tool/build', {
+        ...(buildForce ? { force: true } : {}),
+      });
+      break;
+    }
+    case 'wait-for-notification': {
+      const notifTimeout = parseIntFlag(args, '--timeout');
+      await sendRequest(port, 'POST', '/tool/wait_for_notification', {
+        ...(notifTimeout === undefined ? {} : { timeoutMs: notifTimeout }),
+      });
+      break;
+    }
+    case 'switch-to-tab': {
+      const tabRole = parseStringFlag(args, '--role');
+      const tabUrl = parseStringFlag(args, '--url');
+      if (!tabRole && !tabUrl) {
+        process.stderr.write(
+          'Usage: mm switch-to-tab --role <role> | --url <url>\n',
+        );
+        process.exit(1);
+      }
+      await sendRequest(port, 'POST', '/tool/switch_to_tab', {
+        ...(tabRole ? { role: tabRole } : {}),
+        ...(tabUrl ? { url: tabUrl } : {}),
+      });
+      break;
+    }
+    case 'close-tab': {
+      const closeRole = parseStringFlag(args, '--role');
+      const closeUrl = parseStringFlag(args, '--url');
+      if (!closeRole && !closeUrl) {
+        process.stderr.write(
+          'Usage: mm close-tab --role <role> | --url <url>\n',
+        );
+        process.exit(1);
+      }
+      await sendRequest(port, 'POST', '/tool/close_tab', {
+        ...(closeRole ? { role: closeRole } : {}),
+        ...(closeUrl ? { url: closeUrl } : {}),
+      });
+      break;
+    }
+    case 'clipboard': {
+      const clipAction = args[0];
+      if (!clipAction || (clipAction !== 'read' && clipAction !== 'write')) {
+        process.stderr.write('Usage: mm clipboard <read|write> [text]\n');
+        process.exit(1);
+      }
+      if (clipAction === 'write' && !args[1]) {
+        process.stderr.write('Usage: mm clipboard write <text>\n');
+        process.exit(1);
+      }
+      await sendRequest(port, 'POST', '/tool/clipboard', {
+        action: clipAction,
+        ...(clipAction === 'write' ? { text: args[1] } : {}),
+      });
+      break;
+    }
+    case 'seed-contract': {
+      if (!args[0]) {
+        process.stderr.write(
+          'Usage: mm seed-contract <name> [--hardfork <fork>]\n',
+        );
+        process.exit(1);
+      }
+      const hardfork = parseStringFlag(args, '--hardfork');
+      await sendRequest(port, 'POST', '/tool/seed_contract', {
+        contractName: args[0],
+        ...(hardfork ? { hardfork } : {}),
+      });
+      break;
+    }
+    case 'seed-contracts': {
+      const contractNames = args.filter(
+        (a) =>
+          !a.startsWith('--') && args[args.indexOf(a) - 1] !== '--hardfork',
+      );
+      if (contractNames.length === 0) {
+        process.stderr.write(
+          'Usage: mm seed-contracts <name1> <name2> ... [--hardfork <fork>]\n',
+        );
+        process.exit(1);
+      }
+      const seedHardfork = parseStringFlag(args, '--hardfork');
+      await sendRequest(port, 'POST', '/tool/seed_contracts', {
+        contracts: contractNames,
+        ...(seedHardfork ? { hardfork: seedHardfork } : {}),
+      });
+      break;
+    }
+    case 'get-contract-address':
+      if (!args[0]) {
+        process.stderr.write('Usage: mm get-contract-address <name>\n');
+        process.exit(1);
+      }
+      await sendRequest(port, 'POST', '/tool/get_contract_address', {
+        contractName: args[0],
+      });
+      break;
+    case 'list-contracts':
+      await sendRequest(port, 'POST', '/tool/list_contracts', {});
+      break;
+    case 'list-testids': {
+      const testIdLimit = parseIntFlag(args, '--limit');
+      await sendRequest(port, 'POST', '/tool/list_testids', {
+        ...(testIdLimit === undefined ? {} : { limit: testIdLimit }),
+      });
+      break;
+    }
+    case 'accessibility-snapshot': {
+      const rootSelector = parseStringFlag(args, '--root');
+      await sendRequest(port, 'POST', '/tool/accessibility_snapshot', {
+        ...(rootSelector ? { rootSelector } : {}),
+      });
+      break;
+    }
+    case 'knowledge-search':
+      if (!args[0]) {
+        process.stderr.write('Usage: mm knowledge-search <query>\n');
+        process.exit(1);
+      }
+      await sendRequest(port, 'POST', '/tool/knowledge_search', {
+        query: args[0],
+      });
+      break;
+    case 'knowledge-last':
+      await sendRequest(port, 'POST', '/tool/knowledge_last', {});
+      break;
+    case 'knowledge-sessions':
+      await sendRequest(port, 'POST', '/tool/knowledge_sessions', {});
+      break;
+    case 'knowledge-summarize': {
+      const summarizeSession = parseStringFlag(args, '--session');
+      await sendRequest(port, 'POST', '/tool/knowledge_summarize', {
+        ...(summarizeSession ? { scope: { sessionId: summarizeSession } } : {}),
+      });
+      break;
+    }
+    case 'run-steps':
+      if (!args[0]) {
+        process.stderr.write(
+          'Usage: mm run-steps \'{"steps":[{"tool":"click","args":{"a11yRef":"e1"}}]}\'\n',
+        );
+        process.exit(1);
+      }
+      try {
+        await sendRequest(
+          port,
+          'POST',
+          '/tool/run_steps',
+          JSON.parse(args[0]) as Record<string, unknown>,
+        );
+      } catch (error) {
+        if (error instanceof SyntaxError) {
+          process.stderr.write(`Error: invalid JSON — ${error.message}\n`);
+          process.exit(1);
+        }
+        throw error;
+      }
+      break;
+    default:
+      process.stderr.write(
+        `Error: unknown command '${command}'. Run 'mm --help' for usage.\n`,
+      );
+      process.exit(1);
+  }
+}
+
+/**
+ * Checks whether a fetch error is transient and worth retrying.
+ * Only network-level failures are retried — HTTP responses (even errors) are not.
+ *
+ * @param error - The caught error from a fetch attempt.
+ * @returns Whether the error is transient.
+ */
+export function isTransientError(error: unknown): boolean {
+  const message = String(error);
+  return (
+    message.includes('ECONNREFUSED') ||
+    message.includes('ECONNRESET') ||
+    message.includes('EPIPE') ||
+    message.includes('UND_ERR_SOCKET') ||
+    message.includes('fetch failed')
+  );
+}
+
+/**
+ * Sends an HTTP request to the daemon and prints the response.
+ * Retries transient network errors (ECONNREFUSED, ECONNRESET, etc.)
+ * with linear backoff up to SEND_MAX_RETRIES times.
+ *
+ * @param port - The daemon HTTP server port.
+ * @param method - The HTTP method to use.
+ * @param requestPath - The URL path for the request.
+ * @param body - The request body payload, or null for no body.
+ */
+export async function sendRequest(
+  port: number,
+  method: string,
+  requestPath: string,
+  body: unknown,
+): Promise<void> {
+  const commandName = requestPath.split('/').pop() ?? '';
+  const timeout =
+    COMMAND_TIMEOUTS_MS[commandName] ?? COMMAND_TIMEOUTS_MS.default;
+
+  let lastError: unknown;
+
+  for (let attempt = 0; attempt <= SEND_MAX_RETRIES; attempt++) {
+    if (attempt > 0) {
+      await sleep(SEND_RETRY_BASE_DELAY_MS * attempt);
+    }
+
+    const controller = new AbortController();
+    const timer = setTimeout(() => controller.abort(), timeout);
+
+    try {
+      const headers: Record<string, string> = {};
+      if (body !== null) {
+        headers['Content-Type'] = 'application/json';
+      }
+      const options: RequestInit = {
+        method,
+        signal: controller.signal,
+        headers,
+        ...(body === null ? {} : { body: JSON.stringify(body) }),
+      };
+      const response = await fetch(
+        `http://127.0.0.1:${port}${requestPath}`,
+        options,
+      );
+      const data = (await response.json()) as Record<string, unknown>;
+
+      if (!response.ok || data.ok === false) {
+        const errorData = data.error as { message?: string } | undefined;
+        process.stderr.write(
+          `Error: ${errorData?.message ?? 'Request failed'}\n`,
+        );
+        process.exit(1);
+      }
+
+      const result = data.result ?? data;
+      if (typeof result === 'string') {
+        process.stdout.write(`${result}\n`);
+      } else {
+        process.stdout.write(`${JSON.stringify(result, null, 2)}\n`);
+      }
+      return;
+    } catch (error) {
+      if ((error as Error).name === 'AbortError') {
+        process.stderr.write(`Error: request timed out after ${timeout}ms\n`);
+        process.exit(1);
+      }
+
+      if (isTransientError(error) && attempt < SEND_MAX_RETRIES) {
+        lastError = error;
+        continue;
+      }
+
+      process.stderr.write(`Error: ${String(error)}\n`);
+      process.exit(1);
+    } finally {
+      clearTimeout(timer);
+    }
+  }
+
+  process.stderr.write(
+    `Error: request failed after ${SEND_MAX_RETRIES + 1} attempts: ${String(lastError)}\n`,
+  );
+  process.exit(1);
+}
+
+/**
+ * Discovers a running daemon or auto-starts one for eligible commands.
+ *
+ * @param worktreeRoot - The git worktree root directory.
+ * @param command - The CLI command being executed.
+ * @returns The daemon state with connection details.
+ */
+export async function discoverDaemon(
+  worktreeRoot: string,
+  command: string,
+): Promise<DaemonState> {
+  let state = await readDaemonState(worktreeRoot);
+
+  if (state) {
+    const alive = await isDaemonAlive(state);
+    if (alive) {
+      if (isDaemonVersionMatch(state)) {
+        return state;
+      }
+
+      process.stderr.write(
+        `Daemon version mismatch (running: ${state.version ?? 'unknown'}, cli: ${PACKAGE_VERSION}). Restarting...\n`,
+      );
+      await shutdownDaemon(worktreeRoot, state);
+      state = null;
+    } else {
+      await removeDaemonState(worktreeRoot);
+      state = null;
+    }
+  }
+
+  if (!AUTO_START_COMMANDS.has(command)) {
+    process.stderr.write(
+      'Error: no daemon running. Run `mm launch` to start.\n',
+    );
+    process.exit(1);
+  }
+
+  return autoStartDaemon(worktreeRoot);
+}
+
+/**
+ * Spawns a new daemon process and waits for it to become ready.
+ *
+ * @param worktreeRoot - The git worktree root directory.
+ * @returns The daemon state once it is alive.
+ */
+export async function autoStartDaemon(
+  worktreeRoot: string,
+): Promise<DaemonState> {
+  const locked = await acquireStartupLock(worktreeRoot);
+  if (!locked) {
+    return waitForDaemon(worktreeRoot);
+  }
+
+  try {
+    const existingState = await readDaemonState(worktreeRoot);
+    if (existingState && (await isDaemonAlive(existingState))) {
+      return existingState;
+    }
+
+    const config = await readDaemonConfig(worktreeRoot);
+    const runtimeBin = resolveRuntime(worktreeRoot, config.runtime);
+
+    const child = spawn(runtimeBin, [config.daemonPath], {
+      detached: true,
+      stdio: ['ignore', 'ignore', 'ignore'],
+      cwd: worktreeRoot,
+    });
+    child.unref();
+
+    return await waitForDaemon(worktreeRoot);
+  } finally {
+    await releaseStartupLock(worktreeRoot);
+  }
+}
+
+/**
+ * Starts the daemon in foreground or background mode.
+ *
+ * @param worktreeRoot - The git worktree root directory.
+ * @param background - Whether to run the daemon as a detached background process.
+ */
+export async function handleServe(
+  worktreeRoot: string,
+  background: boolean,
+): Promise<void> {
+  const existing = await readDaemonState(worktreeRoot);
+  if (existing && (await isDaemonAlive(existing))) {
+    process.stderr.write(
+      `Error: daemon already running on port ${existing.port} (PID ${existing.pid})\n`,
+    );
+    process.exit(1);
+  }
+
+  if (existing) {
+    await removeDaemonState(worktreeRoot);
+  }
+
+  const config = await readDaemonConfig(worktreeRoot);
+  const runtimeBin = resolveRuntime(worktreeRoot, config.runtime);
+
+  if (background) {
+    const child = spawn(runtimeBin, [config.daemonPath], {
+      detached: true,
+      stdio: ['ignore', 'ignore', 'ignore'],
+      cwd: worktreeRoot,
+    });
+    child.unref();
+
+    const state = await waitForDaemon(worktreeRoot);
+    process.stdout.write(
+      `Daemon started on port ${state.port} (PID ${state.pid})\n`,
+    );
+    return;
+  }
+
+  const child = spawn(runtimeBin, [config.daemonPath], {
+    stdio: 'inherit',
+    cwd: worktreeRoot,
+  });
+
+  await new Promise<void>((resolve) => {
+    child.on('exit', (code) => {
+      process.exitCode = code ?? 0;
+      resolve();
+    });
+  });
+}
+
+/**
+ * Reads the daemon configuration from the worktree package.json.
+ *
+ * @param worktreeRoot - The git worktree root directory.
+ * @returns The daemon path and runtime configuration.
+ */
+export async function readDaemonConfig(
+  worktreeRoot: string,
+): Promise<DaemonConfig> {
+  const pkgPath = path.join(worktreeRoot, 'package.json');
+  let content: string;
+  try {
+    content = await fs.readFile(pkgPath, 'utf-8');
+  } catch {
+    process.stderr.write(`Error: Cannot read package.json at ${pkgPath}\n`);
+    process.exit(1);
+  }
+
+  const pkgJson = JSON.parse(content) as Record<string, unknown>;
+  const mmConfig = pkgJson.mm as
+    | { daemon?: string; runtime?: string }
+    | undefined;
+  if (!mmConfig?.daemon) {
+    process.stderr.write(
+      'Error: No daemon entry point configured. Add `mm.daemon` to package.json.\n',
+    );
+    process.exit(1);
+  }
+
+  return {
+    daemonPath: mmConfig.daemon,
+    runtime: mmConfig.runtime ?? 'tsx',
+  };
+}
+
+/**
+ * Resolves the runtime binary path for spawning the daemon.
+ *
+ * @param worktreeRoot - The git worktree root directory.
+ * @param runtime - The runtime name from configuration.
+ * @returns The absolute path to the runtime binary.
+ */
+export function resolveRuntime(worktreeRoot: string, runtime: string): string {
+  if (runtime === 'node') {
+    return 'node';
+  }
+
+  const binPath = path.join(worktreeRoot, 'node_modules', '.bin', runtime);
+  if (!existsSync(binPath)) {
+    process.stderr.write(
+      `Error: Runtime '${runtime}' not found at ${binPath}. Install it or set "mm.runtime" in package.json.\n`,
+    );
+    process.exit(1);
+  }
+  return binPath;
+}
+
+/**
+ * Polls for daemon state until the daemon is alive or times out.
+ *
+ * @param worktreeRoot - The git worktree root directory.
+ * @returns The daemon state once the daemon is responsive.
+ */
+export async function waitForDaemon(
+  worktreeRoot: string,
+): Promise<DaemonState> {
+  for (let i = 0; i < DAEMON_POLL_MAX_ATTEMPTS; i++) {
+    await sleep(DAEMON_POLL_INTERVAL_MS);
+    const state = await readDaemonState(worktreeRoot);
+    if (state && (await isDaemonAlive(state))) {
+      return state;
+    }
+  }
+  throw new Error('Daemon failed to start within 10 seconds');
+}
+
+/**
+ * Terminates the daemon process and removes its state file.
+ *
+ * @param worktreeRoot - The git worktree root directory.
+ * @param state - The current daemon state containing the PID.
+ */
+export async function shutdownDaemon(
+  worktreeRoot: string,
+  state: DaemonState,
+): Promise<void> {
+  if (state.pid) {
+    try {
+      process.kill(state.pid, 'SIGTERM');
+    } catch {
+      /* already dead */
+    }
+  }
+  await removeDaemonState(worktreeRoot);
+}
+
+/**
+ * Delays execution for the specified duration.
+ *
+ * @param ms - The number of milliseconds to wait.
+ * @returns A promise that resolves after the delay.
+ */
+export async function sleep(ms: number): Promise<void> {
+  return new Promise((resolve) => setTimeout(resolve, ms));
+}
+
+/**
+ * Parses a numeric flag value from a CLI argument list.
+ *
+ * @param args - The raw CLI arguments to search.
+ * @param flag - The flag name to look for (e.g., '--timeout').
+ * @returns The parsed integer value, or undefined if the flag is absent or invalid.
+ */
+export function parseIntFlag(args: string[], flag: string): number | undefined {
+  const idx = args.indexOf(flag);
+  if (idx < 0) {
+    return undefined;
+  }
+  const parsed = parseInt(args[idx + 1], 10);
+  return isNaN(parsed) ? undefined : parsed;
+}
+
+/**
+ * Parses a string flag value from a CLI argument list.
+ *
+ * @param args - The raw CLI arguments to search.
+ * @param flag - The flag name to look for (e.g., '--role').
+ * @returns The string value, or undefined if the flag is absent.
+ */
+export function parseStringFlag(
+  args: string[],
+  flag: string,
+): string | undefined {
+  const idx = args.indexOf(flag);
+  if (idx < 0 || !args[idx + 1] || args[idx + 1].startsWith('--')) {
+    return undefined;
+  }
+  return args[idx + 1];
+}
+
+/**
+ * Parses launch command arguments into a key-value object.
+ *
+ * @param args - The raw CLI arguments after the command name.
+ * @returns The parsed launch options.
+ */
+export function parseLaunchArgs(args: string[]): Record<string, unknown> {
+  const result: Record<string, unknown> = {};
+  const knownFlags = new Set([
+    '--state',
+    '--extension-path',
+    '--goal',
+    '--force',
+    '--flow-tags',
+  ]);
+
+  for (let i = 0; i < args.length; i++) {
+    const arg = args[i];
+    if (arg === '--force') {
+      result.force = true;
+    } else if (arg === '--state') {
+      i += 1;
+      if (!args[i] || args[i].startsWith('--')) {
+        process.stderr.write(
+          'Error: --state requires a value (default|onboarding|custom)\n',
+        );
+        process.exit(1);
+      }
+      result.stateMode = args[i];
+    } else if (arg === '--extension-path') {
+      i += 1;
+      if (!args[i] || args[i].startsWith('--')) {
+        process.stderr.write('Error: --extension-path requires a value\n');
+        process.exit(1);
+      }
+      result.extensionPath = args[i];
+    } else if (arg === '--goal') {
+      i += 1;
+      if (!args[i] || args[i].startsWith('--')) {
+        process.stderr.write('Error: --goal requires a value\n');
+        process.exit(1);
+      }
+      result.goal = args[i];
+    } else if (arg === '--flow-tags') {
+      i += 1;
+      if (!args[i] || args[i].startsWith('--')) {
+        process.stderr.write(
+          'Error: --flow-tags requires a comma-separated value\n',
+        );
+        process.exit(1);
+      }
+      result.flowTags = args[i].split(',').map((tag) => tag.trim());
+    } else if (arg.startsWith('--') && !knownFlags.has(arg)) {
+      process.stderr.write(`Warning: unknown launch flag '${arg}'\n`);
+    }
+  }
+  return result;
+}
+
+/**
+ * Prints CLI usage information to stdout.
+ */
+export function printHelp(): void {
+  process.stdout.write(`mm — MetaMask CLI
+
+Usage: mm [--project <path>] <command> [options]
+
+Global Options:
+  --project <path>    Target a specific project directory (absolute or relative).
+                      Overrides MM_PROJECT and git-based discovery.
+
+Environment Variables:
+  MM_PROJECT          Default project directory when --project is not provided.
+                      Falls back to the current git worktree root.
+
+Lifecycle:
+  mm launch [--state default|onboarding|custom] [--extension-path <path>] [--goal <text>] [--force] [--flow-tags <tags>]
+  mm cleanup [--shutdown]
+  mm status
+  mm serve [--background]
+  mm build [--force]
+
+Interaction:
+  mm click <ref> [--selector <css>] [--testid <id>]
+  mm type <ref> <text> [--selector <css>] [--testid <id>]
+  mm describe-screen
+  mm screenshot [--name <name>]
+  mm wait-for <ref> [--timeout <ms>] [--selector <css>] [--testid <id>]
+  mm wait-for-notification [--timeout <ms>]
+  mm clipboard <read|write> [text]
+
+Navigation:
+  mm navigate <url>
+  mm navigate-home
+  mm navigate-settings
+  mm switch-to-tab --role <role> | --url <url>
+  mm close-tab --role <role> | --url <url>
+
+Discovery:
+  mm list-testids [--limit <n>]
+  mm accessibility-snapshot [--root <selector>]
+
+State & Context:
+  mm get-state
+  mm get-context
+  mm set-context <e2e|prod>
+
+Knowledge:
+  mm knowledge-search <query>
+  mm knowledge-last
+  mm knowledge-sessions
+  mm knowledge-summarize [--session <id>]
+
+Contracts (E2E only):
+  mm seed-contract <name> [--hardfork <fork>]
+  mm seed-contracts <name1> <name2> ... [--hardfork <fork>]
+  mm get-contract-address <name>
+  mm list-contracts
+
+Batching:
+  mm run-steps <json>
+
+Examples:
+  mm launch                                          (from inside project)
+  mm --project ../metamask-extension launch          (from parent folder)
+  MM_PROJECT=/path/to/extension mm describe-screen   (via env var)
+`);
+}
+
+/* istanbul ignore next -- CLI entry point, tested via exported functions */
+if (process.env.VITEST === undefined) {
+  main().catch((error: unknown) => {
+    process.stderr.write(`Fatal: ${String(error)}\n`);
+    process.exit(1);
+  });
+}
diff --git a/src/index.ts b/src/index.ts
index eaefc45..0025aff 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -2,26 +2,35 @@
 export type * from './capabilities/types.js';
 export * from './capabilities/context.js';
 
-// MCP Server - Session Manager Interface
-export * from './mcp-server/session-manager.js';
-
-// MCP Server - Server
-export * from './mcp-server/server.js';
-
-// MCP Server - Core Components
-export * from './mcp-server/knowledge-store.js';
-export * from './mcp-server/discovery.js';
-export * from './mcp-server/schemas.js';
-export * from './mcp-server/tools/definitions.js';
-export * from './mcp-server/tokenization.js';
-
-// MCP Server - Types
-export * from './mcp-server/types';
-
-// MCP Server - Utils
-export * from './mcp-server/utils';
-
-// Shared utilities
+// Session Manager Interface (transport-agnostic)
+export type {
+  ISessionManager,
+  TrackedPage,
+  SessionLaunchInput,
+  SessionLaunchResult,
+  SessionScreenshotOptions,
+} from './server/session-manager.js';
+
+// Core Components
+export * from './knowledge-store/knowledge-store.js';
+export * from './tools/utils/discovery.js';
+export * from './validation/schemas.js';
+export * from './knowledge-store/tokenization.js';
+
+// Types
+export * from './tools/types';
+
+// HTTP Server Types
+export type * from './types/http.js';
+export * from './tools/registry.js';
+
+// Server utilities
+export * from './server/request-queue.js';
+export * from './server/port-allocator.js';
+export * from './server/daemon-state.js';
+export * from './server/create-server.js';
+
+// Utils
 export * from './utils';
 
 // Launcher utilities
@@ -30,26 +39,8 @@ export * from './launcher/extension-readiness.js';
 export * from './launcher/console-error-buffer.js';
 export * from './launcher/retry.js';
 
-// MCP Server - Tool Handlers
-export * from './mcp-server/tools/build.js';
-export * from './mcp-server/tools/launch.js';
-export * from './mcp-server/tools/cleanup.js';
-export * from './mcp-server/tools/state.js';
-export * from './mcp-server/tools/seeding.js';
-export * from './mcp-server/tools/interaction.js';
-export * from './mcp-server/tools/navigation.js';
-export * from './mcp-server/tools/discovery-tools.js';
-export * from './mcp-server/tools/screenshot.js';
-export * from './mcp-server/tools/knowledge.js';
-export * from './mcp-server/tools/batch.js';
-export * from './mcp-server/tools/context.js';
-export * from './mcp-server/tools/clipboard.js';
-
-// Run tool utility
-export * from './mcp-server/tools/run-tool.js';
-
 // Error classification
-export * from './mcp-server/tools/error-classification.js';
+export * from './tools/error-classification.js';
 
-// Helpers
-export * from './mcp-server/tools/helpers.js';
+// Version
+export * from './version.js';
diff --git a/src/mcp-server/knowledge-store.test.ts b/src/knowledge-store/knowledge-store.test.ts
similarity index 91%
rename from src/mcp-server/knowledge-store.test.ts
rename to src/knowledge-store/knowledge-store.test.ts
index ea984f6..40e75b3 100644
--- a/src/mcp-server/knowledge-store.test.ts
+++ b/src/knowledge-store/knowledge-store.test.ts
@@ -16,12 +16,12 @@ import {
   knowledgeStore,
 } from './knowledge-store.js';
 import type { KnowledgeStoreConfig } from './knowledge-store.js';
+import type { ExtensionState } from '../capabilities/types.js';
 import type {
   SessionMetadata,
   StepRecordOutcome,
   StepRecordObservation,
-} from './types';
-import type { ExtensionState } from '../capabilities/types.js';
+} from '../tools/types';
 
 vi.mock('fs', () => ({
   existsSync: vi.fn(),
@@ -101,21 +101,10 @@ describe('core', () => {
       expect(store).toBeDefined();
     });
 
-    it('accepts custom toolPrefix configuration', () => {
-      const config: KnowledgeStoreConfig = {
-        toolPrefix: 'custom',
-      };
-
-      const store = new KnowledgeStore(config);
-
-      expect(store).toBeDefined();
-    });
-
     it('accepts full configuration object', () => {
       const config: KnowledgeStoreConfig = {
         rootDir: '/custom/root',
         sessionIdPrefix: 'test-',
-        toolPrefix: 'test',
       };
 
       const store = new KnowledgeStore(config);
@@ -126,7 +115,9 @@ describe('core', () => {
 
   describe('writeSessionMetadata', () => {
     it('creates session directory and writes metadata file', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
       const metadata: SessionMetadata = {
         schemaVersion: 1,
         sessionId: 'session-001',
@@ -152,7 +143,9 @@ describe('core', () => {
     });
 
     it('includes optional goal in metadata', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
       const metadata: SessionMetadata = {
         schemaVersion: 1,
         sessionId: 'session-003',
@@ -244,13 +237,15 @@ describe('core', () => {
 
   describe('recordStep', () => {
     it('creates steps directory and writes step file', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
       const outcome: StepRecordOutcome = { ok: true };
       const observation = createObservation();
 
       const result = await store.recordStep({
         sessionId: 'session-step-001',
-        toolName: 'mm_click',
+        toolName: 'click',
         input: { testId: 'send-button' },
         outcome,
         observation,
@@ -264,17 +259,19 @@ describe('core', () => {
       expect(fs.writeFile).toHaveBeenCalled();
       expect(result).toContain('session-step-001');
       expect(result).toContain('steps');
-      expect(result).toContain('mm_click.json');
+      expect(result).toContain('click.json');
     });
 
     it('records step with screenshot artifact', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
       const outcome: StepRecordOutcome = { ok: true };
       const observation = createObservation({ currentScreen: 'send' });
 
       await store.recordStep({
         sessionId: 'session-step-002',
-        toolName: 'mm_screenshot',
+        toolName: 'screenshot',
         outcome,
         observation,
         screenshotPath: '/test/screenshots/screenshot-001.png',
@@ -294,7 +291,9 @@ describe('core', () => {
     });
 
     it('sanitizes sensitive input fields', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
       const outcome: StepRecordOutcome = { ok: true };
       const observation = createObservation({
         currentScreen: 'unlock',
@@ -303,7 +302,7 @@ describe('core', () => {
 
       await store.recordStep({
         sessionId: 'session-step-003',
-        toolName: 'mm_type',
+        toolName: 'type',
         input: { testId: 'password-input', text: 'my-secret-password' },
         outcome,
         observation,
@@ -318,13 +317,15 @@ describe('core', () => {
     });
 
     it('records step with target information', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
       const outcome: StepRecordOutcome = { ok: true };
       const observation = createObservation();
 
       await store.recordStep({
         sessionId: 'session-step-004',
-        toolName: 'mm_click',
+        toolName: 'click',
         input: { testId: 'confirm-btn' },
         target: {
           testId: 'confirm-btn',
@@ -344,13 +345,15 @@ describe('core', () => {
     });
 
     it('computes discovery label for discovery tools', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
       const outcome: StepRecordOutcome = { ok: true };
       const observation = createObservation();
 
       await store.recordStep({
         sessionId: 'session-step-005',
-        toolName: 'mm_describe_screen',
+        toolName: 'describe_screen',
         outcome,
         observation,
       });
@@ -362,13 +365,15 @@ describe('core', () => {
     });
 
     it('computes navigation label for navigation tools', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
       const outcome: StepRecordOutcome = { ok: true };
       const observation = createObservation({ currentScreen: 'settings' });
 
       await store.recordStep({
         sessionId: 'session-step-006',
-        toolName: 'mm_navigate',
+        toolName: 'navigate',
         outcome,
         observation,
       });
@@ -380,13 +385,15 @@ describe('core', () => {
     });
 
     it('computes interaction label for interaction tools', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
       const outcome: StepRecordOutcome = { ok: true };
       const observation = createObservation();
 
       await store.recordStep({
         sessionId: 'session-step-007',
-        toolName: 'mm_click',
+        toolName: 'click',
         input: { testId: 'send-button' },
         outcome,
         observation,
@@ -399,7 +406,9 @@ describe('core', () => {
     });
 
     it('computes confirmation label for confirmation-related targets', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
       const outcome: StepRecordOutcome = { ok: true };
       const observation = createObservation({
         currentScreen: 'confirm-transaction',
@@ -407,7 +416,7 @@ describe('core', () => {
 
       await store.recordStep({
         sessionId: 'session-step-008',
-        toolName: 'mm_click',
+        toolName: 'click',
         target: { testId: 'confirm-transaction-btn' },
         outcome,
         observation,
@@ -420,7 +429,9 @@ describe('core', () => {
     });
 
     it('computes error-recovery label for failed outcomes', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
       const outcome: StepRecordOutcome = {
         ok: false,
         error: { code: 'MM_TARGET_NOT_FOUND', message: 'Target not found' },
@@ -429,7 +440,7 @@ describe('core', () => {
 
       await store.recordStep({
         sessionId: 'session-step-009',
-        toolName: 'mm_click',
+        toolName: 'click',
         input: { testId: 'nonexistent-btn' },
         outcome,
         observation,
@@ -442,13 +453,15 @@ describe('core', () => {
     });
 
     it('records step with e2e context', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
       const outcome: StepRecordOutcome = { ok: true };
       const observation = createObservation();
 
       await store.recordStep({
         sessionId: 'session-step-011',
-        toolName: 'mm_click',
+        toolName: 'click',
         outcome,
         observation,
         context: 'e2e',
@@ -461,43 +474,24 @@ describe('core', () => {
     });
 
     it('records step with prod context', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
-      const outcome: StepRecordOutcome = { ok: true };
-      const observation = createObservation();
-
-      await store.recordStep({
-        sessionId: 'session-step-012',
-        toolName: 'mm_click',
-        outcome,
-        observation,
-        context: 'prod',
-      });
-
-      const writeCall = vi.mocked(fs.writeFile).mock.calls[0];
-      const writtenData = JSON.parse(writeCall[1] as string);
-
-      expect(writtenData.context).toBe('prod');
-    });
-
-    it('uses custom tool prefix for label computation', async () => {
       const store = new KnowledgeStore({
         rootDir: '/test/knowledge',
-        toolPrefix: 'custom',
       });
       const outcome: StepRecordOutcome = { ok: true };
       const observation = createObservation();
 
       await store.recordStep({
-        sessionId: 'session-step-013',
-        toolName: 'custom_describe_screen',
+        sessionId: 'session-step-012',
+        toolName: 'click',
         outcome,
         observation,
+        context: 'prod',
       });
 
       const writeCall = vi.mocked(fs.writeFile).mock.calls[0];
       const writtenData = JSON.parse(writeCall[1] as string);
 
-      expect(writtenData.labels).toContain('discovery');
+      expect(writtenData.context).toBe('prod');
     });
   });
 
@@ -507,7 +501,9 @@ describe('core', () => {
     }
 
     it('returns empty array when no sessions exist', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
       vi.mocked(fs.readdir).mockResolvedValueOnce([]);
 
       const result = await store.listSessions(10);
@@ -516,7 +512,9 @@ describe('core', () => {
     });
 
     it('returns sessions sorted by createdAt descending', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
       const oldMetadata: SessionMetadata = {
         schemaVersion: 1,
         sessionId: 'mm-session-old',
@@ -549,7 +547,9 @@ describe('core', () => {
     });
 
     it('limits results to specified count', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
 
       vi.mocked(fs.readdir).mockResolvedValueOnce([
         createDirent('mm-session-1'),
@@ -583,7 +583,9 @@ describe('core', () => {
     });
 
     it('filters by flowTag', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
       const sendMetadata: SessionMetadata = {
         schemaVersion: 1,
         sessionId: 'mm-session-send',
@@ -616,7 +618,9 @@ describe('core', () => {
     });
 
     it('filters by tag', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
       const testMetadata: SessionMetadata = {
         schemaVersion: 1,
         sessionId: 'mm-session-test',
@@ -649,7 +653,9 @@ describe('core', () => {
     });
 
     it('filters by sinceHours', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
       const now = new Date();
       const recentDate = new Date(now.getTime() - 12 * 60 * 60 * 1000);
       const oldDate = new Date(now.getTime() - 72 * 60 * 60 * 1000);
@@ -692,7 +698,9 @@ describe('core', () => {
     }
 
     it('returns current session ID for scope "current"', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
 
       const result = await store.resolveSessionIds(
         'current',
@@ -703,7 +711,9 @@ describe('core', () => {
     });
 
     it('returns empty array for scope "current" without current session', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
 
       const result = await store.resolveSessionIds('current', undefined);
 
@@ -711,7 +721,9 @@ describe('core', () => {
     });
 
     it('returns specific session ID for scope object', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
 
       const result = await store.resolveSessionIds(
         { sessionId: 'specific-session-001' },
@@ -722,7 +734,9 @@ describe('core', () => {
     });
 
     it('returns all session IDs for scope "all"', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
       vi.mocked(fs.readdir).mockResolvedValueOnce([
         createDirent('mm-session-1'),
         createDirent('mm-session-2'),
@@ -739,7 +753,9 @@ describe('core', () => {
     });
 
     it('filters session IDs by filters for scope "all"', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
       const sendMetadata: SessionMetadata = {
         schemaVersion: 1,
         sessionId: 'mm-session-send',
@@ -773,7 +789,9 @@ describe('core', () => {
     });
 
     it('includes sessions without metadata when filtering', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
 
       vi.mocked(fs.readdir).mockResolvedValueOnce([
         createDirent('mm-session-with-metadata'),
@@ -803,7 +821,9 @@ describe('core', () => {
 
   describe('extractPathTokens', () => {
     it('extracts tokens from URL hash fragment', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
       const outcome: StepRecordOutcome = { ok: true };
       const observation = createObservation({
         currentScreen: 'confirm-transaction',
@@ -812,7 +832,7 @@ describe('core', () => {
 
       await store.recordStep({
         sessionId: 'session-path-001',
-        toolName: 'mm_click',
+        toolName: 'click',
         outcome,
         observation,
       });
@@ -855,7 +875,7 @@ describe('similarity', () => {
     } = {},
   ) {
     const baseTool = {
-      name: 'mm_click',
+      name: 'click',
       input: { testId: 'test-btn' },
       target: { testId: 'test-btn' },
     };
@@ -904,9 +924,11 @@ describe('similarity', () => {
 
   describe('searchSteps scoring', () => {
     it('scores steps matching tool name in query', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
       const stepRecord = createStepRecord({
-        tool: { name: 'mm_click', input: {} },
+        tool: { name: 'click', input: {} },
       });
 
       vi.mocked(fs.readdir).mockResolvedValueOnce([
@@ -928,11 +950,13 @@ describe('similarity', () => {
       const results = await store.searchSteps('click', 10, 'all', undefined);
 
       expect(results.length).toBeGreaterThan(0);
-      expect(results[0].tool).toBe('mm_click');
+      expect(results[0].tool).toBe('click');
     });
 
     it('scores steps matching screen name in query', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
       const stepRecord = createStepRecord({
         observation: {
           state: { currentScreen: 'send' },
@@ -964,10 +988,12 @@ describe('similarity', () => {
     });
 
     it('scores steps matching target testId in query', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
       const stepRecord = createStepRecord({
         tool: {
-          name: 'mm_click',
+          name: 'click',
           input: { testId: 'confirm-button' },
           target: { testId: 'confirm-button' },
         },
@@ -995,7 +1021,9 @@ describe('similarity', () => {
     });
 
     it('scores steps matching labels in query', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
       const stepRecord = createStepRecord({
         labels: ['navigation', 'confirmation'],
       });
@@ -1027,7 +1055,9 @@ describe('similarity', () => {
     });
 
     it('scores steps matching observed testIds in query', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
       const stepRecord = createStepRecord({
         observation: {
           state: { currentScreen: 'home' },
@@ -1061,7 +1091,9 @@ describe('similarity', () => {
     });
 
     it('scores steps matching a11y node names in query', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
       const stepRecord = createStepRecord({
         observation: {
           state: { currentScreen: 'home' },
@@ -1097,7 +1129,9 @@ describe('similarity', () => {
     });
 
     it('scores steps matching a11y node roles in query', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
       const stepRecord = createStepRecord({
         observation: {
           state: { currentScreen: 'home' },
@@ -1130,7 +1164,9 @@ describe('similarity', () => {
     });
 
     it('returns empty results for empty query', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
 
       const results = await store.searchSteps('', 10, 'all', undefined);
 
@@ -1138,9 +1174,11 @@ describe('similarity', () => {
     });
 
     it('calculates token coverage ratio bonus', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
       const stepRecord = createStepRecord({
-        tool: { name: 'mm_click', input: {} },
+        tool: { name: 'click', input: {} },
         observation: {
           state: { currentScreen: 'send' },
           testIds: [],
@@ -1177,7 +1215,9 @@ describe('similarity', () => {
 
   describe('session scoring', () => {
     it('scores sessions with matching flowTags higher', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
       const sendMetadata = {
         schemaVersion: 1,
         sessionId: 'mm-session-send',
@@ -1219,7 +1259,9 @@ describe('similarity', () => {
     });
 
     it('scores sessions with matching goal tokens', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
       const metadata = {
         schemaVersion: 1,
         sessionId: 'mm-session-1',
@@ -1245,7 +1287,9 @@ describe('similarity', () => {
     });
 
     it('scores sessions with matching tags', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
       const metadata = {
         schemaVersion: 1,
         sessionId: 'mm-session-1',
@@ -1275,7 +1319,9 @@ describe('similarity', () => {
     });
 
     it('gives recency bonus to recent sessions (< 24 hours)', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
       const recentDate = new Date(
         Date.now() - 12 * 60 * 60 * 1000,
       ).toISOString();
@@ -1296,7 +1342,7 @@ describe('similarity', () => {
       vi.mocked(fs.readFile).mockResolvedValueOnce(
         JSON.stringify(
           createStepRecord({
-            tool: { name: 'mm_click', input: {} },
+            tool: { name: 'click', input: {} },
           }),
         ),
       );
@@ -1307,7 +1353,9 @@ describe('similarity', () => {
     });
 
     it('gives smaller recency bonus to moderately recent sessions (24-72 hours)', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
       const moderateDate = new Date(
         Date.now() - 48 * 60 * 60 * 1000,
       ).toISOString();
@@ -1328,7 +1376,7 @@ describe('similarity', () => {
       vi.mocked(fs.readFile).mockResolvedValueOnce(
         JSON.stringify(
           createStepRecord({
-            tool: { name: 'mm_click', input: {} },
+            tool: { name: 'click', input: {} },
           }),
         ),
       );
@@ -1339,7 +1387,9 @@ describe('similarity', () => {
     });
 
     it('sorts sessions by score then by createdAt', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
       const metadata1 = {
         schemaVersion: 1,
         sessionId: 'mm-session-1',
@@ -1372,7 +1422,7 @@ describe('similarity', () => {
           JSON.stringify(
             createStepRecord({
               sessionId: 'mm-session-1',
-              tool: { name: 'mm_click', input: {} },
+              tool: { name: 'click', input: {} },
             }),
           ),
         )
@@ -1380,7 +1430,7 @@ describe('similarity', () => {
           JSON.stringify(
             createStepRecord({
               sessionId: 'mm-session-2',
-              tool: { name: 'mm_click', input: {} },
+              tool: { name: 'click', input: {} },
             }),
           ),
         );
@@ -1393,7 +1443,9 @@ describe('similarity', () => {
 
   describe('generatePriorKnowledge similarity scoring', () => {
     it('scores steps with same screen higher', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
       const metadata = {
         schemaVersion: 1,
         sessionId: 'mm-session-1',
@@ -1433,7 +1485,9 @@ describe('similarity', () => {
     });
 
     it('scores steps with URL path overlap', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
       const metadata = {
         schemaVersion: 1,
         sessionId: 'mm-session-1',
@@ -1474,7 +1528,9 @@ describe('similarity', () => {
     });
 
     it('scores steps with testId overlap', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
       const metadata = {
         schemaVersion: 1,
         sessionId: 'mm-session-1',
@@ -1518,7 +1574,9 @@ describe('similarity', () => {
     });
 
     it('scores steps with a11y node overlap', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
       const metadata = {
         schemaVersion: 1,
         sessionId: 'mm-session-1',
@@ -1563,7 +1621,9 @@ describe('similarity', () => {
     });
 
     it('scores actionable tools higher than discovery tools', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
       const metadata = {
         schemaVersion: 1,
         sessionId: 'mm-session-1',
@@ -1573,7 +1633,7 @@ describe('similarity', () => {
         launch: { stateMode: 'default' },
       };
       const clickStep = createStepRecord({
-        tool: { name: 'mm_click', input: { testId: 'send-btn' } },
+        tool: { name: 'click', input: { testId: 'send-btn' } },
         observation: {
           state: { currentScreen: 'home' },
           testIds: [{ testId: 'send-btn', tag: 'button', visible: true }],
@@ -1601,12 +1661,14 @@ describe('similarity', () => {
 
       expect(result).toBeDefined();
       if (result?.similarSteps.length) {
-        expect(result.similarSteps[0].tool).toBe('mm_click');
+        expect(result.similarSteps[0].tool).toBe('click');
       }
     });
 
     it('excludes discovery tools from similarity scoring', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
       const metadata = {
         schemaVersion: 1,
         sessionId: 'mm-session-1',
@@ -1616,7 +1678,7 @@ describe('similarity', () => {
         launch: { stateMode: 'default' },
       };
       const discoveryStep = createStepRecord({
-        tool: { name: 'mm_describe_screen', input: {} },
+        tool: { name: 'describe_screen', input: {} },
         observation: {
           state: { currentScreen: 'home' },
           testIds: [{ testId: 'send-btn', tag: 'button', visible: true }],
@@ -1644,14 +1706,16 @@ describe('similarity', () => {
 
       if (result?.similarSteps.length) {
         const hasDiscoveryTool = result.similarSteps.some(
-          (s) => s.tool === 'mm_describe_screen',
+          (s) => s.tool === 'describe_screen',
         );
         expect(hasDiscoveryTool).toBe(false);
       }
     });
 
     it('returns undefined when no candidate sessions exist', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
 
       vi.mocked(fs.readdir).mockResolvedValueOnce([] as any);
 
@@ -1668,7 +1732,9 @@ describe('similarity', () => {
     });
 
     it('excludes current session from candidate sessions', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
       const metadata = {
         schemaVersion: 1,
         sessionId: 'mm-session-current',
@@ -1696,7 +1762,9 @@ describe('similarity', () => {
     });
 
     it('caps testId overlap scoring at 3 items', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
       const metadata = {
         schemaVersion: 1,
         sessionId: 'mm-session-1',
@@ -1745,7 +1813,9 @@ describe('similarity', () => {
     });
 
     it('caps a11y overlap scoring at 2 items', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
       const metadata = {
         schemaVersion: 1,
         sessionId: 'mm-session-1',
@@ -1794,7 +1864,9 @@ describe('similarity', () => {
     });
 
     it('computes confidence as ratio of score to max score', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
       const metadata = {
         schemaVersion: 1,
         sessionId: 'mm-session-1',
@@ -1804,7 +1876,7 @@ describe('similarity', () => {
         launch: { stateMode: 'default' },
       };
       const stepRecord = createStepRecord({
-        tool: { name: 'mm_click', input: { testId: 'send-btn' } },
+        tool: { name: 'click', input: { testId: 'send-btn' } },
         observation: {
           state: { currentScreen: 'send' },
           testIds: [{ testId: 'send-btn', tag: 'button', visible: true }],
@@ -1842,7 +1914,9 @@ describe('similarity', () => {
     });
 
     it('filters steps using flowTag from context', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
       const metadata = {
         schemaVersion: 1,
         sessionId: 'mm-session-1',
@@ -1875,7 +1949,9 @@ describe('similarity', () => {
     });
 
     it('does not award sameScreen bonus for unknown screens', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
       const metadata = {
         schemaVersion: 1,
         sessionId: 'mm-session-1',
@@ -1915,7 +1991,9 @@ describe('similarity', () => {
     });
 
     it('builds avoid list only for targets meeting failure threshold', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
       const metadata = {
         schemaVersion: 1,
         sessionId: 'mm-session-1',
@@ -1931,7 +2009,7 @@ describe('similarity', () => {
       }) => ({
         ...createStepRecord({
           tool: {
-            name: 'mm_click',
+            name: 'click',
             input: { testId: target.testId ?? 'unknown-btn' },
             target,
           },
@@ -1955,7 +2033,7 @@ describe('similarity', () => {
       const failedSelector = makeFailedStep({ selector: '.unstable-target' });
       const successfulStep = createStepRecord({
         tool: {
-          name: 'mm_click',
+          name: 'click',
           input: { testId: 'confirm-btn' },
           target: { testId: 'confirm-btn' },
         },
@@ -2007,7 +2085,9 @@ describe('similarity', () => {
     });
 
     it('skips suggested action when tool is not in action map', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
       const metadata = {
         schemaVersion: 1,
         sessionId: 'mm-session-1',
@@ -2018,7 +2098,7 @@ describe('similarity', () => {
       };
       const unknownToolStep = createStepRecord({
         tool: {
-          name: 'mm_unknown_tool',
+          name: 'unknown_tool',
           input: { testId: 'send-btn' },
           target: { testId: 'send-btn' },
         },
@@ -2056,7 +2136,9 @@ describe('similarity', () => {
     });
 
     it('includes a11y fallback target when testId text matches visible a11y name', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
       const metadata = {
         schemaVersion: 1,
         sessionId: 'mm-session-1',
@@ -2067,7 +2149,7 @@ describe('similarity', () => {
       };
       const actionableStep = createStepRecord({
         tool: {
-          name: 'mm_click',
+          name: 'click',
           input: { testId: 'send-button' },
           target: { testId: 'send-button' },
         },
@@ -2151,7 +2233,7 @@ describe('session', () => {
       schemaVersion: 1,
       sessionId,
       timestamp,
-      tool: { name: 'mm_click', input: { testId: 'test-btn' } },
+      tool: { name: 'click', input: { testId: 'test-btn' } },
       observation: {
         state: {
           isLoaded: true,
@@ -2176,7 +2258,9 @@ describe('session', () => {
 
   describe('getAllSessionIds', () => {
     it('returns session IDs from directories starting with mm-', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
 
       vi.mocked(fs.readdir).mockResolvedValueOnce([
         createDirent('mm-session-1'),
@@ -2208,7 +2292,9 @@ describe('session', () => {
     });
 
     it('returns empty array when directory read fails', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
 
       vi.mocked(fs.readdir).mockRejectedValueOnce(new Error('ENOENT'));
 
@@ -2218,7 +2304,9 @@ describe('session', () => {
     });
 
     it('returns empty array for empty directory', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
 
       vi.mocked(fs.readdir).mockResolvedValueOnce([] as any);
 
@@ -2230,7 +2318,9 @@ describe('session', () => {
 
   describe('session scanning limits', () => {
     it('limits sessions scanned to maxSessionsToScan (20)', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
 
       // Create 25 sessions
       const sessions = Array.from({ length: 25 }, (_, i) =>
@@ -2267,7 +2357,9 @@ describe('session', () => {
     });
 
     it('limits steps per session to maxStepsPerSession (500)', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
 
       // Create 600 steps for one session
       const stepFiles = Array.from({ length: 600 }, (_, i) => `step-${i}.json`);
@@ -2300,7 +2392,9 @@ describe('session', () => {
     });
 
     it('stops scanning when maxTotalSteps (2000) is reached', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
 
       // Create 5 sessions with 500 steps each = 2500 total
       const sessions = Array.from({ length: 5 }, (_, i) =>
@@ -2353,7 +2447,9 @@ describe('session', () => {
 
   describe('filter parameters', () => {
     it('filters sessions by flowTag', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
 
       vi.mocked(fs.readdir).mockResolvedValueOnce([
         createDirent('mm-session-send'),
@@ -2385,7 +2481,9 @@ describe('session', () => {
     });
 
     it('filters sessions by tag', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
 
       vi.mocked(fs.readdir).mockResolvedValueOnce([
         createDirent('mm-session-e2e'),
@@ -2417,7 +2515,9 @@ describe('session', () => {
     });
 
     it('filters sessions by sinceHours', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
       const now = new Date();
       const recentDate = new Date(now.getTime() - 6 * 60 * 60 * 1000);
       const oldDate = new Date(now.getTime() - 48 * 60 * 60 * 1000);
@@ -2452,7 +2552,9 @@ describe('session', () => {
     });
 
     it('combines multiple filters', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
       const recentDate = new Date(Date.now() - 6 * 60 * 60 * 1000);
       const oldDate = new Date(Date.now() - 48 * 60 * 60 * 1000);
 
@@ -2503,7 +2605,9 @@ describe('session', () => {
 
   describe('corrupted session file handling', () => {
     it('skips corrupted session metadata files', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
 
       vi.mocked(fs.readdir).mockResolvedValueOnce([
         createDirent('mm-session-valid'),
@@ -2527,7 +2631,9 @@ describe('session', () => {
     });
 
     it('skips corrupted step files during search', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
 
       vi.mocked(fs.readdir)
         .mockResolvedValueOnce([createDirent('mm-session-1')] as any)
@@ -2559,7 +2665,9 @@ describe('session', () => {
     });
 
     it('handles missing step files gracefully', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
 
       vi.mocked(fs.readdir)
         .mockResolvedValueOnce([createDirent('mm-session-1')] as any)
@@ -2582,7 +2690,9 @@ describe('session', () => {
     });
 
     it('handles steps directory not existing', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
 
       vi.mocked(fs.readdir)
         .mockResolvedValueOnce([createDirent('mm-session-1')] as any)
@@ -2605,7 +2715,9 @@ describe('session', () => {
 
   describe('empty session directory', () => {
     it('returns empty results for empty knowledge root', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
 
       vi.mocked(fs.readdir).mockResolvedValueOnce([] as any);
 
@@ -2615,7 +2727,9 @@ describe('session', () => {
     });
 
     it('returns empty search results for empty knowledge root', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
 
       vi.mocked(fs.readdir).mockResolvedValueOnce([] as any);
 
@@ -2625,7 +2739,9 @@ describe('session', () => {
     });
 
     it('returns empty getLastSteps for session with no steps', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
 
       vi.mocked(fs.readdir)
         .mockResolvedValueOnce([createDirent('mm-session-1')] as any)
@@ -2641,7 +2757,9 @@ describe('session', () => {
     });
 
     it('returns empty summarizeSession for session with no steps', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
 
       vi.mocked(fs.readdir).mockResolvedValueOnce([] as any);
 
@@ -2654,7 +2772,9 @@ describe('session', () => {
 
   describe('resolveSessionIds with filters', () => {
     it('includes sessions without metadata when filtering', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
 
       vi.mocked(fs.readdir).mockResolvedValueOnce([
         createDirent('mm-session-with-metadata'),
@@ -2682,7 +2802,9 @@ describe('session', () => {
     });
 
     it('returns empty array for scope current without sessionId', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
 
       const result = await store.resolveSessionIds('current', undefined);
 
@@ -2690,7 +2812,9 @@ describe('session', () => {
     });
 
     it('returns specific sessionId for scope object', async () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
 
       const result = await store.resolveSessionIds(
         { sessionId: 'specific-session' },
@@ -2715,7 +2839,9 @@ describe('session', () => {
     });
 
     it('returns true when knowledge store is initialized', () => {
-      const store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      const store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
       setKnowledgeStore(store);
 
       const result = hasKnowledgeStore();
@@ -2728,7 +2854,9 @@ describe('session', () => {
     let store: KnowledgeStore;
 
     beforeEach(() => {
-      store = new KnowledgeStore({ rootDir: '/test/knowledge' });
+      store = new KnowledgeStore({
+        rootDir: '/test/knowledge',
+      });
       setKnowledgeStore(store);
     });
 
@@ -2739,7 +2867,7 @@ describe('session', () => {
     it('recordStep delegates to underlying KnowledgeStore instance', async () => {
       const params = {
         sessionId: 'test-session',
-        toolName: 'mm_click',
+        toolName: 'click',
         observation: createObservation(),
         outcome: { ok: true } as StepRecordOutcome,
       };
@@ -2757,7 +2885,7 @@ describe('session', () => {
 
       const params = {
         sessionId: 'test-session',
-        toolName: 'mm_click',
+        toolName: 'click',
         observation: createObservation(),
         outcome: { ok: true } as StepRecordOutcome,
       };
@@ -2771,7 +2899,7 @@ describe('session', () => {
       const mockSteps = [
         {
           timestamp: '2024-01-15T10:30:00.000Z',
-          tool: 'mm_click',
+          tool: 'click',
           screen: 'home' as const,
           snippet: 'Clicked button',
         },
@@ -2809,7 +2937,7 @@ describe('session', () => {
       const mockResults = [
         {
           timestamp: '2024-01-15T10:30:00.000Z',
-          tool: 'mm_click',
+          tool: 'click',
           screen: 'home' as const,
           snippet: 'Clicked send button',
         },
@@ -2845,7 +2973,7 @@ describe('session', () => {
       const mockSummary = {
         sessionId: 'test-session',
         stepCount: 5,
-        recipe: [{ stepNumber: 1, tool: 'mm_click', notes: 'Clicked send' }],
+        recipe: [{ stepNumber: 1, tool: 'click', notes: 'Clicked send' }],
       };
 
       vi.spyOn(store, 'summarizeSession').mockResolvedValueOnce(mockSummary);
diff --git a/src/mcp-server/knowledge-store.ts b/src/knowledge-store/knowledge-store.ts
similarity index 97%
rename from src/mcp-server/knowledge-store.ts
rename to src/knowledge-store/knowledge-store.ts
index fbf1ec5..fecbd8c 100644
--- a/src/mcp-server/knowledge-store.ts
+++ b/src/knowledge-store/knowledge-store.ts
@@ -8,6 +8,7 @@ import {
   tokenize,
   tokenizeIdentifier,
 } from './tokenization.js';
+import type { ExtensionState } from '../capabilities/types.js';
 import type {
   StepRecord,
   StepRecordTool,
@@ -28,14 +29,12 @@ import type {
   PriorKnowledgeAvoid,
   PriorKnowledgeRelatedSession,
   PriorKnowledgeTarget,
-} from './types';
+} from '../tools/types';
+import { generateFilesafeTimestamp, debugWarn } from '../utils';
 import {
-  generateFilesafeTimestamp,
   isSensitiveField,
   SENSITIVE_FIELD_PATTERNS,
-  debugWarn,
-} from './utils';
-import type { ExtensionState } from '../capabilities/types.js';
+} from './utils/redaction.js';
 
 const KNOWLEDGE_ROOT = 'test-artifacts/llm-knowledge';
 const SCHEMA_VERSION = 1;
@@ -96,10 +95,6 @@ export type KnowledgeStoreConfig = {
    * Prefix for session IDs (default: 'mm-')
    */
   sessionIdPrefix?: string;
-  /**
-   * Prefix for tool names (default: 'mm')
-   */
-  toolPrefix?: string;
 };
 
 /**
@@ -128,8 +123,6 @@ export class KnowledgeStore {
 
   readonly #sessionIdPrefix: string;
 
-  readonly #toolPrefix: string;
-
   readonly #sessionMetadataCache: Map<string, SessionMetadata | null> =
     new Map();
 
@@ -151,30 +144,28 @@ export class KnowledgeStore {
     this.#knowledgeRoot =
       config.rootDir ?? path.join(process.cwd(), KNOWLEDGE_ROOT);
     this.#sessionIdPrefix = config.sessionIdPrefix ?? 'mm-';
-    this.#toolPrefix = config.toolPrefix ?? 'mm';
 
-    const prefix = this.#toolPrefix;
     this.#actionableTools = [
-      `${prefix}_click`,
-      `${prefix}_type`,
-      `${prefix}_wait_for`,
-      `${prefix}_navigate`,
-      `${prefix}_wait_for_notification`,
+      'click',
+      'type',
+      'wait_for',
+      'navigate',
+      'wait_for_notification',
     ];
 
     this.#toolActionMap = {
-      [`${prefix}_click`]: 'click',
-      [`${prefix}_type`]: 'type',
-      [`${prefix}_wait_for`]: 'wait_for',
-      [`${prefix}_navigate`]: 'navigate',
-      [`${prefix}_wait_for_notification`]: 'wait_for_notification',
+      click: 'click',
+      type: 'type',
+      wait_for: 'wait_for',
+      navigate: 'navigate',
+      wait_for_notification: 'wait_for_notification',
     };
 
     this.#discoveryTools = [
-      `${prefix}_describe_screen`,
-      `${prefix}_list_testids`,
-      `${prefix}_accessibility_snapshot`,
-      `${prefix}_get_state`,
+      'describe_screen',
+      'list_testids',
+      'accessibility_snapshot',
+      'get_state',
     ];
   }
 
@@ -448,15 +439,8 @@ export class KnowledgeStore {
   ): string[] {
     const labels: string[] = [];
 
-    const navigationTools = [
-      `${this.#toolPrefix}_navigate`,
-      `${this.#toolPrefix}_wait_for_notification`,
-    ];
-    const interactionTools = [
-      `${this.#toolPrefix}_click`,
-      `${this.#toolPrefix}_type`,
-      `${this.#toolPrefix}_wait_for`,
-    ];
+    const navigationTools = ['navigate', 'wait_for_notification'];
+    const interactionTools = ['click', 'type', 'wait_for'];
 
     if (this.#discoveryTools.includes(toolName)) {
       labels.push('discovery');
@@ -825,7 +809,7 @@ export class KnowledgeStore {
     let textRedacted = false;
     let textLength: number | undefined;
 
-    const typeToolName = `${this.#toolPrefix}_type`;
+    const typeToolName = 'type';
 
     for (const [key, value] of Object.entries(input)) {
       if (toolName === typeToolName && key === 'text') {
diff --git a/src/mcp-server/tokenization.test.ts b/src/knowledge-store/tokenization.test.ts
similarity index 99%
rename from src/mcp-server/tokenization.test.ts
rename to src/knowledge-store/tokenization.test.ts
index 64c33d5..939ac1c 100644
--- a/src/mcp-server/tokenization.test.ts
+++ b/src/knowledge-store/tokenization.test.ts
@@ -56,7 +56,7 @@ describe('tokenization', () => {
       expect(buttonCount).toBe(1);
     });
 
-    it('handles special MCP/extension stopwords', () => {
+    it('handles special tool/extension stopwords', () => {
       const tokens = tokenize('mm mcp lw test flow');
       expect(tokens).not.toContain('mm');
       expect(tokens).not.toContain('mcp');
diff --git a/src/mcp-server/tokenization.ts b/src/knowledge-store/tokenization.ts
similarity index 100%
rename from src/mcp-server/tokenization.ts
rename to src/knowledge-store/tokenization.ts
diff --git a/src/mcp-server/utils/redaction.test.ts b/src/knowledge-store/utils/redaction.test.ts
similarity index 100%
rename from src/mcp-server/utils/redaction.test.ts
rename to src/knowledge-store/utils/redaction.test.ts
diff --git a/src/mcp-server/utils/redaction.ts b/src/knowledge-store/utils/redaction.ts
similarity index 100%
rename from src/mcp-server/utils/redaction.ts
rename to src/knowledge-store/utils/redaction.ts
diff --git a/src/launcher/console-error-buffer.test.ts b/src/launcher/console-error-buffer.test.ts
index c34747b..b824862 100644
--- a/src/launcher/console-error-buffer.test.ts
+++ b/src/launcher/console-error-buffer.test.ts
@@ -1,7 +1,7 @@
 import { describe, it, expect } from 'vitest';
 
-import { ConsoleErrorBuffer } from './console-error-buffer';
-import type { ConsoleErrorEntry } from './console-error-buffer';
+import { ConsoleErrorBuffer } from './console-error-buffer.js';
+import type { ConsoleErrorEntry } from './console-error-buffer.js';
 
 describe('ConsoleErrorBuffer', () => {
   describe('constructor', () => {
diff --git a/src/launcher/retry.test.ts b/src/launcher/retry.test.ts
index 06f7ef6..4cf4dbf 100644
--- a/src/launcher/retry.test.ts
+++ b/src/launcher/retry.test.ts
@@ -1,6 +1,6 @@
 import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
 
-import { delay, retryUntil } from './retry';
+import { delay, retryUntil } from './retry.js';
 
 describe('retry', () => {
   beforeEach(() => {
diff --git a/src/mcp-server/server.test.ts b/src/mcp-server/server.test.ts
deleted file mode 100644
index f6ff8fa..0000000
--- a/src/mcp-server/server.test.ts
+++ /dev/null
@@ -1,677 +0,0 @@
-/* eslint-disable @typescript-eslint/naming-convention */
-import { Server } from '@modelcontextprotocol/sdk/server/index.js';
-import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
-import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
-import type { MockInstance } from 'vitest';
-
-import { createMcpServer } from './server.js';
-import type { McpServerConfig } from './server.js';
-import * as sessionManagerModule from './session-manager.js';
-import { flushPromises } from './test-utils';
-import * as batchModule from './tools/batch.js';
-import * as definitionsModule from './tools/definitions.js';
-import { ErrorCodes } from './types';
-
-vi.mock('@modelcontextprotocol/sdk/server/index.js');
-vi.mock('@modelcontextprotocol/sdk/server/stdio.js');
-vi.mock('./session-manager.js');
-vi.mock('./tools/definitions.js');
-vi.mock('./tools/batch.js');
-
-describe('createMcpServer', () => {
-  let processExitSpy: MockInstance;
-  let processOnSpy: MockInstance;
-  let consoleErrorSpy: MockInstance;
-  let signalHandlers: Map<string, () => void>;
-  let mockSetRequestHandler: ReturnType<typeof vi.fn>;
-  let mockConnect: ReturnType<typeof vi.fn>;
-  let mockClose: ReturnType<typeof vi.fn>;
-
-  const mockToolDefinitions = [
-    { name: 'mm_click', description: 'Click element', inputSchema: {} },
-    { name: 'mm_type', description: 'Type text', inputSchema: {} },
-  ];
-
-  const mockToolHandlers = {
-    mm_click: vi
-      .fn()
-      .mockResolvedValue({ ok: true, result: { clicked: true } }),
-    mm_type: vi.fn().mockResolvedValue({ ok: true, result: { typed: true } }),
-  };
-
-  beforeEach(() => {
-    vi.clearAllMocks();
-
-    mockSetRequestHandler = vi.fn();
-    mockConnect = vi.fn().mockResolvedValue(undefined);
-    mockClose = vi.fn().mockResolvedValue(undefined);
-
-    vi.mocked(Server).mockImplementation(
-      () =>
-        ({
-          setRequestHandler: mockSetRequestHandler,
-          connect: mockConnect,
-          close: mockClose,
-        }) as unknown as InstanceType<typeof Server>,
-    );
-
-    vi.mocked(StdioServerTransport).mockImplementation(
-      () =>
-        ({
-          type: 'stdio',
-        }) as unknown as InstanceType<typeof StdioServerTransport>,
-    );
-
-    vi.mocked(sessionManagerModule.getSessionManager).mockReturnValue({
-      getSessionId: vi.fn().mockReturnValue('test-session-123'),
-      cleanup: vi.fn().mockResolvedValue(true),
-    } as unknown as ReturnType<typeof sessionManagerModule.getSessionManager>);
-    vi.mocked(sessionManagerModule.hasSessionManager).mockReturnValue(true);
-
-    vi.mocked(definitionsModule.getToolDefinitions).mockReturnValue(
-      mockToolDefinitions,
-    );
-    vi.mocked(definitionsModule.buildToolHandlersRecord).mockReturnValue(
-      mockToolHandlers,
-    );
-    vi.mocked(definitionsModule.getToolHandler).mockReturnValue(
-      vi.fn().mockResolvedValue({ ok: true, result: {} }),
-    );
-    vi.mocked(definitionsModule.safeValidateToolInput).mockReturnValue({
-      success: true,
-      data: {},
-    });
-    (definitionsModule as { TOOL_PREFIX: string }).TOOL_PREFIX = 'mm';
-
-    vi.mocked(batchModule.setToolRegistry).mockImplementation(() => {});
-
-    signalHandlers = new Map();
-    processOnSpy = vi
-      .spyOn(process, 'on')
-      .mockImplementation(
-        (event: string | symbol, handler: (...args: unknown[]) => void) => {
-          signalHandlers.set(String(event), handler as () => void);
-          return process;
-        },
-      );
-
-    processExitSpy = vi
-      .spyOn(process, 'exit')
-      .mockImplementation(
-        (_code?: string | number | null | undefined): never => {
-          throw new Error(`process.exit(${_code})`);
-        },
-      );
-
-    consoleErrorSpy = vi.spyOn(console, 'error').mockImplementation(() => {});
-  });
-
-  afterEach(() => {
-    vi.restoreAllMocks();
-  });
-
-  describe('factory function', () => {
-    it('creates server with required config', () => {
-      const config: McpServerConfig = {
-        name: 'test-server',
-        version: '1.0.0',
-      };
-
-      const server = createMcpServer(config);
-
-      expect(server).toBeDefined();
-      expect(server.start).toBeInstanceOf(Function);
-      expect(server.stop).toBeInstanceOf(Function);
-      expect(server.getServer).toBeInstanceOf(Function);
-      expect(server.getToolDefinitions).toBeInstanceOf(Function);
-      expect(server.getToolPrefix).toBeInstanceOf(Function);
-    });
-
-    it('creates Server with name and version', () => {
-      const config: McpServerConfig = {
-        name: 'my-extension',
-        version: '2.0.0',
-      };
-
-      createMcpServer(config);
-
-      expect(Server).toHaveBeenCalledWith(
-        { name: 'my-extension', version: '2.0.0' },
-        { capabilities: { tools: {} } },
-      );
-    });
-
-    it('registers ListTools and CallTool request handlers', () => {
-      createMcpServer({
-        name: 'test-server',
-        version: '1.0.0',
-      });
-
-      expect(mockSetRequestHandler).toHaveBeenCalledTimes(2);
-    });
-
-    it('registers signal handlers for SIGINT and SIGTERM', () => {
-      createMcpServer({
-        name: 'test-server',
-        version: '1.0.0',
-      });
-
-      expect(processOnSpy).toHaveBeenCalledWith('SIGINT', expect.any(Function));
-      expect(processOnSpy).toHaveBeenCalledWith(
-        'SIGTERM',
-        expect.any(Function),
-      );
-    });
-  });
-
-  describe('getServer()', () => {
-    it('returns the underlying MCP Server instance', () => {
-      const server = createMcpServer({ name: 'test', version: '1.0.0' });
-
-      const mcpServer = server.getServer();
-
-      expect(mcpServer).toBeDefined();
-      expect(mcpServer.setRequestHandler).toBeInstanceOf(Function);
-      expect(mcpServer.connect).toBeInstanceOf(Function);
-      expect(mcpServer.close).toBeInstanceOf(Function);
-    });
-  });
-
-  describe('getToolDefinitions()', () => {
-    it('returns all tool definitions', () => {
-      const server = createMcpServer({ name: 'test', version: '1.0.0' });
-
-      const toolDefs = server.getToolDefinitions();
-
-      expect(toolDefs).toStrictEqual(mockToolDefinitions);
-    });
-  });
-
-  describe('getToolPrefix()', () => {
-    it('returns the tool prefix', () => {
-      const server = createMcpServer({ name: 'test', version: '1.0.0' });
-
-      const prefix = server.getToolPrefix();
-
-      expect(prefix).toBe('mm');
-    });
-  });
-
-  describe('start()', () => {
-    it('creates StdioServerTransport and connects', async () => {
-      const server = createMcpServer({ name: 'test', version: '1.0.0' });
-
-      await server.start();
-
-      expect(StdioServerTransport).toHaveBeenCalled();
-      expect(mockConnect).toHaveBeenCalled();
-    });
-
-    it('logs server startup message', async () => {
-      const customLogger = vi.fn();
-      const server = createMcpServer({
-        name: 'my-server',
-        version: '2.0.0',
-        logger: customLogger,
-      });
-
-      await server.start();
-
-      expect(customLogger).toHaveBeenCalledWith(
-        'my-server MCP Server v2.0.0 running on stdio',
-      );
-    });
-
-    it('uses console.error as default logger', async () => {
-      const server = createMcpServer({
-        name: 'test-server',
-        version: '1.0.0',
-      });
-
-      await server.start();
-
-      expect(consoleErrorSpy).toHaveBeenCalledWith(
-        'test-server MCP Server v1.0.0 running on stdio',
-      );
-    });
-  });
-
-  describe('stop()', () => {
-    it('closes server when transport exists', async () => {
-      const server = createMcpServer({ name: 'test', version: '1.0.0' });
-      await server.start();
-
-      await server.stop();
-
-      expect(mockClose).toHaveBeenCalled();
-    });
-
-    it('does nothing when transport does not exist', async () => {
-      const server = createMcpServer({ name: 'test', version: '1.0.0' });
-
-      await server.stop();
-
-      expect(mockClose).not.toHaveBeenCalled();
-    });
-  });
-
-  describe('ListToolsRequestSchema handler', () => {
-    it('returns tool definitions', async () => {
-      createMcpServer({ name: 'test', version: '1.0.0' });
-
-      const listToolsHandler = mockSetRequestHandler.mock.calls[0][1];
-
-      const result = await listToolsHandler();
-
-      expect(result).toStrictEqual({
-        tools: mockToolDefinitions,
-      });
-    });
-  });
-
-  describe('CallToolRequestSchema handler', () => {
-    let callToolHandler: (
-      request: {
-        params: { name: string; arguments?: Record<string, unknown> };
-      },
-      extra?: { signal?: AbortSignal },
-    ) => Promise<unknown>;
-
-    beforeEach(() => {
-      createMcpServer({ name: 'test', version: '1.0.0' });
-      callToolHandler = mockSetRequestHandler.mock.calls[1][1];
-    });
-
-    it('returns error for unknown tool', async () => {
-      const result = await callToolHandler({
-        params: { name: 'mm_unknown', arguments: {} },
-      });
-
-      expect(result).toMatchObject({
-        content: [{ type: 'text' }],
-        isError: true,
-      });
-
-      const responseText = JSON.parse(
-        (result as { content: [{ text: string }] }).content[0].text,
-      );
-      expect(responseText.error.code).toBe(ErrorCodes.MM_INVALID_INPUT);
-      expect(responseText.error.message).toContain('Unknown tool: mm_unknown');
-    });
-
-    it('returns error for invalid input', async () => {
-      vi.mocked(definitionsModule.safeValidateToolInput).mockReturnValueOnce({
-        success: false,
-        error: 'name: Required',
-      });
-
-      const result = await callToolHandler({
-        params: { name: 'mm_click', arguments: {} },
-      });
-
-      expect(result).toMatchObject({
-        content: [{ type: 'text' }],
-        isError: true,
-      });
-
-      const responseText = JSON.parse(
-        (result as { content: [{ text: string }] }).content[0].text,
-      );
-      expect(responseText.error.code).toBe(ErrorCodes.MM_INVALID_INPUT);
-      expect(responseText.error.message).toContain(
-        'Invalid input: name: Required',
-      );
-    });
-
-    it('returns error when no handler registered', async () => {
-      vi.mocked(definitionsModule.getToolHandler).mockReturnValueOnce(
-        undefined,
-      );
-
-      const result = await callToolHandler({
-        params: { name: 'mm_click', arguments: {} },
-      });
-
-      expect(result).toMatchObject({
-        content: [{ type: 'text' }],
-        isError: true,
-      });
-
-      const responseText = JSON.parse(
-        (result as { content: [{ text: string }] }).content[0].text,
-      );
-      expect(responseText.error.code).toBe(ErrorCodes.MM_INVALID_INPUT);
-      expect(responseText.error.message).toContain(
-        'No handler registered for tool: mm_click',
-      );
-    });
-
-    it('executes handler and returns success response', async () => {
-      const mockHandler = vi
-        .fn()
-        .mockResolvedValue({ ok: true, result: { clicked: true } });
-      vi.mocked(definitionsModule.getToolHandler).mockReturnValueOnce(
-        mockHandler,
-      );
-
-      const result = await callToolHandler({
-        params: { name: 'mm_click', arguments: { testId: 'btn' } },
-      });
-
-      expect(result).toMatchObject({
-        content: [{ type: 'text' }],
-        isError: false,
-      });
-
-      const responseText = JSON.parse(
-        (result as { content: [{ text: string }] }).content[0].text,
-      );
-      expect(responseText.ok).toBe(true);
-      expect(responseText.result.clicked).toBe(true);
-    });
-
-    it('passes signal to handler', async () => {
-      const mockHandler = vi.fn().mockResolvedValue({ ok: true, result: {} });
-      vi.mocked(definitionsModule.getToolHandler).mockReturnValueOnce(
-        mockHandler,
-      );
-      const mockSignal = new AbortController().signal;
-
-      await callToolHandler(
-        { params: { name: 'mm_click', arguments: {} } },
-        { signal: mockSignal },
-      );
-
-      expect(mockHandler).toHaveBeenCalledWith(
-        expect.any(Object),
-        expect.objectContaining({ signal: mockSignal }),
-      );
-    });
-
-    it('returns isError: true when handler returns ok: false', async () => {
-      const mockHandler = vi.fn().mockResolvedValue({
-        ok: false,
-        error: { code: 'MM_CLICK_FAILED', message: 'Click failed' },
-      });
-      vi.mocked(definitionsModule.getToolHandler).mockReturnValueOnce(
-        mockHandler,
-      );
-
-      const result = await callToolHandler({
-        params: { name: 'mm_click', arguments: {} },
-      });
-
-      expect(result).toMatchObject({
-        isError: true,
-      });
-    });
-
-    it('includes sessionId in error response when session manager available', async () => {
-      vi.mocked(sessionManagerModule.hasSessionManager).mockReturnValue(true);
-      vi.mocked(sessionManagerModule.getSessionManager).mockReturnValue({
-        getSessionId: vi.fn().mockReturnValue('session-abc'),
-        cleanup: vi.fn(),
-      } as unknown as ReturnType<
-        typeof sessionManagerModule.getSessionManager
-      >);
-
-      const result = await callToolHandler({
-        params: { name: 'mm_unknown', arguments: {} },
-      });
-
-      const responseText = JSON.parse(
-        (result as { content: [{ text: string }] }).content[0].text,
-      );
-      expect(responseText.meta.sessionId).toBe('session-abc');
-    });
-
-    it('does not include sessionId when no session manager', async () => {
-      vi.mocked(sessionManagerModule.hasSessionManager).mockReturnValue(false);
-
-      const result = await callToolHandler({
-        params: { name: 'mm_unknown', arguments: {} },
-      });
-
-      const responseText = JSON.parse(
-        (result as { content: [{ text: string }] }).content[0].text,
-      );
-      expect(responseText.meta.sessionId).toBeUndefined();
-    });
-  });
-
-  describe('signal handlers', () => {
-    it('calls cleanup on SIGINT', async () => {
-      const onCleanup = vi.fn().mockResolvedValue(undefined);
-      createMcpServer({
-        name: 'test',
-        version: '1.0.0',
-        onCleanup,
-      });
-
-      const sigintHandler = signalHandlers.get('SIGINT');
-      expect(sigintHandler).toBeDefined();
-
-      try {
-        sigintHandler?.();
-        await flushPromises();
-      } catch (e) {
-        expect((e as Error).message).toBe('process.exit(0)');
-      }
-
-      expect(onCleanup).toHaveBeenCalled();
-    });
-
-    it('calls cleanup on SIGTERM', async () => {
-      const onCleanup = vi.fn().mockResolvedValue(undefined);
-      createMcpServer({
-        name: 'test',
-        version: '1.0.0',
-        onCleanup,
-      });
-
-      const sigtermHandler = signalHandlers.get('SIGTERM');
-      expect(sigtermHandler).toBeDefined();
-
-      try {
-        sigtermHandler?.();
-        await flushPromises();
-      } catch (e) {
-        expect((e as Error).message).toBe('process.exit(0)');
-      }
-
-      expect(onCleanup).toHaveBeenCalled();
-    });
-
-    it('cleans up session manager if available', async () => {
-      const mockCleanup = vi.fn().mockResolvedValue(true);
-      vi.mocked(sessionManagerModule.hasSessionManager).mockReturnValue(true);
-      vi.mocked(sessionManagerModule.getSessionManager).mockReturnValue({
-        getSessionId: vi.fn().mockReturnValue('session-abc'),
-        cleanup: mockCleanup,
-      } as unknown as ReturnType<
-        typeof sessionManagerModule.getSessionManager
-      >);
-
-      createMcpServer({
-        name: 'test',
-        version: '1.0.0',
-      });
-
-      const sigintHandler = signalHandlers.get('SIGINT');
-
-      sigintHandler?.();
-      await flushPromises();
-
-      expect(mockCleanup).toHaveBeenCalled();
-    });
-
-    it('does not call session cleanup when no session manager', async () => {
-      const mockCleanup = vi.fn();
-      vi.mocked(sessionManagerModule.hasSessionManager).mockReturnValue(false);
-
-      createMcpServer({
-        name: 'test',
-        version: '1.0.0',
-      });
-
-      const sigintHandler = signalHandlers.get('SIGINT');
-
-      sigintHandler?.();
-      await flushPromises();
-
-      expect(mockCleanup).not.toHaveBeenCalled();
-    });
-
-    it('prevents duplicate cleanup calls', async () => {
-      const onCleanup = vi.fn().mockResolvedValue(undefined);
-      createMcpServer({
-        name: 'test',
-        version: '1.0.0',
-        onCleanup,
-      });
-
-      const sigintHandler = signalHandlers.get('SIGINT');
-
-      sigintHandler?.();
-      sigintHandler?.();
-      await flushPromises();
-
-      expect(onCleanup).toHaveBeenCalledTimes(1);
-    });
-
-    it('logs cleanup message', async () => {
-      const customLogger = vi.fn();
-      createMcpServer({
-        name: 'test',
-        version: '1.0.0',
-        logger: customLogger,
-      });
-
-      const sigintHandler = signalHandlers.get('SIGINT');
-
-      sigintHandler?.();
-      await flushPromises();
-
-      expect(customLogger).toHaveBeenCalledWith(
-        'Received SIGINT, cleaning up...',
-      );
-    });
-
-    it('logs cleanup errors', async () => {
-      const customLogger = vi.fn();
-      const onCleanup = vi.fn().mockRejectedValue(new Error('Cleanup failed'));
-      createMcpServer({
-        name: 'test',
-        version: '1.0.0',
-        onCleanup,
-        logger: customLogger,
-      });
-
-      const sigintHandler = signalHandlers.get('SIGINT');
-
-      sigintHandler?.();
-      await flushPromises();
-
-      expect(customLogger).toHaveBeenCalledWith(
-        expect.stringContaining('Cleanup error:'),
-      );
-    });
-
-    it('exits with code 0 after cleanup', async () => {
-      createMcpServer({
-        name: 'test',
-        version: '1.0.0',
-      });
-
-      const sigintHandler = signalHandlers.get('SIGINT');
-
-      try {
-        sigintHandler?.();
-        await flushPromises();
-      } catch (e) {
-        expect((e as Error).message).toBe('process.exit(0)');
-      }
-
-      expect(processExitSpy).toHaveBeenCalledWith(0);
-    });
-
-    it('handles signal error gracefully', async () => {
-      const customLogger = vi.fn();
-      const onCleanup = vi.fn().mockImplementation(() => {
-        throw new Error('Sync error');
-      });
-      createMcpServer({
-        name: 'test',
-        version: '1.0.0',
-        onCleanup,
-        logger: customLogger,
-      });
-
-      const sigintHandler = signalHandlers.get('SIGINT');
-
-      sigintHandler?.();
-      await flushPromises();
-
-      expect(customLogger).toHaveBeenCalledWith(
-        expect.stringContaining('Cleanup error:'),
-      );
-    });
-  });
-
-  describe('tool registry', () => {
-    it('sets tool registry with handlers', () => {
-      createMcpServer({ name: 'test', version: '1.0.0' });
-
-      expect(batchModule.setToolRegistry).toHaveBeenCalledWith(
-        mockToolHandlers,
-      );
-    });
-  });
-
-  describe('createToolErrorResponse helper', () => {
-    it('formats error with sessionId from session manager', async () => {
-      vi.mocked(sessionManagerModule.hasSessionManager).mockReturnValue(true);
-      vi.mocked(sessionManagerModule.getSessionManager).mockReturnValue({
-        getSessionId: vi.fn().mockReturnValue('my-session'),
-        cleanup: vi.fn(),
-      } as unknown as ReturnType<
-        typeof sessionManagerModule.getSessionManager
-      >);
-
-      createMcpServer({ name: 'test', version: '1.0.0' });
-      const callToolHandler = mockSetRequestHandler.mock.calls[1][1];
-
-      const result = await callToolHandler({
-        params: { name: 'mm_invalid', arguments: {} },
-      });
-
-      const responseText = JSON.parse(
-        (result as { content: [{ text: string }] }).content[0].text,
-      );
-      expect(responseText.meta.sessionId).toBe('my-session');
-      expect(responseText.meta.timestamp).toBeDefined();
-      expect(responseText.meta.durationMs).toBeGreaterThanOrEqual(0);
-    });
-
-    it('includes error details when provided', async () => {
-      vi.mocked(definitionsModule.safeValidateToolInput).mockReturnValueOnce({
-        success: false,
-        error: 'validation error',
-      });
-
-      createMcpServer({ name: 'test', version: '1.0.0' });
-      const callToolHandler = mockSetRequestHandler.mock.calls[1][1];
-
-      const result = await callToolHandler({
-        params: { name: 'mm_click', arguments: { invalid: 'arg' } },
-      });
-
-      const responseText = JSON.parse(
-        (result as { content: [{ text: string }] }).content[0].text,
-      );
-      expect(responseText.error.details).toStrictEqual({
-        providedArgs: { invalid: 'arg' },
-      });
-    });
-  });
-});
diff --git a/src/mcp-server/server.ts b/src/mcp-server/server.ts
deleted file mode 100644
index 1c3411c..0000000
--- a/src/mcp-server/server.ts
+++ /dev/null
@@ -1,237 +0,0 @@
-#!/usr/bin/env node
-/* eslint-disable @typescript-eslint/explicit-function-return-type */
-import { Server } from '@modelcontextprotocol/sdk/server/index.js';
-import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
-import {
-  CallToolRequestSchema,
-  ListToolsRequestSchema,
-} from '@modelcontextprotocol/sdk/types.js';
-
-import { getSessionManager, hasSessionManager } from './session-manager.js';
-import { setToolRegistry } from './tools/batch.js';
-import {
-  getToolDefinitions,
-  getToolHandler,
-  safeValidateToolInput,
-  buildToolHandlersRecord,
-  TOOL_PREFIX,
-} from './tools/definitions.js';
-import type { ToolDefinition } from './tools/definitions.js';
-import { ErrorCodes } from './types';
-import { createErrorResponse } from './utils';
-
-export type McpServerConfig = {
-  name: string;
-  version: string;
-  onCleanup?: () => Promise<void>;
-  logger?: (message: string) => void;
-};
-
-/**
- * Create a standardized error response for tool execution failures.
- *
- * @param code The error code from ErrorCodes enum
- * @param message Human-readable error message
- * @param details Optional error details object
- * @param startTime Timestamp when the operation started
- * @returns MCP-formatted error response object
- */
-function createToolErrorResponse(
-  code: (typeof ErrorCodes)[keyof typeof ErrorCodes],
-  message: string,
-  details: Record<string, unknown> | undefined,
-  startTime: number,
-) {
-  const sessionId = hasSessionManager()
-    ? getSessionManager().getSessionId()
-    : undefined;
-
-  const response = createErrorResponse(
-    code,
-    message,
-    details,
-    sessionId,
-    startTime,
-  );
-
-  return {
-    content: [
-      {
-        type: 'text' as const,
-        text: JSON.stringify(response),
-      },
-    ],
-    isError: true,
-  };
-}
-
-export type McpServer = {
-  start(): Promise<void>;
-  stop(): Promise<void>;
-  getServer(): Server;
-  getToolDefinitions(): ToolDefinition[];
-  getToolPrefix(): string;
-};
-
-/**
- * Create and configure an MCP server instance.
- *
- * @param config Server configuration including name, version, and optional cleanup handler
- * @returns McpServer instance with start/stop methods and tool definitions
- */
-export function createMcpServer(config: McpServerConfig): McpServer {
-  const { name, version, onCleanup, logger = console.error } = config;
-
-  const toolDefinitions = getToolDefinitions();
-  const toolHandlers = buildToolHandlersRecord();
-
-  setToolRegistry(toolHandlers);
-
-  const validToolNames = new Set(toolDefinitions.map((tool) => tool.name));
-
-  const server = new Server({ name, version }, { capabilities: { tools: {} } });
-
-  let isCleaningUp = false;
-
-  server.setRequestHandler(ListToolsRequestSchema, async () => ({
-    tools: toolDefinitions,
-  }));
-
-  server.setRequestHandler(CallToolRequestSchema, async (request, extra) => {
-    const { name: toolName, arguments: args } = request.params;
-    const startTime = Date.now();
-    const signal = extra?.signal;
-
-    if (!validToolNames.has(toolName)) {
-      return createToolErrorResponse(
-        ErrorCodes.MM_INVALID_INPUT,
-        `Unknown tool: ${toolName}`,
-        undefined,
-        startTime,
-      );
-    }
-
-    const validation = safeValidateToolInput(toolName, args);
-    if (!validation.success) {
-      return createToolErrorResponse(
-        ErrorCodes.MM_INVALID_INPUT,
-        `Invalid input: ${validation.error}`,
-        { providedArgs: args },
-        startTime,
-      );
-    }
-
-    const handler = getToolHandler(toolName);
-
-    if (!handler) {
-      return createToolErrorResponse(
-        ErrorCodes.MM_INVALID_INPUT,
-        `No handler registered for tool: ${toolName}`,
-        undefined,
-        startTime,
-      );
-    }
-
-    const response = await handler(validation.data as Record<string, unknown>, {
-      signal,
-    });
-
-    return {
-      content: [
-        {
-          type: 'text' as const,
-          text: JSON.stringify(response),
-        },
-      ],
-      isError: !response.ok,
-    };
-  });
-
-  /**
-   * Handle process signals (SIGINT, SIGTERM) and perform cleanup.
-   *
-   * @param signal The signal name received (e.g., 'SIGINT', 'SIGTERM')
-   */
-  const handleSignal = async (signal: string) => {
-    if (isCleaningUp) {
-      return;
-    }
-    isCleaningUp = true;
-
-    logger(`Received ${signal}, cleaning up...`);
-
-    try {
-      if (onCleanup) {
-        await onCleanup();
-      }
-
-      if (hasSessionManager()) {
-        await getSessionManager().cleanup();
-      }
-    } catch (error) {
-      logger(`Cleanup error: ${JSON.stringify(error)}`);
-    }
-
-    process.exit(0);
-  };
-
-  process.on('SIGINT', () => {
-    handleSignal('SIGINT').catch((error) => logger(`SIGINT error: ${error}`));
-  });
-  process.on('SIGTERM', () => {
-    handleSignal('SIGTERM').catch((error) => logger(`SIGTERM error: ${error}`));
-  });
-
-  let transport: StdioServerTransport | undefined;
-
-  return {
-    /**
-     * Start the MCP server and connect to stdio transport.
-     *
-     * @returns Promise that resolves when server is running
-     */
-    async start() {
-      transport = new StdioServerTransport();
-      await server.connect(transport);
-      logger(`${name} MCP Server v${version} running on stdio`);
-    },
-
-    /**
-     * Stop the MCP server and close the transport.
-     *
-     * @returns Promise that resolves when server is stopped
-     */
-    async stop() {
-      if (transport) {
-        await server.close();
-      }
-    },
-
-    /**
-     * Get the underlying MCP Server instance.
-     *
-     * @returns The MCP Server instance
-     */
-    getServer() {
-      return server;
-    },
-
-    /**
-     * Get all available tool definitions.
-     *
-     * @returns Array of tool definitions
-     */
-    getToolDefinitions() {
-      return toolDefinitions;
-    },
-
-    /**
-     * Get the tool name prefix (e.g., 'mm_').
-     *
-     * @returns The tool prefix string
-     */
-    getToolPrefix() {
-      return TOOL_PREFIX;
-    },
-  };
-}
diff --git a/src/mcp-server/session-manager.test.ts b/src/mcp-server/session-manager.test.ts
deleted file mode 100644
index b41b7ca..0000000
--- a/src/mcp-server/session-manager.test.ts
+++ /dev/null
@@ -1,105 +0,0 @@
-import { describe, it, expect, beforeEach } from 'vitest';
-
-import {
-  setSessionManager,
-  getSessionManager,
-  hasSessionManager,
-} from './session-manager.js';
-import type { ISessionManager } from './session-manager.js';
-import { createMockSessionManager } from './test-utils/mock-factories.js';
-
-describe('session-manager', () => {
-  beforeEach(() => {
-    setSessionManager(undefined as unknown as ISessionManager);
-  });
-
-  describe('setSessionManager', () => {
-    it('sets the session manager instance', () => {
-      const mockManager = createMockSessionManager();
-      setSessionManager(mockManager);
-
-      expect(hasSessionManager()).toBe(true);
-    });
-
-    it('replaces the existing session manager', () => {
-      const mockManager1 = createMockSessionManager();
-      const mockManager2 = createMockSessionManager();
-
-      setSessionManager(mockManager1);
-      setSessionManager(mockManager2);
-
-      expect(getSessionManager()).toBe(mockManager2);
-    });
-  });
-
-  describe('getSessionManager', () => {
-    it('returns the session manager when set', () => {
-      const mockManager = createMockSessionManager();
-      setSessionManager(mockManager);
-
-      expect(getSessionManager()).toBe(mockManager);
-    });
-
-    it('throws error when session manager is not set', () => {
-      expect(() => getSessionManager()).toThrowError(
-        'Session manager not initialized. Call setSessionManager() first.',
-      );
-    });
-  });
-
-  describe('hasSessionManager', () => {
-    it('returns false when no session manager is set', () => {
-      expect(hasSessionManager()).toBe(false);
-    });
-
-    it('returns true when session manager is set', () => {
-      const mockManager = createMockSessionManager();
-      setSessionManager(mockManager);
-
-      expect(hasSessionManager()).toBe(true);
-    });
-  });
-
-  describe('ISessionManager interface compliance', () => {
-    let manager: ISessionManager;
-
-    beforeEach(() => {
-      manager = createMockSessionManager();
-      setSessionManager(manager);
-    });
-
-    it('can call hasActiveSession', () => {
-      const result = getSessionManager().hasActiveSession();
-      expect(typeof result).toBe('boolean');
-    });
-
-    it('can call getSessionId', () => {
-      const result = getSessionManager().getSessionId();
-      expect(result).toBeUndefined();
-    });
-
-    it('can call launch', async () => {
-      const result = await getSessionManager().launch({});
-      expect(result.sessionId).toBe('test-session-123');
-    });
-
-    it('can call cleanup', async () => {
-      const result = await getSessionManager().cleanup();
-      expect(result).toBe(true);
-    });
-
-    it('can call screenshot', async () => {
-      const result = await getSessionManager().screenshot({ name: 'test' });
-      expect(result.path).toBeDefined();
-    });
-
-    it('can access capability methods', () => {
-      expect(getSessionManager().getBuildCapability()).toBeUndefined();
-      expect(getSessionManager().getFixtureCapability()).toBeUndefined();
-      expect(getSessionManager().getChainCapability()).toBeUndefined();
-      expect(
-        getSessionManager().getContractSeedingCapability(),
-      ).toBeUndefined();
-    });
-  });
-});
diff --git a/src/mcp-server/test-utils/flush-promises.ts b/src/mcp-server/test-utils/flush-promises.ts
deleted file mode 100644
index eb3403f..0000000
--- a/src/mcp-server/test-utils/flush-promises.ts
+++ /dev/null
@@ -1,8 +0,0 @@
-const scheduler =
-  typeof setImmediate === 'function' ? setImmediate : setTimeout;
-
-export async function flushPromises() {
-  return new Promise((resolve) => {
-    scheduler(resolve, 0);
-  });
-}
diff --git a/src/mcp-server/tools/batch.test.ts b/src/mcp-server/tools/batch.test.ts
deleted file mode 100644
index 2a84d2b..0000000
--- a/src/mcp-server/tools/batch.test.ts
+++ /dev/null
@@ -1,428 +0,0 @@
-import { describe, it, expect, beforeEach, vi } from 'vitest';
-
-import {
-  setToolRegistry,
-  getToolRegistry,
-  hasToolRegistry,
-  setToolValidator,
-  getToolValidator,
-  handleRunSteps,
-} from './batch.js';
-import type { ToolRegistry, ToolHandler, ToolValidator } from './batch.js';
-import { setSessionManager } from '../session-manager.js';
-import { createMockSessionManager } from '../test-utils/mock-factories.js';
-
-/**
- * Clears the tool validator by resetting it to undefined.
- */
-function clearToolValidator(): void {
-  setToolValidator((() => ({ success: true })) as ToolValidator);
-  setToolValidator(undefined as unknown as ToolValidator);
-}
-
-describe('batch', () => {
-  beforeEach(() => {
-    setToolRegistry({});
-    clearToolValidator();
-  });
-
-  describe('setToolRegistry / getToolRegistry', () => {
-    it('sets and gets tool registry', () => {
-      const mockHandler: ToolHandler = vi.fn().mockResolvedValue({ ok: true });
-      const registry: ToolRegistry = {
-        mm_click: mockHandler,
-      };
-
-      setToolRegistry(registry);
-
-      expect(getToolRegistry()).toBe(registry);
-      expect(getToolRegistry().mm_click).toBe(mockHandler);
-    });
-
-    it('replaces existing registry', () => {
-      const registry1: ToolRegistry = { tool1: vi.fn() };
-      const registry2: ToolRegistry = { tool2: vi.fn() };
-
-      setToolRegistry(registry1);
-      setToolRegistry(registry2);
-
-      expect(getToolRegistry()).toBe(registry2);
-      expect(getToolRegistry().tool1).toBeUndefined();
-      expect(getToolRegistry().tool2).toBeDefined();
-    });
-  });
-
-  describe('hasToolRegistry', () => {
-    it('returns false for empty registry', () => {
-      setToolRegistry({});
-      expect(hasToolRegistry()).toBe(false);
-    });
-
-    it('returns true when registry has handlers', () => {
-      setToolRegistry({ mm_click: vi.fn() });
-      expect(hasToolRegistry()).toBe(true);
-    });
-  });
-
-  describe('setToolValidator / getToolValidator', () => {
-    it('sets and gets tool validator', () => {
-      const validator: ToolValidator = vi
-        .fn()
-        .mockReturnValue({ success: true });
-      setToolValidator(validator);
-
-      expect(getToolValidator()).toBe(validator);
-    });
-
-    it('returns undefined when not set', () => {
-      expect(getToolValidator()).toBeUndefined();
-    });
-  });
-
-  describe('handleRunSteps', () => {
-    beforeEach(() => {
-      setSessionManager(createMockSessionManager({ hasActive: true }));
-    });
-
-    it('returns error when no active session', async () => {
-      setSessionManager(createMockSessionManager({ hasActive: false }));
-
-      const result = await handleRunSteps({
-        steps: [{ tool: 'mm_click', args: { testId: 'button' } }],
-      });
-
-      expect(result.ok).toBe(false);
-      if (!result.ok) {
-        expect(result.error?.code).toBe('MM_NO_ACTIVE_SESSION');
-      }
-    });
-
-    it('executes steps in sequence', async () => {
-      const executionOrder: string[] = [];
-      const clickHandler = vi.fn().mockImplementation(async () => {
-        executionOrder.push('click');
-        return { ok: true, result: 'clicked' };
-      });
-      const typeHandler = vi.fn().mockImplementation(async () => {
-        executionOrder.push('type');
-        return { ok: true, result: 'typed' };
-      });
-
-      setToolRegistry({
-        mm_click: clickHandler,
-        mm_type: typeHandler,
-      });
-
-      const result = await handleRunSteps({
-        steps: [
-          { tool: 'mm_click', args: { testId: 'button' } },
-          { tool: 'mm_type', args: { testId: 'input', text: 'hello' } },
-        ],
-      });
-
-      expect(result.ok).toBe(true);
-      expect(executionOrder).toStrictEqual(['click', 'type']);
-      if (result.ok) {
-        expect(result.result?.summary.total).toBe(2);
-        expect(result.result?.summary.succeeded).toBe(2);
-        expect(result.result?.summary.failed).toBe(0);
-      }
-    });
-
-    it('returns error for unknown tool', async () => {
-      setToolRegistry({});
-
-      const result = await handleRunSteps({
-        steps: [{ tool: 'unknown_tool', args: {} }],
-      });
-
-      expect(result.ok).toBe(true);
-      if (result.ok) {
-        expect(result.result?.steps[0].ok).toBe(false);
-        expect(result.result?.steps[0].error?.code).toBe('MM_UNKNOWN_TOOL');
-        expect(result.result?.summary.failed).toBe(1);
-      }
-    });
-
-    it('stops on error when stopOnError is true', async () => {
-      const clickHandler = vi.fn().mockResolvedValue({
-        ok: false,
-        error: { code: 'ERR', message: 'fail' },
-      });
-      const typeHandler = vi.fn().mockResolvedValue({ ok: true });
-
-      setToolRegistry({
-        mm_click: clickHandler,
-        mm_type: typeHandler,
-      });
-
-      const result = await handleRunSteps({
-        steps: [
-          { tool: 'mm_click', args: {} },
-          { tool: 'mm_type', args: { text: 'hello' } },
-        ],
-        stopOnError: true,
-      });
-
-      expect(clickHandler).toHaveBeenCalledTimes(1);
-      expect(typeHandler).not.toHaveBeenCalled();
-      if (result.ok) {
-        expect(result.result?.steps.length).toBe(1);
-      }
-    });
-
-    it('continues on error when stopOnError is false', async () => {
-      const clickHandler = vi.fn().mockResolvedValue({
-        ok: false,
-        error: { code: 'ERR', message: 'fail' },
-      });
-      const typeHandler = vi
-        .fn()
-        .mockResolvedValue({ ok: true, result: 'typed' });
-
-      setToolRegistry({
-        mm_click: clickHandler,
-        mm_type: typeHandler,
-      });
-
-      const result = await handleRunSteps({
-        steps: [
-          { tool: 'mm_click', args: {} },
-          { tool: 'mm_type', args: { text: 'hello' } },
-        ],
-        stopOnError: false,
-      });
-
-      expect(clickHandler).toHaveBeenCalledTimes(1);
-      expect(typeHandler).toHaveBeenCalledTimes(1);
-      if (result.ok) {
-        expect(result.result?.steps.length).toBe(2);
-        expect(result.result?.summary.failed).toBe(1);
-        expect(result.result?.summary.succeeded).toBe(1);
-      }
-    });
-
-    it('uses tool validator when set', async () => {
-      const clickHandler = vi.fn().mockResolvedValue({ ok: true });
-      setToolRegistry({ mm_click: clickHandler });
-
-      const validator: ToolValidator = vi.fn().mockReturnValue({
-        success: false,
-        error: { message: 'Invalid testId' },
-      });
-      setToolValidator(validator);
-
-      const result = await handleRunSteps({
-        steps: [{ tool: 'mm_click', args: { testId: '' } }],
-      });
-
-      expect(validator).toHaveBeenCalledWith('mm_click', { testId: '' });
-      expect(clickHandler).not.toHaveBeenCalled();
-      if (result.ok) {
-        expect(result.result?.steps[0].ok).toBe(false);
-        expect(result.result?.steps[0].error?.code).toBe('MM_INVALID_INPUT');
-      }
-    });
-
-    it('passes validation when validator returns success', async () => {
-      const clickHandler = vi
-        .fn()
-        .mockResolvedValue({ ok: true, result: 'clicked' });
-      setToolRegistry({ mm_click: clickHandler });
-
-      const validator: ToolValidator = vi
-        .fn()
-        .mockReturnValue({ success: true });
-      setToolValidator(validator);
-
-      const result = await handleRunSteps({
-        steps: [{ tool: 'mm_click', args: { testId: 'btn' } }],
-      });
-
-      expect(clickHandler).toHaveBeenCalled();
-      if (result.ok) {
-        expect(result.result?.steps[0].ok).toBe(true);
-      }
-    });
-
-    it('handles exceptions from tool handlers', async () => {
-      const clickHandler = vi.fn().mockRejectedValue(new Error('Timeout'));
-      setToolRegistry({ mm_click: clickHandler });
-
-      const result = await handleRunSteps({
-        steps: [{ tool: 'mm_click', args: {} }],
-      });
-
-      if (result.ok) {
-        expect(result.result?.steps[0].ok).toBe(false);
-        expect(result.result?.steps[0].error?.code).toBe('MM_INTERNAL_ERROR');
-        expect(result.result?.steps[0].error?.message).toContain('Timeout');
-      }
-    });
-
-    it('includes duration in step results', async () => {
-      vi.useFakeTimers();
-      const clickHandler = vi.fn().mockImplementation(async () => {
-        await new Promise((resolve) => setTimeout(resolve, 100));
-        return { ok: true };
-      });
-      setToolRegistry({ mm_click: clickHandler });
-
-      const resultPromise = handleRunSteps({
-        steps: [{ tool: 'mm_click', args: {} }],
-      });
-
-      await vi.advanceTimersByTimeAsync(100);
-      const result = await resultPromise;
-
-      if (result.ok) {
-        expect(result.result?.steps[0].meta?.durationMs).toBe(100);
-      }
-
-      vi.useRealTimers();
-    });
-
-    it('includes total duration in summary', async () => {
-      const clickHandler = vi.fn().mockResolvedValue({ ok: true });
-      setToolRegistry({ mm_click: clickHandler });
-
-      const result = await handleRunSteps({
-        steps: [
-          { tool: 'mm_click', args: {} },
-          { tool: 'mm_click', args: {} },
-        ],
-      });
-
-      if (result.ok) {
-        expect(result.result?.summary.durationMs).toBeGreaterThanOrEqual(0);
-      }
-    });
-
-    it('defaults args to empty object when not provided', async () => {
-      const clickHandler = vi.fn().mockResolvedValue({ ok: true });
-      setToolRegistry({ mm_click: clickHandler });
-
-      await handleRunSteps({
-        steps: [{ tool: 'mm_click' }],
-      });
-
-      expect(clickHandler).toHaveBeenCalledWith({}, expect.any(Object));
-    });
-
-    it('maps includeObservations "none" to observation policy', async () => {
-      const clickHandler = vi.fn().mockResolvedValue({ ok: true });
-      setToolRegistry({ mm_click: clickHandler });
-
-      const result = await handleRunSteps({
-        steps: [{ tool: 'mm_click', args: {} }],
-        includeObservations: 'none',
-      });
-
-      expect(result.ok).toBe(true);
-      if (result.ok) {
-        expect(result.result?.steps[0].ok).toBe(true);
-      }
-      expect(clickHandler).toHaveBeenCalledWith(
-        {},
-        expect.objectContaining({ observationPolicy: 'none' }),
-      );
-    });
-
-    it('maps includeObservations "failures" to observation policy', async () => {
-      const clickHandler = vi.fn().mockResolvedValue({ ok: true });
-      setToolRegistry({ mm_click: clickHandler });
-
-      const result = await handleRunSteps({
-        steps: [{ tool: 'mm_click', args: {} }],
-        includeObservations: 'failures',
-      });
-
-      expect(result.ok).toBe(true);
-      if (result.ok) {
-        expect(result.result?.steps[0].ok).toBe(true);
-      }
-      expect(clickHandler).toHaveBeenCalledWith(
-        {},
-        expect.objectContaining({ observationPolicy: 'failures' }),
-      );
-    });
-
-    it('stops execution when stopOnError=true and handler not found', async () => {
-      const typeHandler = vi.fn().mockResolvedValue({ ok: true });
-      setToolRegistry({ mm_type: typeHandler });
-
-      const result = await handleRunSteps({
-        steps: [
-          { tool: 'unknown_tool', args: {} },
-          { tool: 'mm_type', args: { text: 'hello' } },
-        ],
-        stopOnError: true,
-      });
-
-      expect(typeHandler).not.toHaveBeenCalled();
-      if (result.ok) {
-        expect(result.result?.steps.length).toBe(1);
-        expect(result.result?.steps[0].ok).toBe(false);
-        expect(result.result?.steps[0].error?.code).toBe('MM_UNKNOWN_TOOL');
-      }
-    });
-
-    it('stops execution when stopOnError=true and validation fails', async () => {
-      const clickHandler = vi.fn().mockResolvedValue({ ok: true });
-      const typeHandler = vi.fn().mockResolvedValue({ ok: true });
-      setToolRegistry({
-        mm_click: clickHandler,
-        mm_type: typeHandler,
-      });
-
-      const validator: ToolValidator = vi.fn().mockImplementation((tool) => {
-        if (tool === 'mm_click') {
-          return { success: false, error: { message: 'Invalid testId' } };
-        }
-        return { success: true };
-      });
-      setToolValidator(validator);
-
-      const result = await handleRunSteps({
-        steps: [
-          { tool: 'mm_click', args: { testId: '' } },
-          { tool: 'mm_type', args: { text: 'hello' } },
-        ],
-        stopOnError: true,
-      });
-
-      expect(clickHandler).not.toHaveBeenCalled();
-      expect(typeHandler).not.toHaveBeenCalled();
-      if (result.ok) {
-        expect(result.result?.steps.length).toBe(1);
-        expect(result.result?.steps[0].ok).toBe(false);
-        expect(result.result?.steps[0].error?.code).toBe('MM_INVALID_INPUT');
-      }
-    });
-
-    it('stops execution when stopOnError=true and handler throws error', async () => {
-      const clickHandler = vi.fn().mockRejectedValue(new Error('Timeout'));
-      const typeHandler = vi.fn().mockResolvedValue({ ok: true });
-      setToolRegistry({
-        mm_click: clickHandler,
-        mm_type: typeHandler,
-      });
-
-      const result = await handleRunSteps({
-        steps: [
-          { tool: 'mm_click', args: {} },
-          { tool: 'mm_type', args: { text: 'hello' } },
-        ],
-        stopOnError: true,
-      });
-
-      expect(clickHandler).toHaveBeenCalledTimes(1);
-      expect(typeHandler).not.toHaveBeenCalled();
-      if (result.ok) {
-        expect(result.result?.steps.length).toBe(1);
-        expect(result.result?.steps[0].ok).toBe(false);
-        expect(result.result?.steps[0].error?.code).toBe('MM_INTERNAL_ERROR');
-      }
-    });
-  });
-});
diff --git a/src/mcp-server/tools/batch.ts b/src/mcp-server/tools/batch.ts
deleted file mode 100644
index b723a69..0000000
--- a/src/mcp-server/tools/batch.ts
+++ /dev/null
@@ -1,286 +0,0 @@
-import { getSessionManager } from '../session-manager.js';
-import type {
-  McpResponse,
-  HandlerOptions,
-  RunStepsInput,
-  RunStepsResult,
-  StepResult,
-  ObservationPolicyOverride,
-} from '../types';
-import { ErrorCodes } from '../types';
-import {
-  createSuccessResponse,
-  createErrorResponse,
-  extractErrorMessage,
-} from '../utils';
-
-/**
- * Maps includeObservations string to observation policy override.
- *
- * @param value The observation policy string ('none', 'failures', 'all', or undefined)
- * @returns The mapped observation policy override
- */
-function mapIncludeObservationsToPolicy(
-  value: 'none' | 'failures' | 'all' | undefined,
-): ObservationPolicyOverride {
-  switch (value) {
-    case 'none':
-      return 'none';
-    case 'failures':
-      return 'failures';
-    case 'all':
-    default:
-      return 'default';
-  }
-}
-
-/**
- * Handler function type for executing MCP tools.
- *
- * @param input Tool arguments as key-value pairs
- * @param options Optional handler configuration
- * @returns Promise resolving to MCP response with tool result
- */
-export type ToolHandler = (
-  input: Record<string, unknown>,
-  options?: HandlerOptions,
-) => Promise<McpResponse<unknown>>;
-
-/**
- * Registry mapping tool names to their handler functions.
- *
- * @returns Record of tool name to handler function mappings
- */
-export type ToolRegistry = Record<string, ToolHandler>;
-
-/**
- * Validator function type for validating tool arguments before execution.
- *
- * @param tool Tool name being validated
- * @param args Tool arguments to validate
- * @returns Validation result with success status and optional error details
- */
-export type ToolValidator = (
-  tool: string,
-  args: Record<string, unknown>,
-) =>
-  | {
-      /**
-       * Validation succeeded
-       */
-      success: true;
-    }
-  | {
-      /**
-       * Validation failed
-       */
-      success: false;
-      /**
-       * Error details when validation fails
-       */
-      error: {
-        /**
-         * Error message describing validation failure
-         */
-        message: string;
-      };
-    };
-
-let _toolRegistry: ToolRegistry = {};
-let _toolValidator: ToolValidator | undefined;
-
-/**
- * Sets the global tool registry for batch execution.
- *
- * @param registry Tool registry mapping names to handlers
- */
-export function setToolRegistry(registry: ToolRegistry): void {
-  _toolRegistry = registry;
-}
-
-/**
- * Gets the current global tool registry.
- *
- * @returns The current tool registry
- */
-export function getToolRegistry(): ToolRegistry {
-  return _toolRegistry;
-}
-
-/**
- * Checks if the tool registry has any registered handlers.
- *
- * @returns True if registry contains handlers, false otherwise
- */
-export function hasToolRegistry(): boolean {
-  return Object.keys(_toolRegistry).length > 0;
-}
-
-/**
- * Sets the global tool validator for batch execution.
- *
- * @param validator Validator function to validate tool arguments
- */
-export function setToolValidator(validator: ToolValidator): void {
-  _toolValidator = validator;
-}
-
-/**
- * Gets the current global tool validator.
- *
- * @returns The current tool validator or undefined if not set
- */
-export function getToolValidator(): ToolValidator | undefined {
-  return _toolValidator;
-}
-
-/**
- * Executes multiple tool steps in sequence with optional validation and error handling.
- *
- * @param input Steps to execute with optional stop-on-error and observation policy
- * @param options Optional handler configuration and observation policy override
- * @returns Promise resolving to MCP response with step results and summary
- */
-export async function handleRunSteps(
-  input: RunStepsInput,
-  options?: HandlerOptions,
-): Promise<McpResponse<RunStepsResult>> {
-  const batchStartTime = Date.now();
-  const sessionManager = getSessionManager();
-  const sessionId = sessionManager.getSessionId();
-
-  if (!sessionManager.hasActiveSession()) {
-    return createErrorResponse(
-      ErrorCodes.MM_NO_ACTIVE_SESSION,
-      'No active session. Call launch first.',
-      { input },
-      undefined,
-      batchStartTime,
-    );
-  }
-
-  const { steps: stepInputs, stopOnError = false, includeObservations } = input;
-  const observationPolicy = mapIncludeObservationsToPolicy(includeObservations);
-  const stepResults: StepResult[] = [];
-  let succeeded = 0;
-  let failed = 0;
-
-  const toolHandlers = getToolRegistry();
-  const toolValidator = getToolValidator();
-
-  for (const stepInput of stepInputs) {
-    const stepStartTime = Date.now();
-    const { tool, args = {} } = stepInput;
-
-    const handler = toolHandlers[tool];
-    if (!handler) {
-      const result: StepResult = {
-        tool,
-        ok: false,
-        error: {
-          code: ErrorCodes.MM_UNKNOWN_TOOL,
-          message: `Unknown tool: ${tool}`,
-        },
-        meta: {
-          durationMs: Date.now() - stepStartTime,
-          timestamp: new Date().toISOString(),
-        },
-      };
-      stepResults.push(result);
-      failed += 1;
-
-      if (stopOnError) {
-        break;
-      }
-      continue;
-    }
-
-    if (toolValidator) {
-      const validation = toolValidator(tool, args);
-      if (!validation.success) {
-        const result: StepResult = {
-          tool,
-          ok: false,
-          error: {
-            code: ErrorCodes.MM_INVALID_INPUT,
-            message: `Invalid input: ${validation.error.message}`,
-          },
-          meta: {
-            durationMs: Date.now() - stepStartTime,
-            timestamp: new Date().toISOString(),
-          },
-        };
-        stepResults.push(result);
-        failed += 1;
-
-        if (stopOnError) {
-          break;
-        }
-        continue;
-      }
-    }
-
-    try {
-      const stepOptions: HandlerOptions = {
-        ...options,
-        observationPolicy,
-      };
-      const response = await handler(args, stepOptions);
-
-      const result: StepResult = {
-        tool,
-        ok: response.ok,
-        result: response.ok ? response.result : undefined,
-        error: response.ok ? undefined : response.error,
-        meta: {
-          durationMs: Date.now() - stepStartTime,
-          timestamp: new Date().toISOString(),
-        },
-      };
-
-      stepResults.push(result);
-
-      if (response.ok) {
-        succeeded += 1;
-      } else {
-        failed += 1;
-        if (stopOnError) {
-          break;
-        }
-      }
-    } catch (error) {
-      const message = extractErrorMessage(error);
-      const result: StepResult = {
-        tool,
-        ok: false,
-        error: {
-          code: ErrorCodes.MM_INTERNAL_ERROR,
-          message: `Unexpected error: ${message}`,
-        },
-        meta: {
-          durationMs: Date.now() - stepStartTime,
-          timestamp: new Date().toISOString(),
-        },
-      };
-      stepResults.push(result);
-      failed += 1;
-
-      if (stopOnError) {
-        break;
-      }
-    }
-  }
-
-  const batchResult: RunStepsResult = {
-    steps: stepResults,
-    summary: {
-      ok: failed === 0,
-      total: stepResults.length,
-      succeeded,
-      failed,
-      durationMs: Date.now() - batchStartTime,
-    },
-  };
-
-  return createSuccessResponse(batchResult, sessionId, batchStartTime);
-}
diff --git a/src/mcp-server/tools/build.test.ts b/src/mcp-server/tools/build.test.ts
deleted file mode 100644
index 4e3721c..0000000
--- a/src/mcp-server/tools/build.test.ts
+++ /dev/null
@@ -1,211 +0,0 @@
-/**
- * Unit tests for build tool handler.
- *
- * Tests the build handler with BuildCapability and legacy build paths,
- * including success/failure scenarios and build options handling.
- */
-
-import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
-
-import { handleBuild } from './build.js';
-import type { BuildCapability } from '../../capabilities/types.js';
-import * as knowledgeStoreModule from '../knowledge-store.js';
-import * as sessionManagerModule from '../session-manager.js';
-import { createMockSessionManager } from '../test-utils';
-import { ErrorCodes } from '../types/errors.js';
-
-describe('build', () => {
-  let mockSessionManager: ReturnType<typeof createMockSessionManager>;
-  let mockBuildCapability: BuildCapability;
-
-  beforeEach(() => {
-    mockSessionManager = createMockSessionManager({
-      hasActive: true,
-      sessionId: 'test-session-123',
-      sessionMetadata: {
-        schemaVersion: 1,
-        sessionId: 'test-session-123',
-        createdAt: new Date().toISOString(),
-        flowTags: [],
-        tags: [],
-        launch: { stateMode: 'default' },
-      },
-    });
-    vi.spyOn(sessionManagerModule, 'getSessionManager').mockReturnValue(
-      mockSessionManager,
-    );
-    // Mock knowledge store to prevent "not initialized" errors
-    vi.spyOn(knowledgeStoreModule, 'knowledgeStore', 'get').mockReturnValue({
-      recordStep: vi.fn().mockResolvedValue(undefined),
-      getLastSteps: vi.fn().mockResolvedValue([]),
-      searchSteps: vi.fn().mockResolvedValue([]),
-      summarizeSession: vi
-        .fn()
-        .mockResolvedValue({ sessionId: 'test', stepCount: 0, recipe: [] }),
-      listSessions: vi.fn().mockResolvedValue([]),
-      generatePriorKnowledge: vi.fn().mockResolvedValue(undefined),
-      writeSessionMetadata: vi.fn().mockResolvedValue('test-session'),
-    } as any);
-
-    mockBuildCapability = {
-      build: vi.fn(),
-      getExtensionPath: vi.fn(),
-      isBuilt: vi.fn(),
-    };
-  });
-
-  afterEach(() => {
-    vi.restoreAllMocks();
-  });
-
-  describe('handleBuild with capability', () => {
-    it('builds extension successfully with default buildType', async () => {
-      // Arrange
-      const mockedBuild = vi
-        .spyOn(mockBuildCapability, 'build')
-        .mockResolvedValue({
-          success: true,
-          extensionPath: '/path/to/dist/chrome',
-          durationMs: 5000,
-        });
-
-      // Act
-      const result = await handleBuild(
-        {},
-        { buildCapability: mockBuildCapability },
-      );
-
-      // Assert
-      expect(result.ok).toBe(true);
-      if (result.ok) {
-        expect(result.result.buildType).toBe('build:test');
-        expect(result.result.extensionPathResolved).toBe(
-          '/path/to/dist/chrome',
-        );
-      }
-      expect(mockedBuild).toHaveBeenCalledWith({
-        buildType: undefined,
-        force: undefined,
-      });
-    });
-
-    it('builds extension with explicit buildType', async () => {
-      // Arrange
-      const mockedBuild = vi
-        .spyOn(mockBuildCapability, 'build')
-        .mockResolvedValue({
-          success: true,
-          extensionPath: '/path/to/dist/chrome',
-          durationMs: 5000,
-        });
-
-      // Act
-      const result = await handleBuild(
-        { buildType: 'build:test' },
-        { buildCapability: mockBuildCapability },
-      );
-
-      // Assert
-      expect(result.ok).toBe(true);
-      if (result.ok) {
-        expect(result.result.buildType).toBe('build:test');
-        expect(result.result.extensionPathResolved).toBe(
-          '/path/to/dist/chrome',
-        );
-      }
-      expect(mockedBuild).toHaveBeenCalledWith({
-        buildType: 'build:test',
-        force: undefined,
-      });
-    });
-
-    it('builds extension with force flag', async () => {
-      // Arrange
-      const mockedBuild = vi
-        .spyOn(mockBuildCapability, 'build')
-        .mockResolvedValue({
-          success: true,
-          extensionPath: '/path/to/dist/chrome',
-          durationMs: 5000,
-        });
-
-      // Act
-      const result = await handleBuild(
-        { force: true },
-        { buildCapability: mockBuildCapability },
-      );
-
-      // Assert
-      expect(result.ok).toBe(true);
-      expect(mockedBuild).toHaveBeenCalledWith({
-        buildType: undefined,
-        force: true,
-      });
-    });
-
-    it('returns error when build fails with error message', async () => {
-      // Arrange
-      vi.spyOn(mockBuildCapability, 'build').mockResolvedValue({
-        success: false,
-        extensionPath: '',
-        durationMs: 1000,
-        error: 'Compilation error',
-      });
-
-      // Act
-      const result = await handleBuild(
-        {},
-        { buildCapability: mockBuildCapability },
-      );
-
-      // Assert
-      expect(result.ok).toBe(false);
-      if (!result.ok) {
-        expect(result.error.code).toBe(ErrorCodes.MM_BUILD_FAILED);
-        expect(result.error.message).toContain('Compilation error');
-      }
-    });
-
-    it('returns error when build fails without error message', async () => {
-      // Arrange
-      vi.spyOn(mockBuildCapability, 'build').mockResolvedValue({
-        success: false,
-        extensionPath: '',
-        durationMs: 1000,
-      });
-
-      // Act
-      const result = await handleBuild(
-        {},
-        { buildCapability: mockBuildCapability },
-      );
-
-      // Assert
-      expect(result.ok).toBe(false);
-      if (!result.ok) {
-        expect(result.error.code).toBe(ErrorCodes.MM_BUILD_FAILED);
-        expect(result.error.message).toContain('Unknown error');
-      }
-    });
-
-    it('returns error when build throws exception', async () => {
-      // Arrange
-      vi.spyOn(mockBuildCapability, 'build').mockRejectedValue(
-        new Error('Build process crashed'),
-      );
-
-      // Act
-      const result = await handleBuild(
-        {},
-        { buildCapability: mockBuildCapability },
-      );
-
-      // Assert
-      expect(result.ok).toBe(false);
-      if (!result.ok) {
-        expect(result.error.code).toBe(ErrorCodes.MM_BUILD_FAILED);
-        expect(result.error.message).toContain('Build process crashed');
-      }
-    });
-  });
-});
diff --git a/src/mcp-server/tools/build.ts b/src/mcp-server/tools/build.ts
deleted file mode 100644
index 7d422cf..0000000
--- a/src/mcp-server/tools/build.ts
+++ /dev/null
@@ -1,100 +0,0 @@
-import type { BuildCapability } from '../../capabilities/types.js';
-import type {
-  BuildInput,
-  BuildToolResult,
-  McpResponse,
-  HandlerOptions,
-} from '../types';
-import { ErrorCodes } from '../types';
-import {
-  createSuccessResponse,
-  createErrorResponse,
-  extractErrorMessage,
-} from '../utils';
-
-/**
- * Options for the build tool handler.
- *
- * @returns Handler options with optional build capability
- */
-export type BuildToolOptions = HandlerOptions & {
-  /**
-   * Optional build capability for extension building
-   */
-  buildCapability?: BuildCapability;
-};
-
-/**
- * Handles the build tool request to build the extension.
- *
- * @param input Build configuration with optional buildType and force flag
- * @param options Optional handler options with build capability
- * @returns Promise resolving to MCP response with build result
- */
-export async function handleBuild(
-  input: BuildInput,
-  options?: BuildToolOptions,
-): Promise<McpResponse<BuildToolResult>> {
-  const startTime = Date.now();
-
-  if (options?.buildCapability) {
-    return handleBuildWithCapability(input, options.buildCapability, startTime);
-  }
-
-  return createErrorResponse(
-    ErrorCodes.MM_CAPABILITY_NOT_AVAILABLE,
-    'BuildCapability not available. The mm_build tool requires either: (1) running in e2e mode with the MetaMask extension wrapper, or (2) running directly in the metamask-extension repository with dependencies installed.',
-    { capability: 'BuildCapability' },
-    undefined,
-    startTime,
-  );
-}
-
-/**
- * Handles build using the provided build capability.
- *
- * @param input Build configuration with optional buildType and force flag
- * @param buildCapability Build capability instance for executing the build
- * @param startTime Timestamp when the operation started
- * @returns Promise resolving to MCP response with build result
- */
-async function handleBuildWithCapability(
-  input: BuildInput,
-  buildCapability: BuildCapability,
-  startTime: number,
-): Promise<McpResponse<BuildToolResult>> {
-  try {
-    const result = await buildCapability.build({
-      buildType: input.buildType,
-      force: input.force,
-    });
-
-    if (!result.success) {
-      return createErrorResponse(
-        ErrorCodes.MM_BUILD_FAILED,
-        `Build failed: ${result.error ?? 'Unknown error'}`,
-        { buildType: input.buildType ?? 'build:test' },
-        undefined,
-        startTime,
-      );
-    }
-
-    return createSuccessResponse<BuildToolResult>(
-      {
-        buildType: input.buildType ?? 'build:test',
-        extensionPathResolved: result.extensionPath,
-      },
-      undefined,
-      startTime,
-    );
-  } catch (error) {
-    const message = extractErrorMessage(error);
-    return createErrorResponse(
-      ErrorCodes.MM_BUILD_FAILED,
-      `Build failed: ${message}`,
-      { buildType: input.buildType ?? 'build:test' },
-      undefined,
-      startTime,
-    );
-  }
-}
diff --git a/src/mcp-server/tools/cleanup.test.ts b/src/mcp-server/tools/cleanup.test.ts
deleted file mode 100644
index 7a8193f..0000000
--- a/src/mcp-server/tools/cleanup.test.ts
+++ /dev/null
@@ -1,161 +0,0 @@
-/**
- * Unit tests for cleanup tool handler.
- *
- * Tests session cleanup with various session states.
- */
-
-import { describe, it, expect, vi, beforeEach } from 'vitest';
-
-import { handleCleanup } from './cleanup.js';
-import * as sessionManagerModule from '../session-manager.js';
-import { createMockSessionManager } from '../test-utils/mock-factories.js';
-
-describe('handleCleanup', () => {
-  beforeEach(() => {
-    vi.clearAllMocks();
-  });
-
-  it('cleans up active session successfully', async () => {
-    const mockSessionManager = createMockSessionManager({
-      hasActive: true,
-      sessionId: 'test-session-123',
-    });
-    vi.spyOn(mockSessionManager, 'cleanup').mockResolvedValue(true);
-    vi.spyOn(sessionManagerModule, 'getSessionManager').mockReturnValue(
-      mockSessionManager,
-    );
-
-    const result = await handleCleanup({});
-
-    expect(result.ok).toBe(true);
-    if (result.ok) {
-      expect(result.result.cleanedUp).toBe(true);
-      expect(result.meta.sessionId).toBe('test-session-123');
-    }
-    expect(mockSessionManager.cleanup).toHaveBeenCalled();
-  });
-
-  it('returns false when no session to clean up', async () => {
-    const mockSessionManager = createMockSessionManager({ hasActive: false });
-    vi.spyOn(mockSessionManager, 'cleanup').mockResolvedValue(false);
-    vi.spyOn(sessionManagerModule, 'getSessionManager').mockReturnValue(
-      mockSessionManager,
-    );
-
-    const result = await handleCleanup({});
-
-    expect(result.ok).toBe(true);
-    if (result.ok) {
-      expect(result.result.cleanedUp).toBe(false);
-    }
-  });
-
-  it('uses provided sessionId in input', async () => {
-    const mockSessionManager = createMockSessionManager({
-      hasActive: true,
-      sessionId: 'current-session',
-    });
-    vi.spyOn(mockSessionManager, 'cleanup').mockResolvedValue(true);
-    vi.spyOn(sessionManagerModule, 'getSessionManager').mockReturnValue(
-      mockSessionManager,
-    );
-
-    const result = await handleCleanup({ sessionId: 'custom-session-456' });
-
-    expect(result.ok).toBe(true);
-    if (result.ok) {
-      expect(result.meta.sessionId).toBe('custom-session-456');
-    }
-  });
-
-  it('falls back to current sessionId when input sessionId is undefined', async () => {
-    const mockSessionManager = createMockSessionManager({
-      hasActive: true,
-      sessionId: 'test-session-789',
-    });
-    vi.spyOn(mockSessionManager, 'cleanup').mockResolvedValue(true);
-    vi.spyOn(sessionManagerModule, 'getSessionManager').mockReturnValue(
-      mockSessionManager,
-    );
-
-    const result = await handleCleanup({});
-
-    expect(result.ok).toBe(true);
-    if (result.ok) {
-      expect(result.meta.sessionId).toBe('test-session-789');
-    }
-  });
-
-  it('handles cleanup when sessionId is undefined', async () => {
-    const mockSessionManager = createMockSessionManager({ hasActive: false });
-    vi.spyOn(mockSessionManager, 'cleanup').mockResolvedValue(false);
-    vi.spyOn(sessionManagerModule, 'getSessionManager').mockReturnValue(
-      mockSessionManager,
-    );
-
-    const result = await handleCleanup({});
-
-    expect(result.ok).toBe(true);
-    if (result.ok) {
-      expect(result.result.cleanedUp).toBe(false);
-    }
-  });
-
-  it('includes timestamp in response', async () => {
-    const mockSessionManager = createMockSessionManager({ hasActive: true });
-    vi.spyOn(mockSessionManager, 'cleanup').mockResolvedValue(true);
-    vi.spyOn(sessionManagerModule, 'getSessionManager').mockReturnValue(
-      mockSessionManager,
-    );
-
-    const result = await handleCleanup({});
-
-    expect(result.ok).toBe(true);
-    if (result.ok) {
-      expect(result.meta.timestamp).toBeDefined();
-      expect(typeof result.meta.timestamp).toBe('string');
-      expect(new Date(result.meta.timestamp).getTime()).toBeGreaterThan(0);
-    }
-  });
-
-  it('includes durationMs in response', async () => {
-    const mockSessionManager = createMockSessionManager({ hasActive: true });
-    vi.spyOn(mockSessionManager, 'cleanup').mockResolvedValue(true);
-    vi.spyOn(sessionManagerModule, 'getSessionManager').mockReturnValue(
-      mockSessionManager,
-    );
-
-    const result = await handleCleanup({});
-
-    expect(result.ok).toBe(true);
-    if (result.ok) {
-      expect(result.meta.durationMs).toBeGreaterThanOrEqual(0);
-      expect(typeof result.meta.durationMs).toBe('number');
-    }
-  });
-
-  it('cleans up multiple times without error', async () => {
-    const mockSessionManager = createMockSessionManager({ hasActive: true });
-    vi.spyOn(mockSessionManager, 'cleanup')
-      .mockResolvedValueOnce(true)
-      .mockResolvedValueOnce(false);
-    vi.spyOn(sessionManagerModule, 'getSessionManager').mockReturnValue(
-      mockSessionManager,
-    );
-
-    const result1 = await handleCleanup({});
-    const result2 = await handleCleanup({});
-
-    expect(result1.ok).toBe(true);
-    if (result1.ok) {
-      expect(result1.result.cleanedUp).toBe(true);
-    }
-
-    expect(result2.ok).toBe(true);
-    if (result2.ok) {
-      expect(result2.result.cleanedUp).toBe(false);
-    }
-
-    expect(mockSessionManager.cleanup).toHaveBeenCalledTimes(2);
-  });
-});
diff --git a/src/mcp-server/tools/cleanup.ts b/src/mcp-server/tools/cleanup.ts
deleted file mode 100644
index 9b6f266..0000000
--- a/src/mcp-server/tools/cleanup.ts
+++ /dev/null
@@ -1,32 +0,0 @@
-import { getSessionManager } from '../session-manager.js';
-import type {
-  CleanupInput,
-  CleanupResult,
-  McpResponse,
-  HandlerOptions,
-} from '../types';
-import { createSuccessResponse } from '../utils';
-
-/**
- * Handles the cleanup tool request to stop browser and services.
- *
- * @param input - The cleanup input parameters.
- * @param _options - Handler options (unused).
- * @returns Response indicating if cleanup was performed.
- */
-export async function handleCleanup(
-  input: CleanupInput,
-  _options?: HandlerOptions,
-): Promise<McpResponse<CleanupResult>> {
-  const startTime = Date.now();
-  const sessionManager = getSessionManager();
-  const sessionId = input.sessionId ?? sessionManager.getSessionId();
-
-  const cleanedUp = await sessionManager.cleanup();
-
-  return createSuccessResponse<CleanupResult>(
-    { cleanedUp },
-    sessionId,
-    startTime,
-  );
-}
diff --git a/src/mcp-server/tools/clipboard.test.ts b/src/mcp-server/tools/clipboard.test.ts
deleted file mode 100644
index d52c3f6..0000000
--- a/src/mcp-server/tools/clipboard.test.ts
+++ /dev/null
@@ -1,325 +0,0 @@
-/**
- * Unit tests for clipboard tool handler.
- *
- * Tests CDP-based clipboard operations (read/write) with proper mocking.
- */
-
-import { describe, it, expect, vi, beforeEach } from 'vitest';
-
-import { handleClipboard } from './clipboard.js';
-import * as knowledgeStoreModule from '../knowledge-store.js';
-import * as sessionManagerModule from '../session-manager.js';
-import { createMockSessionManager } from '../test-utils/mock-factories.js';
-import { ErrorCodes } from '../types/errors.js';
-
-describe('handleClipboard', () => {
-  const mockSessionManager = createMockSessionManager({
-    hasActive: true,
-    sessionId: 'test-session-123',
-    sessionMetadata: {
-      schemaVersion: 1,
-      sessionId: 'test-session-123',
-      createdAt: new Date().toISOString(),
-      flowTags: [],
-      tags: [],
-      launch: { stateMode: 'default' },
-    },
-  });
-
-  beforeEach(() => {
-    vi.spyOn(sessionManagerModule, 'getSessionManager').mockReturnValue(
-      mockSessionManager,
-    );
-    vi.spyOn(knowledgeStoreModule, 'knowledgeStore', 'get').mockReturnValue({
-      recordStep: vi.fn().mockResolvedValue(undefined),
-      getLastSteps: vi.fn().mockResolvedValue([]),
-      searchSteps: vi.fn().mockResolvedValue([]),
-      summarizeSession: vi
-        .fn()
-        .mockResolvedValue({ sessionId: 'test', stepCount: 0, recipe: [] }),
-      listSessions: vi.fn().mockResolvedValue([]),
-      generatePriorKnowledge: vi.fn().mockResolvedValue(undefined),
-      writeSessionMetadata: vi.fn().mockResolvedValue('test-session'),
-    } as any);
-  });
-
-  describe('write action', () => {
-    it('writes text to clipboard via CDP', async () => {
-      const mockCdpSession = {
-        send: vi.fn().mockResolvedValue(undefined),
-        detach: vi.fn().mockResolvedValue(undefined),
-      };
-      const mockPage = {
-        context: vi.fn().mockReturnValue({
-          newCDPSession: vi.fn().mockResolvedValue(mockCdpSession),
-        }),
-      };
-      vi.spyOn(mockSessionManager, 'getPage').mockReturnValue(mockPage);
-
-      const result = await handleClipboard({
-        action: 'write',
-        text: 'test content',
-      });
-
-      expect(result.ok).toBe(true);
-      if (result.ok) {
-        expect(result.result.action).toBe('write');
-        expect(result.result.success).toBe(true);
-        expect(result.result.text).toBe('test content');
-      }
-      expect(mockCdpSession.send).toHaveBeenCalledWith('Runtime.evaluate', {
-        expression: 'navigator.clipboard.writeText("test content")',
-        awaitPromise: true,
-        userGesture: true,
-      });
-      expect(mockCdpSession.detach).toHaveBeenCalled();
-    });
-
-    it('detaches CDP session even if write fails', async () => {
-      const mockCdpSession = {
-        send: vi.fn().mockRejectedValue(new Error('Write failed')),
-        detach: vi.fn().mockResolvedValue(undefined),
-      };
-      const mockPage = {
-        context: vi.fn().mockReturnValue({
-          newCDPSession: vi.fn().mockResolvedValue(mockCdpSession),
-        }),
-      };
-      vi.spyOn(mockSessionManager, 'getPage').mockReturnValue(mockPage);
-
-      const result = await handleClipboard({ action: 'write', text: 'test' });
-
-      expect(result.ok).toBe(false);
-      expect(mockCdpSession.detach).toHaveBeenCalled();
-    });
-  });
-
-  describe('read action', () => {
-    it('reads text from clipboard via CDP', async () => {
-      const mockCdpSession = {
-        send: vi.fn().mockResolvedValue({
-          result: { value: 'clipboard content' },
-        }),
-        detach: vi.fn().mockResolvedValue(undefined),
-      };
-      const mockPage = {
-        context: vi.fn().mockReturnValue({
-          newCDPSession: vi.fn().mockResolvedValue(mockCdpSession),
-        }),
-      };
-      vi.spyOn(mockSessionManager, 'getPage').mockReturnValue(mockPage);
-
-      const result = await handleClipboard({ action: 'read' });
-
-      expect(result.ok).toBe(true);
-      if (result.ok) {
-        expect(result.result.action).toBe('read');
-        expect(result.result.success).toBe(true);
-        expect(result.result.text).toBe('clipboard content');
-      }
-      expect(mockCdpSession.send).toHaveBeenCalledWith('Runtime.evaluate', {
-        expression: 'navigator.clipboard.readText()',
-        awaitPromise: true,
-        userGesture: true,
-      });
-    });
-
-    it('uses description when value is missing', async () => {
-      const mockCdpSession = {
-        send: vi.fn().mockResolvedValue({
-          result: { description: 'fallback content' },
-        }),
-        detach: vi.fn().mockResolvedValue(undefined),
-      };
-      const mockPage = {
-        context: vi.fn().mockReturnValue({
-          newCDPSession: vi.fn().mockResolvedValue(mockCdpSession),
-        }),
-      };
-      vi.spyOn(mockSessionManager, 'getPage').mockReturnValue(mockPage);
-
-      const result = await handleClipboard({ action: 'read' });
-
-      expect(result.ok).toBe(true);
-      if (result.ok) {
-        expect(result.result.text).toBe('fallback content');
-      }
-    });
-
-    it('returns empty string when result is missing', async () => {
-      const mockCdpSession = {
-        send: vi.fn().mockResolvedValue({ result: {} }),
-        detach: vi.fn().mockResolvedValue(undefined),
-      };
-      const mockPage = {
-        context: vi.fn().mockReturnValue({
-          newCDPSession: vi.fn().mockResolvedValue(mockCdpSession),
-        }),
-      };
-      vi.spyOn(mockSessionManager, 'getPage').mockReturnValue(mockPage);
-
-      const result = await handleClipboard({ action: 'read' });
-
-      expect(result.ok).toBe(true);
-      if (result.ok) {
-        expect(result.result.text).toBe('');
-      }
-    });
-  });
-
-  describe('error classification', () => {
-    it('classifies permission denied errors', async () => {
-      const mockCdpSession = {
-        send: vi.fn().mockRejectedValue(new Error('permissions denied')),
-        detach: vi.fn().mockResolvedValue(undefined),
-      };
-      const mockPage = {
-        context: vi.fn().mockReturnValue({
-          newCDPSession: vi.fn().mockResolvedValue(mockCdpSession),
-        }),
-      };
-      vi.spyOn(mockSessionManager, 'getPage').mockReturnValue(mockPage);
-
-      const result = await handleClipboard({ action: 'read' });
-
-      expect(result.ok).toBe(false);
-      if (!result.ok) {
-        expect(result.error.code).toBe('MM_CLIPBOARD_PERMISSION_DENIED');
-        expect(result.error.message).toContain('Clipboard permission denied');
-      }
-    });
-
-    it('classifies LavaMoat blocked errors', async () => {
-      const mockCdpSession = {
-        send: vi.fn().mockRejectedValue(new Error('LavaMoat policy violation')),
-        detach: vi.fn().mockResolvedValue(undefined),
-      };
-      const mockPage = {
-        context: vi.fn().mockReturnValue({
-          newCDPSession: vi.fn().mockResolvedValue(mockCdpSession),
-        }),
-      };
-      vi.spyOn(mockSessionManager, 'getPage').mockReturnValue(mockPage);
-
-      const result = await handleClipboard({ action: 'write', text: 'test' });
-
-      expect(result.ok).toBe(false);
-      if (!result.ok) {
-        expect(result.error.code).toBe('MM_CLIPBOARD_LAVAMOAT_BLOCKED');
-        expect(result.error.message).toContain(
-          'Clipboard blocked by LavaMoat policy',
-        );
-      }
-    });
-
-    it('classifies generic clipboard errors', async () => {
-      const mockCdpSession = {
-        send: vi.fn().mockRejectedValue(new Error('Unknown error')),
-        detach: vi.fn().mockResolvedValue(undefined),
-      };
-      const mockPage = {
-        context: vi.fn().mockReturnValue({
-          newCDPSession: vi.fn().mockResolvedValue(mockCdpSession),
-        }),
-      };
-      vi.spyOn(mockSessionManager, 'getPage').mockReturnValue(mockPage);
-
-      const result = await handleClipboard({ action: 'read' });
-
-      expect(result.ok).toBe(false);
-      if (!result.ok) {
-        expect(result.error.code).toBe('MM_CLIPBOARD_FAILED');
-        expect(result.error.message).toContain('Clipboard operation failed');
-      }
-    });
-  });
-
-  describe('input sanitization', () => {
-    it('sanitizes write input for recording', async () => {
-      const mockCdpSession = {
-        send: vi.fn().mockResolvedValue(undefined),
-        detach: vi.fn().mockResolvedValue(undefined),
-      };
-      const mockPage = {
-        context: vi.fn().mockReturnValue({
-          newCDPSession: vi.fn().mockResolvedValue(mockCdpSession),
-        }),
-      };
-      vi.spyOn(mockSessionManager, 'getPage').mockReturnValue(mockPage);
-      const recordStepSpy = vi.fn().mockResolvedValue(undefined);
-      vi.spyOn(knowledgeStoreModule, 'knowledgeStore', 'get').mockReturnValue({
-        recordStep: recordStepSpy,
-        getLastSteps: vi.fn().mockResolvedValue([]),
-        searchSteps: vi.fn().mockResolvedValue([]),
-        summarizeSession: vi
-          .fn()
-          .mockResolvedValue({ sessionId: 'test', stepCount: 0, recipe: [] }),
-        listSessions: vi.fn().mockResolvedValue([]),
-        generatePriorKnowledge: vi.fn().mockResolvedValue(undefined),
-        writeSessionMetadata: vi.fn().mockResolvedValue('test-session'),
-      } as any);
-
-      await handleClipboard({ action: 'write', text: 'sensitive password' });
-
-      expect(recordStepSpy).toHaveBeenCalled();
-      const recordedInput = recordStepSpy.mock.calls[0][0].input;
-      expect(recordedInput).toStrictEqual({
-        action: 'write',
-        textLength: 18,
-      });
-      expect(recordedInput).not.toHaveProperty('text');
-    });
-
-    it('sanitizes read input for recording', async () => {
-      const mockCdpSession = {
-        send: vi
-          .fn()
-          .mockResolvedValue({ result: { value: 'clipboard content' } }),
-        detach: vi.fn().mockResolvedValue(undefined),
-      };
-      const mockPage = {
-        context: vi.fn().mockReturnValue({
-          newCDPSession: vi.fn().mockResolvedValue(mockCdpSession),
-        }),
-      };
-      vi.spyOn(mockSessionManager, 'getPage').mockReturnValue(mockPage);
-      const recordStepSpy = vi.fn().mockResolvedValue(undefined);
-      vi.spyOn(knowledgeStoreModule, 'knowledgeStore', 'get').mockReturnValue({
-        recordStep: recordStepSpy,
-        getLastSteps: vi.fn().mockResolvedValue([]),
-        searchSteps: vi.fn().mockResolvedValue([]),
-        summarizeSession: vi
-          .fn()
-          .mockResolvedValue({ sessionId: 'test', stepCount: 0, recipe: [] }),
-        listSessions: vi.fn().mockResolvedValue([]),
-        generatePriorKnowledge: vi.fn().mockResolvedValue(undefined),
-        writeSessionMetadata: vi.fn().mockResolvedValue('test-session'),
-      } as any);
-
-      await handleClipboard({ action: 'read' });
-
-      expect(recordStepSpy).toHaveBeenCalled();
-      const recordedInput = recordStepSpy.mock.calls[0][0].input;
-      expect(recordedInput).toStrictEqual({
-        action: 'read',
-        textLength: 0,
-      });
-    });
-  });
-
-  describe('session validation', () => {
-    it('returns error when no active session', async () => {
-      const noSessionManager = createMockSessionManager({ hasActive: false });
-      vi.spyOn(sessionManagerModule, 'getSessionManager').mockReturnValue(
-        noSessionManager,
-      );
-
-      const result = await handleClipboard({ action: 'read' });
-
-      expect(result.ok).toBe(false);
-      if (!result.ok) {
-        expect(result.error.code).toBe(ErrorCodes.MM_NO_ACTIVE_SESSION);
-      }
-    });
-  });
-});
diff --git a/src/mcp-server/tools/clipboard.ts b/src/mcp-server/tools/clipboard.ts
deleted file mode 100644
index a9e4fd6..0000000
--- a/src/mcp-server/tools/clipboard.ts
+++ /dev/null
@@ -1,117 +0,0 @@
-import { runTool } from './run-tool.js';
-import type {
-  ClipboardInput,
-  ClipboardResult,
-  McpResponse,
-  HandlerOptions,
-} from '../types';
-
-/**
- * Clipboard handler using CDP (Chrome DevTools Protocol) to bypass LavaMoat restrictions.
- *
- * Why CDP instead of page.evaluate()?
- * - page.evaluate() runs JavaScript inside the page context, which is wrapped by LavaMoat
- * - LavaMoat restricts access to navigator.clipboard in the page context
- * - CDP's Runtime.evaluate runs at the browser/DevTools level, bypassing LavaMoat
- * - userGesture: true simulates a user gesture to satisfy clipboard security requirements
- *
- * @param input Clipboard action ('read' or 'write') with optional text content
- * @param options Optional handler configuration
- * @returns Promise resolving to MCP response with clipboard operation result
- */
-export async function handleClipboard(
-  input: ClipboardInput,
-  options?: HandlerOptions,
-): Promise<McpResponse<ClipboardResult>> {
-  return runTool<ClipboardInput, ClipboardResult>({
-    toolName: 'mm_clipboard',
-    input,
-    options,
-
-    /**
-     * Executes the clipboard operation using CDP.
-     *
-     * @param context Tool execution context with page and session info
-     * @returns Promise resolving to clipboard operation result
-     */
-    execute: async (context) => {
-      const { page } = context;
-      const cdpSession = await page.context().newCDPSession(page);
-
-      try {
-        if (input.action === 'write') {
-          await cdpSession.send('Runtime.evaluate', {
-            expression: `navigator.clipboard.writeText(${JSON.stringify(input.text)})`,
-            awaitPromise: true,
-            userGesture: true,
-          });
-
-          return {
-            action: 'write',
-            success: true,
-            text: input.text,
-          };
-        }
-
-        const result = await cdpSession.send('Runtime.evaluate', {
-          expression: `navigator.clipboard.readText()`,
-          awaitPromise: true,
-          userGesture: true,
-        });
-
-        const clipboardText =
-          result.result?.value ?? result.result?.description ?? '';
-
-        return {
-          action: 'read',
-          success: true,
-          text: clipboardText as string,
-        };
-      } finally {
-        // eslint-disable-next-line no-empty-function
-        await cdpSession.detach().catch(() => {});
-      }
-    },
-
-    /**
-     * Classifies clipboard errors into specific error codes.
-     *
-     * @param error The error to classify
-     * @returns Error classification with code and message
-     */
-    classifyError: (error) => {
-      const message = error instanceof Error ? error.message : String(error);
-
-      if (message.includes('permissions') || message.includes('denied')) {
-        return {
-          code: 'MM_CLIPBOARD_PERMISSION_DENIED',
-          message: `Clipboard permission denied: ${message}`,
-        };
-      }
-
-      if (message.includes('LavaMoat') || message.includes('policy')) {
-        return {
-          code: 'MM_CLIPBOARD_LAVAMOAT_BLOCKED',
-          message: `Clipboard blocked by LavaMoat policy: ${message}`,
-        };
-      }
-
-      return {
-        code: 'MM_CLIPBOARD_FAILED',
-        message: `Clipboard operation failed: ${message}`,
-      };
-    },
-
-    /**
-     * Sanitizes clipboard input for recording (removes sensitive text).
-     *
-     * @param inp The clipboard input to sanitize
-     * @returns Sanitized input with text length instead of actual text
-     */
-    sanitizeInputForRecording: (inp) => ({
-      action: inp.action,
-      // Don't record the actual text content for privacy (could be SRP, passwords, etc.)
-      textLength: inp.text?.length ?? 0,
-    }),
-  });
-}
diff --git a/src/mcp-server/tools/context.test.ts b/src/mcp-server/tools/context.test.ts
deleted file mode 100644
index d4c384c..0000000
--- a/src/mcp-server/tools/context.test.ts
+++ /dev/null
@@ -1,221 +0,0 @@
-/**
- * Unit tests for context tool handlers.
- *
- * Tests context switching (e2e/prod) and context info retrieval.
- */
-
-import { describe, it, expect, vi, beforeEach } from 'vitest';
-
-import { handleSetContext, handleGetContext } from './context.js';
-import * as knowledgeStoreModule from '../knowledge-store.js';
-import * as sessionManagerModule from '../session-manager.js';
-import { createMockSessionManager } from '../test-utils/mock-factories.js';
-import { ErrorCodes } from '../types/errors.js';
-
-describe('handleSetContext', () => {
-  beforeEach(() => {
-    vi.spyOn(knowledgeStoreModule, 'knowledgeStore', 'get').mockReturnValue({
-      recordStep: vi.fn().mockResolvedValue(undefined),
-      getLastSteps: vi.fn().mockResolvedValue([]),
-      searchSteps: vi.fn().mockResolvedValue([]),
-      summarizeSession: vi
-        .fn()
-        .mockResolvedValue({ sessionId: 'test', stepCount: 0, recipe: [] }),
-      listSessions: vi.fn().mockResolvedValue([]),
-      generatePriorKnowledge: vi.fn().mockResolvedValue(undefined),
-      writeSessionMetadata: vi.fn().mockResolvedValue('test-session'),
-    } as any);
-  });
-
-  it('switches context from e2e to prod', async () => {
-    const mockSessionManager = createMockSessionManager({
-      environmentMode: 'e2e',
-    });
-    vi.spyOn(mockSessionManager, 'setContext');
-    // eslint-disable-next-line vitest/prefer-spy-on
-    mockSessionManager.getContextInfo = vi.fn().mockReturnValue({
-      currentContext: 'prod',
-      hasActiveSession: false,
-      sessionId: null,
-      capabilities: { available: ['build', 'fixture'] },
-      canSwitchContext: true,
-    });
-    vi.spyOn(sessionManagerModule, 'getSessionManager').mockReturnValue(
-      mockSessionManager,
-    );
-
-    const result = await handleSetContext({ context: 'prod' });
-
-    expect(result.ok).toBe(true);
-    if (result.ok) {
-      expect(result.result.previousContext).toBe('e2e');
-      expect(result.result.newContext).toBe('prod');
-      expect(result.result.availableCapabilities).toStrictEqual([
-        'build',
-        'fixture',
-      ]);
-    }
-    expect(mockSessionManager.setContext).toHaveBeenCalledWith(
-      'prod',
-      undefined,
-    );
-  });
-
-  it('forwards context options to session manager', async () => {
-    const mockSessionManager = createMockSessionManager({
-      environmentMode: 'e2e',
-    });
-    vi.spyOn(mockSessionManager, 'setContext');
-    // eslint-disable-next-line vitest/prefer-spy-on
-    mockSessionManager.getContextInfo = vi.fn().mockReturnValue({
-      currentContext: 'e2e',
-      hasActiveSession: false,
-      sessionId: null,
-      capabilities: { available: ['build', 'fixture', 'chain'] },
-      canSwitchContext: true,
-    });
-    vi.spyOn(sessionManagerModule, 'getSessionManager').mockReturnValue(
-      mockSessionManager,
-    );
-
-    const contextOptions = {
-      mockServer: {
-        enabled: true,
-        port: 18000,
-      },
-    };
-
-    const result = await handleSetContext({
-      context: 'e2e',
-      options: contextOptions,
-    });
-
-    expect(result.ok).toBe(true);
-    expect(mockSessionManager.setContext).toHaveBeenCalledWith(
-      'e2e',
-      contextOptions,
-    );
-  });
-
-  it('switches context from prod to e2e', async () => {
-    const mockSessionManager = createMockSessionManager({
-      environmentMode: 'prod',
-    });
-    vi.spyOn(mockSessionManager, 'setContext');
-    // eslint-disable-next-line vitest/prefer-spy-on
-    mockSessionManager.getContextInfo = vi.fn().mockReturnValue({
-      currentContext: 'e2e',
-      hasActiveSession: false,
-      sessionId: null,
-      capabilities: { available: ['build', 'fixture', 'chain', 'seeding'] },
-      canSwitchContext: true,
-    });
-    vi.spyOn(sessionManagerModule, 'getSessionManager').mockReturnValue(
-      mockSessionManager,
-    );
-
-    const result = await handleSetContext({ context: 'e2e' });
-
-    expect(result.ok).toBe(true);
-    if (result.ok) {
-      expect(result.result.previousContext).toBe('prod');
-      expect(result.result.newContext).toBe('e2e');
-      expect(result.result.availableCapabilities).toStrictEqual([
-        'build',
-        'fixture',
-        'chain',
-        'seeding',
-      ]);
-    }
-  });
-
-  it('classifies context switch blocked errors', async () => {
-    const mockSessionManager = createMockSessionManager({
-      environmentMode: 'e2e',
-    });
-    vi.spyOn(mockSessionManager, 'setContext').mockImplementation(() => {
-      throw new Error(ErrorCodes.MM_CONTEXT_SWITCH_BLOCKED);
-    });
-    vi.spyOn(sessionManagerModule, 'getSessionManager').mockReturnValue(
-      mockSessionManager,
-    );
-
-    const result = await handleSetContext({ context: 'prod' });
-
-    expect(result.ok).toBe(false);
-    if (!result.ok) {
-      expect(result.error.code).toBe(ErrorCodes.MM_CONTEXT_SWITCH_BLOCKED);
-      expect(result.error.message).toBe(ErrorCodes.MM_CONTEXT_SWITCH_BLOCKED);
-    }
-  });
-
-  it('classifies generic context errors', async () => {
-    const mockSessionManager = createMockSessionManager({
-      environmentMode: 'e2e',
-    });
-    vi.spyOn(mockSessionManager, 'setContext').mockImplementation(() => {
-      throw new Error('Unknown error');
-    });
-    vi.spyOn(sessionManagerModule, 'getSessionManager').mockReturnValue(
-      mockSessionManager,
-    );
-
-    const result = await handleSetContext({ context: 'prod' });
-
-    expect(result.ok).toBe(false);
-    if (!result.ok) {
-      expect(result.error.code).toBe(ErrorCodes.MM_SET_CONTEXT_FAILED);
-      expect(result.error.message).toContain('Context switch failed');
-    }
-  });
-});
-
-describe('handleGetContext', () => {
-  beforeEach(() => {
-    vi.spyOn(knowledgeStoreModule, 'knowledgeStore', 'get').mockReturnValue({
-      recordStep: vi.fn().mockResolvedValue(undefined),
-      getLastSteps: vi.fn().mockResolvedValue([]),
-      searchSteps: vi.fn().mockResolvedValue([]),
-      summarizeSession: vi
-        .fn()
-        .mockResolvedValue({ sessionId: 'test', stepCount: 0, recipe: [] }),
-      listSessions: vi.fn().mockResolvedValue([]),
-      generatePriorKnowledge: vi.fn().mockResolvedValue(undefined),
-      writeSessionMetadata: vi.fn().mockResolvedValue('test-session'),
-    } as any);
-  });
-
-  it('returns context info when getContextInfo is available', async () => {
-    const mockSessionManager = createMockSessionManager({
-      hasActive: true,
-      sessionId: 'test-session-123',
-      environmentMode: 'e2e',
-    });
-    // eslint-disable-next-line vitest/prefer-spy-on
-    mockSessionManager.getContextInfo = vi.fn().mockReturnValue({
-      currentContext: 'e2e',
-      hasActiveSession: true,
-      sessionId: 'test-session-123',
-      capabilities: { available: ['build', 'fixture', 'chain'] },
-      canSwitchContext: false,
-    });
-    vi.spyOn(sessionManagerModule, 'getSessionManager').mockReturnValue(
-      mockSessionManager,
-    );
-
-    const result = await handleGetContext({});
-
-    expect(result.ok).toBe(true);
-    if (result.ok) {
-      expect(result.result.currentContext).toBe('e2e');
-      expect(result.result.hasActiveSession).toBe(true);
-      expect(result.result.sessionId).toBe('test-session-123');
-      expect(result.result.capabilities.available).toStrictEqual([
-        'build',
-        'fixture',
-        'chain',
-      ]);
-      expect(result.result.canSwitchContext).toBe(false);
-    }
-  });
-});
diff --git a/src/mcp-server/tools/context.ts b/src/mcp-server/tools/context.ts
deleted file mode 100644
index cbf2b52..0000000
--- a/src/mcp-server/tools/context.ts
+++ /dev/null
@@ -1,94 +0,0 @@
-import { runTool } from './run-tool.js';
-import { getSessionManager } from '../session-manager.js';
-import { classifyContextError } from './error-classification.js';
-import type { McpResponse, HandlerOptions } from '../types';
-
-export type SetContextInput = {
-  context: 'e2e' | 'prod';
-  options?: Record<string, unknown>;
-};
-export type SetContextResult = {
-  previousContext: 'e2e' | 'prod';
-  newContext: 'e2e' | 'prod';
-  availableCapabilities: string[];
-};
-
-/**
- * Handle setting the workflow context (e2e or prod).
- *
- * @param input The context input containing the desired context mode
- * @param options Optional handler options for the operation
- * @returns Promise resolving to the context change result with previous and new context
- */
-export async function handleSetContext(
-  input: SetContextInput,
-  options?: HandlerOptions,
-): Promise<McpResponse<SetContextResult>> {
-  return runTool<SetContextInput, SetContextResult>({
-    toolName: 'mm_set_context',
-    input,
-    options,
-    requiresSession: false,
-    observationPolicy: 'none',
-
-    /**
-     * Execute the context switch operation.
-     *
-     * @returns The result containing previous context, new context, and available capabilities
-     */
-    execute: async () => {
-      const sessionManager = getSessionManager();
-      const previousContext = sessionManager.getEnvironmentMode();
-      sessionManager.setContext(input.context, input.options);
-      const info = sessionManager.getContextInfo();
-
-      return {
-        previousContext,
-        newContext: input.context,
-        availableCapabilities: info.capabilities.available,
-      };
-    },
-
-    classifyError: classifyContextError,
-  });
-}
-
-export type GetContextResult = {
-  currentContext: 'e2e' | 'prod';
-  hasActiveSession: boolean;
-  sessionId: string | null;
-  capabilities: {
-    available: string[];
-  };
-  canSwitchContext: boolean;
-};
-
-/**
- * Handle getting the current workflow context and capabilities.
- *
- * @param input Empty input object for this operation
- * @param options Optional handler options for the operation
- * @returns Promise resolving to the current context, session state, and available capabilities
- */
-export async function handleGetContext(
-  input: Record<string, never>,
-  options?: HandlerOptions,
-): Promise<McpResponse<GetContextResult>> {
-  return runTool<Record<string, never>, GetContextResult>({
-    toolName: 'mm_get_context',
-    input,
-    options,
-    requiresSession: false,
-    observationPolicy: 'none',
-
-    /**
-     * Execute the get context operation.
-     *
-     * @returns The result containing current context, session state, and capabilities
-     */
-    execute: async () => {
-      const sessionManager = getSessionManager();
-      return sessionManager.getContextInfo();
-    },
-  });
-}
diff --git a/src/mcp-server/tools/definitions.test.ts b/src/mcp-server/tools/definitions.test.ts
deleted file mode 100644
index 2d1634f..0000000
--- a/src/mcp-server/tools/definitions.test.ts
+++ /dev/null
@@ -1,759 +0,0 @@
-/* eslint-disable vitest/require-to-throw-message */
-import { describe, it, expect, beforeAll } from 'vitest';
-
-import {
-  getToolDefinitions,
-  TOOL_PREFIX,
-  extractBaseName,
-  validateToolInput,
-  safeValidateToolInput,
-  getToolNames,
-  getPrefixedToolNames,
-  buildToolHandlersRecord,
-  getToolHandler,
-  hasToolHandler,
-} from './definitions.js';
-import type { ToolDefinition } from './definitions.js';
-
-describe('tool-definitions', () => {
-  describe('getToolDefinitions', () => {
-    it('creates tool definitions with mm_ prefix', () => {
-      const definitions = getToolDefinitions();
-
-      for (const def of definitions) {
-        expect(def.name.startsWith(`${TOOL_PREFIX}_`)).toBe(true);
-      }
-    });
-
-    it('creates 27 tool definitions', () => {
-      const definitions = getToolDefinitions();
-      expect(definitions).toHaveLength(27);
-    });
-
-    it('includes all expected tools', () => {
-      const definitions = getToolDefinitions();
-      const toolNames = definitions.map((d) => d.name);
-
-      const expectedTools = [
-        'mm_build',
-        'mm_launch',
-        'mm_cleanup',
-        'mm_get_state',
-        'mm_navigate',
-        'mm_wait_for_notification',
-        'mm_switch_to_tab',
-        'mm_close_tab',
-        'mm_list_testids',
-        'mm_accessibility_snapshot',
-        'mm_describe_screen',
-        'mm_screenshot',
-        'mm_click',
-        'mm_type',
-        'mm_wait_for',
-        'mm_knowledge_last',
-        'mm_knowledge_search',
-        'mm_knowledge_summarize',
-        'mm_knowledge_sessions',
-        'mm_seed_contract',
-        'mm_seed_contracts',
-        'mm_get_contract_address',
-        'mm_list_contracts',
-        'mm_run_steps',
-        'mm_set_context',
-        'mm_get_context',
-      ];
-
-      for (const expected of expectedTools) {
-        expect(toolNames).toContain(expected);
-      }
-    });
-
-    it('all tools have valid input schema', () => {
-      const definitions = getToolDefinitions();
-
-      for (const def of definitions) {
-        expect(def.inputSchema).toBeDefined();
-        const hasObjectType = def.inputSchema.type === 'object';
-        const hasAllOf = Array.isArray(def.inputSchema.allOf);
-        expect(hasObjectType || hasAllOf).toBe(true);
-      }
-    });
-
-    it('all tools have descriptions', () => {
-      const definitions = getToolDefinitions();
-
-      for (const def of definitions) {
-        expect(def.description).toBeDefined();
-        expect(typeof def.description).toBe('string');
-        expect(def.description.length).toBeGreaterThan(10);
-      }
-    });
-
-    describe('specific tool schemas', () => {
-      let definitions: ToolDefinition[];
-
-      beforeAll(() => {
-        definitions = getToolDefinitions();
-      });
-
-      /**
-       * Find a tool definition by its name.
-       *
-       * @param name The tool name to search for
-       * @returns The matching tool definition or undefined if not found
-       */
-      const findTool = (name: string): ToolDefinition | undefined =>
-        definitions.find((d) => d.name === name);
-
-      /**
-       * Schema object structure for testing.
-       */
-      type SchemaObj = {
-        /**
-         * Object properties mapping
-         */
-        properties?: Record<string, unknown>;
-        /**
-         * Required property names
-         */
-        required?: string[];
-        /**
-         * Array of schemas to combine
-         */
-        allOf?: SchemaObj[];
-      };
-
-      /**
-       * Get all properties from a schema, including those in allOf.
-       *
-       * @param schema The schema object to extract properties from
-       * @returns Combined properties from schema and allOf items
-       */
-      const getAllProperties = (schema: SchemaObj): Record<string, unknown> => {
-        if (schema.properties) {
-          return schema.properties;
-        }
-        if (schema.allOf) {
-          return schema.allOf.reduce(
-            (acc, item) => ({ ...acc, ...getAllProperties(item) }),
-            {},
-          );
-        }
-        return {};
-      };
-
-      /**
-       * Get all required properties from a schema, including those in allOf.
-       *
-       * @param schema The schema object to extract required properties from
-       * @returns Combined required property names from schema and allOf items
-       */
-      const getAllRequired = (schema: SchemaObj): string[] => {
-        const required: string[] = [];
-        if (schema.required) {
-          required.push(...schema.required);
-        }
-        if (schema.allOf) {
-          for (const item of schema.allOf) {
-            required.push(...getAllRequired(item));
-          }
-        }
-        return required;
-      };
-
-      it('mm_click has correct schema', () => {
-        const tool = findTool('mm_click');
-        expect(tool).toBeDefined();
-
-        const props = getAllProperties(tool?.inputSchema as SchemaObj);
-        expect(props.a11yRef).toBeDefined();
-        expect(props.testId).toBeDefined();
-        expect(props.selector).toBeDefined();
-        expect(props.timeoutMs).toBeDefined();
-      });
-
-      it('mm_type has required text property', () => {
-        const tool = findTool('mm_type');
-        expect(tool).toBeDefined();
-
-        const required = getAllRequired(tool?.inputSchema as SchemaObj);
-        expect(required).toContain('text');
-      });
-
-      it('mm_navigate has required screen property', () => {
-        const tool = findTool('mm_navigate');
-        expect(tool).toBeDefined();
-
-        const required = getAllRequired(tool?.inputSchema as SchemaObj);
-        expect(required).toContain('screen');
-
-        const props = getAllProperties(
-          tool?.inputSchema as SchemaObj,
-        ) as Record<
-          string,
-          {
-            /**
-             *
-             */
-            enum?: string[];
-          }
-        >;
-        expect(props.screen?.enum).toStrictEqual([
-          'home',
-          'settings',
-          'notification',
-          'url',
-        ]);
-      });
-
-      it('mm_screenshot has required name property', () => {
-        const tool = findTool('mm_screenshot');
-        expect(tool).toBeDefined();
-
-        const required = getAllRequired(tool?.inputSchema as SchemaObj);
-        expect(required).toContain('name');
-      });
-
-      it('mm_run_steps has required steps property', () => {
-        const tool = findTool('mm_run_steps');
-        expect(tool).toBeDefined();
-
-        const required = getAllRequired(tool?.inputSchema as SchemaObj);
-        expect(required).toContain('steps');
-
-        const props = getAllProperties(
-          tool?.inputSchema as SchemaObj,
-        ) as Record<
-          string,
-          {
-            /**
-             * The JSON schema type
-             */
-            type?: string;
-            /**
-             * Array item schema definition
-             */
-            items?: {
-              /**
-               * The item type
-               */
-              type: string;
-            };
-          }
-        >;
-        expect(props.steps?.type).toBe('array');
-      });
-
-      it('mm_seed_contract has required contractName property', () => {
-        const tool = findTool('mm_seed_contract');
-        expect(tool).toBeDefined();
-
-        const required = getAllRequired(tool?.inputSchema as SchemaObj);
-        expect(required).toContain('contractName');
-
-        const props = getAllProperties(
-          tool?.inputSchema as SchemaObj,
-        ) as Record<
-          string,
-          {
-            /**
-             *
-             */
-            enum?: string[];
-          }
-        >;
-        expect(props.contractName?.enum).toContain('hst');
-        expect(props.contractName?.enum).toContain('nfts');
-      });
-
-      it('mm_launch has stateMode enum', () => {
-        const tool = findTool('mm_launch');
-        expect(tool).toBeDefined();
-
-        const props = getAllProperties(
-          tool?.inputSchema as SchemaObj,
-        ) as Record<
-          string,
-          {
-            /**
-             *
-             */
-            enum?: string[];
-          }
-        >;
-        expect(props.stateMode?.enum).toStrictEqual([
-          'default',
-          'onboarding',
-          'custom',
-        ]);
-      });
-
-      it('mm_switch_to_tab has role enum', () => {
-        const tool = findTool('mm_switch_to_tab');
-        expect(tool).toBeDefined();
-
-        const props = getAllProperties(
-          tool?.inputSchema as SchemaObj,
-        ) as Record<
-          string,
-          {
-            /**
-             *
-             */
-            enum?: string[];
-          }
-        >;
-        expect(props.role?.enum).toStrictEqual([
-          'extension',
-          'notification',
-          'dapp',
-          'other',
-        ]);
-      });
-
-      it('mm_knowledge_search has required query property', () => {
-        const tool = findTool('mm_knowledge_search');
-        expect(tool).toBeDefined();
-
-        const required = getAllRequired(tool?.inputSchema as SchemaObj);
-        expect(required).toContain('query');
-      });
-    });
-
-    it('uses mm_ prefix in descriptions', () => {
-      const definitions = getToolDefinitions();
-
-      const a11yTool = definitions.find(
-        (d) => d.name === 'mm_accessibility_snapshot',
-      );
-      expect(a11yTool?.description).toContain('mm_click');
-      expect(a11yTool?.description).toContain('mm_type');
-    });
-
-    it('all schemas have additionalProperties set to false', () => {
-      const definitions = getToolDefinitions();
-
-      for (const def of definitions) {
-        const schema = def.inputSchema;
-        if (schema.type === 'object') {
-          expect(schema.additionalProperties).toBe(false);
-        }
-      }
-    });
-
-    it('all schemas have properties defined', () => {
-      const definitions = getToolDefinitions();
-
-      for (const def of definitions) {
-        const schema = def.inputSchema;
-        expect(
-          schema.properties ?? schema.allOf ?? schema.anyOf ?? schema.oneOf,
-        ).toBeDefined();
-      }
-    });
-
-    it('all required properties are defined in properties', () => {
-      const definitions = getToolDefinitions();
-
-      for (const def of definitions) {
-        const schema = def.inputSchema;
-        if (Array.isArray(schema.required) && schema.properties) {
-          const props = schema.properties as Record<string, unknown>;
-          for (const req of schema.required) {
-            expect(props[req as string]).toBeDefined();
-          }
-        }
-      }
-    });
-
-    it('processes anyOf arrays in nested properties', () => {
-      const definitions = getToolDefinitions();
-
-      // Find tools with anyOf in properties (e.g., knowledge tools with scope)
-      // This exercises the anyOf handling in removeDefaultsFromRequired (lines 397-400)
-      let foundAnyOf = false;
-      for (const def of definitions) {
-        const schema = def.inputSchema;
-        if (schema.properties && typeof schema.properties === 'object') {
-          const props = schema.properties as Record<string, unknown>;
-          for (const [, prop] of Object.entries(props)) {
-            if (prop && typeof prop === 'object') {
-              const propObj = prop as Record<string, unknown>;
-              if ('anyOf' in propObj) {
-                foundAnyOf = true;
-                expect(Array.isArray(propObj.anyOf)).toBe(true);
-                // Verify anyOf items are properly processed
-                const anyOfArray = propObj.anyOf as unknown[];
-                for (const item of anyOfArray) {
-                  expect(item).toBeDefined();
-                }
-              }
-            }
-          }
-        }
-      }
-      // Verify we found at least one tool with anyOf (knowledge tools)
-      expect(foundAnyOf).toBe(true);
-    });
-
-    it('processes nested object properties recursively', () => {
-      const definitions = getToolDefinitions();
-
-      // Verify that nested object properties are processed correctly
-      // This exercises the recursive property handling in removeDefaultsFromRequired (lines 418-421)
-      for (const def of definitions) {
-        const schema = def.inputSchema;
-        if (schema.properties && typeof schema.properties === 'object') {
-          const props = schema.properties as Record<string, unknown>;
-          for (const [, value] of Object.entries(props)) {
-            if (value && typeof value === 'object') {
-              const propObj = value as Record<string, unknown>;
-              // Nested objects should have proper structure
-              expect(propObj).toBeDefined();
-              // If it has properties, they should be objects
-              if ('properties' in propObj && propObj.properties) {
-                expect(typeof propObj.properties).toBe('object');
-              }
-            }
-          }
-        }
-      }
-    });
-
-    it('sets additionalProperties false on top-level object schemas', () => {
-      const definitions = getToolDefinitions();
-
-      // Verify that additionalProperties is set to false on top-level schemas
-      // This exercises the additionalProperties assignment in zodSchemaToJsonSchema (line 503)
-      for (const def of definitions) {
-        const schema = def.inputSchema;
-        // All tool schemas should be objects with additionalProperties: false
-        if (schema.type === 'object') {
-          expect(schema.additionalProperties).toBe(false);
-        }
-      }
-    });
-  });
-
-  describe('extractBaseName', () => {
-    it('removes mm_ prefix from tool name', () => {
-      const result = extractBaseName('mm_click');
-
-      expect(result).toBe('click');
-    });
-
-    it('returns original name when no prefix', () => {
-      const result = extractBaseName('click');
-
-      expect(result).toBe('click');
-    });
-
-    it('handles multiple underscores correctly', () => {
-      const result = extractBaseName('mm_wait_for_notification');
-
-      expect(result).toBe('wait_for_notification');
-    });
-
-    it('handles empty string', () => {
-      const result = extractBaseName('');
-
-      expect(result).toBe('');
-    });
-
-    it('handles string with only prefix', () => {
-      const result = extractBaseName('mm_');
-
-      expect(result).toBe('');
-    });
-
-    it('handles all tool names from getToolNames', () => {
-      const baseNames = getToolNames();
-
-      for (const baseName of baseNames) {
-        const prefixed = `${TOOL_PREFIX}_${baseName}`;
-        const extracted = extractBaseName(prefixed);
-        expect(extracted).toBe(baseName);
-      }
-    });
-  });
-
-  describe('validateToolInput', () => {
-    it('parses valid input for known tool', () => {
-      const result = validateToolInput('mm_click', { testId: 'button' });
-
-      expect(result).toBeDefined();
-      expect(result).toHaveProperty('testId', 'button');
-    });
-
-    it('throws error for unknown tool', () => {
-      expect(() => {
-        validateToolInput('mm_unknown_tool', {});
-      }).toThrowError('Unknown tool: mm_unknown_tool');
-    });
-
-    it('throws error for invalid input schema', () => {
-      expect(() => {
-        validateToolInput('mm_type', { text: 123 });
-      }).toThrowError();
-    });
-
-    it('accepts input without prefix', () => {
-      const result = validateToolInput('click', { testId: 'button' });
-
-      expect(result).toBeDefined();
-      expect(result).toHaveProperty('testId', 'button');
-    });
-
-    it('parses input with multiple valid properties', () => {
-      const result = validateToolInput('mm_click', {
-        testId: 'button',
-        timeoutMs: 5000,
-      });
-
-      expect(result).toBeDefined();
-      expect(result).toHaveProperty('testId', 'button');
-      expect(result).toHaveProperty('timeoutMs', 5000);
-    });
-  });
-
-  describe('safeValidateToolInput', () => {
-    it('returns success with data for valid input', () => {
-      const result = safeValidateToolInput('mm_click', { testId: 'button' });
-
-      expect(result.success).toBe(true);
-      expect(result).toHaveProperty('data');
-      if (result.success) {
-        expect(result.data).toHaveProperty('testId', 'button');
-      }
-    });
-
-    it('returns failure for unknown tool', () => {
-      const result = safeValidateToolInput('mm_unknown_tool', {});
-
-      expect(result.success).toBe(false);
-      expect(result).toHaveProperty('error');
-      if (!result.success) {
-        expect(result.error).toContain('Unknown tool');
-      }
-    });
-
-    it('returns failure for invalid input', () => {
-      const result = safeValidateToolInput('mm_type', { text: 123 });
-
-      expect(result.success).toBe(false);
-      expect(result).toHaveProperty('error');
-    });
-
-    it('accepts input without prefix', () => {
-      const result = safeValidateToolInput('click', { testId: 'button' });
-
-      expect(result.success).toBe(true);
-      if (result.success) {
-        expect(result.data).toHaveProperty('testId', 'button');
-      }
-    });
-
-    it('returns success with multiple valid properties', () => {
-      const result = safeValidateToolInput('mm_click', {
-        testId: 'button',
-        timeoutMs: 5000,
-      });
-
-      expect(result.success).toBe(true);
-      if (result.success) {
-        expect(result.data).toHaveProperty('testId', 'button');
-        expect(result.data).toHaveProperty('timeoutMs', 5000);
-      }
-    });
-
-    it('includes error message with path for validation errors', () => {
-      const result = safeValidateToolInput('mm_type', { text: 123 });
-
-      expect(result.success).toBe(false);
-      if (!result.success) {
-        expect(result.error).toMatch(/text/u);
-      }
-    });
-  });
-
-  describe('getToolNames', () => {
-    it('returns array of tool base names', () => {
-      const names = getToolNames();
-
-      expect(Array.isArray(names)).toBe(true);
-      expect(names.length).toBeGreaterThan(0);
-    });
-
-    it('includes expected tool names without prefix', () => {
-      const names = getToolNames();
-
-      expect(names).toContain('click');
-      expect(names).toContain('type');
-      expect(names).toContain('launch');
-      expect(names).toContain('cleanup');
-    });
-
-    it('does not include mm_ prefix in names', () => {
-      const names = getToolNames();
-
-      for (const name of names) {
-        expect(name).not.toMatch(/^mm_/u);
-      }
-    });
-
-    it('returns 27 tool names', () => {
-      const names = getToolNames();
-
-      expect(names).toHaveLength(27);
-    });
-
-    it('all names are strings', () => {
-      const names = getToolNames();
-
-      for (const name of names) {
-        expect(typeof name).toBe('string');
-        expect(name.length).toBeGreaterThan(0);
-      }
-    });
-  });
-
-  describe('getPrefixedToolNames', () => {
-    it('returns array of prefixed tool names', () => {
-      const names = getPrefixedToolNames();
-
-      expect(Array.isArray(names)).toBe(true);
-      expect(names.length).toBeGreaterThan(0);
-    });
-
-    it('includes mm_ prefix in all names', () => {
-      const names = getPrefixedToolNames();
-
-      for (const name of names) {
-        expect(name).toMatch(/^mm_/u);
-      }
-    });
-
-    it('includes expected prefixed tool names', () => {
-      const names = getPrefixedToolNames();
-
-      expect(names).toContain('mm_click');
-      expect(names).toContain('mm_type');
-      expect(names).toContain('mm_launch');
-      expect(names).toContain('mm_cleanup');
-    });
-
-    it('has same count as getToolNames', () => {
-      const baseNames = getToolNames();
-      const prefixedNames = getPrefixedToolNames();
-
-      expect(prefixedNames).toHaveLength(baseNames.length);
-    });
-  });
-
-  describe('buildToolHandlersRecord', () => {
-    it('returns record mapping prefixed names to handlers', () => {
-      const handlers = buildToolHandlersRecord();
-
-      expect(typeof handlers).toBe('object');
-      expect(handlers).not.toBeNull();
-    });
-
-    it('includes all prefixed tool names as keys', () => {
-      const handlers = buildToolHandlersRecord();
-      const prefixedNames = getPrefixedToolNames();
-
-      for (const name of prefixedNames) {
-        expect(handlers).toHaveProperty(name);
-      }
-    });
-
-    it('all values are functions', () => {
-      const handlers = buildToolHandlersRecord();
-
-      for (const [, handler] of Object.entries(handlers)) {
-        expect(typeof handler).toBe('function');
-      }
-    });
-
-    it('has same count as getPrefixedToolNames', () => {
-      const handlers = buildToolHandlersRecord();
-      const prefixedNames = getPrefixedToolNames();
-
-      expect(Object.keys(handlers)).toHaveLength(prefixedNames.length);
-    });
-
-    it('does not include base names without prefix', () => {
-      const handlers = buildToolHandlersRecord();
-      const baseNames = getToolNames();
-
-      for (const baseName of baseNames) {
-        expect(handlers).not.toHaveProperty(baseName);
-      }
-    });
-  });
-
-  describe('getToolHandler', () => {
-    it('returns handler for prefixed tool name', () => {
-      const handler = getToolHandler('mm_click');
-
-      expect(handler).toBeDefined();
-      expect(typeof handler).toBe('function');
-    });
-
-    it('returns handler for base tool name', () => {
-      const handler = getToolHandler('click');
-
-      expect(handler).toBeDefined();
-      expect(typeof handler).toBe('function');
-    });
-
-    it('returns undefined for unknown tool', () => {
-      const handler = getToolHandler('mm_unknown_tool');
-
-      expect(handler).toBeUndefined();
-    });
-
-    it('returns same handler for prefixed and base names', () => {
-      const prefixedHandler = getToolHandler('mm_click');
-      const baseHandler = getToolHandler('click');
-
-      expect(prefixedHandler).toBe(baseHandler);
-    });
-  });
-
-  describe('hasToolHandler', () => {
-    it('returns true for existing prefixed tool', () => {
-      const exists = hasToolHandler('mm_click');
-
-      expect(exists).toBe(true);
-    });
-
-    it('returns true for existing base tool name', () => {
-      const exists = hasToolHandler('click');
-
-      expect(exists).toBe(true);
-    });
-
-    it('returns false for unknown tool', () => {
-      const exists = hasToolHandler('mm_unknown_tool');
-
-      expect(exists).toBe(false);
-    });
-
-    it('returns true for all prefixed tool names', () => {
-      const prefixedNames = getPrefixedToolNames();
-
-      for (const name of prefixedNames) {
-        expect(hasToolHandler(name)).toBe(true);
-      }
-    });
-
-    it('returns true for all base tool names', () => {
-      const baseNames = getToolNames();
-
-      for (const name of baseNames) {
-        expect(hasToolHandler(name)).toBe(true);
-      }
-    });
-  });
-});
diff --git a/src/mcp-server/tools/definitions.ts b/src/mcp-server/tools/definitions.ts
deleted file mode 100644
index f6e7fdc..0000000
--- a/src/mcp-server/tools/definitions.ts
+++ /dev/null
@@ -1,638 +0,0 @@
-import type { ZodType } from 'zod';
-
-import {
-  buildInputSchema,
-  launchInputSchema,
-  cleanupInputSchema,
-  getStateInputSchema,
-  navigateInputSchema,
-  waitForNotificationInputSchema,
-  switchToTabInputSchema,
-  closeTabInputSchema,
-  listTestIdsInputSchema,
-  accessibilitySnapshotInputSchema,
-  describeScreenInputSchema,
-  screenshotInputSchema,
-  clickInputSchema,
-  typeInputSchema,
-  waitForInputSchema,
-  knowledgeLastInputSchema,
-  knowledgeSearchInputSchema,
-  knowledgeSummarizeInputSchema,
-  knowledgeSessionsInputSchema,
-  seedContractInputSchema,
-  seedContractsInputSchema,
-  getContractAddressInputSchema,
-  listDeployedContractsInputSchema,
-  runStepsInputSchema,
-  setContextInputSchema,
-  getContextInputSchema,
-  clipboardInputSchema,
-} from '../schemas.js';
-import { getSessionManager } from '../session-manager.js';
-import { handleRunSteps } from './batch.js';
-import type { ToolHandler } from './batch.js';
-import type { BuildToolOptions } from './build.js';
-import { handleBuild } from './build.js';
-import { handleCleanup } from './cleanup.js';
-import { handleClipboard } from './clipboard.js';
-import { handleSetContext, handleGetContext } from './context.js';
-import {
-  handleListTestIds,
-  handleAccessibilitySnapshot,
-  handleDescribeScreen,
-} from './discovery-tools.js';
-import { handleClick, handleType, handleWaitFor } from './interaction.js';
-import {
-  handleKnowledgeLast,
-  handleKnowledgeSearch,
-  handleKnowledgeSummarize,
-  handleKnowledgeSessions,
-} from './knowledge.js';
-import { handleLaunch } from './launch.js';
-import {
-  handleNavigate,
-  handleWaitForNotification,
-  handleSwitchToTab,
-  handleCloseTab,
-} from './navigation.js';
-import { handleScreenshot } from './screenshot.js';
-import {
-  handleSeedContract,
-  handleSeedContracts,
-  handleGetContractAddress,
-  handleListDeployedContracts,
-} from './seeding.js';
-import type { SeedingToolOptions } from './seeding.js';
-import { handleGetState } from './state.js';
-import type { StateToolOptions } from './state.js';
-import type {
-  SeedContractInput,
-  SeedContractsInput,
-  GetContractAddressInput,
-  ListDeployedContractsInput,
-} from '../types';
-
-export const TOOL_PREFIX = 'mm';
-
-export type ToolDefinition = {
-  name: string;
-  description: string;
-  inputSchema: Record<string, unknown>;
-};
-
-type ZodSchema = ZodType<unknown> & { toJSONSchema(): Record<string, unknown> };
-
-type ToolEntry = {
-  schema: ZodSchema;
-  description: string;
-  handler: ToolHandler;
-};
-
-/**
- * Create a handler for the build tool that injects build capability.
- *
- * @returns A tool handler function for building the extension
- */
-function createBuildHandler(): ToolHandler {
-  return async (input, options) => {
-    const sessionManager = getSessionManager();
-    const buildOptions: BuildToolOptions = {
-      ...options,
-      buildCapability: sessionManager.getBuildCapability?.(),
-    };
-    return handleBuild(input, buildOptions);
-  };
-}
-
-/**
- * Create a handler for the state tool that injects state snapshot capability.
- *
- * @returns A tool handler function for getting extension state
- */
-function createStateHandler(): ToolHandler {
-  return async (_, options) => {
-    const sessionManager = getSessionManager();
-    const stateOptions: StateToolOptions = {
-      ...options,
-      stateSnapshotCapability: sessionManager.getStateSnapshotCapability?.(),
-    };
-    return handleGetState(stateOptions);
-  };
-}
-
-/**
- * Create a handler for the seed contract tool that injects seeding capability.
- *
- * @returns A tool handler function for deploying a single contract
- */
-function createSeedContractHandler(): ToolHandler {
-  return async (input, options) => {
-    const sessionManager = getSessionManager();
-    const seedingOptions: SeedingToolOptions = {
-      ...options,
-      seedingCapability: sessionManager.getContractSeedingCapability?.(),
-    };
-    return handleSeedContract(input as SeedContractInput, seedingOptions);
-  };
-}
-
-/**
- * Create a handler for the seed contracts tool that injects seeding capability.
- *
- * @returns A tool handler function for deploying multiple contracts
- */
-function createSeedContractsHandler(): ToolHandler {
-  return async (input, options) => {
-    const sessionManager = getSessionManager();
-    const seedingOptions: SeedingToolOptions = {
-      ...options,
-      seedingCapability: sessionManager.getContractSeedingCapability?.(),
-    };
-    return handleSeedContracts(input as SeedContractsInput, seedingOptions);
-  };
-}
-
-/**
- * Create a handler for the get contract address tool that injects seeding capability.
- *
- * @returns A tool handler function for retrieving a deployed contract address
- */
-function createGetContractAddressHandler(): ToolHandler {
-  return async (input, options) => {
-    const sessionManager = getSessionManager();
-    const seedingOptions: SeedingToolOptions = {
-      ...options,
-      seedingCapability: sessionManager.getContractSeedingCapability?.(),
-    };
-    return handleGetContractAddress(
-      input as GetContractAddressInput,
-      seedingOptions,
-    );
-  };
-}
-
-/**
- * Create a handler for the list contracts tool that injects seeding capability.
- *
- * @returns A tool handler function for listing deployed contracts
- */
-function createListDeployedContractsHandler(): ToolHandler {
-  return async (input, options) => {
-    const sessionManager = getSessionManager();
-    const seedingOptions: SeedingToolOptions = {
-      ...options,
-      seedingCapability: sessionManager.getContractSeedingCapability?.(),
-    };
-    return handleListDeployedContracts(
-      input as ListDeployedContractsInput,
-      seedingOptions,
-    );
-  };
-}
-
-const tools: Record<string, ToolEntry> = {
-  build: {
-    schema: buildInputSchema,
-    description: `Build the extension using yarn build:test. Call before ${TOOL_PREFIX}_launch if extension is not built.`,
-    handler: createBuildHandler(),
-  },
-  launch: {
-    schema: launchInputSchema,
-    description:
-      'Launch extension in a headed Chrome browser with Playwright. Returns session info and initial state.',
-    handler: handleLaunch as ToolHandler,
-  },
-  cleanup: {
-    schema: cleanupInputSchema,
-    description:
-      'Stop the browser, Anvil, and all services. Always call when done.',
-    handler: handleCleanup as ToolHandler,
-  },
-  get_state: {
-    schema: getStateInputSchema,
-    description:
-      'Get current extension state including screen, URL, balance, network, and account address.',
-    handler: createStateHandler(),
-  },
-  navigate: {
-    schema: navigateInputSchema,
-    description: 'Navigate to a specific screen in the extension.',
-    handler: handleNavigate as ToolHandler,
-  },
-  wait_for_notification: {
-    schema: waitForNotificationInputSchema,
-    description:
-      'Wait for notification popup to appear (e.g., after dapp interaction). Sets the notification page as the active page for subsequent interactions.',
-    handler: handleWaitForNotification as ToolHandler,
-  },
-  switch_to_tab: {
-    schema: switchToTabInputSchema,
-    description: `Switch the active page to a different tracked tab. Use this to direct ${TOOL_PREFIX}_click, ${TOOL_PREFIX}_type, and other interaction tools to a specific page.`,
-    handler: handleSwitchToTab as ToolHandler,
-  },
-  close_tab: {
-    schema: closeTabInputSchema,
-    description:
-      'Close a specific tab by role or URL. Cannot close the extension home page. If closing the active tab, automatically switches to extension home.',
-    handler: handleCloseTab as ToolHandler,
-  },
-  list_testids: {
-    schema: listTestIdsInputSchema,
-    description:
-      'List all visible data-testid attributes on the current page. Use to discover available interaction targets.',
-    handler: handleListTestIds as ToolHandler,
-  },
-  accessibility_snapshot: {
-    schema: accessibilitySnapshotInputSchema,
-    description: `Get trimmed accessibility tree with deterministic refs (e1, e2, ...). Use refs with ${TOOL_PREFIX}_click/${TOOL_PREFIX}_type.`,
-    handler: handleAccessibilitySnapshot as ToolHandler,
-  },
-  describe_screen: {
-    schema: describeScreenInputSchema,
-    description:
-      'Get comprehensive screen state: extension state + testIds + accessibility snapshot. Optional screenshot.',
-    handler: handleDescribeScreen as ToolHandler,
-  },
-  screenshot: {
-    schema: screenshotInputSchema,
-    description: 'Take a screenshot and save to test-artifacts/screenshots/',
-    handler: handleScreenshot as ToolHandler,
-  },
-  click: {
-    schema: clickInputSchema,
-    description:
-      'Click an element. Specify exactly one of: a11yRef, testId, or selector.',
-    handler: handleClick as ToolHandler,
-  },
-  type: {
-    schema: typeInputSchema,
-    description:
-      'Type text into an element. Specify exactly one of: a11yRef, testId, or selector.',
-    handler: handleType as ToolHandler,
-  },
-  wait_for: {
-    schema: waitForInputSchema,
-    description:
-      'Wait for an element to become visible. Specify exactly one of: a11yRef, testId, or selector.',
-    handler: handleWaitFor as ToolHandler,
-  },
-  knowledge_last: {
-    schema: knowledgeLastInputSchema,
-    description:
-      'Get the last N step records from the knowledge store for the current session.',
-    handler: handleKnowledgeLast as ToolHandler,
-  },
-  knowledge_search: {
-    schema: knowledgeSearchInputSchema,
-    description:
-      'Search step records by tool name, screen, testId, or accessibility names. Default searches all sessions.',
-    handler: handleKnowledgeSearch as ToolHandler,
-  },
-  knowledge_summarize: {
-    schema: knowledgeSummarizeInputSchema,
-    description: 'Generate a recipe-like summary of steps taken in a session.',
-    handler: handleKnowledgeSummarize as ToolHandler,
-  },
-  knowledge_sessions: {
-    schema: knowledgeSessionsInputSchema,
-    description:
-      'List recent sessions with metadata for cross-session knowledge retrieval.',
-    handler: handleKnowledgeSessions as ToolHandler,
-  },
-  seed_contract: {
-    schema: seedContractInputSchema,
-    description:
-      'Deploy a smart contract to the local Anvil node. Available: hst (ERC20 TST token), nfts (ERC721), erc1155, piggybank, failing (reverts), multisig, entrypoint (ERC-4337), simpleAccountFactory, verifyingPaymaster.',
-    handler: createSeedContractHandler(),
-  },
-  seed_contracts: {
-    schema: seedContractsInputSchema,
-    description: 'Deploy multiple smart contracts in sequence.',
-    handler: createSeedContractsHandler(),
-  },
-  get_contract_address: {
-    schema: getContractAddressInputSchema,
-    description: 'Get the deployed address of a smart contract.',
-    handler: createGetContractAddressHandler(),
-  },
-  list_contracts: {
-    schema: listDeployedContractsInputSchema,
-    description: 'List all smart contracts deployed in this session.',
-    handler: createListDeployedContractsHandler(),
-  },
-  run_steps: {
-    schema: runStepsInputSchema,
-    description:
-      'Execute multiple tools in sequence. Reduces round trips for multi-step flows.',
-    handler: handleRunSteps as ToolHandler,
-  },
-  set_context: {
-    schema: setContextInputSchema,
-    description:
-      'Switch workflow context (e2e or prod). Cannot switch during active session.',
-    handler: handleSetContext as ToolHandler,
-  },
-  get_context: {
-    schema: getContextInputSchema,
-    description:
-      'Get current context, available capabilities, and whether context can be switched.',
-    handler: handleGetContext as ToolHandler,
-  },
-  clipboard: {
-    schema: clipboardInputSchema,
-    description:
-      "Write text to or read text from the browser clipboard. Use action='write' with text parameter to write, or action='read' to read current clipboard content. Useful for pasting SRP or other data into components that have paste handlers.",
-    handler: handleClipboard as ToolHandler,
-  },
-};
-
-/**
- * Zod v4's toJSONSchema() marks properties with defaults as required.
- * This is incorrect for MCP tool input schemas where LLM clients shouldn't
- * be required to provide values that have defaults. This function recursively
- * removes those properties from the required array.
- *
- * @param schema The JSON schema to process
- * @returns The modified schema with defaults removed from required array
- */
-function removeDefaultsFromRequired(
-  schema: Record<string, unknown>,
-): Record<string, unknown> {
-  const result = { ...schema };
-
-  if (Array.isArray(result.allOf)) {
-    result.allOf = result.allOf.map((item: Record<string, unknown>) =>
-      removeDefaultsFromRequired(item),
-    );
-  }
-
-  if (Array.isArray(result.anyOf)) {
-    result.anyOf = result.anyOf.map((item: Record<string, unknown>) =>
-      removeDefaultsFromRequired(item),
-    );
-  }
-
-  if (Array.isArray(result.oneOf)) {
-    result.oneOf = result.oneOf.map((item: Record<string, unknown>) =>
-      removeDefaultsFromRequired(item),
-    );
-  }
-
-  if (
-    result.properties &&
-    typeof result.properties === 'object' &&
-    result.properties !== null
-  ) {
-    const newProperties: Record<string, unknown> = {};
-    for (const [key, value] of Object.entries(
-      result.properties as Record<string, unknown>,
-    )) {
-      if (value && typeof value === 'object') {
-        newProperties[key] = removeDefaultsFromRequired(
-          value as Record<string, unknown>,
-        );
-      } else {
-        newProperties[key] = value;
-      }
-    }
-    result.properties = newProperties;
-  }
-
-  if (
-    Array.isArray(result.required) &&
-    result.properties &&
-    typeof result.properties === 'object'
-  ) {
-    const properties = result.properties as Record<
-      string,
-      Record<string, unknown>
-    >;
-    result.required = result.required.filter((propName: string) => {
-      const prop = properties[propName];
-      return prop && !('default' in prop);
-    });
-
-    if ((result.required as string[]).length === 0) {
-      delete result.required;
-    }
-  }
-
-  return result;
-}
-
-/**
- * MCP protocol doesn't support allOf/oneOf/anyOf at the top level of input schemas.
- * This flattens allOf into a single merged object schema.
- *
- * @param schema The JSON schema to flatten
- * @returns The flattened schema with allOf merged into properties
- */
-function flattenTopLevelAllOf(
-  schema: Record<string, unknown>,
-): Record<string, unknown> {
-  if (!Array.isArray(schema.allOf)) {
-    return schema;
-  }
-
-  const mergedProperties: Record<string, unknown> = {};
-  const mergedRequired: string[] = [];
-
-  for (const subSchema of schema.allOf as Record<string, unknown>[]) {
-    if (subSchema.properties && typeof subSchema.properties === 'object') {
-      Object.assign(mergedProperties, subSchema.properties);
-    }
-    if (Array.isArray(subSchema.required)) {
-      mergedRequired.push(...subSchema.required);
-    }
-  }
-
-  const result: Record<string, unknown> = {
-    type: 'object',
-    properties: mergedProperties,
-    additionalProperties: false,
-  };
-
-  if (mergedRequired.length > 0) {
-    result.required = [...new Set(mergedRequired)];
-  }
-
-  return result;
-}
-
-/**
- * Convert a Zod schema to a JSON schema suitable for MCP tool definitions.
- *
- * @param schema The Zod schema to convert
- * @returns The converted JSON schema with defaults removed and allOf flattened
- */
-function zodSchemaToJsonSchema(schema: ZodSchema): Record<string, unknown> {
-  const jsonSchema = schema.toJSONSchema();
-  const { $schema: _, ...rest } = jsonSchema;
-
-  const flattened = flattenTopLevelAllOf(rest);
-
-  if (flattened.type === 'object' && !('additionalProperties' in flattened)) {
-    flattened.additionalProperties = false;
-  }
-
-  return removeDefaultsFromRequired(flattened);
-}
-
-/**
- * Get all tool definitions with their schemas and descriptions.
- *
- * @returns Array of tool definitions for all available MCP tools
- */
-export function getToolDefinitions(): ToolDefinition[] {
-  return Object.entries(tools).map(([baseName, tool]) => ({
-    name: `${TOOL_PREFIX}_${baseName}`,
-    description: tool.description,
-    inputSchema: zodSchemaToJsonSchema(tool.schema),
-  }));
-}
-
-/**
- * Get the handler function for a specific tool by name.
- *
- * @param name The tool name (with or without mm_ prefix)
- * @returns The tool handler function or undefined if tool not found
- */
-export function getToolHandler(name: string): ToolHandler | undefined {
-  const prefixedMatch = Object.entries(tools).find(
-    ([baseName]) => `${TOOL_PREFIX}_${baseName}` === name,
-  );
-  if (prefixedMatch) {
-    return prefixedMatch[1].handler;
-  }
-
-  const tool = tools[name];
-  return tool?.handler;
-}
-
-/**
- * Check if a tool handler exists for the given tool name.
- *
- * @param name The tool name to check
- * @returns True if a handler exists for the tool, false otherwise
- */
-export function hasToolHandler(name: string): boolean {
-  return getToolHandler(name) !== undefined;
-}
-
-/**
- * Extract the base name from a tool name by removing the mm_ prefix.
- *
- * @param toolName The tool name (with or without mm_ prefix)
- * @returns The base name without the prefix
- */
-export function extractBaseName(toolName: string): string {
-  const prefixWithUnderscore = `${TOOL_PREFIX}_`;
-  if (toolName.startsWith(prefixWithUnderscore)) {
-    return toolName.slice(prefixWithUnderscore.length);
-  }
-  return toolName;
-}
-
-/**
- * Validate tool input against the tool's schema and return parsed data.
- *
- * @param toolName The tool name to validate input for
- * @param input The input data to validate
- * @returns The validated and parsed input data
- */
-export function validateToolInput<Type = unknown>(
-  toolName: string,
-  input: unknown,
-): Type {
-  const baseName = extractBaseName(toolName);
-  const tool = tools[baseName];
-
-  if (!tool) {
-    throw new Error(`Unknown tool: ${toolName}`);
-  }
-
-  return tool.schema.parse(input ?? {}) as Type;
-}
-
-/**
- * Safely validate tool input without throwing errors.
- *
- * @param toolName The tool name to validate input for
- * @param input The input data to validate
- * @returns Object with success flag and either parsed data or error message
- */
-export function safeValidateToolInput(
-  toolName: string,
-  input: unknown,
-):
-  | {
-      /**
-       * Indicates validation succeeded
-       */
-      success: true;
-      /**
-       * The validated and parsed input data
-       */
-      data: unknown;
-    }
-  | {
-      /**
-       * Indicates validation failed
-       */
-      success: false;
-      /**
-       * Error message describing validation failure
-       */
-      error: string;
-    } {
-  const baseName = extractBaseName(toolName);
-  const tool = tools[baseName];
-
-  if (!tool) {
-    return { success: false, error: `Unknown tool: ${toolName}` };
-  }
-
-  const result = tool.schema.safeParse(input ?? {});
-  if (!result.success) {
-    const errorMessage = result.error.issues
-      .map((issue) => `${issue.path.join('.')}: ${issue.message}`)
-      .join('; ');
-    return { success: false, error: errorMessage };
-  }
-
-  return { success: true, data: result.data };
-}
-
-/**
- * Get all available tool base names (without mm_ prefix).
- *
- * @returns Array of tool base names
- */
-export function getToolNames(): string[] {
-  return Object.keys(tools);
-}
-
-/**
- * Get all available tool names with mm_ prefix.
- *
- * @returns Array of prefixed tool names
- */
-export function getPrefixedToolNames(): string[] {
-  return Object.keys(tools).map((name) => `${TOOL_PREFIX}_${name}`);
-}
-
-/**
- * Build a record mapping prefixed tool names to their handler functions.
- *
- * @returns Record of tool name to handler function mappings
- */
-export function buildToolHandlersRecord(): Record<string, ToolHandler> {
-  const handlers: Record<string, ToolHandler> = {};
-  for (const [baseName, tool] of Object.entries(tools)) {
-    handlers[`${TOOL_PREFIX}_${baseName}`] = tool.handler;
-  }
-  return handlers;
-}
-
-export type { ToolEntry };
diff --git a/src/mcp-server/tools/discovery-tools.ts b/src/mcp-server/tools/discovery-tools.ts
deleted file mode 100644
index be5dae6..0000000
--- a/src/mcp-server/tools/discovery-tools.ts
+++ /dev/null
@@ -1,223 +0,0 @@
-import {
-  DEFAULT_TESTID_LIMIT,
-  OBSERVATION_TESTID_LIMIT,
-} from '../constants.js';
-import { collectTestIds, collectTrimmedA11ySnapshot } from '../discovery.js';
-import {
-  knowledgeStore,
-  createDefaultObservation,
-} from '../knowledge-store.js';
-import { getSessionManager } from '../session-manager.js';
-import { classifyDiscoveryError } from './error-classification.js';
-import { runTool } from './run-tool.js';
-import type {
-  ListTestIdsInput,
-  ListTestIdsResult,
-  AccessibilitySnapshotInput,
-  AccessibilitySnapshotResult,
-  DescribeScreenInput,
-  DescribeScreenResult,
-  McpResponse,
-  PriorKnowledgeContext,
-  HandlerOptions,
-} from '../types';
-
-/**
- * Handle listing all visible data-testid attributes on the current page.
- *
- * @param input The input containing optional limit for number of items
- * @param options Optional handler options for the operation
- * @returns Promise resolving to list of visible test IDs with metadata
- */
-export async function handleListTestIds(
-  input: ListTestIdsInput,
-  options?: HandlerOptions,
-): Promise<McpResponse<ListTestIdsResult>> {
-  const limit = input.limit ?? DEFAULT_TESTID_LIMIT;
-
-  return runTool<ListTestIdsInput, ListTestIdsResult>({
-    toolName: 'mm_list_testids',
-    input,
-    options,
-    observationPolicy: 'custom',
-
-    /**
-     * Execute the list test IDs operation.
-     *
-     * @param context The workflow context containing the page
-     * @returns The result with test ID items and observation data
-     */
-    execute: async (context) => {
-      const items = await collectTestIds(context.page, limit);
-      const state = await getSessionManager().getExtensionState();
-      const { nodes, refMap } = await collectTrimmedA11ySnapshot(context.page);
-
-      getSessionManager().setRefMap(refMap);
-
-      return {
-        result: { items },
-        observation: createDefaultObservation(state, items, nodes),
-      };
-    },
-
-    classifyError: classifyDiscoveryError,
-
-    /**
-     * Sanitizes input for recording by extracting only the limit parameter.
-     *
-     * @returns Sanitized input with limit value
-     */
-    sanitizeInputForRecording: () => ({ limit }),
-  });
-}
-
-/**
- * Handle getting a trimmed accessibility tree with deterministic refs.
- *
- * @param input The input containing optional root selector for scoping
- * @param options Optional handler options for the operation
- * @returns Promise resolving to accessibility nodes with deterministic refs
- */
-export async function handleAccessibilitySnapshot(
-  input: AccessibilitySnapshotInput,
-  options?: HandlerOptions,
-): Promise<McpResponse<AccessibilitySnapshotResult>> {
-  return runTool<AccessibilitySnapshotInput, AccessibilitySnapshotResult>({
-    toolName: 'mm_accessibility_snapshot',
-    input,
-    options,
-    observationPolicy: 'custom',
-
-    /**
-     * Execute the accessibility snapshot operation.
-     *
-     * @param context The workflow context containing the page
-     * @returns The result with accessibility nodes and observation data
-     */
-    execute: async (context) => {
-      const { nodes, refMap } = await collectTrimmedA11ySnapshot(
-        context.page,
-        input.rootSelector,
-      );
-
-      getSessionManager().setRefMap(refMap);
-
-      const state = await getSessionManager().getExtensionState();
-      const testIds = await collectTestIds(
-        context.page,
-        OBSERVATION_TESTID_LIMIT,
-      );
-
-      return {
-        result: { nodes },
-        observation: createDefaultObservation(state, testIds, nodes),
-      };
-    },
-
-    classifyError: classifyDiscoveryError,
-
-    /**
-     * Sanitizes input for recording by extracting only the root selector.
-     *
-     * @returns Sanitized input with rootSelector value
-     */
-    sanitizeInputForRecording: () => ({ rootSelector: input.rootSelector }),
-  });
-}
-
-/**
- * Handle getting comprehensive screen state with state, testIds, a11y, and optional screenshot.
- *
- * @param input The input containing screenshot options and selector
- * @param options Optional handler options for the operation
- * @returns Promise resolving to comprehensive screen description with prior knowledge
- */
-export async function handleDescribeScreen(
-  input: DescribeScreenInput,
-  options?: HandlerOptions,
-): Promise<McpResponse<DescribeScreenResult>> {
-  return runTool<DescribeScreenInput, DescribeScreenResult>({
-    toolName: 'mm_describe_screen',
-    input,
-    options,
-    observationPolicy: 'custom',
-
-    /**
-     * Execute the describe screen operation.
-     *
-     * @param context The workflow context containing the page
-     * @returns The result with state, testIds, a11y, screenshot, and prior knowledge
-     */
-    execute: async (context) => {
-      const sessionManager = getSessionManager();
-      const { page } = context;
-
-      const state = await sessionManager.getExtensionState();
-      const testIds = await collectTestIds(page, DEFAULT_TESTID_LIMIT);
-      const { nodes, refMap } = await collectTrimmedA11ySnapshot(page);
-
-      sessionManager.setRefMap(refMap);
-
-      let screenshot: DescribeScreenResult['screenshot'] = null;
-
-      if (input.includeScreenshot) {
-        const screenshotName = input.screenshotName ?? 'describe-screen';
-        const result = await sessionManager.screenshot({
-          name: screenshotName,
-          fullPage: true,
-        });
-
-        screenshot = {
-          path: result.path,
-          width: result.width,
-          height: result.height,
-          base64: input.includeScreenshotBase64 ? result.base64 : null,
-        };
-      }
-
-      const sessionMetadata = sessionManager.getSessionMetadata();
-      const priorKnowledgeContext: PriorKnowledgeContext = {
-        currentScreen: state.currentScreen,
-        currentUrl: state.currentUrl,
-        visibleTestIds: testIds,
-        a11yNodes: nodes,
-        currentSessionFlowTags: sessionMetadata?.flowTags,
-      };
-
-      const priorKnowledge = await knowledgeStore.generatePriorKnowledge(
-        priorKnowledgeContext,
-        context.sessionId,
-      );
-
-      const observation = createDefaultObservation(
-        state,
-        testIds,
-        nodes,
-        priorKnowledge,
-      );
-
-      return {
-        result: {
-          state,
-          testIds: { items: testIds },
-          a11y: { nodes },
-          screenshot,
-          priorKnowledge,
-        },
-        observation,
-      };
-    },
-
-    classifyError: classifyDiscoveryError,
-
-    /**
-     * Sanitizes input for recording by extracting screenshot-related parameters.
-     *
-     * @returns Sanitized input with screenshot options
-     */
-    sanitizeInputForRecording: () => ({
-      includeScreenshot: input.includeScreenshot,
-      screenshotName: input.screenshotName,
-    }),
-  });
-}
diff --git a/src/mcp-server/tools/helpers.test.ts b/src/mcp-server/tools/helpers.test.ts
deleted file mode 100644
index 64e463d..0000000
--- a/src/mcp-server/tools/helpers.test.ts
+++ /dev/null
@@ -1,745 +0,0 @@
-/**
- * Unit tests for tool helper functions.
- *
- * Tests session validation, observation collection, error handling, and step recording.
- */
-
-import type { Page } from '@playwright/test';
-import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
-
-import {
-  requireActiveSession,
-  collectObservation,
-  withActiveSession,
-  recordToolStep,
-  collectObservationAndRecord,
-  handleToolError,
-} from './helpers';
-import type { ObservationLevel, RecordStepParams } from './helpers';
-import * as discoveryModule from '../discovery.js';
-import * as knowledgeStoreModule from '../knowledge-store.js';
-import * as sessionManagerModule from '../session-manager.js';
-import { createMockSessionManager } from '../test-utils';
-import { ErrorCodes } from '../types';
-
-describe('helpers', () => {
-  let mockSessionManager: ReturnType<typeof createMockSessionManager>;
-
-  beforeEach(() => {
-    mockSessionManager = createMockSessionManager();
-    vi.spyOn(sessionManagerModule, 'getSessionManager').mockReturnValue(
-      mockSessionManager,
-    );
-  });
-
-  afterEach(() => {
-    vi.restoreAllMocks();
-  });
-
-  describe('requireActiveSession', () => {
-    describe('when no active session exists', () => {
-      it('returns error response with NO_ACTIVE_SESSION code', () => {
-        // Arrange
-        vi.spyOn(mockSessionManager, 'hasActiveSession').mockReturnValue(false);
-        const startTime = Date.now();
-
-        // Act
-        const result = requireActiveSession(startTime);
-
-        // Assert
-        expect(result).toBeDefined();
-        expect(result?.ok).toBe(false);
-        if (result && !result.ok) {
-          expect(result.error.code).toBe(ErrorCodes.MM_NO_ACTIVE_SESSION);
-          expect(result.error.message).toBe(
-            'No active session. Call launch first.',
-          );
-        }
-      });
-
-      it('includes timestamp in error response', () => {
-        // Arrange
-        vi.spyOn(mockSessionManager, 'hasActiveSession').mockReturnValue(false);
-        const startTime = Date.now();
-
-        // Act
-        const result = requireActiveSession(startTime);
-
-        // Assert
-        if (result && !result.ok) {
-          expect(result.meta.timestamp).toBeDefined();
-        }
-      });
-    });
-
-    describe('when active session exists', () => {
-      it('returns undefined', () => {
-        // Arrange
-        vi.spyOn(mockSessionManager, 'hasActiveSession').mockReturnValue(true);
-        const startTime = Date.now();
-
-        // Act
-        const result = requireActiveSession(startTime);
-
-        // Assert
-        expect(result).toBeUndefined();
-      });
-    });
-  });
-
-  describe('collectObservation', () => {
-    describe('when level is "none"', () => {
-      it('returns default observation with empty arrays', async () => {
-        // Arrange
-        const mockPage = {} as Page;
-        const level: ObservationLevel = 'none';
-        vi.spyOn(
-          knowledgeStoreModule,
-          'createDefaultObservation',
-        ).mockReturnValue({
-          state: {} as any,
-          testIds: [],
-          a11y: { nodes: [] },
-        });
-
-        // Act
-        const result = await collectObservation(mockPage, level);
-
-        // Assert
-        expect(result.testIds).toStrictEqual([]);
-        expect(result.a11y.nodes).toStrictEqual([]);
-      });
-
-      it('does not query extension state', async () => {
-        // Arrange
-        const mockPage = {} as Page;
-        const level: ObservationLevel = 'none';
-        vi.spyOn(
-          knowledgeStoreModule,
-          'createDefaultObservation',
-        ).mockReturnValue({
-          state: {} as any,
-          testIds: [],
-          a11y: { nodes: [] },
-        });
-
-        // Act
-        await collectObservation(mockPage, level);
-
-        // Assert
-        expect(mockSessionManager.getExtensionState).not.toHaveBeenCalled();
-      });
-    });
-
-    describe('when level is "minimal"', () => {
-      it('returns observation with state only', async () => {
-        // Arrange
-        const mockPage = {} as Page;
-        const level: ObservationLevel = 'minimal';
-        const mockState = {
-          isLoaded: true,
-          currentUrl: 'chrome-extension://ext-123/home.html',
-          extensionId: 'ext-123',
-          isUnlocked: true,
-          currentScreen: 'home' as const,
-          accountAddress: '0x123',
-          networkName: 'Ethereum Mainnet',
-          chainId: 1,
-          balance: '1.5 ETH',
-        };
-        vi.spyOn(mockSessionManager, 'getExtensionState').mockResolvedValue(
-          mockState,
-        );
-        vi.spyOn(
-          knowledgeStoreModule,
-          'createDefaultObservation',
-        ).mockReturnValue({
-          state: mockState,
-          testIds: [],
-          a11y: { nodes: [] },
-        });
-
-        // Act
-        const result = await collectObservation(mockPage, level);
-
-        // Assert
-        expect(result.state).toStrictEqual(mockState);
-        expect(result.testIds).toStrictEqual([]);
-        expect(result.a11y.nodes).toStrictEqual([]);
-      });
-
-      it('uses preset state when provided', async () => {
-        // Arrange
-        const mockPage = {} as Page;
-        const level: ObservationLevel = 'minimal';
-        const presetState = {
-          isLoaded: true,
-          currentUrl: 'chrome-extension://ext-456/home.html',
-          extensionId: 'ext-456',
-          isUnlocked: false,
-          currentScreen: 'unlock' as const,
-          accountAddress: null,
-          networkName: null,
-          chainId: null,
-          balance: null,
-        };
-        vi.spyOn(
-          knowledgeStoreModule,
-          'createDefaultObservation',
-        ).mockReturnValue({
-          state: presetState,
-          testIds: [],
-          a11y: { nodes: [] },
-        });
-
-        // Act
-        const result = await collectObservation(mockPage, level, presetState);
-
-        // Assert
-        expect(mockSessionManager.getExtensionState).not.toHaveBeenCalled();
-        expect(result.state).toStrictEqual(presetState);
-      });
-    });
-
-    describe('when level is "full"', () => {
-      it('collects state, testIds, and a11y tree', async () => {
-        // Arrange
-        const mockPage = { locator: vi.fn() } as unknown as Page;
-        const level: ObservationLevel = 'full';
-        const mockState = {
-          isLoaded: true,
-          currentUrl: 'chrome-extension://ext-123/home.html',
-          extensionId: 'ext-123',
-          isUnlocked: true,
-          currentScreen: 'home' as const,
-          accountAddress: '0x123',
-          networkName: 'Ethereum Mainnet',
-          chainId: 1,
-          balance: '1.5 ETH',
-        };
-        const mockTestIds = [
-          { testId: 'send-button', tag: 'button', text: 'Send', visible: true },
-        ];
-        const mockA11yNodes = [
-          { ref: 'e1', role: 'button', name: 'Send', path: [] },
-        ];
-        const mockRefMap = new Map([['e1', '[data-testid="send-button"]']]);
-
-        vi.spyOn(mockSessionManager, 'getExtensionState').mockResolvedValue(
-          mockState,
-        );
-        vi.spyOn(discoveryModule, 'collectTestIds').mockResolvedValue(
-          mockTestIds,
-        );
-        vi.spyOn(
-          discoveryModule,
-          'collectTrimmedA11ySnapshot',
-        ).mockResolvedValue({
-          nodes: mockA11yNodes,
-          refMap: mockRefMap,
-        });
-        vi.spyOn(
-          knowledgeStoreModule,
-          'createDefaultObservation',
-        ).mockReturnValue({
-          state: mockState,
-          testIds: mockTestIds,
-          a11y: { nodes: mockA11yNodes },
-        });
-
-        // Act
-        const result = await collectObservation(mockPage, level);
-
-        // Assert
-        expect(result.state).toStrictEqual(mockState);
-        expect(result.testIds).toStrictEqual(mockTestIds);
-        expect(result.a11y.nodes).toStrictEqual(mockA11yNodes);
-        expect(mockSessionManager.setRefMap).toHaveBeenCalledWith(mockRefMap);
-      });
-
-      it('returns default observation when page is undefined', async () => {
-        // Arrange
-        const level: ObservationLevel = 'full';
-        const mockState = {
-          isLoaded: true,
-          currentUrl: 'chrome-extension://ext-123/home.html',
-          extensionId: 'ext-123',
-          isUnlocked: true,
-          currentScreen: 'home' as const,
-          accountAddress: null,
-          networkName: null,
-          chainId: null,
-          balance: null,
-        };
-        vi.spyOn(mockSessionManager, 'getExtensionState').mockResolvedValue(
-          mockState,
-        );
-        vi.spyOn(
-          knowledgeStoreModule,
-          'createDefaultObservation',
-        ).mockReturnValue({
-          state: mockState,
-          testIds: [],
-          a11y: { nodes: [] },
-        });
-
-        // Act
-        const result = await collectObservation(undefined, level);
-
-        // Assert
-        expect(result.testIds).toStrictEqual([]);
-        expect(result.a11y.nodes).toStrictEqual([]);
-      });
-
-      it('returns default observation when discovery throws error', async () => {
-        // Arrange
-        const mockPage = { locator: vi.fn() } as unknown as Page;
-        const level: ObservationLevel = 'full';
-        const mockState = {
-          isLoaded: true,
-          currentUrl: 'chrome-extension://ext-123/home.html',
-          extensionId: 'ext-123',
-          isUnlocked: true,
-          currentScreen: 'home' as const,
-          accountAddress: null,
-          networkName: null,
-          chainId: null,
-          balance: null,
-        };
-        vi.spyOn(mockSessionManager, 'getExtensionState').mockResolvedValue(
-          mockState,
-        );
-        vi.spyOn(discoveryModule, 'collectTestIds').mockRejectedValue(
-          new Error('Page closed'),
-        );
-        vi.spyOn(
-          knowledgeStoreModule,
-          'createDefaultObservation',
-        ).mockReturnValue({
-          state: mockState,
-          testIds: [],
-          a11y: { nodes: [] },
-        });
-
-        // Act
-        const result = await collectObservation(mockPage, level);
-
-        // Assert
-        expect(result.testIds).toStrictEqual([]);
-        expect(result.a11y.nodes).toStrictEqual([]);
-      });
-    });
-  });
-
-  describe('withActiveSession', () => {
-    describe('when no active session exists', () => {
-      it('returns error response without calling handler', async () => {
-        // Arrange
-        vi.spyOn(mockSessionManager, 'hasActiveSession').mockReturnValue(false);
-        const handler = vi.fn();
-        const wrappedHandler = withActiveSession(handler);
-
-        // Act
-        const result = await wrappedHandler({ test: 'input' });
-
-        // Assert
-        expect(result.ok).toBe(false);
-        if (!result.ok) {
-          expect(result.error.code).toBe(ErrorCodes.MM_NO_ACTIVE_SESSION);
-        }
-        expect(handler).not.toHaveBeenCalled();
-      });
-    });
-
-    describe('when session ID is missing', () => {
-      it('returns error response', async () => {
-        // Arrange
-        vi.spyOn(mockSessionManager, 'hasActiveSession').mockReturnValue(true);
-        vi.spyOn(mockSessionManager, 'getSessionId').mockReturnValue(undefined);
-        const handler = vi.fn();
-        const wrappedHandler = withActiveSession(handler);
-
-        // Act
-        const result = await wrappedHandler({ test: 'input' });
-
-        // Assert
-        expect(result.ok).toBe(false);
-        if (!result.ok) {
-          expect(result.error.code).toBe(ErrorCodes.MM_NO_ACTIVE_SESSION);
-          expect(result.error.message).toBe('Session ID not found');
-        }
-        expect(handler).not.toHaveBeenCalled();
-      });
-    });
-
-    describe('when active session exists', () => {
-      it('calls handler with input, context, and startTime', async () => {
-        // Arrange
-        const mockPage = { url: () => 'test-url' } as unknown as Page;
-        const mockRefMap = new Map([['e1', '[data-testid="test"]']]);
-        vi.spyOn(mockSessionManager, 'hasActiveSession').mockReturnValue(true);
-        vi.spyOn(mockSessionManager, 'getSessionId').mockReturnValue(
-          'session-123',
-        );
-        vi.spyOn(mockSessionManager, 'getPage').mockReturnValue(mockPage);
-        vi.spyOn(mockSessionManager, 'getRefMap').mockReturnValue(mockRefMap);
-
-        const handler = vi.fn().mockResolvedValue({
-          ok: true,
-          ts: Date.now(),
-          durationMs: 100,
-          result: { success: true },
-        });
-        const wrappedHandler = withActiveSession(handler);
-        const input = { test: 'input' };
-
-        // Act
-        const result = await wrappedHandler(input);
-
-        // Assert
-        expect(handler).toHaveBeenCalledWith(
-          input,
-          {
-            sessionId: 'session-123',
-            page: mockPage,
-            refMap: mockRefMap,
-          },
-          expect.any(Number),
-        );
-        expect(result.ok).toBe(true);
-      });
-
-      it('passes through handler result', async () => {
-        // Arrange
-        const mockPage = { url: () => 'test-url' } as unknown as Page;
-        vi.spyOn(mockSessionManager, 'hasActiveSession').mockReturnValue(true);
-        vi.spyOn(mockSessionManager, 'getSessionId').mockReturnValue(
-          'session-123',
-        );
-        vi.spyOn(mockSessionManager, 'getPage').mockReturnValue(mockPage);
-        vi.spyOn(mockSessionManager, 'getRefMap').mockReturnValue(new Map());
-
-        const expectedResult = {
-          ok: true,
-          ts: Date.now(),
-          durationMs: 100,
-          result: { data: 'test-data' },
-        };
-        const handler = vi.fn().mockResolvedValue(expectedResult);
-        const wrappedHandler = withActiveSession(handler);
-
-        // Act
-        const result = await wrappedHandler({ test: 'input' });
-
-        // Assert
-        expect(result).toStrictEqual(expectedResult);
-      });
-    });
-  });
-
-  describe('recordToolStep', () => {
-    it('records step with all parameters', async () => {
-      // Arrange
-      vi.spyOn(mockSessionManager, 'getSessionId').mockReturnValue(
-        'session-123',
-      );
-      const mockRecordStep = vi.fn().mockResolvedValue(undefined);
-      vi.spyOn(knowledgeStoreModule, 'knowledgeStore', 'get').mockReturnValue({
-        recordStep: mockRecordStep,
-      } as any);
-
-      const params: RecordStepParams = {
-        toolName: 'mm_click',
-        input: { testId: 'send-button' },
-        startTime: Date.now() - 100,
-        observation: {
-          state: {} as any,
-          testIds: [],
-          a11y: { nodes: [] },
-        },
-        target: { testId: 'send-button' },
-        screenshotPath: '/path/to/screenshot.png',
-        screenshotDimensions: { width: 1280, height: 720 },
-      };
-
-      // Act
-      await recordToolStep(params);
-
-      // Assert
-      expect(mockRecordStep).toHaveBeenCalledWith({
-        sessionId: 'session-123',
-        toolName: 'mm_click',
-        input: { testId: 'send-button' },
-        target: { testId: 'send-button' },
-        outcome: { ok: true },
-        observation: params.observation,
-        durationMs: expect.any(Number),
-        screenshotPath: '/path/to/screenshot.png',
-        screenshotDimensions: { width: 1280, height: 720 },
-      });
-    });
-
-    it('uses empty string when session ID is undefined', async () => {
-      // Arrange
-      vi.spyOn(mockSessionManager, 'getSessionId').mockReturnValue(undefined);
-      const mockRecordStep = vi.fn().mockResolvedValue(undefined);
-      vi.spyOn(knowledgeStoreModule, 'knowledgeStore', 'get').mockReturnValue({
-        recordStep: mockRecordStep,
-      } as any);
-
-      const params: RecordStepParams = {
-        toolName: 'mm_click',
-        input: { testId: 'send-button' },
-        startTime: Date.now(),
-        observation: {
-          state: {} as any,
-          testIds: [],
-          a11y: { nodes: [] },
-        },
-      };
-
-      // Act
-      await recordToolStep(params);
-
-      // Assert
-      expect(mockRecordStep).toHaveBeenCalledWith(
-        expect.objectContaining({
-          sessionId: '',
-        }),
-      );
-    });
-  });
-
-  describe('collectObservationAndRecord', () => {
-    it('collects observation and records step', async () => {
-      // Arrange
-      const mockPage = { locator: vi.fn() } as unknown as Page;
-      const mockObservation = {
-        state: {} as any,
-        testIds: [
-          { testId: 'send-button', tag: 'button', text: 'Send', visible: true },
-        ],
-        a11y: {
-          nodes: [{ ref: 'e1', role: 'button', name: 'Send', path: [] }],
-        },
-      };
-      const mockRecordStep = vi.fn().mockResolvedValue(undefined);
-
-      vi.spyOn(
-        knowledgeStoreModule,
-        'createDefaultObservation',
-      ).mockReturnValue(mockObservation);
-      vi.spyOn(discoveryModule, 'collectTestIds').mockResolvedValue(
-        mockObservation.testIds,
-      );
-      vi.spyOn(discoveryModule, 'collectTrimmedA11ySnapshot').mockResolvedValue(
-        {
-          nodes: mockObservation.a11y.nodes,
-          refMap: new Map(),
-        },
-      );
-      vi.spyOn(knowledgeStoreModule, 'knowledgeStore', 'get').mockReturnValue({
-        recordStep: mockRecordStep,
-      } as any);
-      vi.spyOn(mockSessionManager, 'getSessionId').mockReturnValue(
-        'session-123',
-      );
-
-      // Act
-      const result = await collectObservationAndRecord(
-        mockPage,
-        'mm_click',
-        { testId: 'send-button' },
-        Date.now(),
-        {
-          target: { testId: 'send-button' },
-          screenshotPath: '/path/to/screenshot.png',
-          screenshotDimensions: { width: 1280, height: 720 },
-        },
-      );
-
-      // Assert
-      expect(result).toStrictEqual(mockObservation);
-      expect(mockRecordStep).toHaveBeenCalledWith(
-        expect.objectContaining({
-          toolName: 'mm_click',
-          input: { testId: 'send-button' },
-          observation: mockObservation,
-          target: { testId: 'send-button' },
-          screenshotPath: '/path/to/screenshot.png',
-          screenshotDimensions: { width: 1280, height: 720 },
-        }),
-      );
-    });
-
-    it('works without optional parameters', async () => {
-      // Arrange
-      const mockPage = { locator: vi.fn() } as unknown as Page;
-      const mockObservation = {
-        state: {} as any,
-        testIds: [],
-        a11y: { nodes: [] },
-      };
-      const mockRecordStep = vi.fn().mockResolvedValue(undefined);
-
-      vi.spyOn(
-        knowledgeStoreModule,
-        'createDefaultObservation',
-      ).mockReturnValue(mockObservation);
-      vi.spyOn(discoveryModule, 'collectTestIds').mockResolvedValue([]);
-      vi.spyOn(discoveryModule, 'collectTrimmedA11ySnapshot').mockResolvedValue(
-        {
-          nodes: [],
-          refMap: new Map(),
-        },
-      );
-      vi.spyOn(knowledgeStoreModule, 'knowledgeStore', 'get').mockReturnValue({
-        recordStep: mockRecordStep,
-      } as any);
-      vi.spyOn(mockSessionManager, 'getSessionId').mockReturnValue(
-        'session-123',
-      );
-
-      // Act
-      const result = await collectObservationAndRecord(
-        mockPage,
-        'mm_get_state',
-        {},
-        Date.now(),
-      );
-
-      // Assert
-      expect(result).toStrictEqual(mockObservation);
-      expect(mockRecordStep).toHaveBeenCalledWith(
-        expect.objectContaining({
-          toolName: 'mm_get_state',
-          input: {},
-          observation: mockObservation,
-          target: undefined,
-          screenshotPath: undefined,
-          screenshotDimensions: undefined,
-        }),
-      );
-    });
-  });
-
-  describe('handleToolError', () => {
-    describe('when error contains "Unknown a11yRef"', () => {
-      it('returns TARGET_NOT_FOUND error code', () => {
-        // Arrange
-        const error = new Error('Unknown a11yRef: e99');
-        const startTime = Date.now();
-
-        // Act
-        const result = handleToolError(
-          error,
-          ErrorCodes.MM_CLICK_FAILED,
-          'Click failed',
-          { a11yRef: 'e99' },
-          'session-123',
-          startTime,
-        );
-
-        // Assert
-        expect(result.ok).toBe(false);
-        if (!result.ok) {
-          expect(result.error.code).toBe(ErrorCodes.MM_TARGET_NOT_FOUND);
-          expect(result.error.message).toContain('Unknown a11yRef: e99');
-        }
-      });
-    });
-
-    describe('when error contains "not found"', () => {
-      it('returns TARGET_NOT_FOUND error code', () => {
-        // Arrange
-        const error = new Error('Element not found');
-        const startTime = Date.now();
-
-        // Act
-        const result = handleToolError(
-          error,
-          ErrorCodes.MM_TYPE_FAILED,
-          'Type failed',
-          { testId: 'missing-input' },
-          'session-123',
-          startTime,
-        );
-
-        // Assert
-        expect(result.ok).toBe(false);
-        if (!result.ok) {
-          expect(result.error.code).toBe(ErrorCodes.MM_TARGET_NOT_FOUND);
-          expect(result.error.message).toContain('not found');
-        }
-      });
-    });
-
-    describe('when error does not match special patterns', () => {
-      it('returns default error code with combined message', () => {
-        // Arrange
-        const error = new Error('Timeout exceeded');
-        const startTime = Date.now();
-
-        // Act
-        const result = handleToolError(
-          error,
-          ErrorCodes.MM_CLICK_FAILED,
-          'Click failed',
-          { testId: 'slow-button' },
-          'session-123',
-          startTime,
-        );
-
-        // Assert
-        expect(result.ok).toBe(false);
-        if (!result.ok) {
-          expect(result.error.code).toBe(ErrorCodes.MM_CLICK_FAILED);
-          expect(result.error.message).toBe('Click failed: Timeout exceeded');
-        }
-      });
-
-      it('includes input in error details', () => {
-        // Arrange
-        const error = new Error('Generic error');
-        const input = { testId: 'test-button', timeoutMs: 5000 };
-        const startTime = Date.now();
-
-        // Act
-        const result = handleToolError(
-          error,
-          ErrorCodes.MM_CLICK_FAILED,
-          'Click failed',
-          input,
-          'session-123',
-          startTime,
-        );
-
-        // Assert
-        if (!result.ok) {
-          expect(result.error.details).toStrictEqual({ input });
-        }
-      });
-
-      it('includes session ID in response', () => {
-        // Arrange
-        const error = new Error('Generic error');
-        const startTime = Date.now();
-
-        // Act
-        const result = handleToolError(
-          error,
-          ErrorCodes.MM_CLICK_FAILED,
-          'Click failed',
-          {},
-          'session-456',
-          startTime,
-        );
-
-        // Assert
-        if (!result.ok) {
-          expect(result.meta.sessionId).toBe('session-456');
-        }
-      });
-    });
-  });
-});
diff --git a/src/mcp-server/tools/helpers.ts b/src/mcp-server/tools/helpers.ts
deleted file mode 100644
index cf94f48..0000000
--- a/src/mcp-server/tools/helpers.ts
+++ /dev/null
@@ -1,313 +0,0 @@
-import type { Page } from '@playwright/test';
-
-import type { ExtensionState } from '../../capabilities/types.js';
-import { OBSERVATION_TESTID_LIMIT } from '../constants.js';
-import { collectTestIds, collectTrimmedA11ySnapshot } from '../discovery.js';
-import {
-  knowledgeStore,
-  createDefaultObservation,
-} from '../knowledge-store.js';
-import { getSessionManager } from '../session-manager.js';
-import type {
-  McpResponse,
-  ErrorCode,
-  TestIdItem,
-  StepRecordObservation,
-} from '../types';
-import { ErrorCodes } from '../types';
-import { createErrorResponse, extractErrorMessage, debugWarn } from '../utils';
-
-/**
- * Level of detail to collect for observation data.
- * - "full": Collect state, testIds, and a11y tree
- * - "minimal": Collect state only (no testIds or a11y)
- * - "none": Return empty observation
- */
-export type ObservationLevel = 'full' | 'minimal' | 'none';
-
-/**
- * Parameters for recording a tool step in the knowledge store.
- */
-export type RecordStepParams = {
-  /**
-   * Name of the tool that was executed
-   */
-  toolName: string;
-  /**
-   * Input parameters passed to the tool
-   */
-  input: Record<string, unknown>;
-  /**
-   * Timestamp when the tool execution started
-   */
-  startTime: number;
-  /**
-   * Observation data collected after tool execution
-   */
-  observation: StepRecordObservation;
-  /**
-   * Target element information (selector, testId, etc.)
-   */
-  target?: Record<string, string>;
-  /**
-   * Path to screenshot file if captured
-   */
-  screenshotPath?: string;
-  /**
-   * Screenshot dimensions if captured
-   */
-  screenshotDimensions?: {
-    /**
-     * Screenshot width in pixels
-     */
-    width: number;
-    /**
-     * Screenshot height in pixels
-     */
-    height: number;
-  };
-};
-
-/**
- * Context information for an active session.
- */
-export type ActiveSessionContext = {
-  /**
-   * Unique session identifier
-   */
-  sessionId: string;
-  /**
-   * Current active page instance
-   */
-  page: Page;
-  /**
-   * Map of accessibility references to selectors
-   */
-  refMap: Map<string, string>;
-};
-
-/**
- * Check if an active session exists and return error if not.
- *
- * @param startTime - Timestamp when the operation started
- * @returns Error response if no active session, undefined otherwise
- */
-export function requireActiveSession<Result>(
-  startTime: number,
-): McpResponse<Result> | undefined {
-  const sessionManager = getSessionManager();
-  if (!sessionManager.hasActiveSession()) {
-    return createErrorResponse(
-      ErrorCodes.MM_NO_ACTIVE_SESSION,
-      'No active session. Call launch first.',
-      undefined,
-      undefined,
-      startTime,
-    ) as McpResponse<Result>;
-  }
-  return undefined;
-}
-
-/**
- * Collect observation data from the current page state.
- *
- * @param page - The page to collect observation from
- * @param level - Level of detail to collect (full, minimal, or none)
- * @param presetState - Optional pre-fetched extension state to use instead of querying
- * @returns Observation data with state, testIds, and accessibility tree
- */
-export async function collectObservation(
-  page: Page | undefined,
-  level: ObservationLevel,
-  presetState?: ExtensionState,
-): Promise<StepRecordObservation> {
-  const sessionManager = getSessionManager();
-
-  if (level === 'none') {
-    return createDefaultObservation({} as ExtensionState, [], []);
-  }
-
-  const state = presetState ?? (await sessionManager.getExtensionState());
-
-  if (level === 'minimal') {
-    return createDefaultObservation(state, [], []);
-  }
-
-  if (!page) {
-    debugWarn('collectObservation', 'Page not provided for full observation');
-    return createDefaultObservation(state, [], []);
-  }
-
-  try {
-    const testIds: TestIdItem[] = await collectTestIds(
-      page,
-      OBSERVATION_TESTID_LIMIT,
-    );
-    const { nodes, refMap } = await collectTrimmedA11ySnapshot(page);
-    sessionManager.setRefMap(refMap);
-    return createDefaultObservation(state, testIds, nodes);
-  } catch (error) {
-    debugWarn('collectObservation', error);
-    return createDefaultObservation(state, [], []);
-  }
-}
-
-/**
- * Wrapper that ensures an active session exists before executing a handler.
- *
- * @param handler - Function to execute with active session context
- * @returns Wrapped function that validates session before calling handler
- */
-export function withActiveSession<TInput, TResult>(
-  handler: (
-    input: TInput,
-    ctx: ActiveSessionContext,
-    startTime: number,
-  ) => Promise<McpResponse<TResult>>,
-): (input: TInput) => Promise<McpResponse<TResult>> {
-  return async (input: TInput): Promise<McpResponse<TResult>> => {
-    const startTime = Date.now();
-    const sessionManager = getSessionManager();
-
-    const sessionError = requireActiveSession<TResult>(startTime);
-    if (sessionError) {
-      return sessionError;
-    }
-
-    const sessionId = sessionManager.getSessionId();
-    if (!sessionId) {
-      return createErrorResponse(
-        ErrorCodes.MM_NO_ACTIVE_SESSION,
-        'Session ID not found',
-        undefined,
-        undefined,
-        startTime,
-      ) as McpResponse<TResult>;
-    }
-    const page = sessionManager.getPage();
-    const refMap = sessionManager.getRefMap();
-
-    return handler(input, { sessionId, page, refMap }, startTime);
-  };
-}
-
-/**
- * Record a tool execution step in the knowledge store.
- *
- * @param params - Parameters containing tool name, input, observation, and metadata
- */
-export async function recordToolStep(params: RecordStepParams): Promise<void> {
-  const sessionManager = getSessionManager();
-  const sessionId = sessionManager.getSessionId() ?? '';
-
-  await knowledgeStore.recordStep({
-    sessionId,
-    toolName: params.toolName,
-    input: params.input,
-    target: params.target,
-    outcome: { ok: true },
-    observation: params.observation,
-    durationMs: Date.now() - params.startTime,
-    screenshotPath: params.screenshotPath,
-    screenshotDimensions: params.screenshotDimensions,
-  });
-}
-
-/**
- * Collect observation data and record the tool step in the knowledge store.
- *
- * @param page - The page to collect observation from
- * @param toolName - Name of the tool that was executed
- * @param input - Input parameters passed to the tool
- * @param startTime - Timestamp when the tool execution started
- * @param options - Optional metadata for the step record
- * @param options.target - Target element information
- * @param options.screenshotPath - Path to screenshot file if captured
- * @param options.screenshotDimensions - Screenshot dimensions
- * @param options.screenshotDimensions.width - Screenshot width in pixels
- * @param options.screenshotDimensions.height - Screenshot height in pixels
- * @returns Observation data collected after tool execution
- */
-export async function collectObservationAndRecord(
-  page: Page,
-  toolName: string,
-  input: Record<string, unknown>,
-  startTime: number,
-  options: {
-    /**
-     * Target element information (selector, testId, etc.)
-     */
-    target?: Record<string, string>;
-    /**
-     * Path to screenshot file if captured
-     */
-    screenshotPath?: string;
-    /**
-     * Screenshot dimensions if captured
-     */
-    screenshotDimensions?: {
-      /**
-       * Screenshot width in pixels
-       */
-      width: number;
-      /**
-       * Screenshot height in pixels
-       */
-      height: number;
-    };
-  } = {},
-): Promise<StepRecordObservation> {
-  const observation = await collectObservation(page, 'full');
-
-  await recordToolStep({
-    toolName,
-    input,
-    startTime,
-    observation,
-    target: options.target,
-    screenshotPath: options.screenshotPath,
-    screenshotDimensions: options.screenshotDimensions,
-  });
-
-  return observation;
-}
-
-/**
- * Handle tool execution errors and return appropriate error response.
- *
- * @param error - The error that occurred during tool execution
- * @param defaultCode - Default error code to use if no specific match found
- * @param defaultMessage - Default error message to use
- * @param input - Input parameters that were passed to the tool
- * @param sessionId - Current session ID for error context
- * @param startTime - Timestamp when the tool execution started
- * @returns Error response with appropriate code and message
- */
-export function handleToolError<Result>(
-  error: unknown,
-  defaultCode: ErrorCode,
-  defaultMessage: string,
-  input: unknown,
-  sessionId: string | undefined,
-  startTime: number,
-): McpResponse<Result> {
-  const message = extractErrorMessage(error);
-
-  if (message.includes('Unknown a11yRef') || message.includes('not found')) {
-    return createErrorResponse(
-      ErrorCodes.MM_TARGET_NOT_FOUND,
-      message,
-      { input },
-      sessionId,
-      startTime,
-    ) as McpResponse<Result>;
-  }
-
-  return createErrorResponse(
-    defaultCode,
-    `${defaultMessage}: ${message}`,
-    { input },
-    sessionId,
-    startTime,
-  ) as McpResponse<Result>;
-}
diff --git a/src/mcp-server/tools/index.ts b/src/mcp-server/tools/index.ts
deleted file mode 100644
index 2621238..0000000
--- a/src/mcp-server/tools/index.ts
+++ /dev/null
@@ -1,10 +0,0 @@
-export * from './error-classification.js';
-export * from './run-tool.js';
-export * from './helpers.js';
-export * from './interaction.js';
-export * from './navigation.js';
-export * from './discovery-tools.js';
-export * from './screenshot.js';
-export * from './knowledge.js';
-export * from './batch.js';
-export * from './clipboard.js';
diff --git a/src/mcp-server/tools/interaction.test.ts b/src/mcp-server/tools/interaction.test.ts
deleted file mode 100644
index cd4fea7..0000000
--- a/src/mcp-server/tools/interaction.test.ts
+++ /dev/null
@@ -1,822 +0,0 @@
-/**
- * Unit tests for interaction tool handlers.
- *
- * Tests handleClick, handleType, and handleWaitFor with various target types,
- * error scenarios, and page closure detection.
- */
-
-import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
-
-import { handleClick, handleType, handleWaitFor } from './interaction';
-import * as discoveryModule from '../discovery.js';
-import * as knowledgeStoreModule from '../knowledge-store.js';
-import * as sessionManagerModule from '../session-manager.js';
-import {
-  createMockSessionManager,
-  createMockPage,
-  createMockLocator,
-} from '../test-utils';
-import { ErrorCodes } from '../types';
-import * as utilsModule from '../utils';
-
-describe('interaction', () => {
-  let mockSessionManager: ReturnType<typeof createMockSessionManager>;
-
-  beforeEach(() => {
-    mockSessionManager = createMockSessionManager({
-      hasActive: true,
-      sessionId: 'test-session-123',
-    });
-    vi.spyOn(sessionManagerModule, 'getSessionManager').mockReturnValue(
-      mockSessionManager,
-    );
-    // Mock knowledge store to prevent "not initialized" errors
-    vi.spyOn(knowledgeStoreModule, 'knowledgeStore', 'get').mockReturnValue({
-      recordStep: vi.fn().mockResolvedValue(undefined),
-      getLastSteps: vi.fn().mockResolvedValue([]),
-      searchSteps: vi.fn().mockResolvedValue([]),
-      summarizeSession: vi
-        .fn()
-        .mockResolvedValue({ sessionId: 'test', stepCount: 0, recipe: [] }),
-      listSessions: vi.fn().mockResolvedValue([]),
-      generatePriorKnowledge: vi.fn().mockResolvedValue(undefined),
-      writeSessionMetadata: vi.fn().mockResolvedValue('test-session'),
-    } as any);
-  });
-
-  afterEach(() => {
-    vi.restoreAllMocks();
-  });
-
-  describe('handleClick', () => {
-    describe('with testId target', () => {
-      it('clicks element by testId', async () => {
-        // Arrange
-        const mockPage = createMockPage();
-        const mockLocator = createMockLocator();
-        vi.spyOn(mockPage, 'locator').mockReturnValue(mockLocator);
-        vi.spyOn(mockSessionManager, 'getPage').mockReturnValue(mockPage);
-        vi.spyOn(mockSessionManager, 'getRefMap').mockReturnValue(new Map());
-
-        vi.spyOn(discoveryModule, 'waitForTarget').mockResolvedValue(
-          mockLocator as any,
-        );
-
-        // Act
-        const result = await handleClick({ testId: 'my-button' });
-
-        // Assert
-        expect(result.ok).toBe(true);
-        if (result.ok) {
-          expect(result.result.clicked).toBe(true);
-          expect(result.result.target).toBe('testId:my-button');
-        }
-        expect(discoveryModule.waitForTarget).toHaveBeenCalledWith(
-          mockPage,
-          'testId',
-          'my-button',
-          expect.any(Map),
-          15000,
-        );
-        expect(mockLocator.click).toHaveBeenCalled();
-      });
-
-      it('uses custom timeout when provided', async () => {
-        // Arrange
-        const mockPage = createMockPage();
-        const mockLocator = createMockLocator();
-        vi.spyOn(mockSessionManager, 'getPage').mockReturnValue(mockPage);
-        vi.spyOn(mockSessionManager, 'getRefMap').mockReturnValue(new Map());
-
-        vi.spyOn(discoveryModule, 'waitForTarget').mockResolvedValue(
-          mockLocator as any,
-        );
-
-        // Act
-        await handleClick({ testId: 'my-button', timeoutMs: 5000 });
-
-        // Assert
-        expect(discoveryModule.waitForTarget).toHaveBeenCalledWith(
-          mockPage,
-          'testId',
-          'my-button',
-          expect.any(Map),
-          5000,
-        );
-      });
-    });
-
-    describe('with selector target', () => {
-      it('clicks element by CSS selector', async () => {
-        // Arrange
-        const mockPage = createMockPage();
-        const mockLocator = createMockLocator();
-        vi.spyOn(mockSessionManager, 'getPage').mockReturnValue(mockPage);
-        vi.spyOn(mockSessionManager, 'getRefMap').mockReturnValue(new Map());
-
-        vi.spyOn(discoveryModule, 'waitForTarget').mockResolvedValue(
-          mockLocator as any,
-        );
-
-        // Act
-        const result = await handleClick({ selector: 'button.primary' });
-
-        // Assert
-        expect(result.ok).toBe(true);
-        if (result.ok) {
-          expect(result.result.clicked).toBe(true);
-          expect(result.result.target).toBe('selector:button.primary');
-        }
-        expect(discoveryModule.waitForTarget).toHaveBeenCalledWith(
-          mockPage,
-          'selector',
-          'button.primary',
-          expect.any(Map),
-          15000,
-        );
-      });
-    });
-
-    describe('with a11yRef target', () => {
-      it('clicks element by accessibility reference', async () => {
-        // Arrange
-        const mockPage = createMockPage();
-        const mockLocator = createMockLocator();
-        const refMap = new Map([['e5', 'button[aria-label="Submit"]']]);
-        vi.spyOn(mockSessionManager, 'getPage').mockReturnValue(mockPage);
-        vi.spyOn(mockSessionManager, 'getRefMap').mockReturnValue(refMap);
-
-        vi.spyOn(discoveryModule, 'waitForTarget').mockResolvedValue(
-          mockLocator as any,
-        );
-
-        // Act
-        const result = await handleClick({ a11yRef: 'e5' });
-
-        // Assert
-        expect(result.ok).toBe(true);
-        if (result.ok) {
-          expect(result.result.clicked).toBe(true);
-          expect(result.result.target).toBe('a11yRef:e5');
-        }
-        expect(discoveryModule.waitForTarget).toHaveBeenCalledWith(
-          mockPage,
-          'a11yRef',
-          'e5',
-          refMap,
-          15000,
-        );
-      });
-    });
-
-    describe('with invalid target selection', () => {
-      it('returns error when no target specified', async () => {
-        // Act
-        const result = await handleClick({} as any);
-
-        // Assert
-        expect(result.ok).toBe(false);
-        if (!result.ok) {
-          expect(result.error.code).toBe(ErrorCodes.MM_INVALID_INPUT);
-          expect(result.error.message).toContain('Exactly one');
-        }
-      });
-
-      it('returns error when multiple targets specified', async () => {
-        // Act
-        const result = await handleClick({
-          testId: 'button',
-          selector: '.button',
-        } as any);
-
-        // Assert
-        expect(result.ok).toBe(false);
-        if (!result.ok) {
-          expect(result.error.code).toBe(ErrorCodes.MM_INVALID_INPUT);
-          expect(result.error.message).toContain('Exactly one');
-        }
-      });
-
-      it('returns error when validation result is invalid but not caught by isInvalidTargetSelection', async () => {
-        // Arrange
-        vi.spyOn(utilsModule, 'validateTargetSelection').mockReturnValue({
-          valid: true,
-          // Missing type and value properties - will fail isValidTargetSelection
-        } as any);
-
-        // Act
-        const result = await handleClick({ testId: 'button' });
-
-        // Assert
-        expect(result.ok).toBe(false);
-        if (!result.ok) {
-          expect(result.error.code).toBe(ErrorCodes.MM_INVALID_INPUT);
-          expect(result.error.message).toBe('Invalid target selection');
-        }
-      });
-    });
-
-    describe('with page closure after click', () => {
-      it('handles page closure gracefully', async () => {
-        // Arrange
-        const mockPage = createMockPage();
-        const mockLocator = createMockLocator();
-        vi.spyOn(mockLocator, 'click').mockRejectedValue(
-          new Error('Target page, context or browser has been closed'),
-        );
-        vi.spyOn(mockSessionManager, 'getPage').mockReturnValue(mockPage);
-        vi.spyOn(mockSessionManager, 'getRefMap').mockReturnValue(new Map());
-
-        vi.spyOn(discoveryModule, 'waitForTarget').mockResolvedValue(
-          mockLocator as any,
-        );
-
-        // Act
-        const result = await handleClick({ testId: 'close-btn' });
-
-        // Assert
-        expect(result.ok).toBe(true);
-        if (result.ok) {
-          expect(result.result.clicked).toBe(true);
-          expect(result.result.pageClosedAfterClick).toBe(true);
-          expect(result.result.target).toBe('testId:close-btn');
-        }
-      });
-
-      it('handles browser closed error gracefully', async () => {
-        // Arrange
-        const mockPage = createMockPage();
-        const mockLocator = createMockLocator();
-        vi.spyOn(mockLocator, 'click').mockRejectedValue(
-          new Error('browser has been closed'),
-        );
-        vi.spyOn(mockSessionManager, 'getPage').mockReturnValue(mockPage);
-        vi.spyOn(mockSessionManager, 'getRefMap').mockReturnValue(new Map());
-
-        vi.spyOn(discoveryModule, 'waitForTarget').mockResolvedValue(
-          mockLocator as any,
-        );
-
-        // Act
-        const result = await handleClick({ testId: 'close-btn' });
-
-        // Assert
-        expect(result.ok).toBe(true);
-        if (result.ok) {
-          expect(result.result.pageClosedAfterClick).toBe(true);
-        }
-      });
-    });
-
-    describe('with click errors', () => {
-      it('returns error when click fails with non-closure error', async () => {
-        // Arrange
-        const mockPage = createMockPage();
-        const mockLocator = createMockLocator();
-        vi.spyOn(mockLocator, 'click').mockRejectedValue(
-          new Error('Element is not clickable'),
-        );
-        vi.spyOn(mockSessionManager, 'getPage').mockReturnValue(mockPage);
-        vi.spyOn(mockSessionManager, 'getRefMap').mockReturnValue(new Map());
-
-        vi.spyOn(discoveryModule, 'waitForTarget').mockResolvedValue(
-          mockLocator as any,
-        );
-
-        // Act
-        const result = await handleClick({ testId: 'my-button' });
-
-        // Assert
-        expect(result.ok).toBe(false);
-        if (!result.ok) {
-          expect(result.error.code).toBe(ErrorCodes.MM_CLICK_FAILED);
-        }
-      });
-
-      it('returns error when element not found', async () => {
-        // Arrange
-        const mockPage = createMockPage();
-        vi.spyOn(mockSessionManager, 'getPage').mockReturnValue(mockPage);
-        vi.spyOn(mockSessionManager, 'getRefMap').mockReturnValue(new Map());
-
-        vi.spyOn(discoveryModule, 'waitForTarget').mockRejectedValue(
-          new Error('Timeout waiting for element'),
-        );
-
-        // Act
-        const result = await handleClick({ testId: 'nonexistent' });
-
-        // Assert
-        expect(result.ok).toBe(false);
-        if (!result.ok) {
-          expect(result.error.code).toBe(ErrorCodes.MM_WAIT_TIMEOUT);
-        }
-      });
-    });
-
-    describe('without active session', () => {
-      it('returns error when no session active', async () => {
-        // Arrange
-        vi.spyOn(mockSessionManager, 'hasActiveSession').mockReturnValue(false);
-
-        // Act
-        const result = await handleClick({ testId: 'my-button' });
-
-        // Assert
-        expect(result.ok).toBe(false);
-        if (!result.ok) {
-          expect(result.error.code).toBe(ErrorCodes.MM_NO_ACTIVE_SESSION);
-        }
-      });
-    });
-  });
-
-  describe('handleType', () => {
-    describe('with testId target', () => {
-      it('types text into element by testId', async () => {
-        // Arrange
-        const mockPage = createMockPage();
-        const mockLocator = createMockLocator();
-        vi.spyOn(mockPage, 'locator').mockReturnValue(mockLocator);
-        vi.spyOn(mockSessionManager, 'getPage').mockReturnValue(mockPage);
-        vi.spyOn(mockSessionManager, 'getRefMap').mockReturnValue(new Map());
-
-        vi.spyOn(discoveryModule, 'waitForTarget').mockResolvedValue(
-          mockLocator as any,
-        );
-
-        // Act
-        const result = await handleType({
-          testId: 'amount-input',
-          text: '0.5',
-        });
-
-        // Assert
-        expect(result.ok).toBe(true);
-        if (result.ok) {
-          expect(result.result.typed).toBe(true);
-          expect(result.result.target).toBe('testId:amount-input');
-          expect(result.result.textLength).toBe(3);
-        }
-        expect(discoveryModule.waitForTarget).toHaveBeenCalledWith(
-          mockPage,
-          'testId',
-          'amount-input',
-          expect.any(Map),
-          15000,
-        );
-        expect(mockLocator.fill).toHaveBeenCalledWith('0.5');
-      });
-
-      it('uses custom timeout when provided', async () => {
-        // Arrange
-        const mockPage = createMockPage();
-        const mockLocator = createMockLocator();
-        vi.spyOn(mockSessionManager, 'getPage').mockReturnValue(mockPage);
-        vi.spyOn(mockSessionManager, 'getRefMap').mockReturnValue(new Map());
-
-        vi.spyOn(discoveryModule, 'waitForTarget').mockResolvedValue(
-          mockLocator as any,
-        );
-
-        // Act
-        await handleType({ testId: 'input', text: 'test', timeoutMs: 3000 });
-
-        // Assert
-        expect(discoveryModule.waitForTarget).toHaveBeenCalledWith(
-          mockPage,
-          'testId',
-          'input',
-          expect.any(Map),
-          3000,
-        );
-      });
-    });
-
-    describe('with selector target', () => {
-      it('types text into element by CSS selector', async () => {
-        // Arrange
-        const mockPage = createMockPage();
-        const mockLocator = createMockLocator();
-        vi.spyOn(mockSessionManager, 'getPage').mockReturnValue(mockPage);
-        vi.spyOn(mockSessionManager, 'getRefMap').mockReturnValue(new Map());
-
-        vi.spyOn(discoveryModule, 'waitForTarget').mockResolvedValue(
-          mockLocator as any,
-        );
-
-        // Act
-        const result = await handleType({
-          selector: 'input[name="email"]',
-          text: 'test@example.com',
-        });
-
-        // Assert
-        expect(result.ok).toBe(true);
-        if (result.ok) {
-          expect(result.result.typed).toBe(true);
-          expect(result.result.target).toBe('selector:input[name="email"]');
-          expect(result.result.textLength).toBe(16);
-        }
-        expect(mockLocator.fill).toHaveBeenCalledWith('test@example.com');
-      });
-    });
-
-    describe('with a11yRef target', () => {
-      it('types text into element by accessibility reference', async () => {
-        // Arrange
-        const mockPage = createMockPage();
-        const mockLocator = createMockLocator();
-        const refMap = new Map([['e3', 'input[aria-label="Amount"]']]);
-        vi.spyOn(mockSessionManager, 'getPage').mockReturnValue(mockPage);
-        vi.spyOn(mockSessionManager, 'getRefMap').mockReturnValue(refMap);
-
-        vi.spyOn(discoveryModule, 'waitForTarget').mockResolvedValue(
-          mockLocator as any,
-        );
-
-        // Act
-        const result = await handleType({ a11yRef: 'e3', text: '100' });
-
-        // Assert
-        expect(result.ok).toBe(true);
-        if (result.ok) {
-          expect(result.result.typed).toBe(true);
-          expect(result.result.target).toBe('a11yRef:e3');
-          expect(result.result.textLength).toBe(3);
-        }
-        expect(discoveryModule.waitForTarget).toHaveBeenCalledWith(
-          mockPage,
-          'a11yRef',
-          'e3',
-          refMap,
-          15000,
-        );
-      });
-    });
-
-    describe('with empty text', () => {
-      it('types empty string and reports zero length', async () => {
-        // Arrange
-        const mockPage = createMockPage();
-        const mockLocator = createMockLocator();
-        vi.spyOn(mockSessionManager, 'getPage').mockReturnValue(mockPage);
-        vi.spyOn(mockSessionManager, 'getRefMap').mockReturnValue(new Map());
-
-        vi.spyOn(discoveryModule, 'waitForTarget').mockResolvedValue(
-          mockLocator as any,
-        );
-
-        // Act
-        const result = await handleType({ testId: 'input', text: '' });
-
-        // Assert
-        expect(result.ok).toBe(true);
-        if (result.ok) {
-          expect(result.result.typed).toBe(true);
-          expect(result.result.textLength).toBe(0);
-        }
-        expect(mockLocator.fill).toHaveBeenCalledWith('');
-      });
-    });
-
-    describe('with invalid target selection', () => {
-      it('returns error when no target specified', async () => {
-        // Act
-        const result = await handleType({ text: 'test' } as any);
-
-        // Assert
-        expect(result.ok).toBe(false);
-        if (!result.ok) {
-          expect(result.error.code).toBe(ErrorCodes.MM_INVALID_INPUT);
-          expect(result.error.message).toContain('Exactly one');
-        }
-      });
-
-      it('returns error when multiple targets specified', async () => {
-        // Act
-        const result = await handleType({
-          testId: 'input',
-          selector: 'input',
-          text: 'test',
-        } as any);
-
-        // Assert
-        expect(result.ok).toBe(false);
-        if (!result.ok) {
-          expect(result.error.code).toBe(ErrorCodes.MM_INVALID_INPUT);
-          expect(result.error.message).toContain('Exactly one');
-        }
-      });
-
-      it('returns error when validation result is invalid but not caught by isInvalidTargetSelection', async () => {
-        // Arrange
-        vi.spyOn(utilsModule, 'validateTargetSelection').mockReturnValue({
-          valid: true,
-          // Missing type and value properties - will fail isValidTargetSelection
-        } as any);
-
-        // Act
-        const result = await handleType({ testId: 'input', text: 'test' });
-
-        // Assert
-        expect(result.ok).toBe(false);
-        if (!result.ok) {
-          expect(result.error.code).toBe(ErrorCodes.MM_INVALID_INPUT);
-          expect(result.error.message).toBe('Invalid target selection');
-        }
-      });
-    });
-
-    describe('with type errors', () => {
-      it('returns error when fill fails', async () => {
-        // Arrange
-        const mockPage = createMockPage();
-        const mockLocator = createMockLocator();
-        vi.spyOn(mockLocator, 'fill').mockRejectedValue(
-          new Error('Element is not editable'),
-        );
-        vi.spyOn(mockSessionManager, 'getPage').mockReturnValue(mockPage);
-        vi.spyOn(mockSessionManager, 'getRefMap').mockReturnValue(new Map());
-
-        vi.spyOn(discoveryModule, 'waitForTarget').mockResolvedValue(
-          mockLocator as any,
-        );
-
-        // Act
-        const result = await handleType({ testId: 'input', text: 'test' });
-
-        // Assert
-        expect(result.ok).toBe(false);
-        if (!result.ok) {
-          expect(result.error.code).toBe(ErrorCodes.MM_TYPE_FAILED);
-        }
-      });
-
-      it('returns error when element not found', async () => {
-        // Arrange
-        const mockPage = createMockPage();
-        vi.spyOn(mockSessionManager, 'getPage').mockReturnValue(mockPage);
-        vi.spyOn(mockSessionManager, 'getRefMap').mockReturnValue(new Map());
-
-        vi.spyOn(discoveryModule, 'waitForTarget').mockRejectedValue(
-          new Error('Timeout waiting for element'),
-        );
-
-        // Act
-        const result = await handleType({
-          testId: 'nonexistent',
-          text: 'test',
-        });
-
-        // Assert
-        expect(result.ok).toBe(false);
-        if (!result.ok) {
-          expect(result.error.code).toBe(ErrorCodes.MM_WAIT_TIMEOUT);
-        }
-      });
-    });
-
-    describe('without active session', () => {
-      it('returns error when no session active', async () => {
-        // Arrange
-        vi.spyOn(mockSessionManager, 'hasActiveSession').mockReturnValue(false);
-
-        // Act
-        const result = await handleType({ testId: 'input', text: 'test' });
-
-        // Assert
-        expect(result.ok).toBe(false);
-        if (!result.ok) {
-          expect(result.error.code).toBe(ErrorCodes.MM_NO_ACTIVE_SESSION);
-        }
-      });
-    });
-  });
-
-  describe('handleWaitFor', () => {
-    describe('with testId target', () => {
-      it('waits for element by testId', async () => {
-        // Arrange
-        const mockPage = createMockPage();
-        const mockLocator = createMockLocator();
-        vi.spyOn(mockPage, 'locator').mockReturnValue(mockLocator);
-        vi.spyOn(mockSessionManager, 'getPage').mockReturnValue(mockPage);
-        vi.spyOn(mockSessionManager, 'getRefMap').mockReturnValue(new Map());
-
-        vi.spyOn(discoveryModule, 'waitForTarget').mockResolvedValue(
-          mockLocator as any,
-        );
-
-        // Act
-        const result = await handleWaitFor({ testId: 'loading-spinner' });
-
-        // Assert
-        expect(result.ok).toBe(true);
-        if (result.ok) {
-          expect(result.result.found).toBe(true);
-          expect(result.result.target).toBe('testId:loading-spinner');
-        }
-        expect(discoveryModule.waitForTarget).toHaveBeenCalledWith(
-          mockPage,
-          'testId',
-          'loading-spinner',
-          expect.any(Map),
-          15000,
-        );
-      });
-
-      it('uses custom timeout when provided', async () => {
-        // Arrange
-        const mockPage = createMockPage();
-        const mockLocator = createMockLocator();
-        vi.spyOn(mockSessionManager, 'getPage').mockReturnValue(mockPage);
-        vi.spyOn(mockSessionManager, 'getRefMap').mockReturnValue(new Map());
-
-        vi.spyOn(discoveryModule, 'waitForTarget').mockResolvedValue(
-          mockLocator as any,
-        );
-
-        // Act
-        await handleWaitFor({ testId: 'element', timeoutMs: 30000 });
-
-        // Assert
-        expect(discoveryModule.waitForTarget).toHaveBeenCalledWith(
-          mockPage,
-          'testId',
-          'element',
-          expect.any(Map),
-          30000,
-        );
-      });
-    });
-
-    describe('with selector target', () => {
-      it('waits for element by CSS selector', async () => {
-        // Arrange
-        const mockPage = createMockPage();
-        const mockLocator = createMockLocator();
-        vi.spyOn(mockSessionManager, 'getPage').mockReturnValue(mockPage);
-        vi.spyOn(mockSessionManager, 'getRefMap').mockReturnValue(new Map());
-
-        vi.spyOn(discoveryModule, 'waitForTarget').mockResolvedValue(
-          mockLocator as any,
-        );
-
-        // Act
-        const result = await handleWaitFor({ selector: '.success-message' });
-
-        // Assert
-        expect(result.ok).toBe(true);
-        if (result.ok) {
-          expect(result.result.found).toBe(true);
-          expect(result.result.target).toBe('selector:.success-message');
-        }
-        expect(discoveryModule.waitForTarget).toHaveBeenCalledWith(
-          mockPage,
-          'selector',
-          '.success-message',
-          expect.any(Map),
-          15000,
-        );
-      });
-    });
-
-    describe('with a11yRef target', () => {
-      it('waits for element by accessibility reference', async () => {
-        // Arrange
-        const mockPage = createMockPage();
-        const mockLocator = createMockLocator();
-        const refMap = new Map([['e10', 'button[aria-label="Confirm"]']]);
-        vi.spyOn(mockSessionManager, 'getPage').mockReturnValue(mockPage);
-        vi.spyOn(mockSessionManager, 'getRefMap').mockReturnValue(refMap);
-
-        vi.spyOn(discoveryModule, 'waitForTarget').mockResolvedValue(
-          mockLocator as any,
-        );
-
-        // Act
-        const result = await handleWaitFor({ a11yRef: 'e10' });
-
-        // Assert
-        expect(result.ok).toBe(true);
-        if (result.ok) {
-          expect(result.result.found).toBe(true);
-          expect(result.result.target).toBe('a11yRef:e10');
-        }
-        expect(discoveryModule.waitForTarget).toHaveBeenCalledWith(
-          mockPage,
-          'a11yRef',
-          'e10',
-          refMap,
-          15000,
-        );
-      });
-    });
-
-    describe('with invalid target selection', () => {
-      it('returns error when no target specified', async () => {
-        // Act
-        const result = await handleWaitFor({} as any);
-
-        // Assert
-        expect(result.ok).toBe(false);
-        if (!result.ok) {
-          expect(result.error.code).toBe(ErrorCodes.MM_INVALID_INPUT);
-          expect(result.error.message).toContain('Exactly one');
-        }
-      });
-
-      it('returns error when multiple targets specified', async () => {
-        // Act
-        const result = await handleWaitFor({
-          testId: 'element',
-          selector: '.element',
-        } as any);
-
-        // Assert
-        expect(result.ok).toBe(false);
-        if (!result.ok) {
-          expect(result.error.code).toBe(ErrorCodes.MM_INVALID_INPUT);
-          expect(result.error.message).toContain('Exactly one');
-        }
-      });
-
-      it('returns error when validation result is invalid but not caught by isInvalidTargetSelection', async () => {
-        // Arrange
-        vi.spyOn(utilsModule, 'validateTargetSelection').mockReturnValue({
-          valid: true,
-          // Missing type and value properties - will fail isValidTargetSelection
-        } as any);
-
-        // Act
-        const result = await handleWaitFor({ testId: 'element' });
-
-        // Assert
-        expect(result.ok).toBe(false);
-        if (!result.ok) {
-          expect(result.error.code).toBe(ErrorCodes.MM_INVALID_INPUT);
-          expect(result.error.message).toBe('Invalid target selection');
-        }
-      });
-    });
-
-    describe('with timeout errors', () => {
-      it('returns error when element not found within timeout', async () => {
-        // Arrange
-        const mockPage = createMockPage();
-        vi.spyOn(mockSessionManager, 'getPage').mockReturnValue(mockPage);
-        vi.spyOn(mockSessionManager, 'getRefMap').mockReturnValue(new Map());
-
-        vi.spyOn(discoveryModule, 'waitForTarget').mockRejectedValue(
-          new Error('Timeout 15000ms exceeded'),
-        );
-
-        // Act
-        const result = await handleWaitFor({ testId: 'nonexistent' });
-
-        // Assert
-        expect(result.ok).toBe(false);
-        if (!result.ok) {
-          expect(result.error.code).toBe(ErrorCodes.MM_WAIT_TIMEOUT);
-        }
-      });
-
-      it('returns error when page closed during wait', async () => {
-        // Arrange
-        const mockPage = createMockPage();
-        vi.spyOn(mockSessionManager, 'getPage').mockReturnValue(mockPage);
-        vi.spyOn(mockSessionManager, 'getRefMap').mockReturnValue(new Map());
-
-        vi.spyOn(discoveryModule, 'waitForTarget').mockRejectedValue(
-          new Error('Target page has been closed'),
-        );
-
-        // Act
-        const result = await handleWaitFor({ testId: 'element' });
-
-        // Assert
-        expect(result.ok).toBe(false);
-        if (!result.ok) {
-          expect(result.error.code).toBe(ErrorCodes.MM_WAIT_TIMEOUT);
-        }
-      });
-    });
-
-    describe('without active session', () => {
-      it('returns error when no session active', async () => {
-        // Arrange
-        vi.spyOn(mockSessionManager, 'hasActiveSession').mockReturnValue(false);
-
-        // Act
-        const result = await handleWaitFor({ testId: 'element' });
-
-        // Assert
-        expect(result.ok).toBe(false);
-        if (!result.ok) {
-          expect(result.error.code).toBe(ErrorCodes.MM_NO_ACTIVE_SESSION);
-        }
-      });
-    });
-  });
-});
diff --git a/src/mcp-server/tools/interaction.ts b/src/mcp-server/tools/interaction.ts
deleted file mode 100644
index 80c02e1..0000000
--- a/src/mcp-server/tools/interaction.ts
+++ /dev/null
@@ -1,296 +0,0 @@
-import { DEFAULT_INTERACTION_TIMEOUT_MS } from '../constants.js';
-import { waitForTarget } from '../discovery.js';
-import { getSessionManager } from '../session-manager.js';
-import {
-  classifyClickError,
-  classifyTypeError,
-  classifyWaitError,
-  isPageClosedError,
-} from './error-classification.js';
-import { runTool } from './run-tool.js';
-import type {
-  ClickInput,
-  ClickResult,
-  TypeInput,
-  TypeResult,
-  WaitForInput,
-  WaitForResult,
-  McpResponse,
-  HandlerOptions,
-} from '../types';
-import { ErrorCodes } from '../types';
-import {
-  createErrorResponse,
-  validateTargetSelection,
-  isValidTargetSelection,
-  isInvalidTargetSelection,
-} from '../utils';
-
-/**
- * Handles clicking on an element specified by testId, selector, or accessibility reference.
- *
- * @param input The click input containing target selection and timeout options
- * @param options Optional handler configuration
- * @returns Promise resolving to click result with target information
- */
-export async function handleClick(
-  input: ClickInput,
-  options?: HandlerOptions,
-): Promise<McpResponse<ClickResult>> {
-  const startTime = Date.now();
-  const sessionManager = getSessionManager();
-  const sessionId = sessionManager.getSessionId();
-  const timeoutMs = input.timeoutMs ?? DEFAULT_INTERACTION_TIMEOUT_MS;
-
-  const validation = validateTargetSelection(input);
-  if (isInvalidTargetSelection(validation)) {
-    return createErrorResponse(
-      ErrorCodes.MM_INVALID_INPUT,
-      validation.error,
-      { input },
-      sessionId,
-      startTime,
-    );
-  }
-
-  if (!isValidTargetSelection(validation)) {
-    return createErrorResponse(
-      ErrorCodes.MM_INVALID_INPUT,
-      'Invalid target selection',
-      { input },
-      sessionId,
-      startTime,
-    );
-  }
-
-  const { type: targetType, value: targetValue } = validation;
-
-  return runTool<ClickInput, ClickResult>({
-    toolName: 'mm_click',
-    input,
-    options,
-
-    /**
-     * Executes the click action on the target element.
-     *
-     * @param context The tool execution context containing page and reference map
-     * @returns Promise resolving to click result with success status and target info
-     */
-    execute: async (context) => {
-      const locator = await waitForTarget(
-        context.page,
-        targetType,
-        targetValue,
-        context.refMap,
-        timeoutMs,
-      );
-
-      try {
-        await locator.click();
-        return {
-          clicked: true,
-          target: `${targetType}:${targetValue}`,
-        };
-      } catch (clickError) {
-        if (isPageClosedError(clickError)) {
-          return {
-            clicked: true,
-            target: `${targetType}:${targetValue}`,
-            pageClosedAfterClick: true,
-          };
-        }
-        throw clickError;
-      }
-    },
-
-    /**
-     * Returns the target element information for recording.
-     *
-     * @returns Object containing the target type and value
-     */
-    getTarget: () => ({ [targetType]: targetValue }),
-
-    classifyError: classifyClickError,
-
-    /**
-     * Sanitizes input for knowledge store recording.
-     *
-     * @returns Sanitized input object with timeout information
-     */
-    sanitizeInputForRecording: () => ({ timeoutMs }),
-  });
-}
-
-/**
- * Handles typing text into an element specified by testId, selector, or accessibility reference.
- *
- * @param input The type input containing target selection, text, and timeout options
- * @param options Optional handler configuration
- * @returns Promise resolving to type result with target and text length information
- */
-export async function handleType(
-  input: TypeInput,
-  options?: HandlerOptions,
-): Promise<McpResponse<TypeResult>> {
-  const startTime = Date.now();
-  const sessionManager = getSessionManager();
-  const sessionId = sessionManager.getSessionId();
-  const timeoutMs = input.timeoutMs ?? DEFAULT_INTERACTION_TIMEOUT_MS;
-
-  const validation = validateTargetSelection(input);
-  if (isInvalidTargetSelection(validation)) {
-    return createErrorResponse(
-      ErrorCodes.MM_INVALID_INPUT,
-      validation.error,
-      { input },
-      sessionId,
-      startTime,
-    );
-  }
-
-  if (!isValidTargetSelection(validation)) {
-    return createErrorResponse(
-      ErrorCodes.MM_INVALID_INPUT,
-      'Invalid target selection',
-      { input },
-      sessionId,
-      startTime,
-    );
-  }
-
-  const { type: targetType, value: targetValue } = validation;
-
-  return runTool<TypeInput, TypeResult>({
-    toolName: 'mm_type',
-    input,
-    options,
-
-    /**
-     * Executes the type action on the target element.
-     *
-     * @param context The tool execution context containing page and reference map
-     * @returns Promise resolving to type result with success status and text length
-     */
-    execute: async (context) => {
-      const locator = await waitForTarget(
-        context.page,
-        targetType,
-        targetValue,
-        context.refMap,
-        timeoutMs,
-      );
-      await locator.fill(input.text);
-
-      return {
-        typed: true,
-        target: `${targetType}:${targetValue}`,
-        textLength: input.text.length,
-      };
-    },
-
-    /**
-     * Returns the target element information for recording.
-     *
-     * @returns Object containing the target type and value
-     */
-    getTarget: () => ({ [targetType]: targetValue }),
-
-    classifyError: classifyTypeError,
-
-    /**
-     * Sanitizes input for knowledge store recording.
-     *
-     * @returns Sanitized input object with timeout and text information
-     */
-    sanitizeInputForRecording: () => ({
-      timeoutMs,
-      text: input.text,
-      testId: input.testId,
-      selector: input.selector,
-      a11yRef: input.a11yRef,
-    }),
-  });
-}
-
-/**
- * Handles waiting for an element to become visible.
- *
- * @param input The wait input containing target selection and timeout options
- * @param options Optional handler configuration
- * @returns Promise resolving to wait result with target information
- */
-export async function handleWaitFor(
-  input: WaitForInput,
-  options?: HandlerOptions,
-): Promise<McpResponse<WaitForResult>> {
-  const startTime = Date.now();
-  const sessionManager = getSessionManager();
-  const sessionId = sessionManager.getSessionId();
-  const timeoutMs = input.timeoutMs ?? DEFAULT_INTERACTION_TIMEOUT_MS;
-
-  const validation = validateTargetSelection(input);
-  if (isInvalidTargetSelection(validation)) {
-    return createErrorResponse(
-      ErrorCodes.MM_INVALID_INPUT,
-      validation.error,
-      { input },
-      sessionId,
-      startTime,
-    );
-  }
-
-  if (!isValidTargetSelection(validation)) {
-    return createErrorResponse(
-      ErrorCodes.MM_INVALID_INPUT,
-      'Invalid target selection',
-      { input },
-      sessionId,
-      startTime,
-    );
-  }
-
-  const { type: targetType, value: targetValue } = validation;
-
-  return runTool<WaitForInput, WaitForResult>({
-    toolName: 'mm_wait_for',
-    input,
-    options,
-
-    /**
-     * Executes the wait action for the target element.
-     *
-     * @param context The tool execution context containing page and reference map
-     * @returns Promise resolving to wait result with success status and target info
-     */
-    execute: async (context) => {
-      await waitForTarget(
-        context.page,
-        targetType,
-        targetValue,
-        context.refMap,
-        timeoutMs,
-      );
-
-      return {
-        found: true,
-        target: `${targetType}:${targetValue}`,
-      };
-    },
-
-    /**
-     * Returns the target element information for recording.
-     *
-     * @returns Object containing the target type and value
-     */
-    getTarget: () => ({ [targetType]: targetValue }),
-
-    classifyError: classifyWaitError,
-
-    /**
-     * Sanitizes input for knowledge store recording.
-     *
-     * @returns Sanitized input object with timeout information
-     */
-    sanitizeInputForRecording: () => ({ timeoutMs }),
-  });
-}
diff --git a/src/mcp-server/tools/knowledge.ts b/src/mcp-server/tools/knowledge.ts
deleted file mode 100644
index eddbfff..0000000
--- a/src/mcp-server/tools/knowledge.ts
+++ /dev/null
@@ -1,212 +0,0 @@
-import { knowledgeStore } from '../knowledge-store.js';
-import { getSessionManager } from '../session-manager.js';
-import type {
-  KnowledgeLastInput,
-  KnowledgeLastResult,
-  KnowledgeSearchInput,
-  KnowledgeSearchResult,
-  KnowledgeSummarizeInput,
-  KnowledgeSummarizeResult,
-  KnowledgeSessionsInput,
-  KnowledgeSessionsResult,
-  KnowledgeScope,
-  McpResponse,
-  HandlerOptions,
-} from '../types';
-import { ErrorCodes } from '../types';
-import {
-  createSuccessResponse,
-  createErrorResponse,
-  extractErrorMessage,
-} from '../utils';
-
-/**
- * Handles retrieving the last N steps from knowledge store.
- *
- * @param input - Input with number of steps and scope.
- * @param _options - Handler options (unused).
- * @returns Response with step records.
- */
-export async function handleKnowledgeLast(
-  input: KnowledgeLastInput,
-  _options?: HandlerOptions,
-): Promise<McpResponse<KnowledgeLastResult>> {
-  const startTime = Date.now();
-  const sessionManager = getSessionManager();
-  const sessionId = sessionManager.getSessionId();
-  const nSteps = input.n ?? 20;
-  const scope: KnowledgeScope = input.scope ?? 'current';
-
-  try {
-    const steps = await knowledgeStore.getLastSteps(
-      nSteps,
-      scope,
-      sessionId,
-      input.filters,
-    );
-
-    return createSuccessResponse<KnowledgeLastResult>(
-      { steps },
-      sessionId,
-      startTime,
-    );
-  } catch (error) {
-    const message = extractErrorMessage(error);
-    return createErrorResponse(
-      ErrorCodes.MM_KNOWLEDGE_ERROR,
-      `Failed to retrieve steps: ${message}`,
-      { nSteps, scope },
-      sessionId,
-      startTime,
-    );
-  }
-}
-
-/**
- * Handles searching step records in knowledge store.
- *
- * @param input - Input with search query and filters.
- * @param _options - Handler options (unused).
- * @returns Response with matching steps.
- */
-export async function handleKnowledgeSearch(
-  input: KnowledgeSearchInput,
-  _options?: HandlerOptions,
-): Promise<McpResponse<KnowledgeSearchResult>> {
-  const startTime = Date.now();
-  const sessionManager = getSessionManager();
-  const sessionId = sessionManager.getSessionId();
-  const limit = input.limit ?? 20;
-  const scope: KnowledgeScope = input.scope ?? 'all';
-
-  try {
-    const matches = await knowledgeStore.searchSteps(
-      input.query,
-      limit,
-      scope,
-      sessionId,
-      input.filters,
-    );
-
-    return createSuccessResponse<KnowledgeSearchResult>(
-      {
-        matches,
-        query: input.query,
-      },
-      sessionId,
-      startTime,
-    );
-  } catch (error) {
-    const message = extractErrorMessage(error);
-    return createErrorResponse(
-      ErrorCodes.MM_KNOWLEDGE_ERROR,
-      `Search failed: ${message}`,
-      { query: input.query, limit, scope },
-      sessionId,
-      startTime,
-    );
-  }
-}
-
-/**
- * Handles summarizing a session's steps as a recipe.
- *
- * @param input - Input with session scope or ID.
- * @param _options - Handler options (unused).
- * @returns Response with session summary.
- */
-export async function handleKnowledgeSummarize(
-  input: KnowledgeSummarizeInput,
-  _options?: HandlerOptions,
-): Promise<McpResponse<KnowledgeSummarizeResult>> {
-  const startTime = Date.now();
-  const sessionManager = getSessionManager();
-  const currentSessionId = sessionManager.getSessionId();
-
-  let targetSessionId: string | undefined;
-
-  if (input.sessionId) {
-    targetSessionId = input.sessionId;
-  } else if (input.scope) {
-    if (input.scope === 'all') {
-      return createErrorResponse(
-        ErrorCodes.MM_INVALID_INPUT,
-        'Cannot summarize all sessions. Use scope="current" or provide a specific sessionId.',
-        { input },
-        currentSessionId,
-        startTime,
-      );
-    } else if (input.scope === 'current') {
-      targetSessionId = currentSessionId;
-    } else if (typeof input.scope === 'object' && 'sessionId' in input.scope) {
-      targetSessionId = input.scope.sessionId;
-    }
-  } else {
-    targetSessionId = currentSessionId;
-  }
-
-  if (!targetSessionId) {
-    return createErrorResponse(
-      ErrorCodes.MM_INVALID_INPUT,
-      'No sessionId provided and no active session',
-      { input },
-      undefined,
-      startTime,
-    );
-  }
-
-  try {
-    const summary = await knowledgeStore.summarizeSession(targetSessionId);
-
-    return createSuccessResponse<KnowledgeSummarizeResult>(
-      summary,
-      targetSessionId,
-      startTime,
-    );
-  } catch (error) {
-    const message = extractErrorMessage(error);
-    return createErrorResponse(
-      ErrorCodes.MM_KNOWLEDGE_ERROR,
-      `Summarize failed: ${message}`,
-      { sessionId: targetSessionId },
-      targetSessionId,
-      startTime,
-    );
-  }
-}
-
-/**
- * Handles listing recent sessions with metadata.
- *
- * @param input - Input with limit and filters.
- * @param _options - Handler options (unused).
- * @returns Response with session list.
- */
-export async function handleKnowledgeSessions(
-  input: KnowledgeSessionsInput,
-  _options?: HandlerOptions,
-): Promise<McpResponse<KnowledgeSessionsResult>> {
-  const startTime = Date.now();
-  const sessionManager = getSessionManager();
-  const sessionId = sessionManager.getSessionId();
-  const limit = input.limit ?? 10;
-
-  try {
-    const sessions = await knowledgeStore.listSessions(limit, input.filters);
-
-    return createSuccessResponse<KnowledgeSessionsResult>(
-      { sessions },
-      sessionId,
-      startTime,
-    );
-  } catch (error) {
-    const message = extractErrorMessage(error);
-    return createErrorResponse(
-      ErrorCodes.MM_KNOWLEDGE_ERROR,
-      `Failed to list sessions: ${message}`,
-      { limit, filters: input.filters },
-      sessionId,
-      startTime,
-    );
-  }
-}
diff --git a/src/mcp-server/tools/launch.test.ts b/src/mcp-server/tools/launch.test.ts
deleted file mode 100644
index 81cab1b..0000000
--- a/src/mcp-server/tools/launch.test.ts
+++ /dev/null
@@ -1,384 +0,0 @@
-/**
- * Unit tests for launch tool handler.
- *
- * Tests session launch with various states and error scenarios.
- */
-
-import { describe, it, expect, vi, beforeEach } from 'vitest';
-
-import { handleLaunch } from './launch.js';
-import type { ExtensionState } from '../../capabilities/types.js';
-import * as sessionManagerModule from '../session-manager.js';
-import type { SessionLaunchResult } from '../session-manager.js';
-import { createMockSessionManager } from '../test-utils/mock-factories.js';
-import { ErrorCodes } from '../types';
-import type { LaunchInput } from '../types';
-
-describe('handleLaunch', () => {
-  beforeEach(() => {
-    vi.clearAllMocks();
-  });
-
-  describe('successful launch', () => {
-    it('returns session info on successful launch', async () => {
-      const mockState: ExtensionState = {
-        isLoaded: true,
-        currentUrl: 'chrome-extension://ext-123/home.html',
-        extensionId: 'ext-123',
-        isUnlocked: false,
-        currentScreen: 'home',
-        accountAddress: null,
-        networkName: null,
-        chainId: null,
-        balance: null,
-      };
-
-      const mockLaunchResult: SessionLaunchResult = {
-        sessionId: 'test-session-123',
-        extensionId: 'ext-123',
-        state: mockState,
-      };
-
-      const mockSessionManager = createMockSessionManager({
-        hasActive: false,
-        launchResult: mockLaunchResult,
-      });
-      vi.spyOn(sessionManagerModule, 'getSessionManager').mockReturnValue(
-        mockSessionManager,
-      );
-
-      const input: LaunchInput = { stateMode: 'default' };
-
-      const result = await handleLaunch(input);
-
-      expect(result.ok).toBe(true);
-      if (result.ok) {
-        expect(result.result.sessionId).toBe('test-session-123');
-        expect(result.result.extensionId).toBe('ext-123');
-        expect(result.result.state).toStrictEqual(mockState);
-        expect(result.meta.sessionId).toBe('test-session-123');
-      }
-      expect(mockSessionManager.launch).toHaveBeenCalledWith(input);
-    });
-
-    it('includes prerequisites in prod mode', async () => {
-      const mockState: ExtensionState = {
-        isLoaded: true,
-        currentUrl: 'chrome-extension://ext-456/home.html',
-        extensionId: 'ext-456',
-        isUnlocked: true,
-        currentScreen: 'home',
-        accountAddress: '0x1234',
-        networkName: 'Ethereum Mainnet',
-        chainId: 1,
-        balance: '10 ETH',
-      };
-
-      const mockLaunchResult: SessionLaunchResult = {
-        sessionId: 'prod-session-456',
-        extensionId: 'ext-456',
-        state: mockState,
-      };
-
-      const mockSessionManager = createMockSessionManager({
-        hasActive: false,
-        launchResult: mockLaunchResult,
-        environmentMode: 'prod',
-      });
-      vi.spyOn(sessionManagerModule, 'getSessionManager').mockReturnValue(
-        mockSessionManager,
-      );
-
-      const input: LaunchInput = { stateMode: 'default' };
-
-      const result = await handleLaunch(input);
-
-      expect(result.ok).toBe(true);
-      if (result.ok) {
-        expect(result.result.prerequisites).toBeDefined();
-        expect(result.result.prerequisites).toHaveLength(3);
-        expect(result.result.prerequisites?.[0].step).toBe('Unlock Wallet');
-        expect(result.result.prerequisites?.[1].step).toBe('Configure Network');
-        expect(result.result.prerequisites?.[2].step).toBe('Set Up Accounts');
-      }
-    });
-
-    it('does not include prerequisites in e2e mode', async () => {
-      const mockState: ExtensionState = {
-        isLoaded: true,
-        currentUrl: 'chrome-extension://ext-123/home.html',
-        extensionId: 'ext-123',
-        isUnlocked: false,
-        currentScreen: 'home',
-        accountAddress: null,
-        networkName: null,
-        chainId: null,
-        balance: null,
-      };
-
-      const mockLaunchResult: SessionLaunchResult = {
-        sessionId: 'e2e-session-789',
-        extensionId: 'ext-123',
-        state: mockState,
-      };
-
-      const mockSessionManager = createMockSessionManager({
-        hasActive: false,
-        launchResult: mockLaunchResult,
-        environmentMode: 'e2e',
-      });
-      vi.spyOn(sessionManagerModule, 'getSessionManager').mockReturnValue(
-        mockSessionManager,
-      );
-
-      const input: LaunchInput = { stateMode: 'default' };
-
-      const result = await handleLaunch(input);
-
-      expect(result.ok).toBe(true);
-      if (result.ok) {
-        expect(result.result.prerequisites).toBeUndefined();
-      }
-    });
-
-    it('passes through all launch input parameters', async () => {
-      const mockState: ExtensionState = {
-        isLoaded: true,
-        currentUrl: 'chrome-extension://ext-123/home.html',
-        extensionId: 'ext-123',
-        isUnlocked: false,
-        currentScreen: 'home',
-        accountAddress: null,
-        networkName: null,
-        chainId: null,
-        balance: null,
-      };
-
-      const mockLaunchResult: SessionLaunchResult = {
-        sessionId: 'custom-session',
-        extensionId: 'ext-123',
-        state: mockState,
-      };
-
-      const mockSessionManager = createMockSessionManager({
-        hasActive: false,
-        launchResult: mockLaunchResult,
-      });
-      vi.spyOn(sessionManagerModule, 'getSessionManager').mockReturnValue(
-        mockSessionManager,
-      );
-
-      const input: LaunchInput = {
-        stateMode: 'custom',
-        fixturePreset: 'test-preset',
-        autoBuild: false,
-        slowMo: 100,
-        goal: 'Test send flow',
-        flowTags: ['send', 'transaction'],
-        tags: ['smoke-test'],
-        seedContracts: ['hst', 'nfts'],
-        ports: {
-          anvil: 8546,
-          fixtureServer: 12346,
-        },
-      };
-
-      const result = await handleLaunch(input);
-
-      expect(result.ok).toBe(true);
-      expect(mockSessionManager.launch).toHaveBeenCalledWith(input);
-    });
-  });
-
-  describe('session already running', () => {
-    it('returns error when session already active', async () => {
-      const mockSessionManager = createMockSessionManager({
-        hasActive: true,
-        sessionId: 'existing-session-999',
-      });
-      vi.spyOn(sessionManagerModule, 'getSessionManager').mockReturnValue(
-        mockSessionManager,
-      );
-
-      const input: LaunchInput = { stateMode: 'default' };
-
-      const result = await handleLaunch(input);
-
-      expect(result.ok).toBe(false);
-      if (!result.ok) {
-        expect(result.error.code).toBe(ErrorCodes.MM_SESSION_ALREADY_RUNNING);
-        expect(result.error.message).toBe(
-          'A session is already running. Call mm_cleanup first.',
-        );
-        expect(result.error.details).toStrictEqual({
-          currentSessionId: 'existing-session-999',
-        });
-        expect(result.meta.sessionId).toBe('existing-session-999');
-      }
-      expect(mockSessionManager.launch).not.toHaveBeenCalled();
-    });
-  });
-
-  describe('launch failures', () => {
-    it('returns port conflict error for EADDRINUSE', async () => {
-      const mockSessionManager = createMockSessionManager({ hasActive: false });
-      vi.spyOn(mockSessionManager, 'launch').mockRejectedValue(
-        new Error('listen EADDRINUSE: address already in use :::8545'),
-      );
-      vi.spyOn(sessionManagerModule, 'getSessionManager').mockReturnValue(
-        mockSessionManager,
-      );
-
-      const input: LaunchInput = { stateMode: 'default' };
-
-      const result = await handleLaunch(input);
-
-      expect(result.ok).toBe(false);
-      if (!result.ok) {
-        expect(result.error.code).toBe(ErrorCodes.MM_PORT_IN_USE);
-        expect(result.error.message).toContain('Port conflict');
-        expect(result.error.message).toContain('EADDRINUSE');
-        expect(result.error.details).toStrictEqual({ input });
-      }
-    });
-
-    it('returns port conflict error for port keyword in message', async () => {
-      const mockSessionManager = createMockSessionManager({ hasActive: false });
-      vi.spyOn(mockSessionManager, 'launch').mockRejectedValue(
-        new Error('port 8545 is already in use'),
-      );
-      vi.spyOn(sessionManagerModule, 'getSessionManager').mockReturnValue(
-        mockSessionManager,
-      );
-
-      const input: LaunchInput = { stateMode: 'default' };
-
-      const result = await handleLaunch(input);
-
-      expect(result.ok).toBe(false);
-      if (!result.ok) {
-        expect(result.error.code).toBe(ErrorCodes.MM_PORT_IN_USE);
-        expect(result.error.message).toContain('Port conflict');
-      }
-    });
-
-    it('returns generic launch failed error for other errors', async () => {
-      const mockSessionManager = createMockSessionManager({ hasActive: false });
-      vi.spyOn(mockSessionManager, 'launch').mockRejectedValue(
-        new Error('Browser failed to start'),
-      );
-      vi.spyOn(sessionManagerModule, 'getSessionManager').mockReturnValue(
-        mockSessionManager,
-      );
-
-      const input: LaunchInput = { stateMode: 'default' };
-
-      const result = await handleLaunch(input);
-
-      expect(result.ok).toBe(false);
-      if (!result.ok) {
-        expect(result.error.code).toBe(ErrorCodes.MM_LAUNCH_FAILED);
-        expect(result.error.message).toContain('Launch failed');
-        expect(result.error.message).toContain('Browser failed to start');
-        expect(result.error.details).toStrictEqual({ input });
-      }
-    });
-
-    it('handles non-Error exceptions', async () => {
-      const mockSessionManager = createMockSessionManager({ hasActive: false });
-      vi.spyOn(mockSessionManager, 'launch').mockRejectedValue('string error');
-      vi.spyOn(sessionManagerModule, 'getSessionManager').mockReturnValue(
-        mockSessionManager,
-      );
-
-      const input: LaunchInput = { stateMode: 'default' };
-
-      const result = await handleLaunch(input);
-
-      expect(result.ok).toBe(false);
-      if (!result.ok) {
-        expect(result.error.code).toBe(ErrorCodes.MM_LAUNCH_FAILED);
-        expect(result.error.message).toContain('Launch failed');
-      }
-    });
-  });
-
-  describe('response metadata', () => {
-    it('includes timestamp in response', async () => {
-      const mockState: ExtensionState = {
-        isLoaded: true,
-        currentUrl: 'chrome-extension://ext-123/home.html',
-        extensionId: 'ext-123',
-        isUnlocked: false,
-        currentScreen: 'home',
-        accountAddress: null,
-        networkName: null,
-        chainId: null,
-        balance: null,
-      };
-
-      const mockLaunchResult: SessionLaunchResult = {
-        sessionId: 'test-session-123',
-        extensionId: 'ext-123',
-        state: mockState,
-      };
-
-      const mockSessionManager = createMockSessionManager({
-        hasActive: false,
-        launchResult: mockLaunchResult,
-      });
-      vi.spyOn(sessionManagerModule, 'getSessionManager').mockReturnValue(
-        mockSessionManager,
-      );
-
-      const input: LaunchInput = { stateMode: 'default' };
-
-      const result = await handleLaunch(input);
-
-      expect(result.ok).toBe(true);
-      if (result.ok) {
-        expect(result.meta.timestamp).toBeDefined();
-        expect(typeof result.meta.timestamp).toBe('string');
-        expect(new Date(result.meta.timestamp).getTime()).toBeGreaterThan(0);
-      }
-    });
-
-    it('includes durationMs in response', async () => {
-      const mockState: ExtensionState = {
-        isLoaded: true,
-        currentUrl: 'chrome-extension://ext-123/home.html',
-        extensionId: 'ext-123',
-        isUnlocked: false,
-        currentScreen: 'home',
-        accountAddress: null,
-        networkName: null,
-        chainId: null,
-        balance: null,
-      };
-
-      const mockLaunchResult: SessionLaunchResult = {
-        sessionId: 'test-session-123',
-        extensionId: 'ext-123',
-        state: mockState,
-      };
-
-      const mockSessionManager = createMockSessionManager({
-        hasActive: false,
-        launchResult: mockLaunchResult,
-      });
-      vi.spyOn(sessionManagerModule, 'getSessionManager').mockReturnValue(
-        mockSessionManager,
-      );
-
-      const input: LaunchInput = { stateMode: 'default' };
-
-      const result = await handleLaunch(input);
-
-      expect(result.ok).toBe(true);
-      if (result.ok) {
-        expect(result.meta.durationMs).toBeGreaterThanOrEqual(0);
-        expect(typeof result.meta.durationMs).toBe('number');
-      }
-    });
-  });
-});
diff --git a/src/mcp-server/tools/launch.ts b/src/mcp-server/tools/launch.ts
deleted file mode 100644
index 11b5d3c..0000000
--- a/src/mcp-server/tools/launch.ts
+++ /dev/null
@@ -1,93 +0,0 @@
-import { getSessionManager } from '../session-manager.js';
-import type {
-  LaunchInput,
-  LaunchResult,
-  LaunchPrerequisite,
-  McpResponse,
-  HandlerOptions,
-} from '../types';
-import { ErrorCodes } from '../types';
-import {
-  createSuccessResponse,
-  createErrorResponse,
-  extractErrorMessage,
-} from '../utils';
-
-const PROD_MODE_PREREQUISITES: LaunchPrerequisite[] = [
-  {
-    step: 'Unlock Wallet',
-    description:
-      'The wallet must be unlocked before interacting with it. Use the extension UI to enter your password.',
-  },
-  {
-    step: 'Configure Network',
-    description:
-      'Ensure the correct network is selected (e.g., Ethereum Mainnet, Sepolia, or custom network).',
-  },
-  {
-    step: 'Set Up Accounts',
-    description:
-      'Import or create accounts as needed. Ensure the active account has sufficient funds for transactions.',
-  },
-];
-
-/**
- * Handles the launch tool request to start a browser session.
- *
- * @param input - The launch configuration parameters.
- * @param _options - Handler options (unused).
- * @returns Response with session info or error.
- */
-export async function handleLaunch(
-  input: LaunchInput,
-  _options?: HandlerOptions,
-): Promise<McpResponse<LaunchResult>> {
-  const startTime = Date.now();
-  const sessionManager = getSessionManager();
-
-  try {
-    if (sessionManager.hasActiveSession()) {
-      return createErrorResponse(
-        ErrorCodes.MM_SESSION_ALREADY_RUNNING,
-        'A session is already running. Call mm_cleanup first.',
-        { currentSessionId: sessionManager.getSessionId() },
-        sessionManager.getSessionId(),
-        startTime,
-      );
-    }
-
-    const result = await sessionManager.launch(input);
-
-    const isProdMode = sessionManager.getEnvironmentMode() === 'prod';
-    const launchResult: LaunchResult = {
-      ...result,
-      ...(isProdMode && { prerequisites: PROD_MODE_PREREQUISITES }),
-    };
-
-    return createSuccessResponse<LaunchResult>(
-      launchResult,
-      result.sessionId,
-      startTime,
-    );
-  } catch (error) {
-    const message = extractErrorMessage(error);
-
-    if (message.includes('EADDRINUSE') || message.includes('port')) {
-      return createErrorResponse(
-        ErrorCodes.MM_PORT_IN_USE,
-        `Port conflict: ${message}`,
-        { input },
-        undefined,
-        startTime,
-      );
-    }
-
-    return createErrorResponse(
-      ErrorCodes.MM_LAUNCH_FAILED,
-      `Launch failed: ${message}`,
-      { input },
-      undefined,
-      startTime,
-    );
-  }
-}
diff --git a/src/mcp-server/tools/navigation.test.ts b/src/mcp-server/tools/navigation.test.ts
deleted file mode 100644
index 84cbdca..0000000
--- a/src/mcp-server/tools/navigation.test.ts
+++ /dev/null
@@ -1,787 +0,0 @@
-/**
- * Unit tests for navigation tool handlers.
- *
- * Tests handleNavigate, handleWaitForNotification, handleSwitchToTab, and handleCloseTab
- * with various navigation targets, tab operations, and error scenarios.
- */
-
-import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
-
-import {
-  handleNavigate,
-  handleWaitForNotification,
-  handleSwitchToTab,
-  handleCloseTab,
-} from './navigation';
-import * as knowledgeStoreModule from '../knowledge-store.js';
-import * as sessionManagerModule from '../session-manager.js';
-import { createMockSessionManager, createMockPage } from '../test-utils';
-import { ErrorCodes } from '../types';
-
-describe('navigation', () => {
-  let mockSessionManager: ReturnType<typeof createMockSessionManager>;
-
-  beforeEach(() => {
-    mockSessionManager = createMockSessionManager({
-      hasActive: true,
-      sessionId: 'test-session-123',
-    });
-    vi.spyOn(sessionManagerModule, 'getSessionManager').mockReturnValue(
-      mockSessionManager,
-    );
-    // Mock knowledge store to prevent "not initialized" errors
-    vi.spyOn(knowledgeStoreModule, 'knowledgeStore', 'get').mockReturnValue({
-      recordStep: vi.fn().mockResolvedValue(undefined),
-      getLastSteps: vi.fn().mockResolvedValue([]),
-      searchSteps: vi.fn().mockResolvedValue([]),
-      summarizeSession: vi
-        .fn()
-        .mockResolvedValue({ sessionId: 'test', stepCount: 0, recipe: [] }),
-      listSessions: vi.fn().mockResolvedValue([]),
-      generatePriorKnowledge: vi.fn().mockResolvedValue(undefined),
-      writeSessionMetadata: vi.fn().mockResolvedValue('test-session'),
-    } as any);
-  });
-
-  afterEach(() => {
-    vi.restoreAllMocks();
-  });
-
-  describe('handleNavigate', () => {
-    describe('with home screen', () => {
-      it('navigates to home screen', async () => {
-        // Arrange
-        const mockPage = createMockPage();
-        vi.spyOn(mockPage, 'url').mockReturnValue(
-          'chrome-extension://ext-123/home.html',
-        );
-        vi.spyOn(mockSessionManager, 'getPage').mockReturnValue(mockPage);
-        vi.spyOn(mockSessionManager, 'navigateToHome').mockResolvedValue(
-          undefined,
-        );
-
-        // Act
-        const result = await handleNavigate({ screen: 'home' });
-
-        // Assert
-        expect(result.ok).toBe(true);
-        if (result.ok) {
-          expect(result.result.navigated).toBe(true);
-          expect(result.result.currentUrl).toBe(
-            'chrome-extension://ext-123/home.html',
-          );
-        }
-        expect(mockSessionManager.navigateToHome).toHaveBeenCalled();
-      });
-    });
-
-    describe('with settings screen', () => {
-      it('navigates to settings screen', async () => {
-        // Arrange
-        const mockPage = createMockPage();
-        vi.spyOn(mockPage, 'url').mockReturnValue(
-          'chrome-extension://ext-123/settings.html',
-        );
-        vi.spyOn(mockSessionManager, 'getPage').mockReturnValue(mockPage);
-        vi.spyOn(mockSessionManager, 'navigateToSettings').mockResolvedValue(
-          undefined,
-        );
-
-        // Act
-        const result = await handleNavigate({ screen: 'settings' });
-
-        // Assert
-        expect(result.ok).toBe(true);
-        if (result.ok) {
-          expect(result.result.navigated).toBe(true);
-          expect(result.result.currentUrl).toBe(
-            'chrome-extension://ext-123/settings.html',
-          );
-        }
-        expect(mockSessionManager.navigateToSettings).toHaveBeenCalled();
-      });
-    });
-
-    describe('with notification screen', () => {
-      it('navigates to notification screen', async () => {
-        // Arrange
-        const mockPage = createMockPage();
-        vi.spyOn(mockPage, 'url').mockReturnValue(
-          'chrome-extension://ext-123/notification.html',
-        );
-        vi.spyOn(mockSessionManager, 'getPage').mockReturnValue(mockPage);
-        vi.spyOn(
-          mockSessionManager,
-          'navigateToNotification',
-        ).mockResolvedValue(undefined);
-
-        // Act
-        const result = await handleNavigate({ screen: 'notification' });
-
-        // Assert
-        expect(result.ok).toBe(true);
-        if (result.ok) {
-          expect(result.result.navigated).toBe(true);
-          expect(result.result.currentUrl).toBe(
-            'chrome-extension://ext-123/notification.html',
-          );
-        }
-        expect(mockSessionManager.navigateToNotification).toHaveBeenCalled();
-      });
-    });
-
-    describe('with URL screen', () => {
-      it('navigates to custom URL', async () => {
-        // Arrange
-        const mockPage = createMockPage();
-        vi.spyOn(mockPage, 'url').mockReturnValue('https://app.uniswap.org');
-        vi.spyOn(mockSessionManager, 'getPage').mockReturnValue(mockPage);
-        vi.spyOn(mockSessionManager, 'navigateToUrl').mockResolvedValue(
-          mockPage,
-        );
-
-        // Act
-        const result = await handleNavigate({
-          screen: 'url',
-          url: 'https://app.uniswap.org',
-        });
-
-        // Assert
-        expect(result.ok).toBe(true);
-        if (result.ok) {
-          expect(result.result.navigated).toBe(true);
-          expect(result.result.currentUrl).toBe('https://app.uniswap.org');
-        }
-        expect(mockSessionManager.navigateToUrl).toHaveBeenCalledWith(
-          'https://app.uniswap.org',
-        );
-      });
-
-      it('returns error when URL is missing', async () => {
-        // Act
-        const result = await handleNavigate({ screen: 'url' } as any);
-
-        // Assert
-        expect(result.ok).toBe(false);
-        if (!result.ok) {
-          expect(result.error.code).toBe(ErrorCodes.MM_INVALID_INPUT);
-          expect(result.error.message).toContain('url is required');
-        }
-      });
-    });
-
-    describe('with invalid screen', () => {
-      it('returns error for unknown screen', async () => {
-        // Act
-        const result = await handleNavigate({ screen: 'invalid' } as any);
-
-        // Assert
-        expect(result.ok).toBe(false);
-        if (!result.ok) {
-          expect(result.error.code).toBe(ErrorCodes.MM_INVALID_INPUT);
-          expect(result.error.message).toContain('Unknown screen');
-        }
-      });
-    });
-
-    describe('with navigation errors', () => {
-      it('returns error when navigation fails', async () => {
-        // Arrange
-        vi.spyOn(mockSessionManager, 'navigateToHome').mockRejectedValue(
-          new Error('Navigation failed'),
-        );
-
-        // Act
-        const result = await handleNavigate({ screen: 'home' });
-
-        // Assert
-        expect(result.ok).toBe(false);
-        if (!result.ok) {
-          expect(result.error.code).toBe(ErrorCodes.MM_NAVIGATION_FAILED);
-        }
-      });
-
-      it('returns error when page closed during navigation', async () => {
-        // Arrange
-        vi.spyOn(mockSessionManager, 'navigateToSettings').mockRejectedValue(
-          new Error('Target page, context or browser has been closed'),
-        );
-
-        // Act
-        const result = await handleNavigate({ screen: 'settings' });
-
-        // Assert
-        expect(result.ok).toBe(false);
-        if (!result.ok) {
-          expect(result.error.code).toBe(ErrorCodes.MM_NAVIGATION_FAILED);
-          expect(result.error.message).toContain(
-            'Page closed during navigation',
-          );
-        }
-      });
-    });
-
-    describe('without active session', () => {
-      it('returns error when no session active', async () => {
-        // Arrange
-        vi.spyOn(mockSessionManager, 'hasActiveSession').mockReturnValue(false);
-
-        // Act
-        const result = await handleNavigate({ screen: 'home' });
-
-        // Assert
-        expect(result.ok).toBe(false);
-        if (!result.ok) {
-          expect(result.error.code).toBe(ErrorCodes.MM_NO_ACTIVE_SESSION);
-        }
-      });
-    });
-  });
-
-  describe('handleWaitForNotification', () => {
-    describe('with default timeout', () => {
-      it('waits for notification popup', async () => {
-        // Arrange
-        const mockNotificationPage = createMockPage();
-        vi.spyOn(mockNotificationPage, 'url').mockReturnValue(
-          'chrome-extension://ext-123/notification.html',
-        );
-        vi.spyOn(
-          mockSessionManager,
-          'waitForNotificationPage',
-        ).mockResolvedValue(mockNotificationPage);
-
-        // Act
-        const result = await handleWaitForNotification({});
-
-        // Assert
-        expect(result.ok).toBe(true);
-        if (result.ok) {
-          expect(result.result.found).toBe(true);
-          expect(result.result.pageUrl).toBe(
-            'chrome-extension://ext-123/notification.html',
-          );
-        }
-        expect(mockSessionManager.waitForNotificationPage).toHaveBeenCalledWith(
-          15000,
-        );
-      });
-    });
-
-    describe('with custom timeout', () => {
-      it('uses custom timeout value', async () => {
-        // Arrange
-        const mockNotificationPage = createMockPage();
-        vi.spyOn(mockNotificationPage, 'url').mockReturnValue(
-          'chrome-extension://ext-123/notification.html',
-        );
-        vi.spyOn(
-          mockSessionManager,
-          'waitForNotificationPage',
-        ).mockResolvedValue(mockNotificationPage);
-
-        // Act
-        const result = await handleWaitForNotification({ timeoutMs: 30000 });
-
-        // Assert
-        expect(result.ok).toBe(true);
-        if (result.ok) {
-          expect(result.result.found).toBe(true);
-        }
-        expect(mockSessionManager.waitForNotificationPage).toHaveBeenCalledWith(
-          30000,
-        );
-      });
-    });
-
-    describe('with timeout errors', () => {
-      it('returns error when notification not found within timeout', async () => {
-        // Arrange
-        vi.spyOn(
-          mockSessionManager,
-          'waitForNotificationPage',
-        ).mockRejectedValue(new Error('Timeout 15000ms exceeded'));
-
-        // Act
-        const result = await handleWaitForNotification({});
-
-        // Assert
-        expect(result.ok).toBe(false);
-        if (!result.ok) {
-          expect(result.error.code).toBe(ErrorCodes.MM_NOTIFICATION_TIMEOUT);
-        }
-      });
-
-      it('returns error when browser closed during wait', async () => {
-        // Arrange
-        vi.spyOn(
-          mockSessionManager,
-          'waitForNotificationPage',
-        ).mockRejectedValue(new Error('browser has been closed'));
-
-        // Act
-        const result = await handleWaitForNotification({});
-
-        // Assert
-        expect(result.ok).toBe(false);
-        if (!result.ok) {
-          expect(result.error.code).toBe(ErrorCodes.MM_NOTIFICATION_TIMEOUT);
-          expect(result.error.message).toContain(
-            'Browser closed while waiting for notification',
-          );
-        }
-      });
-    });
-
-    describe('without active session', () => {
-      it('returns error when no session active', async () => {
-        // Arrange
-        vi.spyOn(mockSessionManager, 'hasActiveSession').mockReturnValue(false);
-
-        // Act
-        const result = await handleWaitForNotification({});
-
-        // Assert
-        expect(result.ok).toBe(false);
-        if (!result.ok) {
-          expect(result.error.code).toBe(ErrorCodes.MM_NO_ACTIVE_SESSION);
-        }
-      });
-    });
-  });
-
-  describe('handleSwitchToTab', () => {
-    describe('with role matching', () => {
-      it('switches to tab by role', async () => {
-        // Arrange
-        const mockExtensionPage = createMockPage();
-        vi.spyOn(mockExtensionPage, 'url').mockReturnValue(
-          'chrome-extension://ext-123/home.html',
-        );
-        vi.spyOn(mockExtensionPage, 'bringToFront').mockResolvedValue(
-          undefined,
-        );
-
-        const mockDappPage = createMockPage();
-        vi.spyOn(mockDappPage, 'url').mockReturnValue(
-          'https://app.uniswap.org',
-        );
-        vi.spyOn(mockDappPage, 'bringToFront').mockResolvedValue(undefined);
-
-        vi.spyOn(mockSessionManager, 'getPage').mockReturnValue(mockDappPage);
-        vi.spyOn(mockSessionManager, 'getTrackedPages').mockReturnValue([
-          {
-            page: mockExtensionPage,
-            role: 'extension',
-            url: 'chrome-extension://ext-123/home.html',
-          },
-          {
-            page: mockDappPage,
-            role: 'dapp',
-            url: 'https://app.uniswap.org',
-          },
-        ]);
-        vi.spyOn(mockSessionManager, 'setActivePage');
-
-        // Act
-        const result = await handleSwitchToTab({ role: 'dapp' });
-
-        // Assert
-        expect(result.ok).toBe(true);
-        if (result.ok) {
-          expect(result.result.switched).toBe(true);
-          expect(result.result.activeTab.role).toBe('dapp');
-          expect(result.result.activeTab.url).toBe('https://app.uniswap.org');
-        }
-        expect(mockDappPage.bringToFront).toHaveBeenCalled();
-        expect(mockSessionManager.setActivePage).toHaveBeenCalledWith(
-          mockDappPage,
-        );
-      });
-    });
-
-    describe('with URL matching', () => {
-      it('switches to tab by URL prefix', async () => {
-        // Arrange
-        const mockExtensionPage = createMockPage();
-        vi.spyOn(mockExtensionPage, 'url').mockReturnValue(
-          'chrome-extension://ext-123/home.html',
-        );
-        vi.spyOn(mockExtensionPage, 'bringToFront').mockResolvedValue(
-          undefined,
-        );
-
-        const mockDappPage = createMockPage();
-        vi.spyOn(mockDappPage, 'url').mockReturnValue(
-          'https://app.uniswap.org/swap',
-        );
-        vi.spyOn(mockDappPage, 'bringToFront').mockResolvedValue(undefined);
-
-        vi.spyOn(mockSessionManager, 'getPage').mockReturnValue(mockDappPage);
-        vi.spyOn(mockSessionManager, 'getTrackedPages').mockReturnValue([
-          {
-            page: mockExtensionPage,
-            role: 'extension',
-            url: 'chrome-extension://ext-123/home.html',
-          },
-          {
-            page: mockDappPage,
-            role: 'dapp',
-            url: 'https://app.uniswap.org/swap',
-          },
-        ]);
-        vi.spyOn(mockSessionManager, 'setActivePage');
-
-        // Act
-        const result = await handleSwitchToTab({
-          url: 'https://app.uniswap.org',
-        });
-
-        // Assert
-        expect(result.ok).toBe(true);
-        if (result.ok) {
-          expect(result.result.switched).toBe(true);
-          expect(result.result.activeTab.url).toBe(
-            'https://app.uniswap.org/swap',
-          );
-        }
-        expect(mockDappPage.bringToFront).toHaveBeenCalled();
-      });
-    });
-
-    describe('with invalid input', () => {
-      it('returns error when neither role nor url provided', async () => {
-        // Act
-        const result = await handleSwitchToTab({} as any);
-
-        // Assert
-        expect(result.ok).toBe(false);
-        if (!result.ok) {
-          expect(result.error.code).toBe(ErrorCodes.MM_INVALID_INPUT);
-          expect(result.error.message).toContain(
-            'Either role or url must be provided',
-          );
-        }
-      });
-    });
-
-    describe('with tab not found', () => {
-      it('returns error when no matching tab found by role', async () => {
-        // Arrange
-        const mockExtensionPage = createMockPage();
-        vi.spyOn(mockSessionManager, 'getTrackedPages').mockReturnValue([
-          {
-            page: mockExtensionPage,
-            role: 'extension',
-            url: 'chrome-extension://ext-123/home.html',
-          },
-        ]);
-
-        // Act
-        const result = await handleSwitchToTab({ role: 'dapp' });
-
-        // Assert
-        expect(result.ok).toBe(false);
-        if (!result.ok) {
-          expect(result.error.code).toBe(ErrorCodes.MM_TAB_NOT_FOUND);
-          expect(result.error.message).toContain('No tab found matching: dapp');
-        }
-      });
-
-      it('returns error when no matching tab found by URL', async () => {
-        // Arrange
-        const mockExtensionPage = createMockPage();
-        vi.spyOn(mockSessionManager, 'getTrackedPages').mockReturnValue([
-          {
-            page: mockExtensionPage,
-            role: 'extension',
-            url: 'chrome-extension://ext-123/home.html',
-          },
-        ]);
-
-        // Act
-        const result = await handleSwitchToTab({
-          url: 'https://app.uniswap.org',
-        });
-
-        // Assert
-        expect(result.ok).toBe(false);
-        if (!result.ok) {
-          expect(result.error.code).toBe(ErrorCodes.MM_TAB_NOT_FOUND);
-        }
-      });
-    });
-
-    describe('without active session', () => {
-      it('returns error when no session active', async () => {
-        // Arrange
-        vi.spyOn(mockSessionManager, 'hasActiveSession').mockReturnValue(false);
-
-        // Act
-        const result = await handleSwitchToTab({ role: 'dapp' });
-
-        // Assert
-        expect(result.ok).toBe(false);
-        if (!result.ok) {
-          expect(result.error.code).toBe(ErrorCodes.MM_NO_ACTIVE_SESSION);
-        }
-      });
-    });
-  });
-
-  describe('handleCloseTab', () => {
-    describe('with role matching', () => {
-      it('closes tab by role', async () => {
-        // Arrange
-        const mockExtensionPage = createMockPage();
-        vi.spyOn(mockExtensionPage, 'url').mockReturnValue(
-          'chrome-extension://ext-123/home.html',
-        );
-
-        const mockDappPage = createMockPage();
-        vi.spyOn(mockDappPage, 'url').mockReturnValue(
-          'https://app.uniswap.org',
-        );
-        vi.spyOn(mockDappPage, 'close').mockResolvedValue(undefined);
-
-        vi.spyOn(mockSessionManager, 'getPage').mockReturnValue(
-          mockExtensionPage,
-        );
-        vi.spyOn(mockSessionManager, 'getTrackedPages').mockReturnValue([
-          {
-            page: mockExtensionPage,
-            role: 'extension',
-            url: 'chrome-extension://ext-123/home.html',
-          },
-          {
-            page: mockDappPage,
-            role: 'dapp',
-            url: 'https://app.uniswap.org',
-          },
-        ]);
-
-        // Act
-        const result = await handleCloseTab({ role: 'dapp' });
-
-        // Assert
-        expect(result.ok).toBe(true);
-        if (result.ok) {
-          expect(result.result.closed).toBe(true);
-          expect(result.result.closedUrl).toBe('https://app.uniswap.org');
-        }
-        expect(mockDappPage.close).toHaveBeenCalled();
-      });
-    });
-
-    describe('with URL matching', () => {
-      it('closes tab by URL prefix', async () => {
-        // Arrange
-        const mockExtensionPage = createMockPage();
-        vi.spyOn(mockExtensionPage, 'url').mockReturnValue(
-          'chrome-extension://ext-123/home.html',
-        );
-
-        const mockDappPage = createMockPage();
-        vi.spyOn(mockDappPage, 'url').mockReturnValue(
-          'https://app.uniswap.org/swap',
-        );
-        vi.spyOn(mockDappPage, 'close').mockResolvedValue(undefined);
-
-        vi.spyOn(mockSessionManager, 'getPage').mockReturnValue(
-          mockExtensionPage,
-        );
-        vi.spyOn(mockSessionManager, 'getTrackedPages').mockReturnValue([
-          {
-            page: mockExtensionPage,
-            role: 'extension',
-            url: 'chrome-extension://ext-123/home.html',
-          },
-          {
-            page: mockDappPage,
-            role: 'dapp',
-            url: 'https://app.uniswap.org/swap',
-          },
-        ]);
-
-        // Act
-        const result = await handleCloseTab({ url: 'https://app.uniswap.org' });
-
-        // Assert
-        expect(result.ok).toBe(true);
-        if (result.ok) {
-          expect(result.result.closed).toBe(true);
-          expect(result.result.closedUrl).toBe('https://app.uniswap.org/swap');
-        }
-        expect(mockDappPage.close).toHaveBeenCalled();
-      });
-    });
-
-    describe('with active tab closure', () => {
-      it('switches to extension tab when closing active tab', async () => {
-        // Arrange
-        const mockExtensionPage = createMockPage();
-        vi.spyOn(mockExtensionPage, 'url').mockReturnValue(
-          'chrome-extension://ext-123/home.html',
-        );
-        vi.spyOn(mockExtensionPage, 'bringToFront').mockResolvedValue(
-          undefined,
-        );
-
-        const mockDappPage = createMockPage();
-        vi.spyOn(mockDappPage, 'url').mockReturnValue(
-          'https://app.uniswap.org',
-        );
-        vi.spyOn(mockDappPage, 'close').mockResolvedValue(undefined);
-
-        vi.spyOn(mockSessionManager, 'getPage').mockReturnValue(mockDappPage);
-        vi.spyOn(mockSessionManager, 'getTrackedPages').mockReturnValue([
-          {
-            page: mockExtensionPage,
-            role: 'extension',
-            url: 'chrome-extension://ext-123/home.html',
-          },
-          {
-            page: mockDappPage,
-            role: 'dapp',
-            url: 'https://app.uniswap.org',
-          },
-        ]);
-        vi.spyOn(mockSessionManager, 'setActivePage');
-
-        // Act
-        const result = await handleCloseTab({ role: 'dapp' });
-
-        // Assert
-        expect(result.ok).toBe(true);
-        if (result.ok) {
-          expect(result.result.closed).toBe(true);
-        }
-        expect(mockExtensionPage.bringToFront).toHaveBeenCalled();
-        expect(mockSessionManager.setActivePage).toHaveBeenCalledWith(
-          mockExtensionPage,
-        );
-        expect(mockDappPage.close).toHaveBeenCalled();
-      });
-
-      it('does not switch when closing non-active tab', async () => {
-        // Arrange
-        const mockExtensionPage = createMockPage();
-        vi.spyOn(mockExtensionPage, 'url').mockReturnValue(
-          'chrome-extension://ext-123/home.html',
-        );
-        vi.spyOn(mockExtensionPage, 'bringToFront').mockResolvedValue(
-          undefined,
-        );
-
-        const mockDappPage = createMockPage();
-        vi.spyOn(mockDappPage, 'url').mockReturnValue(
-          'https://app.uniswap.org',
-        );
-        vi.spyOn(mockDappPage, 'close').mockResolvedValue(undefined);
-
-        vi.spyOn(mockSessionManager, 'getPage').mockReturnValue(
-          mockExtensionPage,
-        );
-        vi.spyOn(mockSessionManager, 'getTrackedPages').mockReturnValue([
-          {
-            page: mockExtensionPage,
-            role: 'extension',
-            url: 'chrome-extension://ext-123/home.html',
-          },
-          {
-            page: mockDappPage,
-            role: 'dapp',
-            url: 'https://app.uniswap.org',
-          },
-        ]);
-        vi.spyOn(mockSessionManager, 'setActivePage');
-
-        // Act
-        const result = await handleCloseTab({ role: 'dapp' });
-
-        // Assert
-        expect(result.ok).toBe(true);
-        expect(mockExtensionPage.bringToFront).not.toHaveBeenCalled();
-        expect(mockSessionManager.setActivePage).not.toHaveBeenCalled();
-        expect(mockDappPage.close).toHaveBeenCalled();
-      });
-    });
-
-    describe('with invalid input', () => {
-      it('returns error when neither role nor url provided', async () => {
-        // Act
-        const result = await handleCloseTab({} as any);
-
-        // Assert
-        expect(result.ok).toBe(false);
-        if (!result.ok) {
-          expect(result.error.code).toBe(ErrorCodes.MM_INVALID_INPUT);
-          expect(result.error.message).toContain(
-            'Either role or url must be provided',
-          );
-        }
-      });
-    });
-
-    describe('with tab not found', () => {
-      it('returns error when no matching tab found by role', async () => {
-        // Arrange
-        const mockExtensionPage = createMockPage();
-        vi.spyOn(mockSessionManager, 'getTrackedPages').mockReturnValue([
-          {
-            page: mockExtensionPage,
-            role: 'extension',
-            url: 'chrome-extension://ext-123/home.html',
-          },
-        ]);
-
-        // Act
-        const result = await handleCloseTab({ role: 'dapp' });
-
-        // Assert
-        expect(result.ok).toBe(false);
-        if (!result.ok) {
-          expect(result.error.code).toBe(ErrorCodes.MM_TAB_NOT_FOUND);
-          expect(result.error.message).toContain('No tab found matching: dapp');
-        }
-      });
-
-      it('returns error when no matching tab found by URL', async () => {
-        // Arrange
-        const mockExtensionPage = createMockPage();
-        vi.spyOn(mockSessionManager, 'getTrackedPages').mockReturnValue([
-          {
-            page: mockExtensionPage,
-            role: 'extension',
-            url: 'chrome-extension://ext-123/home.html',
-          },
-        ]);
-
-        // Act
-        const result = await handleCloseTab({ url: 'https://app.uniswap.org' });
-
-        // Assert
-        expect(result.ok).toBe(false);
-        if (!result.ok) {
-          expect(result.error.code).toBe(ErrorCodes.MM_TAB_NOT_FOUND);
-        }
-      });
-    });
-
-    describe('without active session', () => {
-      it('returns error when no session active', async () => {
-        // Arrange
-        vi.spyOn(mockSessionManager, 'hasActiveSession').mockReturnValue(false);
-
-        // Act
-        const result = await handleCloseTab({ role: 'dapp' });
-
-        // Assert
-        expect(result.ok).toBe(false);
-        if (!result.ok) {
-          expect(result.error.code).toBe(ErrorCodes.MM_NO_ACTIVE_SESSION);
-        }
-      });
-    });
-  });
-});
diff --git a/src/mcp-server/tools/navigation.ts b/src/mcp-server/tools/navigation.ts
deleted file mode 100644
index 83a59a2..0000000
--- a/src/mcp-server/tools/navigation.ts
+++ /dev/null
@@ -1,329 +0,0 @@
-import { DEFAULT_INTERACTION_TIMEOUT_MS } from '../constants.js';
-import { getSessionManager } from '../session-manager.js';
-import {
-  classifyNavigationError,
-  classifyTabError,
-  classifyNotificationError,
-} from './error-classification.js';
-import { runTool } from './run-tool.js';
-import type {
-  NavigateInput,
-  NavigateResult,
-  WaitForNotificationInput,
-  WaitForNotificationResult,
-  SwitchToTabInput,
-  SwitchToTabResult,
-  CloseTabInput,
-  CloseTabResult,
-  McpResponse,
-  HandlerOptions,
-} from '../types';
-import { ErrorCodes } from '../types';
-import { createErrorResponse } from '../utils';
-
-/**
- * Handles navigation to a specific screen or URL.
- *
- * @param input The navigate input containing target screen and optional URL
- * @param options Optional handler configuration
- * @returns Promise resolving to navigate result with current URL information
- */
-export async function handleNavigate(
-  input: NavigateInput,
-  options?: HandlerOptions,
-): Promise<McpResponse<NavigateResult>> {
-  const startTime = Date.now();
-  const sessionManager = getSessionManager();
-  const sessionId = sessionManager.getSessionId();
-
-  if (input.screen === 'url' && !input.url) {
-    return createErrorResponse(
-      ErrorCodes.MM_INVALID_INPUT,
-      'url is required when screen is "url"',
-      { input },
-      sessionId,
-      startTime,
-    );
-  }
-
-  const validScreens = ['home', 'settings', 'url', 'notification'];
-  if (!validScreens.includes(input.screen)) {
-    return createErrorResponse(
-      ErrorCodes.MM_INVALID_INPUT,
-      `Unknown screen: ${String(input.screen)}`,
-      { input },
-      sessionId,
-      startTime,
-    );
-  }
-
-  return runTool<NavigateInput, NavigateResult>({
-    toolName: 'mm_navigate',
-    input,
-    options,
-
-    /**
-     * Executes the navigation action to the target screen.
-     *
-     * @param context The tool execution context containing page and reference map
-     * @returns Promise resolving to navigate result with success status and URL
-     */
-    execute: async (context) => {
-      switch (input.screen) {
-        case 'home':
-          await sessionManager.navigateToHome();
-          break;
-        case 'settings':
-          await sessionManager.navigateToSettings();
-          break;
-        case 'url':
-          await sessionManager.navigateToUrl(input.url as string);
-          break;
-        case 'notification':
-          await sessionManager.navigateToNotification();
-          break;
-        default:
-          throw new Error(`Unsupported screen: ${String(input.screen)}`);
-      }
-
-      return {
-        navigated: true,
-        currentUrl: context.page.url(),
-      };
-    },
-
-    classifyError: classifyNavigationError,
-
-    /**
-     * Sanitizes input for knowledge store recording.
-     *
-     * @returns Sanitized input object with screen and URL information
-     */
-    sanitizeInputForRecording: () => ({
-      screen: input.screen,
-      url: input.url,
-    }),
-  });
-}
-
-/**
- * Handles waiting for a notification popup to appear.
- *
- * @param input The wait input containing timeout options
- * @param options Optional handler configuration
- * @returns Promise resolving to wait result with notification page URL
- */
-export async function handleWaitForNotification(
-  input: WaitForNotificationInput,
-  options?: HandlerOptions,
-): Promise<McpResponse<WaitForNotificationResult>> {
-  const sessionManager = getSessionManager();
-  const timeoutMs = input.timeoutMs ?? DEFAULT_INTERACTION_TIMEOUT_MS;
-
-  return runTool<WaitForNotificationInput, WaitForNotificationResult>({
-    toolName: 'mm_wait_for_notification',
-    input,
-    options,
-
-    /**
-     * Executes the wait action for notification popup.
-     *
-     * @returns Promise resolving to wait result with notification page URL
-     */
-    execute: async () => {
-      const notificationPage =
-        await sessionManager.waitForNotificationPage(timeoutMs);
-      const pageUrl = notificationPage.url();
-
-      return {
-        found: true,
-        pageUrl,
-      };
-    },
-
-    classifyError: classifyNotificationError,
-
-    /**
-     * Sanitizes input for knowledge store recording.
-     *
-     * @returns Sanitized input object with timeout information
-     */
-    sanitizeInputForRecording: () => ({ timeoutMs }),
-  });
-}
-
-/**
- * Handles switching to a different tab by role or URL.
- *
- * @param input The switch input containing tab role or URL to match
- * @param options Optional handler configuration
- * @returns Promise resolving to switch result with active tab information
- */
-export async function handleSwitchToTab(
-  input: SwitchToTabInput,
-  options?: HandlerOptions,
-): Promise<McpResponse<SwitchToTabResult>> {
-  const startTime = Date.now();
-  const sessionManager = getSessionManager();
-  const sessionId = sessionManager.getSessionId();
-
-  if (!input.role && !input.url) {
-    return createErrorResponse(
-      ErrorCodes.MM_INVALID_INPUT,
-      'Either role or url must be provided',
-      { input },
-      sessionId,
-      startTime,
-    );
-  }
-
-  return runTool<SwitchToTabInput, SwitchToTabResult>({
-    toolName: 'mm_switch_to_tab',
-    input,
-    options,
-
-    /**
-     * Executes the tab switch action.
-     *
-     * @param _context The tool execution context containing page and reference map
-     * @returns Promise resolving to switch result with active tab information
-     */
-    execute: async (_context) => {
-      const trackedPages = sessionManager.getTrackedPages();
-      const targetPage = trackedPages.find((trackedPage) => {
-        if (input.role) {
-          return trackedPage.role === input.role;
-        }
-        if (input.url) {
-          return trackedPage.url.startsWith(input.url);
-        }
-        return false;
-      });
-
-      if (!targetPage) {
-        const availableTabs = trackedPages.map((trackedPage) => ({
-          role: trackedPage.role,
-          url: trackedPage.url,
-        }));
-        throw new Error(
-          `No tab found matching: ${input.role ?? input.url}. Available tabs: ${JSON.stringify(availableTabs)}`,
-        );
-      }
-
-      await targetPage.page.bringToFront();
-      sessionManager.setActivePage(targetPage.page);
-
-      const updatedTrackedPages = sessionManager.getTrackedPages();
-      const activeTabInfo = updatedTrackedPages.find(
-        (trackedPage) => trackedPage.page === targetPage.page,
-      );
-
-      return {
-        switched: true,
-        activeTab: {
-          role: activeTabInfo?.role ?? 'other',
-          url: targetPage.page.url(),
-        },
-      };
-    },
-
-    classifyError: classifyTabError,
-
-    /**
-     * Sanitizes input for knowledge store recording.
-     *
-     * @returns Sanitized input object with role and URL information
-     */
-    sanitizeInputForRecording: () => ({
-      role: input.role,
-      url: input.url,
-    }),
-  });
-}
-
-/**
- * Handles closing a tab by role or URL.
- *
- * @param input The close input containing tab role or URL to match
- * @param options Optional handler configuration
- * @returns Promise resolving to close result with closed tab URL
- */
-export async function handleCloseTab(
-  input: CloseTabInput,
-  options?: HandlerOptions,
-): Promise<McpResponse<CloseTabResult>> {
-  const startTime = Date.now();
-  const sessionManager = getSessionManager();
-  const sessionId = sessionManager.getSessionId();
-
-  if (!input.role && !input.url) {
-    return createErrorResponse(
-      ErrorCodes.MM_INVALID_INPUT,
-      'Either role or url must be provided',
-      { input },
-      sessionId,
-      startTime,
-    );
-  }
-
-  return runTool<CloseTabInput, CloseTabResult>({
-    toolName: 'mm_close_tab',
-    input,
-    options,
-
-    /**
-     * Executes the tab close action.
-     *
-     * @param context The tool execution context containing page and reference map
-     * @returns Promise resolving to close result with closed tab URL
-     */
-    execute: async (context) => {
-      const trackedPages = sessionManager.getTrackedPages();
-      const targetPage = trackedPages.find((trackedPage) => {
-        if (input.role) {
-          return trackedPage.role === input.role;
-        }
-        if (input.url) {
-          return trackedPage.url.startsWith(input.url);
-        }
-        return false;
-      });
-
-      if (!targetPage) {
-        throw new Error(`No tab found matching: ${input.role ?? input.url}`);
-      }
-
-      const closedUrl = targetPage.url;
-
-      const currentActivePage = context.page;
-      if (targetPage.page === currentActivePage) {
-        const extensionPage = trackedPages.find(
-          (trackedPage) => trackedPage.role === 'extension',
-        );
-        if (extensionPage) {
-          await extensionPage.page.bringToFront();
-          sessionManager.setActivePage(extensionPage.page);
-        }
-      }
-
-      await targetPage.page.close();
-
-      return {
-        closed: true,
-        closedUrl,
-      };
-    },
-
-    classifyError: classifyTabError,
-
-    /**
-     * Sanitizes input for knowledge store recording.
-     *
-     * @returns Sanitized input object with role and URL information
-     */
-    sanitizeInputForRecording: () => ({
-      role: input.role,
-      url: input.url,
-    }),
-  });
-}
diff --git a/src/mcp-server/tools/registry.test.ts b/src/mcp-server/tools/registry.test.ts
deleted file mode 100644
index 084c489..0000000
--- a/src/mcp-server/tools/registry.test.ts
+++ /dev/null
@@ -1,156 +0,0 @@
-import { describe, it, expect } from 'vitest';
-
-import {
-  getToolHandler,
-  hasToolHandler,
-  buildToolHandlersRecord,
-  toolHandlers,
-} from './registry.js';
-
-describe('tool registry', () => {
-  describe('getToolHandler', () => {
-    it('returns handler for prefixed tool name', () => {
-      const handler = getToolHandler('mm_launch');
-
-      expect(handler).toBeDefined();
-      expect(typeof handler).toBe('function');
-    });
-
-    it('returns handler for base tool name', () => {
-      const handler = getToolHandler('launch');
-
-      expect(handler).toBeDefined();
-      expect(typeof handler).toBe('function');
-    });
-
-    it('returns undefined for unknown tool', () => {
-      const handler = getToolHandler('mm_unknown_tool');
-
-      expect(handler).toBeUndefined();
-    });
-
-    it('returns undefined for empty string', () => {
-      const handler = getToolHandler('');
-
-      expect(handler).toBeUndefined();
-    });
-
-    it('returns different handlers for different tools', () => {
-      const launchHandler = getToolHandler('mm_launch');
-      const cleanupHandler = getToolHandler('mm_cleanup');
-
-      expect(launchHandler).not.toBe(cleanupHandler);
-    });
-  });
-
-  describe('hasToolHandler', () => {
-    it('returns true for existing prefixed tool', () => {
-      const result = hasToolHandler('mm_click');
-
-      expect(result).toBe(true);
-    });
-
-    it('returns true for existing base tool', () => {
-      const result = hasToolHandler('click');
-
-      expect(result).toBe(true);
-    });
-
-    it('returns false for non-existent tool', () => {
-      const result = hasToolHandler('mm_nonexistent');
-
-      expect(result).toBe(false);
-    });
-
-    it('returns false for empty string', () => {
-      const result = hasToolHandler('');
-
-      expect(result).toBe(false);
-    });
-  });
-
-  describe('buildToolHandlersRecord', () => {
-    it('returns record with prefixed tool names', () => {
-      const handlers = buildToolHandlersRecord();
-
-      expect(handlers.mm_launch).toBeDefined();
-      expect(handlers.mm_cleanup).toBeDefined();
-      expect(handlers.mm_click).toBeDefined();
-      expect(handlers.mm_type).toBeDefined();
-    });
-
-    it('returns fresh record on each call', () => {
-      const handlers1 = buildToolHandlersRecord();
-      const handlers2 = buildToolHandlersRecord();
-
-      expect(handlers1).not.toBe(handlers2);
-      expect(handlers1).toStrictEqual(handlers2);
-    });
-
-    it('includes all 27 tools', () => {
-      const handlers = buildToolHandlersRecord();
-
-      expect(Object.keys(handlers)).toHaveLength(27);
-    });
-
-    it('all handlers are functions', () => {
-      const handlers = buildToolHandlersRecord();
-
-      for (const handler of Object.values(handlers)) {
-        expect(typeof handler).toBe('function');
-      }
-    });
-  });
-
-  describe('toolHandlers export', () => {
-    it('exports pre-built handlers record', () => {
-      expect(toolHandlers).toBeDefined();
-      expect(typeof toolHandlers).toBe('object');
-    });
-
-    it('contains all expected tools', () => {
-      const expectedTools = [
-        'mm_build',
-        'mm_launch',
-        'mm_cleanup',
-        'mm_get_state',
-        'mm_navigate',
-        'mm_wait_for_notification',
-        'mm_switch_to_tab',
-        'mm_close_tab',
-        'mm_list_testids',
-        'mm_accessibility_snapshot',
-        'mm_describe_screen',
-        'mm_screenshot',
-        'mm_click',
-        'mm_type',
-        'mm_wait_for',
-        'mm_knowledge_last',
-        'mm_knowledge_search',
-        'mm_knowledge_summarize',
-        'mm_knowledge_sessions',
-        'mm_seed_contract',
-        'mm_seed_contracts',
-        'mm_get_contract_address',
-        'mm_list_contracts',
-        'mm_run_steps',
-        'mm_set_context',
-        'mm_get_context',
-        'mm_clipboard',
-      ];
-
-      for (const tool of expectedTools) {
-        expect(toolHandlers[tool]).toBeDefined();
-        expect(typeof toolHandlers[tool]).toBe('function');
-      }
-    });
-
-    it('matches buildToolHandlersRecord output', () => {
-      const freshHandlers = buildToolHandlersRecord();
-
-      expect(Object.keys(toolHandlers)).toStrictEqual(
-        Object.keys(freshHandlers),
-      );
-    });
-  });
-});
diff --git a/src/mcp-server/tools/registry.ts b/src/mcp-server/tools/registry.ts
deleted file mode 100644
index 3b67886..0000000
--- a/src/mcp-server/tools/registry.ts
+++ /dev/null
@@ -1,10 +0,0 @@
-import { buildToolHandlersRecord } from './definitions.js';
-
-export {
-  getToolHandler,
-  hasToolHandler,
-  buildToolHandlersRecord,
-} from './definitions.js';
-export type { ToolHandler } from './batch.js';
-
-export const toolHandlers = buildToolHandlersRecord();
diff --git a/src/mcp-server/tools/run-tool.test.ts b/src/mcp-server/tools/run-tool.test.ts
deleted file mode 100644
index 3592062..0000000
--- a/src/mcp-server/tools/run-tool.test.ts
+++ /dev/null
@@ -1,958 +0,0 @@
-/**
- * Unit tests for the generic tool execution wrapper (runTool).
- *
- * Tests execution flow, observation collection policies, knowledge store recording,
- * error classification, timeout handling, and page closure detection.
- */
-
-import type { Page } from '@playwright/test';
-import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
-
-import { runTool } from './run-tool';
-import type { ToolExecutionConfig } from './run-tool';
-import * as knowledgeStoreModule from '../knowledge-store.js';
-import * as sessionManagerModule from '../session-manager.js';
-import { createMockSessionManager } from '../test-utils';
-import { ErrorCodes } from '../types';
-import * as helpersModule from './helpers.js';
-
-describe('runTool', () => {
-  let mockSessionManager: ReturnType<typeof createMockSessionManager>;
-  let mockKnowledgeStore: {
-    recordStep: ReturnType<typeof vi.fn>;
-    getLastSteps: ReturnType<typeof vi.fn>;
-    searchSteps: ReturnType<typeof vi.fn>;
-    summarizeSession: ReturnType<typeof vi.fn>;
-    listSessions: ReturnType<typeof vi.fn>;
-    generatePriorKnowledge: ReturnType<typeof vi.fn>;
-    writeSessionMetadata: ReturnType<typeof vi.fn>;
-  };
-  let mockPage: Page;
-
-  beforeEach(() => {
-    mockSessionManager = createMockSessionManager({
-      hasActive: true,
-      sessionId: 'test-session-123',
-      environmentMode: 'e2e',
-    });
-    mockPage = {
-      url: () => 'chrome-extension://test/home.html',
-      isClosed: () => false,
-    } as unknown as Page;
-    vi.spyOn(mockSessionManager, 'getPage').mockReturnValue(mockPage);
-    vi.spyOn(mockSessionManager, 'getRefMap').mockReturnValue(new Map());
-
-    vi.spyOn(sessionManagerModule, 'getSessionManager').mockReturnValue(
-      mockSessionManager,
-    );
-
-    mockKnowledgeStore = {
-      recordStep: vi.fn().mockResolvedValue(undefined),
-      getLastSteps: vi.fn().mockResolvedValue([]),
-      searchSteps: vi.fn().mockResolvedValue([]),
-      summarizeSession: vi
-        .fn()
-        .mockResolvedValue({ sessionId: 'test', stepCount: 0, recipe: [] }),
-      listSessions: vi.fn().mockResolvedValue([]),
-      generatePriorKnowledge: vi.fn().mockResolvedValue(undefined),
-      writeSessionMetadata: vi.fn().mockResolvedValue('test-session'),
-    };
-    vi.spyOn(knowledgeStoreModule, 'knowledgeStore', 'get').mockReturnValue(
-      mockKnowledgeStore as any,
-    );
-  });
-
-  afterEach(() => {
-    vi.restoreAllMocks();
-  });
-
-  describe('basic execution', () => {
-    it('executes tool and returns success response', async () => {
-      // Arrange
-      vi.spyOn(helpersModule, 'collectObservation').mockResolvedValue({
-        state: {} as any,
-        testIds: [],
-        a11y: { nodes: [] },
-      });
-      const config: ToolExecutionConfig<{ value: string }, string> = {
-        toolName: 'mm_test_tool',
-        input: { value: 'test-input' },
-        execute: vi.fn().mockResolvedValue('success'),
-      };
-
-      // Act
-      const result = await runTool(config);
-
-      // Assert
-      expect(result.ok).toBe(true);
-      if (result.ok) {
-        expect(result.result).toBe('success');
-        expect(result.meta.sessionId).toBe('test-session-123');
-        expect(result.meta.durationMs).toBeGreaterThanOrEqual(0);
-      }
-    });
-
-    it('passes context to execute function', async () => {
-      // Arrange
-      vi.spyOn(helpersModule, 'collectObservation').mockResolvedValue({
-        state: {} as any,
-        testIds: [],
-        a11y: { nodes: [] },
-      });
-      const executeFn = vi.fn().mockResolvedValue({ result: 'ok' });
-      const config: ToolExecutionConfig<{ value: string }, { result: string }> =
-        {
-          toolName: 'mm_test_tool',
-          input: { value: 'test' },
-          execute: executeFn,
-        };
-
-      // Act
-      await runTool(config);
-
-      // Assert
-      expect(executeFn).toHaveBeenCalledWith({
-        sessionId: 'test-session-123',
-        page: mockPage,
-        refMap: expect.any(Map),
-        startTime: expect.any(Number),
-      });
-    });
-
-    it('handles ToolExecuteResult with custom observation', async () => {
-      // Arrange
-      const customObservation = {
-        state: { isLoaded: true } as any,
-        testIds: [{ testId: 'custom', tag: 'div', text: '', visible: true }],
-        a11y: { nodes: [] },
-      };
-      const config: ToolExecutionConfig<object, { data: string }> = {
-        toolName: 'mm_test_tool',
-        input: {},
-        observationPolicy: 'custom',
-        execute: vi.fn().mockResolvedValue({
-          result: { data: 'test' },
-          observation: customObservation,
-        }),
-      };
-
-      // Act
-      const result = await runTool(config);
-
-      // Assert
-      expect(result.ok).toBe(true);
-      if (result.ok) {
-        expect(result.result).toStrictEqual({ data: 'test' });
-      }
-      expect(mockKnowledgeStore.recordStep).toHaveBeenCalledWith(
-        expect.objectContaining({
-          observation: customObservation,
-        }),
-      );
-    });
-  });
-
-  describe('session validation', () => {
-    it('returns error when no active session and requiresSession is true', async () => {
-      // Arrange
-      vi.spyOn(mockSessionManager, 'hasActiveSession').mockReturnValue(false);
-      const config: ToolExecutionConfig<object, object> = {
-        toolName: 'mm_test_tool',
-        input: {},
-        requiresSession: true,
-        execute: vi.fn(),
-      };
-
-      // Act
-      const result = await runTool(config);
-
-      // Assert
-      expect(result.ok).toBe(false);
-      if (!result.ok) {
-        expect(result.error.code).toBe(ErrorCodes.MM_NO_ACTIVE_SESSION);
-        expect(result.error.message).toBe(
-          'No active session. Call launch first.',
-        );
-      }
-      expect(config.execute).not.toHaveBeenCalled();
-    });
-
-    it('executes tool when no active session but requiresSession is false', async () => {
-      // Arrange
-      vi.spyOn(mockSessionManager, 'hasActiveSession').mockReturnValue(false);
-      const executeFn = vi.fn().mockResolvedValue({ done: true });
-      const config: ToolExecutionConfig<object, { done: boolean }> = {
-        toolName: 'mm_build',
-        input: {},
-        requiresSession: false,
-        execute: executeFn,
-      };
-
-      // Act
-      const result = await runTool(config);
-
-      // Assert
-      expect(result.ok).toBe(true);
-      expect(executeFn).toHaveBeenCalled();
-    });
-
-    it('defaults requiresSession to true when not specified', async () => {
-      // Arrange
-      vi.spyOn(mockSessionManager, 'hasActiveSession').mockReturnValue(false);
-      const config: ToolExecutionConfig<object, object> = {
-        toolName: 'mm_click',
-        input: {},
-        execute: vi.fn(),
-      };
-
-      // Act
-      const result = await runTool(config);
-
-      // Assert
-      expect(result.ok).toBe(false);
-      if (!result.ok) {
-        expect(result.error.code).toBe(ErrorCodes.MM_NO_ACTIVE_SESSION);
-      }
-    });
-  });
-
-  describe('observation policies', () => {
-    describe('policy: none', () => {
-      it('collects minimal observation on success', async () => {
-        // Arrange
-        const collectObservationSpy = vi
-          .spyOn(helpersModule, 'collectObservation')
-          .mockResolvedValue({
-            state: {} as any,
-            testIds: [],
-            a11y: { nodes: [] },
-          });
-        const config: ToolExecutionConfig<object, object> = {
-          toolName: 'mm_test_tool',
-          input: {},
-          observationPolicy: 'none',
-          execute: vi.fn().mockResolvedValue({}),
-        };
-
-        // Act
-        await runTool(config);
-
-        // Assert
-        expect(collectObservationSpy).toHaveBeenCalledWith(mockPage, 'minimal');
-      });
-    });
-
-    describe('policy: default', () => {
-      it('collects full observation on success', async () => {
-        // Arrange
-        const collectObservationSpy = vi
-          .spyOn(helpersModule, 'collectObservation')
-          .mockResolvedValue({
-            state: {} as any,
-            testIds: [],
-            a11y: { nodes: [] },
-          });
-        const config: ToolExecutionConfig<object, object> = {
-          toolName: 'mm_test_tool',
-          input: {},
-          observationPolicy: 'default',
-          execute: vi.fn().mockResolvedValue({}),
-        };
-
-        // Act
-        await runTool(config);
-
-        // Assert
-        expect(collectObservationSpy).toHaveBeenCalledWith(mockPage, 'full');
-      });
-    });
-
-    describe('policy: failures', () => {
-      it('collects minimal observation on success', async () => {
-        // Arrange
-        const collectObservationSpy = vi
-          .spyOn(helpersModule, 'collectObservation')
-          .mockResolvedValue({
-            state: {} as any,
-            testIds: [],
-            a11y: { nodes: [] },
-          });
-        const config: ToolExecutionConfig<object, object> = {
-          toolName: 'mm_test_tool',
-          input: {},
-          observationPolicy: 'failures',
-          execute: vi.fn().mockResolvedValue({}),
-        };
-
-        // Act
-        await runTool(config);
-
-        // Assert
-        expect(collectObservationSpy).toHaveBeenCalledWith(mockPage, 'minimal');
-      });
-
-      it('collects full observation on failure', async () => {
-        // Arrange
-        const collectObservationSpy = vi
-          .spyOn(helpersModule, 'collectObservation')
-          .mockResolvedValue({
-            state: {} as any,
-            testIds: [],
-            a11y: { nodes: [] },
-          });
-        const config: ToolExecutionConfig<object, object> = {
-          toolName: 'mm_test_tool',
-          input: {},
-          observationPolicy: 'failures',
-          execute: vi.fn().mockRejectedValue(new Error('Test failure')),
-        };
-
-        // Act
-        await runTool(config);
-
-        // Assert
-        expect(collectObservationSpy).toHaveBeenCalledWith(mockPage, 'full');
-      });
-    });
-
-    describe('policy: custom', () => {
-      it('uses observation from execute result', async () => {
-        // Arrange
-        const customObservation = {
-          state: { isLoaded: true } as any,
-          testIds: [],
-          a11y: {
-            nodes: [{ ref: 'e1', role: 'button', name: 'Test', path: [] }],
-          },
-        };
-        const collectObservationSpy = vi.spyOn(
-          helpersModule,
-          'collectObservation',
-        );
-        const config: ToolExecutionConfig<object, { data: string }> = {
-          toolName: 'mm_test_tool',
-          input: {},
-          observationPolicy: 'custom',
-          execute: vi.fn().mockResolvedValue({
-            result: { data: 'test' },
-            observation: customObservation,
-          }),
-        };
-
-        // Act
-        await runTool(config);
-
-        // Assert
-        expect(collectObservationSpy).not.toHaveBeenCalled();
-        expect(mockKnowledgeStore.recordStep).toHaveBeenCalledWith(
-          expect.objectContaining({
-            observation: customObservation,
-          }),
-        );
-      });
-    });
-
-    it('uses options.observationPolicy over config.observationPolicy', async () => {
-      // Arrange
-      const collectObservationSpy = vi
-        .spyOn(helpersModule, 'collectObservation')
-        .mockResolvedValue({
-          state: {} as any,
-          testIds: [],
-          a11y: { nodes: [] },
-        });
-      const config: ToolExecutionConfig<object, object> = {
-        toolName: 'mm_test_tool',
-        input: {},
-        observationPolicy: 'default',
-        options: { observationPolicy: 'none' },
-        execute: vi.fn().mockResolvedValue({}),
-      };
-
-      // Act
-      await runTool(config);
-
-      // Assert
-      expect(collectObservationSpy).toHaveBeenCalledWith(mockPage, 'minimal');
-    });
-
-    it('skips observation collection when requiresSession is false', async () => {
-      // Arrange
-      vi.spyOn(mockSessionManager, 'hasActiveSession').mockReturnValue(false);
-      const collectObservationSpy = vi.spyOn(
-        helpersModule,
-        'collectObservation',
-      );
-      const config: ToolExecutionConfig<object, object> = {
-        toolName: 'mm_build',
-        input: {},
-        requiresSession: false,
-        observationPolicy: 'default',
-        execute: vi.fn().mockResolvedValue({}),
-      };
-
-      // Act
-      await runTool(config);
-
-      // Assert
-      expect(collectObservationSpy).not.toHaveBeenCalled();
-    });
-  });
-
-  describe('knowledge store recording', () => {
-    it('records successful step with all parameters', async () => {
-      // Arrange
-      vi.spyOn(helpersModule, 'collectObservation').mockResolvedValue({
-        state: {} as any,
-        testIds: [],
-        a11y: { nodes: [] },
-      });
-      const config: ToolExecutionConfig<
-        { testId: string },
-        { clicked: boolean }
-      > = {
-        toolName: 'mm_click',
-        input: { testId: 'send-button' },
-        execute: vi.fn().mockResolvedValue({ clicked: true }),
-        getTarget: (input) => ({ testId: input.testId }),
-      };
-
-      // Act
-      await runTool(config);
-
-      // Assert
-      expect(mockKnowledgeStore.recordStep).toHaveBeenCalledWith({
-        sessionId: 'test-session-123',
-        toolName: 'mm_click',
-        input: { testId: 'send-button' },
-        target: { testId: 'send-button' },
-        outcome: { ok: true },
-        observation: expect.any(Object),
-        durationMs: expect.any(Number),
-        context: 'e2e',
-      });
-    });
-
-    it('records failed step with error details', async () => {
-      // Arrange
-      vi.spyOn(helpersModule, 'collectObservation').mockResolvedValue({
-        state: {} as any,
-        testIds: [],
-        a11y: { nodes: [] },
-      });
-      const config: ToolExecutionConfig<{ testId: string }, object> = {
-        toolName: 'mm_click',
-        input: { testId: 'missing-button' },
-        execute: vi.fn().mockRejectedValue(new Error('Element not found')),
-        getTarget: (input) => ({ testId: input.testId }),
-        classifyError: () => ({
-          code: 'MM_TARGET_NOT_FOUND',
-          message: 'Element not found',
-        }),
-      };
-
-      // Act
-      await runTool(config);
-
-      // Assert
-      expect(mockKnowledgeStore.recordStep).toHaveBeenCalledWith({
-        sessionId: 'test-session-123',
-        toolName: 'mm_click',
-        input: { testId: 'missing-button' },
-        target: { testId: 'missing-button' },
-        outcome: {
-          ok: false,
-          error: { code: 'MM_TARGET_NOT_FOUND', message: 'Element not found' },
-        },
-        observation: expect.any(Object),
-        durationMs: expect.any(Number),
-        context: 'e2e',
-      });
-    });
-
-    it('uses sanitizeInputForRecording when provided', async () => {
-      // Arrange
-      vi.spyOn(helpersModule, 'collectObservation').mockResolvedValue({
-        state: {} as any,
-        testIds: [],
-        a11y: { nodes: [] },
-      });
-      const config: ToolExecutionConfig<
-        { action: string; text: string },
-        { success: boolean }
-      > = {
-        toolName: 'mm_clipboard',
-        input: { action: 'write', text: 'secret-srp-phrase' },
-        execute: vi.fn().mockResolvedValue({ success: true }),
-        sanitizeInputForRecording: (input) => ({
-          action: input.action,
-          textLength: input.text.length,
-        }),
-      };
-
-      // Act
-      await runTool(config);
-
-      // Assert
-      expect(mockKnowledgeStore.recordStep).toHaveBeenCalledWith(
-        expect.objectContaining({
-          input: { action: 'write', textLength: 17 },
-        }),
-      );
-    });
-
-    it('skips recording when sessionId is undefined', async () => {
-      // Arrange
-      vi.spyOn(mockSessionManager, 'getSessionId').mockReturnValue(undefined);
-      vi.spyOn(mockSessionManager, 'hasActiveSession').mockReturnValue(true);
-      vi.spyOn(helpersModule, 'collectObservation').mockResolvedValue({
-        state: {} as any,
-        testIds: [],
-        a11y: { nodes: [] },
-      });
-      const config: ToolExecutionConfig<object, object> = {
-        toolName: 'mm_test_tool',
-        input: {},
-        execute: vi.fn().mockResolvedValue({}),
-      };
-
-      // Act
-      await runTool(config);
-
-      // Assert
-      expect(mockKnowledgeStore.recordStep).not.toHaveBeenCalled();
-    });
-  });
-
-  describe('error classification', () => {
-    it('uses classifyError when provided', async () => {
-      // Arrange
-      vi.spyOn(helpersModule, 'collectObservation').mockResolvedValue({
-        state: {} as any,
-        testIds: [],
-        a11y: { nodes: [] },
-      });
-      const config: ToolExecutionConfig<object, object> = {
-        toolName: 'mm_click',
-        input: {},
-        execute: vi
-          .fn()
-          .mockRejectedValue(new Error('Timeout waiting for selector')),
-        classifyError: () => ({
-          code: 'MM_WAIT_TIMEOUT',
-          message: 'Element wait timeout',
-        }),
-      };
-
-      // Act
-      const result = await runTool(config);
-
-      // Assert
-      expect(result.ok).toBe(false);
-      if (!result.ok) {
-        expect(result.error.code).toBe('MM_WAIT_TIMEOUT');
-        expect(result.error.message).toBe('Element wait timeout');
-      }
-    });
-
-    it('generates default error code when classifyError not provided', async () => {
-      // Arrange
-      vi.spyOn(helpersModule, 'collectObservation').mockResolvedValue({
-        state: {} as any,
-        testIds: [],
-        a11y: { nodes: [] },
-      });
-      const config: ToolExecutionConfig<object, object> = {
-        toolName: 'mm_my_tool',
-        input: {},
-        execute: vi.fn().mockRejectedValue(new Error('Something went wrong')),
-      };
-
-      // Act
-      const result = await runTool(config);
-
-      // Assert
-      expect(result.ok).toBe(false);
-      if (!result.ok) {
-        expect(result.error.code).toBe('MM_MY_TOOL_FAILED');
-        expect(result.error.message).toBe('Something went wrong');
-      }
-    });
-
-    it('removes MM_ prefix when generating default error code', async () => {
-      // Arrange
-      vi.spyOn(helpersModule, 'collectObservation').mockResolvedValue({
-        state: {} as any,
-        testIds: [],
-        a11y: { nodes: [] },
-      });
-      const config: ToolExecutionConfig<object, object> = {
-        toolName: 'mm_click',
-        input: {},
-        execute: vi.fn().mockRejectedValue(new Error('Click failed')),
-      };
-
-      // Act
-      const result = await runTool(config);
-
-      // Assert
-      expect(result.ok).toBe(false);
-      if (!result.ok) {
-        expect(result.error.code).toBe('MM_CLICK_FAILED');
-      }
-    });
-  });
-
-  describe('error handling', () => {
-    it('returns error response when execute throws', async () => {
-      // Arrange
-      vi.spyOn(helpersModule, 'collectObservation').mockResolvedValue({
-        state: {} as any,
-        testIds: [],
-        a11y: { nodes: [] },
-      });
-      const config: ToolExecutionConfig<{ testId: string }, object> = {
-        toolName: 'mm_click',
-        input: { testId: 'test-button' },
-        execute: vi.fn().mockRejectedValue(new Error('Execution failed')),
-      };
-
-      // Act
-      const result = await runTool(config);
-
-      // Assert
-      expect(result.ok).toBe(false);
-      if (!result.ok) {
-        expect(result.error.message).toBe('Execution failed');
-        expect(result.error.details).toStrictEqual({
-          input: { testId: 'test-button' },
-        });
-      }
-    });
-
-    it('collects full observation on failure with default policy', async () => {
-      // Arrange
-      const collectObservationSpy = vi
-        .spyOn(helpersModule, 'collectObservation')
-        .mockResolvedValue({
-          state: {} as any,
-          testIds: [],
-          a11y: { nodes: [] },
-        });
-      const config: ToolExecutionConfig<object, object> = {
-        toolName: 'mm_click',
-        input: {},
-        observationPolicy: 'default',
-        execute: vi.fn().mockRejectedValue(new Error('Failed')),
-      };
-
-      // Act
-      await runTool(config);
-
-      // Assert
-      expect(collectObservationSpy).toHaveBeenCalledWith(mockPage, 'full');
-    });
-
-    it('collects minimal observation on failure with none policy', async () => {
-      // Arrange
-      const collectObservationSpy = vi
-        .spyOn(helpersModule, 'collectObservation')
-        .mockResolvedValue({
-          state: {} as any,
-          testIds: [],
-          a11y: { nodes: [] },
-        });
-      const config: ToolExecutionConfig<object, object> = {
-        toolName: 'mm_click',
-        input: {},
-        observationPolicy: 'none',
-        execute: vi.fn().mockRejectedValue(new Error('Failed')),
-      };
-
-      // Act
-      await runTool(config);
-
-      // Assert
-      expect(collectObservationSpy).toHaveBeenCalledWith(undefined, 'minimal');
-    });
-
-    it('handles observation collection failure gracefully', async () => {
-      // Arrange
-      const collectObservationSpy = vi
-        .spyOn(helpersModule, 'collectObservation')
-        .mockRejectedValueOnce(new Error('Page closed'))
-        .mockResolvedValue({
-          state: {} as any,
-          testIds: [],
-          a11y: { nodes: [] },
-        });
-      const config: ToolExecutionConfig<object, object> = {
-        toolName: 'mm_click',
-        input: {},
-        observationPolicy: 'failures',
-        execute: vi.fn().mockRejectedValue(new Error('Execution failed')),
-      };
-
-      // Act
-      const result = await runTool(config);
-
-      // Assert
-      expect(result.ok).toBe(false);
-      if (!result.ok) {
-        expect(result.error.message).toBe('Execution failed');
-      }
-      expect(collectObservationSpy).toHaveBeenCalled();
-    });
-  });
-
-  describe('page closure detection', () => {
-    it('creates empty observation when page is closed during failure handling', async () => {
-      // Arrange
-      vi.spyOn(mockSessionManager, 'hasActiveSession').mockReturnValue(true);
-      const collectObservationSpy = vi
-        .spyOn(helpersModule, 'collectObservation')
-        .mockRejectedValueOnce(
-          new Error('Target page, context or browser has been closed'),
-        )
-        .mockResolvedValue({
-          state: {} as any,
-          testIds: [],
-          a11y: { nodes: [] },
-        });
-      const config: ToolExecutionConfig<object, object> = {
-        toolName: 'mm_click',
-        input: {},
-        observationPolicy: 'default',
-        execute: vi.fn().mockRejectedValue(new Error('Click failed')),
-      };
-
-      // Act
-      await runTool(config);
-
-      // Assert
-      expect(collectObservationSpy).toHaveBeenCalledTimes(2);
-      expect(collectObservationSpy).toHaveBeenLastCalledWith(
-        undefined,
-        'minimal',
-      );
-    });
-  });
-
-  describe('timeout handling', () => {
-    it('includes duration in response even on timeout error', async () => {
-      // Arrange
-      vi.useFakeTimers();
-      vi.spyOn(helpersModule, 'collectObservation').mockResolvedValue({
-        state: {} as any,
-        testIds: [],
-        a11y: { nodes: [] },
-      });
-      const config: ToolExecutionConfig<object, object> = {
-        toolName: 'mm_wait_for',
-        input: {},
-        execute: vi.fn().mockImplementation(async () => {
-          await new Promise((resolve) => setTimeout(resolve, 100));
-          throw new Error('Timeout waiting for element');
-        }),
-        classifyError: () => ({
-          code: 'MM_WAIT_TIMEOUT',
-          message: 'Wait timeout',
-        }),
-      };
-
-      // Act
-      const resultPromise = runTool(config);
-      await vi.advanceTimersByTimeAsync(100);
-      const result = await resultPromise;
-
-      // Assert
-      expect(result.ok).toBe(false);
-      expect(result.meta.durationMs).toBe(100);
-
-      // Cleanup
-      vi.useRealTimers();
-    });
-  });
-
-  describe('getTarget function', () => {
-    it('extracts target from input when getTarget provided', async () => {
-      // Arrange
-      vi.spyOn(helpersModule, 'collectObservation').mockResolvedValue({
-        state: {} as any,
-        testIds: [],
-        a11y: { nodes: [] },
-      });
-      const config: ToolExecutionConfig<
-        { testId?: string; selector?: string; a11yRef?: string },
-        object
-      > = {
-        toolName: 'mm_click',
-        input: { testId: 'send-button', selector: '.btn' },
-        execute: vi.fn().mockResolvedValue({}),
-        getTarget: (input) => ({
-          testId: input.testId,
-          selector: input.selector,
-          a11yRef: input.a11yRef,
-        }),
-      };
-
-      // Act
-      await runTool(config);
-
-      // Assert
-      expect(mockKnowledgeStore.recordStep).toHaveBeenCalledWith(
-        expect.objectContaining({
-          target: {
-            testId: 'send-button',
-            selector: '.btn',
-            a11yRef: undefined,
-          },
-        }),
-      );
-    });
-
-    it('records undefined target when getTarget not provided', async () => {
-      // Arrange
-      vi.spyOn(helpersModule, 'collectObservation').mockResolvedValue({
-        state: {} as any,
-        testIds: [],
-        a11y: { nodes: [] },
-      });
-      const config: ToolExecutionConfig<{ testId: string }, object> = {
-        toolName: 'mm_click',
-        input: { testId: 'send-button' },
-        execute: vi.fn().mockResolvedValue({}),
-      };
-
-      // Act
-      await runTool(config);
-
-      // Assert
-      expect(mockKnowledgeStore.recordStep).toHaveBeenCalledWith(
-        expect.objectContaining({
-          target: undefined,
-        }),
-      );
-    });
-  });
-
-  describe('isToolExecuteResult type guard', () => {
-    it('handles plain result (not ToolExecuteResult)', async () => {
-      // Arrange
-      vi.spyOn(helpersModule, 'collectObservation').mockResolvedValue({
-        state: {} as any,
-        testIds: [],
-        a11y: { nodes: [] },
-      });
-      const config: ToolExecutionConfig<object, { simple: string }> = {
-        toolName: 'mm_test_tool',
-        input: {},
-        execute: vi.fn().mockResolvedValue({ simple: 'value' }),
-      };
-
-      // Act
-      const result = await runTool(config);
-
-      // Assert
-      expect(result.ok).toBe(true);
-      if (result.ok) {
-        expect(result.result).toStrictEqual({ simple: 'value' });
-      }
-    });
-
-    it('handles ToolExecuteResult wrapper', async () => {
-      // Arrange
-      vi.spyOn(helpersModule, 'collectObservation').mockResolvedValue({
-        state: {} as any,
-        testIds: [],
-        a11y: { nodes: [] },
-      });
-      const config: ToolExecutionConfig<object, { wrapped: string }> = {
-        toolName: 'mm_test_tool',
-        input: {},
-        execute: vi.fn().mockResolvedValue({
-          result: { wrapped: 'value' },
-        }),
-      };
-
-      // Act
-      const result = await runTool(config);
-
-      // Assert
-      expect(result.ok).toBe(true);
-      if (result.ok) {
-        expect(result.result).toStrictEqual({ wrapped: 'value' });
-      }
-    });
-
-    it('handles null result', async () => {
-      // Arrange
-      vi.spyOn(helpersModule, 'collectObservation').mockResolvedValue({
-        state: {} as any,
-        testIds: [],
-        a11y: { nodes: [] },
-      });
-      const config: ToolExecutionConfig<object, null> = {
-        toolName: 'mm_test_tool',
-        input: {},
-        execute: vi.fn().mockResolvedValue(null),
-      };
-
-      // Act
-      const result = await runTool(config);
-
-      // Assert
-      expect(result.ok).toBe(true);
-      if (result.ok) {
-        expect(result.result).toBeNull();
-      }
-    });
-
-    it('handles primitive result', async () => {
-      // Arrange
-      vi.spyOn(helpersModule, 'collectObservation').mockResolvedValue({
-        state: {} as any,
-        testIds: [],
-        a11y: { nodes: [] },
-      });
-      const config: ToolExecutionConfig<object, string> = {
-        toolName: 'mm_test_tool',
-        input: {},
-        execute: vi.fn().mockResolvedValue('string-result'),
-      };
-
-      // Act
-      const result = await runTool(config);
-
-      // Assert
-      expect(result.ok).toBe(true);
-      if (result.ok) {
-        expect(result.result).toBe('string-result');
-      }
-    });
-  });
-
-  describe('createEmptyObservation', () => {
-    it('creates empty observation when session has no ID on failure', async () => {
-      // Arrange
-      vi.spyOn(mockSessionManager, 'getSessionId').mockReturnValue(undefined);
-      vi.spyOn(helpersModule, 'collectObservation').mockResolvedValue({
-        state: {} as any,
-        testIds: [],
-        a11y: { nodes: [] },
-      });
-      const config: ToolExecutionConfig<object, object> = {
-        toolName: 'mm_test_tool',
-        input: {},
-        execute: vi.fn().mockRejectedValue(new Error('Failed')),
-      };
-
-      // Act
-      const result = await runTool(config);
-
-      // Assert
-      expect(result.ok).toBe(false);
-      expect(mockKnowledgeStore.recordStep).not.toHaveBeenCalled();
-    });
-  });
-});
diff --git a/src/mcp-server/tools/run-tool.ts b/src/mcp-server/tools/run-tool.ts
deleted file mode 100644
index d74c206..0000000
--- a/src/mcp-server/tools/run-tool.ts
+++ /dev/null
@@ -1,220 +0,0 @@
-import type { Page } from '@playwright/test';
-
-import type { ExtensionState } from '../../capabilities/types.js';
-import { knowledgeStore } from '../knowledge-store.js';
-import { getSessionManager } from '../session-manager.js';
-import { collectObservation } from './helpers.js';
-import type {
-  McpResponse,
-  HandlerOptions,
-  StepRecordObservation,
-  ErrorCode,
-} from '../types';
-import { ErrorCodes } from '../types';
-import {
-  createSuccessResponse,
-  createErrorResponse,
-  extractErrorMessage,
-  debugWarn,
-} from '../utils';
-
-/**
- * Creates an empty observation object for step recording.
- *
- * @returns Empty observation with default state, testIds, and a11y nodes
- */
-function createEmptyObservation(): StepRecordObservation {
-  return {
-    state: {} as ExtensionState,
-    testIds: [],
-    a11y: { nodes: [] },
-  };
-}
-
-export type ObservationPolicy = 'none' | 'default' | 'custom' | 'failures';
-
-export type ToolExecutionContext = {
-  sessionId: string | undefined;
-  page: Page;
-  refMap: Map<string, string>;
-  startTime: number;
-};
-
-export type ToolExecuteResult<TResult> = {
-  result: TResult;
-  observation?: StepRecordObservation;
-};
-
-export type ToolExecutionConfig<TInput, TResult> = {
-  toolName: string;
-  input: TInput;
-  options?: HandlerOptions;
-  requiresSession?: boolean;
-  observationPolicy?: ObservationPolicy;
-  execute: (
-    context: ToolExecutionContext,
-  ) => Promise<TResult | ToolExecuteResult<TResult>>;
-  classifyError?: (error: unknown) => {
-    code: string;
-    message: string;
-  };
-  getTarget?: (input: TInput) =>
-    | {
-        testId?: string;
-        selector?: string;
-        a11yRef?: string;
-      }
-    | undefined;
-  sanitizeInputForRecording?: (input: TInput) => Record<string, unknown>;
-};
-
-/**
- * Type guard to check if result is a ToolExecuteResult with observation.
- *
- * @param result The result to check
- * @returns True if result is a ToolExecuteResult with observation property
- */
-function isToolExecuteResult<TResult>(
-  result: TResult | ToolExecuteResult<TResult>,
-): result is ToolExecuteResult<TResult> {
-  return (
-    typeof result === 'object' &&
-    result !== null &&
-    'result' in result &&
-    Object.prototype.hasOwnProperty.call(result, 'result')
-  );
-}
-
-/**
- * Executes a tool with error handling, observation collection, and knowledge store recording.
- *
- * @param config The tool execution configuration with input, execute function, and error handling
- * @returns Promise resolving to MCP response with tool result or error information
- */
-export async function runTool<TInput, TResult>(
-  config: ToolExecutionConfig<TInput, TResult>,
-): Promise<McpResponse<TResult>> {
-  const startTime = Date.now();
-  const sessionManager = getSessionManager();
-  const sessionId = sessionManager.getSessionId();
-  const requiresSession = config.requiresSession ?? true;
-
-  const effectivePolicy =
-    config.options?.observationPolicy ?? config.observationPolicy ?? 'default';
-
-  try {
-    if (requiresSession && !sessionManager.hasActiveSession()) {
-      return createErrorResponse(
-        ErrorCodes.MM_NO_ACTIVE_SESSION,
-        'No active session. Call launch first.',
-        { input: config.input },
-        undefined,
-        startTime,
-      );
-    }
-
-    const context: ToolExecutionContext = {
-      sessionId,
-      page: requiresSession ? sessionManager.getPage() : (undefined as never),
-      refMap: requiresSession ? sessionManager.getRefMap() : new Map(),
-      startTime,
-    };
-
-    const executeResult = await config.execute(context);
-
-    let result: TResult;
-    let customObservation: StepRecordObservation | undefined;
-
-    if (isToolExecuteResult<TResult>(executeResult)) {
-      result = executeResult.result;
-      customObservation = executeResult.observation;
-    } else {
-      result = executeResult;
-    }
-
-    let observation: StepRecordObservation | undefined;
-
-    if (effectivePolicy === 'custom' && customObservation) {
-      observation = customObservation;
-    } else if (effectivePolicy === 'default' && requiresSession) {
-      observation = await collectObservation(context.page, 'full');
-    } else if (
-      (effectivePolicy === 'none' || effectivePolicy === 'failures') &&
-      requiresSession
-    ) {
-      observation = await collectObservation(context.page, 'minimal');
-    }
-
-    if (sessionId) {
-      const recordInput = config.sanitizeInputForRecording
-        ? config.sanitizeInputForRecording(config.input)
-        : (config.input as Record<string, unknown>);
-
-      await knowledgeStore.recordStep({
-        sessionId,
-        toolName: config.toolName,
-        input: recordInput,
-        target: config.getTarget?.(config.input),
-        outcome: { ok: true },
-        observation: observation ?? createEmptyObservation(),
-        durationMs: Date.now() - startTime,
-        context: sessionManager.getEnvironmentMode(),
-      });
-    }
-
-    return createSuccessResponse<TResult>(result, sessionId, startTime);
-  } catch (error) {
-    const errorInfo = config.classifyError?.(error) ?? {
-      code: `MM_${config.toolName.toUpperCase().replace(/^MM_/u, '')}_FAILED`,
-      message: extractErrorMessage(error),
-    };
-
-    let failureObservation: StepRecordObservation = createEmptyObservation();
-
-    if (requiresSession && sessionManager.hasActiveSession()) {
-      if (effectivePolicy === 'failures' || effectivePolicy === 'default') {
-        try {
-          const page = sessionManager.getPage();
-          failureObservation = await collectObservation(page, 'full');
-        } catch (collectError) {
-          debugWarn('run-tool.collectObservation', collectError);
-          failureObservation = await collectObservation(undefined, 'minimal');
-        }
-      } else if (effectivePolicy === 'none') {
-        try {
-          failureObservation = await collectObservation(undefined, 'minimal');
-        } catch (collectError) {
-          debugWarn('run-tool.collectObservation', collectError);
-        }
-      }
-    }
-
-    if (sessionId) {
-      const recordInput = config.sanitizeInputForRecording
-        ? config.sanitizeInputForRecording(config.input)
-        : (config.input as Record<string, unknown>);
-
-      await knowledgeStore.recordStep({
-        sessionId,
-        toolName: config.toolName,
-        input: recordInput,
-        target: config.getTarget?.(config.input),
-        outcome: {
-          ok: false,
-          error: { code: errorInfo.code, message: errorInfo.message },
-        },
-        observation: failureObservation,
-        durationMs: Date.now() - startTime,
-        context: sessionManager.getEnvironmentMode(),
-      });
-    }
-
-    return createErrorResponse(
-      errorInfo.code as ErrorCode,
-      errorInfo.message,
-      { input: config.input },
-      sessionId,
-      startTime,
-    );
-  }
-}
diff --git a/src/mcp-server/tools/screenshot.test.ts b/src/mcp-server/tools/screenshot.test.ts
deleted file mode 100644
index b21fda8..0000000
--- a/src/mcp-server/tools/screenshot.test.ts
+++ /dev/null
@@ -1,307 +0,0 @@
-/**
- * Unit tests for screenshot tool handler.
- *
- * Tests handleScreenshot with various options including base64 encoding,
- * selector scoping, and error handling.
- */
-
-import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
-
-import { handleScreenshot } from './screenshot.js';
-import * as knowledgeStoreModule from '../knowledge-store.js';
-import * as sessionManagerModule from '../session-manager.js';
-import { createMockSessionManager } from '../test-utils';
-import { ErrorCodes } from '../types/errors.js';
-
-describe('screenshot', () => {
-  let mockSessionManager: ReturnType<typeof createMockSessionManager>;
-
-  beforeEach(() => {
-    mockSessionManager = createMockSessionManager({
-      hasActive: true,
-      sessionId: 'test-session-123',
-      sessionMetadata: {
-        schemaVersion: 1,
-        sessionId: 'test-session-123',
-        createdAt: new Date().toISOString(),
-        flowTags: [],
-        tags: [],
-        launch: { stateMode: 'default' },
-      },
-    });
-    vi.spyOn(sessionManagerModule, 'getSessionManager').mockReturnValue(
-      mockSessionManager,
-    );
-    // Mock knowledge store to prevent "not initialized" errors
-    vi.spyOn(knowledgeStoreModule, 'knowledgeStore', 'get').mockReturnValue({
-      recordStep: vi.fn().mockResolvedValue(undefined),
-      getLastSteps: vi.fn().mockResolvedValue([]),
-      searchSteps: vi.fn().mockResolvedValue([]),
-      summarizeSession: vi
-        .fn()
-        .mockResolvedValue({ sessionId: 'test', stepCount: 0, recipe: [] }),
-      listSessions: vi.fn().mockResolvedValue([]),
-      generatePriorKnowledge: vi.fn().mockResolvedValue(undefined),
-      writeSessionMetadata: vi.fn().mockResolvedValue('test-session'),
-    } as any);
-  });
-
-  afterEach(() => {
-    vi.restoreAllMocks();
-  });
-
-  describe('handleScreenshot', () => {
-    describe('basic screenshot', () => {
-      it('captures full page screenshot by default', async () => {
-        // Arrange
-        vi.spyOn(mockSessionManager, 'screenshot').mockResolvedValue({
-          path: '/path/to/screenshot.png',
-          width: 1280,
-          height: 720,
-          base64: undefined,
-        });
-
-        // Act
-        const result = await handleScreenshot({ name: 'test-screenshot' });
-
-        // Assert
-        expect(result.ok).toBe(true);
-        if (result.ok) {
-          expect(result.result.path).toBe('/path/to/screenshot.png');
-          expect(result.result.width).toBe(1280);
-          expect(result.result.height).toBe(720);
-          expect(result.result.base64).toBeUndefined();
-        }
-        expect(mockSessionManager.screenshot).toHaveBeenCalledWith({
-          name: 'test-screenshot',
-          fullPage: true,
-          selector: undefined,
-        });
-      });
-
-      it('captures viewport-only screenshot when fullPage is false', async () => {
-        // Arrange
-        vi.spyOn(mockSessionManager, 'screenshot').mockResolvedValue({
-          path: '/path/to/screenshot.png',
-          width: 1280,
-          height: 720,
-          base64: undefined,
-        });
-
-        // Act
-        const result = await handleScreenshot({
-          name: 'viewport-screenshot',
-          fullPage: false,
-        });
-
-        // Assert
-        expect(result.ok).toBe(true);
-        expect(mockSessionManager.screenshot).toHaveBeenCalledWith({
-          name: 'viewport-screenshot',
-          fullPage: false,
-          selector: undefined,
-        });
-      });
-    });
-
-    describe('with base64 encoding', () => {
-      it('includes base64 when includeBase64 is true', async () => {
-        // Arrange
-        vi.spyOn(mockSessionManager, 'screenshot').mockResolvedValue({
-          path: '/path/to/screenshot.png',
-          width: 1280,
-          height: 720,
-          base64:
-            'iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg==',
-        });
-
-        // Act
-        const result = await handleScreenshot({
-          name: 'base64-screenshot',
-          includeBase64: true,
-        });
-
-        // Assert
-        expect(result.ok).toBe(true);
-        if (result.ok) {
-          expect(result.result.base64).toBe(
-            'iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg==',
-          );
-        }
-      });
-
-      it('excludes base64 when includeBase64 is false', async () => {
-        // Arrange
-        vi.spyOn(mockSessionManager, 'screenshot').mockResolvedValue({
-          path: '/path/to/screenshot.png',
-          width: 1280,
-          height: 720,
-          base64:
-            'iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg==',
-        });
-
-        // Act
-        const result = await handleScreenshot({
-          name: 'no-base64-screenshot',
-          includeBase64: false,
-        });
-
-        // Assert
-        expect(result.ok).toBe(true);
-        if (result.ok) {
-          expect(result.result.base64).toBeUndefined();
-        }
-      });
-    });
-
-    describe('with selector scoping', () => {
-      it('captures screenshot of specific element', async () => {
-        // Arrange
-        vi.spyOn(mockSessionManager, 'screenshot').mockResolvedValue({
-          path: '/path/to/element-screenshot.png',
-          width: 400,
-          height: 200,
-          base64: undefined,
-        });
-
-        // Act
-        const result = await handleScreenshot({
-          name: 'element-screenshot',
-          selector: '[data-testid="account-menu"]',
-        });
-
-        // Assert
-        expect(result.ok).toBe(true);
-        if (result.ok) {
-          expect(result.result.width).toBe(400);
-          expect(result.result.height).toBe(200);
-        }
-        expect(mockSessionManager.screenshot).toHaveBeenCalledWith({
-          name: 'element-screenshot',
-          fullPage: true,
-          selector: '[data-testid="account-menu"]',
-        });
-      });
-
-      it('combines selector with fullPage false', async () => {
-        // Arrange
-        vi.spyOn(mockSessionManager, 'screenshot').mockResolvedValue({
-          path: '/path/to/element-screenshot.png',
-          width: 400,
-          height: 200,
-          base64: undefined,
-        });
-
-        // Act
-        const result = await handleScreenshot({
-          name: 'element-viewport-screenshot',
-          selector: '.modal-content',
-          fullPage: false,
-        });
-
-        // Assert
-        expect(result.ok).toBe(true);
-        expect(mockSessionManager.screenshot).toHaveBeenCalledWith({
-          name: 'element-viewport-screenshot',
-          fullPage: false,
-          selector: '.modal-content',
-        });
-      });
-    });
-
-    describe('error handling', () => {
-      it('returns error when no active session', async () => {
-        // Arrange
-        vi.spyOn(mockSessionManager, 'hasActiveSession').mockReturnValue(false);
-
-        // Act
-        const result = await handleScreenshot({ name: 'test-screenshot' });
-
-        // Assert
-        expect(result.ok).toBe(false);
-        if (!result.ok) {
-          expect(result.error.code).toBe(ErrorCodes.MM_NO_ACTIVE_SESSION);
-        }
-      });
-
-      it('returns error when screenshot fails', async () => {
-        // Arrange
-        vi.spyOn(mockSessionManager, 'screenshot').mockRejectedValue(
-          new Error('Screenshot failed'),
-        );
-
-        // Act
-        const result = await handleScreenshot({ name: 'test-screenshot' });
-
-        // Assert
-        expect(result.ok).toBe(false);
-        if (!result.ok) {
-          expect(result.error.code).toBe(ErrorCodes.MM_SCREENSHOT_FAILED);
-          expect(result.error.message).toContain('Screenshot failed');
-        }
-      });
-
-      it('returns error when page is closed', async () => {
-        // Arrange
-        vi.spyOn(mockSessionManager, 'screenshot').mockRejectedValue(
-          new Error('Target page, context or browser has been closed'),
-        );
-
-        // Act
-        const result = await handleScreenshot({ name: 'test-screenshot' });
-
-        // Assert
-        expect(result.ok).toBe(false);
-        if (!result.ok) {
-          expect(result.error.code).toBe(ErrorCodes.MM_PAGE_CLOSED);
-        }
-      });
-    });
-
-    describe('input sanitization', () => {
-      it('sanitizes input for knowledge store recording', async () => {
-        // Arrange
-        vi.spyOn(mockSessionManager, 'screenshot').mockResolvedValue({
-          path: '/path/to/screenshot.png',
-          width: 1280,
-          height: 720,
-          base64: 'very-long-base64-string-that-should-not-be-recorded',
-        });
-
-        const recordStepMock = vi.fn().mockResolvedValue(undefined);
-        vi.spyOn(knowledgeStoreModule, 'knowledgeStore', 'get').mockReturnValue(
-          {
-            recordStep: recordStepMock,
-            getLastSteps: vi.fn().mockResolvedValue([]),
-            searchSteps: vi.fn().mockResolvedValue([]),
-            summarizeSession: vi.fn().mockResolvedValue({
-              sessionId: 'test',
-              stepCount: 0,
-              recipe: [],
-            }),
-            listSessions: vi.fn().mockResolvedValue([]),
-            generatePriorKnowledge: vi.fn().mockResolvedValue(undefined),
-            writeSessionMetadata: vi.fn().mockResolvedValue('test-session'),
-          } as any,
-        );
-
-        // Act
-        await handleScreenshot({
-          name: 'test-screenshot',
-          includeBase64: true,
-          selector: '[data-testid="test"]',
-        });
-
-        // Assert
-        expect(recordStepMock).toHaveBeenCalled();
-        const recordedInput = recordStepMock.mock.calls[0][0].input;
-        expect(recordedInput).toStrictEqual({
-          name: 'test-screenshot',
-          fullPage: undefined,
-          selector: '[data-testid="test"]',
-        });
-        expect(recordedInput.includeBase64).toBeUndefined();
-      });
-    });
-  });
-});
diff --git a/src/mcp-server/tools/screenshot.ts b/src/mcp-server/tools/screenshot.ts
deleted file mode 100644
index d6696ad..0000000
--- a/src/mcp-server/tools/screenshot.ts
+++ /dev/null
@@ -1,67 +0,0 @@
-import { getSessionManager } from '../session-manager.js';
-import { classifyScreenshotError } from './error-classification.js';
-import { runTool } from './run-tool.js';
-import type {
-  ScreenshotInput,
-  ScreenshotToolResult,
-  McpResponse,
-  HandlerOptions,
-} from '../types';
-
-/**
- * Handles the screenshot tool request.
- *
- * @param input - The screenshot input parameters.
- * @param options - Handler options including abort signal.
- * @returns Response with screenshot path and dimensions.
- */
-export async function handleScreenshot(
-  input: ScreenshotInput,
-  options?: HandlerOptions,
-): Promise<McpResponse<ScreenshotToolResult>> {
-  return runTool<ScreenshotInput, ScreenshotToolResult>({
-    toolName: 'mm_screenshot',
-    input,
-    options,
-    observationPolicy: 'none',
-
-    /**
-     * Executes the screenshot capture.
-     *
-     * @returns The screenshot result.
-     */
-    execute: async () => {
-      const sessionManager = getSessionManager();
-      const result = await sessionManager.screenshot({
-        name: input.name,
-        fullPage: input.fullPage ?? true,
-        selector: input.selector,
-      });
-
-      const response: ScreenshotToolResult = {
-        path: result.path,
-        width: result.width,
-        height: result.height,
-      };
-
-      if (input.includeBase64) {
-        response.base64 = result.base64;
-      }
-
-      return response;
-    },
-
-    classifyError: classifyScreenshotError,
-
-    /**
-     * Sanitizes input for knowledge store recording.
-     *
-     * @returns Sanitized input object.
-     */
-    sanitizeInputForRecording: () => ({
-      name: input.name,
-      fullPage: input.fullPage,
-      selector: input.selector,
-    }),
-  });
-}
diff --git a/src/mcp-server/tools/seeding.test.ts b/src/mcp-server/tools/seeding.test.ts
deleted file mode 100644
index e77efbb..0000000
--- a/src/mcp-server/tools/seeding.test.ts
+++ /dev/null
@@ -1,552 +0,0 @@
-/**
- * Unit tests for seeding tool handlers.
- *
- * Tests contract deployment handlers including single/multiple contract deployment,
- * address lookup, and contract listing with ContractSeedingCapability.
- */
-
-import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
-
-import {
-  handleSeedContract,
-  handleSeedContracts,
-  handleGetContractAddress,
-  handleListDeployedContracts,
-} from './seeding.js';
-import type { ContractSeedingCapability } from '../../capabilities/types.js';
-import * as knowledgeStoreModule from '../knowledge-store.js';
-import * as sessionManagerModule from '../session-manager.js';
-import { createMockSessionManager } from '../test-utils';
-import { ErrorCodes } from '../types/errors.js';
-
-describe('seeding', () => {
-  let mockSessionManager: ReturnType<typeof createMockSessionManager>;
-  let mockSeedingCapability: ContractSeedingCapability;
-
-  beforeEach(() => {
-    mockSessionManager = createMockSessionManager({
-      hasActive: true,
-      sessionId: 'test-session-123',
-      sessionMetadata: {
-        schemaVersion: 1,
-        sessionId: 'test-session-123',
-        createdAt: new Date().toISOString(),
-        flowTags: [],
-        tags: [],
-        launch: { stateMode: 'default' },
-      },
-    });
-    vi.spyOn(sessionManagerModule, 'getSessionManager').mockReturnValue(
-      mockSessionManager,
-    );
-    // Mock knowledge store to prevent "not initialized" errors
-    vi.spyOn(knowledgeStoreModule, 'knowledgeStore', 'get').mockReturnValue({
-      recordStep: vi.fn().mockResolvedValue(undefined),
-      getLastSteps: vi.fn().mockResolvedValue([]),
-      searchSteps: vi.fn().mockResolvedValue([]),
-      summarizeSession: vi
-        .fn()
-        .mockResolvedValue({ sessionId: 'test', stepCount: 0, recipe: [] }),
-      listSessions: vi.fn().mockResolvedValue([]),
-      generatePriorKnowledge: vi.fn().mockResolvedValue(undefined),
-      writeSessionMetadata: vi.fn().mockResolvedValue('test-session'),
-    } as any);
-
-    // Create fresh mock seeding capability
-    mockSeedingCapability = {
-      deployContract: vi.fn(),
-      deployContracts: vi.fn(),
-      getContractAddress: vi.fn(),
-      listDeployedContracts: vi.fn(),
-      getAvailableContracts: vi.fn(),
-      clearRegistry: vi.fn(),
-      initialize: vi.fn(),
-    };
-  });
-
-  afterEach(() => {
-    vi.restoreAllMocks();
-  });
-
-  describe('handleSeedContract', () => {
-    it('deploys a single contract successfully', async () => {
-      // Arrange
-      const deployedAt = new Date().toISOString();
-      const mockedDeployContract = vi
-        .spyOn(mockSeedingCapability, 'deployContract')
-        .mockResolvedValue({
-          name: 'hst',
-          address: '0x1234567890123456789012345678901234567890',
-          deployedAt,
-        });
-
-      // Act
-      const result = await handleSeedContract(
-        { contractName: 'hst' },
-        { seedingCapability: mockSeedingCapability },
-      );
-
-      // Assert
-      expect(result.ok).toBe(true);
-      if (result.ok) {
-        expect(result.result.contractName).toBe('hst');
-        expect(result.result.contractAddress).toBe(
-          '0x1234567890123456789012345678901234567890',
-        );
-        expect(result.result.deployedAt).toBe(deployedAt);
-      }
-      expect(mockedDeployContract).toHaveBeenCalledWith('hst', {
-        hardfork: undefined,
-        deployerOptions: undefined,
-      });
-    });
-
-    it('deploys contract with custom hardfork', async () => {
-      // Arrange
-      const deployedAt = new Date().toISOString();
-      const mockedDeployContract = vi
-        .spyOn(mockSeedingCapability, 'deployContract')
-        .mockResolvedValue({
-          name: 'nfts',
-          address: '0xabcdefabcdefabcdefabcdefabcdefabcdefabcd',
-          deployedAt,
-        });
-
-      // Act
-      const result = await handleSeedContract(
-        { contractName: 'nfts', hardfork: 'shanghai' },
-        { seedingCapability: mockSeedingCapability },
-      );
-
-      // Assert
-      expect(result.ok).toBe(true);
-      expect(mockedDeployContract).toHaveBeenCalledWith('nfts', {
-        hardfork: 'shanghai',
-        deployerOptions: undefined,
-      });
-    });
-
-    it('deploys contract with deployer options', async () => {
-      // Arrange
-      const deployedAt = new Date().toISOString();
-      const mockedDeployContract = vi
-        .spyOn(mockSeedingCapability, 'deployContract')
-        .mockResolvedValue({
-          name: 'piggybank',
-          address: '0x9876543210987654321098765432109876543210',
-          deployedAt,
-        });
-
-      // Act
-      const result = await handleSeedContract(
-        {
-          contractName: 'piggybank',
-          deployerOptions: {
-            fromAddress: '0x1111111111111111111111111111111111111111',
-          },
-        },
-        { seedingCapability: mockSeedingCapability },
-      );
-
-      // Assert
-      expect(result.ok).toBe(true);
-      expect(mockedDeployContract).toHaveBeenCalledWith('piggybank', {
-        hardfork: undefined,
-        deployerOptions: {
-          fromAddress: '0x1111111111111111111111111111111111111111',
-        },
-      });
-    });
-
-    it('returns error when seeding capability not available', async () => {
-      // Act
-      const result = await handleSeedContract(
-        { contractName: 'hst' },
-        { seedingCapability: undefined },
-      );
-
-      // Assert
-      expect(result.ok).toBe(false);
-      if (!result.ok) {
-        expect(result.error.code).toBe(ErrorCodes.MM_CAPABILITY_NOT_AVAILABLE);
-        expect(result.error.message).toContain(
-          'ContractSeedingCapability not available',
-        );
-      }
-    });
-
-    it('returns error when deployment fails', async () => {
-      // Arrange
-      vi.spyOn(mockSeedingCapability, 'deployContract').mockRejectedValue(
-        new Error('Contract not found: unknown'),
-      );
-
-      // Act
-      const result = await handleSeedContract(
-        { contractName: 'hst' },
-        { seedingCapability: mockSeedingCapability },
-      );
-
-      // Assert
-      expect(result.ok).toBe(false);
-      if (!result.ok) {
-        expect(result.error.code).toBe(ErrorCodes.MM_CONTRACT_NOT_FOUND);
-        expect(result.error.message).toContain('Contract not found');
-      }
-    });
-
-    it('returns error when deployment fails with generic error', async () => {
-      // Arrange
-      vi.spyOn(mockSeedingCapability, 'deployContract').mockRejectedValue(
-        new Error('Deployment failed'),
-      );
-
-      // Act
-      const result = await handleSeedContract(
-        { contractName: 'hst' },
-        { seedingCapability: mockSeedingCapability },
-      );
-
-      // Assert
-      expect(result.ok).toBe(false);
-      if (!result.ok) {
-        expect(result.error.code).toBe(ErrorCodes.MM_SEED_FAILED);
-        expect(result.error.message).toContain('Deployment failed');
-      }
-    });
-  });
-
-  describe('handleSeedContracts', () => {
-    it('deploys multiple contracts successfully', async () => {
-      // Arrange
-      const deployedAt1 = new Date().toISOString();
-      const deployedAt2 = new Date(Date.now() + 1000).toISOString();
-      const mockedDeployContracts = vi
-        .spyOn(mockSeedingCapability, 'deployContracts')
-        .mockResolvedValue({
-          deployed: [
-            {
-              name: 'hst',
-              address: '0x1234567890123456789012345678901234567890',
-              deployedAt: deployedAt1,
-            },
-            {
-              name: 'nfts',
-              address: '0xabcdefabcdefabcdefabcdefabcdefabcdefabcd',
-              deployedAt: deployedAt2,
-            },
-          ],
-          failed: [],
-        });
-
-      // Act
-      const result = await handleSeedContracts(
-        { contracts: ['hst', 'nfts'] },
-        { seedingCapability: mockSeedingCapability },
-      );
-
-      // Assert
-      expect(result.ok).toBe(true);
-      if (result.ok) {
-        expect(result.result.deployed).toHaveLength(2);
-        expect(result.result.deployed[0].contractName).toBe('hst');
-        expect(result.result.deployed[1].contractName).toBe('nfts');
-        expect(result.result.failed).toHaveLength(0);
-      }
-      expect(mockedDeployContracts).toHaveBeenCalledWith(['hst', 'nfts'], {
-        hardfork: undefined,
-      });
-    });
-
-    it('deploys contracts with custom hardfork', async () => {
-      // Arrange
-      const deployedAt = new Date().toISOString();
-      const mockedDeployContracts = vi
-        .spyOn(mockSeedingCapability, 'deployContracts')
-        .mockResolvedValue({
-          deployed: [
-            {
-              name: 'hst',
-              address: '0x1234567890123456789012345678901234567890',
-              deployedAt,
-            },
-          ],
-          failed: [],
-        });
-
-      // Act
-      const result = await handleSeedContracts(
-        { contracts: ['hst'], hardfork: 'shanghai' },
-        { seedingCapability: mockSeedingCapability },
-      );
-
-      // Assert
-      expect(result.ok).toBe(true);
-      expect(mockedDeployContracts).toHaveBeenCalledWith(['hst'], {
-        hardfork: 'shanghai',
-      });
-    });
-
-    it('handles partial deployment failures', async () => {
-      // Arrange
-      const deployedAt = new Date().toISOString();
-      vi.spyOn(mockSeedingCapability, 'deployContracts').mockResolvedValue({
-        deployed: [
-          {
-            name: 'hst',
-            address: '0x1234567890123456789012345678901234567890',
-            deployedAt,
-          },
-        ],
-        failed: [
-          {
-            name: 'nfts',
-            error: 'Contract deployment failed',
-          },
-        ],
-      });
-
-      // Act
-      const result = await handleSeedContracts(
-        { contracts: ['hst', 'nfts'] },
-        { seedingCapability: mockSeedingCapability },
-      );
-
-      // Assert
-      expect(result.ok).toBe(true);
-      if (result.ok) {
-        expect(result.result.deployed).toHaveLength(1);
-        expect(result.result.failed).toHaveLength(1);
-        expect(result.result.failed[0].contractName).toBe('nfts');
-        expect(result.result.failed[0].error).toBe(
-          'Contract deployment failed',
-        );
-      }
-    });
-
-    it('returns error when seeding capability not available', async () => {
-      // Act
-      const result = await handleSeedContracts(
-        { contracts: ['hst'] },
-        { seedingCapability: undefined },
-      );
-
-      // Assert
-      expect(result.ok).toBe(false);
-      if (!result.ok) {
-        expect(result.error.code).toBe(ErrorCodes.MM_CAPABILITY_NOT_AVAILABLE);
-        expect(result.error.message).toContain(
-          'ContractSeedingCapability not available',
-        );
-      }
-    });
-
-    it('returns error when deployment fails completely', async () => {
-      // Arrange
-      vi.spyOn(mockSeedingCapability, 'deployContracts').mockRejectedValue(
-        new Error('Anvil not running'),
-      );
-
-      // Act
-      const result = await handleSeedContracts(
-        { contracts: ['hst'] },
-        { seedingCapability: mockSeedingCapability },
-      );
-
-      // Assert
-      expect(result.ok).toBe(false);
-      if (!result.ok) {
-        expect(result.error.code).toBe(ErrorCodes.MM_SEED_FAILED);
-        expect(result.error.message).toContain('Anvil not running');
-      }
-    });
-  });
-
-  describe('handleGetContractAddress', () => {
-    it('returns contract address when found', async () => {
-      // Arrange
-      const mockedGetContractAddress = vi
-        .spyOn(mockSeedingCapability, 'getContractAddress')
-        .mockReturnValue('0x1234567890123456789012345678901234567890');
-
-      // Act
-      const result = await handleGetContractAddress(
-        { contractName: 'hst' },
-        { seedingCapability: mockSeedingCapability },
-      );
-
-      // Assert
-      expect(result.ok).toBe(true);
-      if (result.ok) {
-        expect(result.result.contractName).toBe('hst');
-        expect(result.result.contractAddress).toBe(
-          '0x1234567890123456789012345678901234567890',
-        );
-      }
-      expect(mockedGetContractAddress).toHaveBeenCalledWith('hst');
-    });
-
-    it('returns null when contract not found', async () => {
-      // Arrange
-      vi.spyOn(mockSeedingCapability, 'getContractAddress').mockReturnValue(
-        null,
-      );
-
-      // Act
-      const result = await handleGetContractAddress(
-        { contractName: 'nfts' },
-        { seedingCapability: mockSeedingCapability },
-      );
-
-      // Assert
-      expect(result.ok).toBe(true);
-      if (result.ok) {
-        expect(result.result.contractName).toBe('nfts');
-        expect(result.result.contractAddress).toBeNull();
-      }
-    });
-
-    it('returns error when seeding capability not available', async () => {
-      // Act
-      const result = await handleGetContractAddress(
-        { contractName: 'hst' },
-        { seedingCapability: undefined },
-      );
-
-      // Assert
-      expect(result.ok).toBe(false);
-      if (!result.ok) {
-        expect(result.error.code).toBe(ErrorCodes.MM_CAPABILITY_NOT_AVAILABLE);
-        expect(result.error.message).toContain(
-          'ContractSeedingCapability not available',
-        );
-      }
-    });
-
-    it('returns error when lookup fails', async () => {
-      // Arrange
-      vi.spyOn(mockSeedingCapability, 'getContractAddress').mockImplementation(
-        () => {
-          throw new Error('Registry error');
-        },
-      );
-
-      // Act
-      const result = await handleGetContractAddress(
-        { contractName: 'hst' },
-        { seedingCapability: mockSeedingCapability },
-      );
-
-      // Assert
-      expect(result.ok).toBe(false);
-      if (!result.ok) {
-        expect(result.error.code).toBe(ErrorCodes.MM_SEED_FAILED);
-        expect(result.error.message).toContain('Registry error');
-      }
-    });
-  });
-
-  describe('handleListDeployedContracts', () => {
-    it('returns list of deployed contracts', async () => {
-      // Arrange
-      const deployedAt1 = new Date().toISOString();
-      const deployedAt2 = new Date(Date.now() + 1000).toISOString();
-      const mockedListDeployedContracts = vi
-        .spyOn(mockSeedingCapability, 'listDeployedContracts')
-        .mockReturnValue([
-          {
-            name: 'hst',
-            address: '0x1234567890123456789012345678901234567890',
-            deployedAt: deployedAt1,
-          },
-          {
-            name: 'nfts',
-            address: '0xabcdefabcdefabcdefabcdefabcdefabcdefabcd',
-            deployedAt: deployedAt2,
-          },
-        ]);
-
-      // Act
-      const result = await handleListDeployedContracts(
-        {},
-        { seedingCapability: mockSeedingCapability },
-      );
-
-      // Assert
-      expect(result.ok).toBe(true);
-      if (result.ok) {
-        expect(result.result.contracts).toHaveLength(2);
-        expect(result.result.contracts[0].contractName).toBe('hst');
-        expect(result.result.contracts[0].contractAddress).toBe(
-          '0x1234567890123456789012345678901234567890',
-        );
-        expect(result.result.contracts[0].deployedAt).toBe(deployedAt1);
-        expect(result.result.contracts[1].contractName).toBe('nfts');
-        expect(result.result.contracts[1].contractAddress).toBe(
-          '0xabcdefabcdefabcdefabcdefabcdefabcdefabcd',
-        );
-        expect(result.result.contracts[1].deployedAt).toBe(deployedAt2);
-      }
-      expect(mockedListDeployedContracts).toHaveBeenCalled();
-    });
-
-    it('returns empty list when no contracts deployed', async () => {
-      // Arrange
-      vi.spyOn(mockSeedingCapability, 'listDeployedContracts').mockReturnValue(
-        [],
-      );
-
-      // Act
-      const result = await handleListDeployedContracts(
-        {},
-        { seedingCapability: mockSeedingCapability },
-      );
-
-      // Assert
-      expect(result.ok).toBe(true);
-      if (result.ok) {
-        expect(result.result.contracts).toHaveLength(0);
-      }
-    });
-
-    it('returns error when seeding capability not available', async () => {
-      // Act
-      const result = await handleListDeployedContracts(
-        {},
-        { seedingCapability: undefined },
-      );
-
-      // Assert
-      expect(result.ok).toBe(false);
-      if (!result.ok) {
-        expect(result.error.code).toBe(ErrorCodes.MM_CAPABILITY_NOT_AVAILABLE);
-        expect(result.error.message).toContain(
-          'ContractSeedingCapability not available',
-        );
-      }
-    });
-
-    it('returns error when listing fails', async () => {
-      // Arrange
-      vi.spyOn(
-        mockSeedingCapability,
-        'listDeployedContracts',
-      ).mockImplementation(() => {
-        throw new Error('Registry error');
-      });
-
-      // Act
-      const result = await handleListDeployedContracts(
-        {},
-        { seedingCapability: mockSeedingCapability },
-      );
-
-      // Assert
-      expect(result.ok).toBe(false);
-      if (!result.ok) {
-        expect(result.error.code).toBe(ErrorCodes.MM_SEED_FAILED);
-        expect(result.error.message).toContain('Registry error');
-      }
-    });
-  });
-});
diff --git a/src/mcp-server/tools/seeding.ts b/src/mcp-server/tools/seeding.ts
deleted file mode 100644
index 0fd578c..0000000
--- a/src/mcp-server/tools/seeding.ts
+++ /dev/null
@@ -1,327 +0,0 @@
-import type { ContractSeedingCapability } from '../../capabilities/types.js';
-import { getSessionManager } from '../session-manager.js';
-import { classifySeedingError } from './error-classification.js';
-import { runTool } from './run-tool.js';
-import type {
-  SeedContractInput,
-  SeedContractsInput,
-  GetContractAddressInput,
-  ListDeployedContractsInput,
-  SeedContractResult,
-  SeedContractsResult,
-  GetContractAddressResult,
-  ListDeployedContractsResult,
-  McpResponse,
-  HandlerOptions,
-} from '../types';
-import { ErrorCodes } from '../types';
-import { createErrorResponse } from '../utils';
-
-export type SeedingToolOptions = HandlerOptions & {
-  seedingCapability?: ContractSeedingCapability;
-};
-
-/**
- * Validates that the seeding capability is available, returning either the capability or an error response.
- *
- * @param toolName The name of the tool requesting the capability
- * @param input The input provided to the tool
- * @param options Tool options containing the seeding capability
- * @param startTime Timestamp when the tool execution started
- * @returns The seeding capability if available, or an error response if not
- */
-function checkSeedingCapability<Type>(
-  toolName: string,
-  input: unknown,
-  options: SeedingToolOptions | undefined,
-  startTime: number,
-): McpResponse<Type> | ContractSeedingCapability {
-  const sessionManager = getSessionManager();
-  const sessionId = sessionManager.getSessionId();
-
-  if (!options?.seedingCapability) {
-    return createErrorResponse(
-      ErrorCodes.MM_CAPABILITY_NOT_AVAILABLE,
-      `ContractSeedingCapability not available. The ${toolName} tool requires running in e2e mode with the MetaMask extension wrapper, which provides Anvil chain and contract deployment support.`,
-      { capability: 'ContractSeedingCapability', input },
-      sessionId,
-      startTime,
-    ) as McpResponse<Type>;
-  }
-
-  return options.seedingCapability;
-}
-
-/**
- * Type guard to check if a result is a ContractSeedingCapability.
- *
- * @param result The value to check
- * @returns True if result is a ContractSeedingCapability, false if it's an error response
- */
-function isCapability(
-  result: McpResponse<unknown> | ContractSeedingCapability,
-): result is ContractSeedingCapability {
-  return (
-    typeof result === 'object' && result !== null && 'deployContract' in result
-  );
-}
-
-/**
- * Handles the mm_seed_contract tool to deploy a single smart contract.
- *
- * @param input The contract name and deployment options
- * @param options Tool options including seeding capability
- * @returns Promise resolving to the deployment result with contract address
- */
-export async function handleSeedContract(
-  input: SeedContractInput,
-  options?: SeedingToolOptions,
-): Promise<McpResponse<SeedContractResult>> {
-  const startTime = Date.now();
-  const capabilityOrError = checkSeedingCapability<SeedContractResult>(
-    'mm_seed_contract',
-    input,
-    options,
-    startTime,
-  );
-
-  if (!isCapability(capabilityOrError)) {
-    return capabilityOrError;
-  }
-
-  const seedingCapability = capabilityOrError;
-
-  return runTool<SeedContractInput, SeedContractResult>({
-    toolName: 'mm_seed_contract',
-    input,
-    options,
-    observationPolicy: 'none',
-
-    /**
-     * Executes the contract deployment using the seeding capability.
-     *
-     * @returns The deployed contract details including name, address, and timestamp
-     */
-    execute: async () => {
-      const deployed = await seedingCapability.deployContract(
-        input.contractName,
-        {
-          hardfork: input.hardfork,
-          deployerOptions: input.deployerOptions,
-        },
-      );
-
-      return {
-        contractName: deployed.name,
-        contractAddress: deployed.address,
-        deployedAt: deployed.deployedAt,
-      };
-    },
-
-    classifyError: classifySeedingError,
-
-    /**
-     * Sanitizes the input for recording in the knowledge store.
-     *
-     * @returns The sanitized input containing contract name and hardfork
-     */
-    sanitizeInputForRecording: () => ({
-      contractName: input.contractName,
-      hardfork: input.hardfork ?? 'prague',
-    }),
-  });
-}
-
-/**
- * Handles the mm_seed_contracts tool to deploy multiple smart contracts.
- *
- * @param input The list of contract names and deployment options
- * @param options Tool options including seeding capability
- * @returns Promise resolving to deployment results with deployed and failed contracts
- */
-export async function handleSeedContracts(
-  input: SeedContractsInput,
-  options?: SeedingToolOptions,
-): Promise<McpResponse<SeedContractsResult>> {
-  const startTime = Date.now();
-  const capabilityOrError = checkSeedingCapability<SeedContractsResult>(
-    'mm_seed_contracts',
-    input,
-    options,
-    startTime,
-  );
-
-  if (!isCapability(capabilityOrError)) {
-    return capabilityOrError;
-  }
-
-  const seedingCapability = capabilityOrError;
-
-  return runTool<SeedContractsInput, SeedContractsResult>({
-    toolName: 'mm_seed_contracts',
-    input,
-    options,
-    observationPolicy: 'none',
-
-    /**
-     * Executes the multi-contract deployment using the seeding capability.
-     *
-     * @returns The deployment results with deployed and failed contract lists
-     */
-    execute: async () => {
-      const seedResult = await seedingCapability.deployContracts(
-        input.contracts,
-        { hardfork: input.hardfork },
-      );
-
-      return {
-        deployed: seedResult.deployed.map((deployedContract) => ({
-          contractName: deployedContract.name,
-          contractAddress: deployedContract.address,
-          deployedAt: deployedContract.deployedAt,
-        })),
-        failed: seedResult.failed.map((failedDeployment) => ({
-          contractName: failedDeployment.name,
-          error: failedDeployment.error,
-        })),
-      };
-    },
-
-    classifyError: classifySeedingError,
-
-    /**
-     * Sanitizes the input for recording in the knowledge store.
-     *
-     * @returns The sanitized input containing contracts list and hardfork
-     */
-    sanitizeInputForRecording: () => ({
-      contracts: input.contracts,
-      hardfork: input.hardfork ?? 'prague',
-    }),
-  });
-}
-
-/**
- * Handles the mm_get_contract_address tool to retrieve a deployed contract's address.
- *
- * @param input The contract name to look up
- * @param options Tool options including seeding capability
- * @returns Promise resolving to the contract address or null if not found
- */
-export async function handleGetContractAddress(
-  input: GetContractAddressInput,
-  options?: SeedingToolOptions,
-): Promise<McpResponse<GetContractAddressResult>> {
-  const startTime = Date.now();
-  const capabilityOrError = checkSeedingCapability<GetContractAddressResult>(
-    'mm_get_contract_address',
-    input,
-    options,
-    startTime,
-  );
-
-  if (!isCapability(capabilityOrError)) {
-    return capabilityOrError;
-  }
-
-  const seedingCapability = capabilityOrError;
-
-  return runTool<GetContractAddressInput, GetContractAddressResult>({
-    toolName: 'mm_get_contract_address',
-    input,
-    options,
-    observationPolicy: 'none',
-
-    /**
-     * Executes the contract address lookup using the seeding capability.
-     *
-     * @returns The contract name and its deployed address
-     */
-    execute: async () => {
-      const address = seedingCapability.getContractAddress(input.contractName);
-
-      return {
-        contractName: input.contractName,
-        contractAddress: address,
-      };
-    },
-
-    classifyError: classifySeedingError,
-
-    /**
-     * Sanitizes the input for recording in the knowledge store.
-     *
-     * @returns The sanitized input containing the contract name
-     */
-    sanitizeInputForRecording: () => ({
-      contractName: input.contractName,
-    }),
-  });
-}
-
-/**
- * Handles the mm_list_contracts tool to list all deployed contracts in the session.
- *
- * @param _input Unused input parameter (no input required for this tool)
- * @param options Tool options including seeding capability
- * @returns Promise resolving to a list of all deployed contracts with their addresses
- */
-export async function handleListDeployedContracts(
-  _input: ListDeployedContractsInput,
-  options?: SeedingToolOptions,
-): Promise<McpResponse<ListDeployedContractsResult>> {
-  const startTime = Date.now();
-  const capabilityOrError = checkSeedingCapability<ListDeployedContractsResult>(
-    'mm_list_contracts',
-    _input,
-    options,
-    startTime,
-  );
-
-  if (!isCapability(capabilityOrError)) {
-    return capabilityOrError;
-  }
-
-  const seedingCapability = capabilityOrError;
-
-  return runTool<ListDeployedContractsInput, ListDeployedContractsResult>({
-    toolName: 'mm_list_contracts',
-    input: _input,
-    options,
-    observationPolicy: 'none',
-
-    /**
-     * Executes the contract listing using the seeding capability.
-     *
-     * @returns The list of all deployed contracts with their details
-     */
-    execute: async () => {
-      const deployed = seedingCapability.listDeployedContracts();
-
-      return {
-        contracts: deployed.map(
-          (deployedContract: {
-            /**
-             * The contract name
-             */
-            name: string;
-            /**
-             * The contract's deployed address
-             */
-            address: string;
-            /**
-             * The deployment timestamp
-             */
-            deployedAt: string;
-          }) => ({
-            contractName: deployedContract.name,
-            contractAddress: deployedContract.address,
-            deployedAt: deployedContract.deployedAt,
-          }),
-        ),
-      };
-    },
-
-    classifyError: classifySeedingError,
-  });
-}
diff --git a/src/mcp-server/tools/state.test.ts b/src/mcp-server/tools/state.test.ts
deleted file mode 100644
index 902e230..0000000
--- a/src/mcp-server/tools/state.test.ts
+++ /dev/null
@@ -1,358 +0,0 @@
-/**
- * Unit tests for state tool handler.
- *
- * Tests handleGetState with various scenarios including state snapshot capability,
- * tab tracking, and error handling.
- */
-
-import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
-
-import { handleGetState } from './state.js';
-import type { StateSnapshotCapability } from '../../capabilities/types.js';
-import * as knowledgeStoreModule from '../knowledge-store.js';
-import * as sessionManagerModule from '../session-manager.js';
-import { createMockSessionManager, createMockPage } from '../test-utils';
-import { ErrorCodes } from '../types/errors.js';
-
-describe('state', () => {
-  let mockSessionManager: ReturnType<typeof createMockSessionManager>;
-
-  beforeEach(() => {
-    mockSessionManager = createMockSessionManager({
-      hasActive: true,
-      sessionId: 'test-session-123',
-      sessionMetadata: {
-        schemaVersion: 1,
-        sessionId: 'test-session-123',
-        createdAt: new Date().toISOString(),
-        flowTags: [],
-        tags: [],
-        launch: { stateMode: 'default' },
-      },
-    });
-    vi.spyOn(sessionManagerModule, 'getSessionManager').mockReturnValue(
-      mockSessionManager,
-    );
-    // Mock knowledge store to prevent "not initialized" errors
-    vi.spyOn(knowledgeStoreModule, 'knowledgeStore', 'get').mockReturnValue({
-      recordStep: vi.fn().mockResolvedValue(undefined),
-      getLastSteps: vi.fn().mockResolvedValue([]),
-      searchSteps: vi.fn().mockResolvedValue([]),
-      summarizeSession: vi
-        .fn()
-        .mockResolvedValue({ sessionId: 'test', stepCount: 0, recipe: [] }),
-      listSessions: vi.fn().mockResolvedValue([]),
-      generatePriorKnowledge: vi.fn().mockResolvedValue(undefined),
-      writeSessionMetadata: vi.fn().mockResolvedValue('test-session'),
-    } as any);
-  });
-
-  afterEach(() => {
-    vi.restoreAllMocks();
-  });
-
-  describe('handleGetState', () => {
-    describe('without state snapshot capability', () => {
-      it('returns extension state from session manager', async () => {
-        // Arrange
-        const mockPage = createMockPage();
-        vi.spyOn(mockPage, 'url').mockReturnValue(
-          'chrome-extension://ext-123/home.html',
-        );
-        vi.spyOn(mockSessionManager, 'getPage').mockReturnValue(mockPage);
-        vi.spyOn(mockSessionManager, 'getExtensionState').mockResolvedValue({
-          isLoaded: true,
-          currentUrl: 'chrome-extension://ext-123/home.html',
-          extensionId: 'ext-123',
-          isUnlocked: true,
-          currentScreen: 'home',
-          accountAddress: '0x1234567890123456789012345678901234567890',
-          networkName: 'Ethereum Mainnet',
-          chainId: 1,
-          balance: '1.5 ETH',
-        });
-        vi.spyOn(mockSessionManager, 'getTrackedPages').mockReturnValue([
-          {
-            page: mockPage,
-            role: 'extension',
-            url: 'chrome-extension://ext-123/home.html',
-          },
-        ]);
-
-        // Act
-        const result = await handleGetState();
-
-        // Assert
-        expect(result.ok).toBe(true);
-        if (result.ok) {
-          expect(result.result.state).toStrictEqual({
-            isLoaded: true,
-            currentUrl: 'chrome-extension://ext-123/home.html',
-            extensionId: 'ext-123',
-            isUnlocked: true,
-            currentScreen: 'home',
-            accountAddress: '0x1234567890123456789012345678901234567890',
-            networkName: 'Ethereum Mainnet',
-            chainId: 1,
-            balance: '1.5 ETH',
-          });
-          expect(result.result.tabs).toStrictEqual({
-            active: {
-              role: 'extension',
-              url: 'chrome-extension://ext-123/home.html',
-            },
-            tracked: [
-              {
-                role: 'extension',
-                url: 'chrome-extension://ext-123/home.html',
-              },
-            ],
-          });
-        }
-        expect(mockSessionManager.getExtensionState).toHaveBeenCalled();
-      });
-
-      it('includes multiple tracked pages in tabs', async () => {
-        // Arrange
-        const mockExtensionPage = createMockPage();
-        vi.spyOn(mockExtensionPage, 'url').mockReturnValue(
-          'chrome-extension://ext-123/home.html',
-        );
-        const mockDappPage = createMockPage();
-        vi.spyOn(mockDappPage, 'url').mockReturnValue(
-          'https://app.uniswap.org',
-        );
-
-        vi.spyOn(mockSessionManager, 'getPage').mockReturnValue(
-          mockExtensionPage,
-        );
-        vi.spyOn(mockSessionManager, 'getExtensionState').mockResolvedValue({
-          isLoaded: true,
-          currentUrl: 'chrome-extension://ext-123/home.html',
-          extensionId: 'ext-123',
-          isUnlocked: true,
-          currentScreen: 'home',
-          accountAddress: '0x1234567890123456789012345678901234567890',
-          networkName: 'Ethereum Mainnet',
-          chainId: 1,
-          balance: '1.5 ETH',
-        });
-        vi.spyOn(mockSessionManager, 'getTrackedPages').mockReturnValue([
-          {
-            page: mockExtensionPage,
-            role: 'extension',
-            url: 'chrome-extension://ext-123/home.html',
-          },
-          {
-            page: mockDappPage,
-            role: 'dapp',
-            url: 'https://app.uniswap.org',
-          },
-        ]);
-
-        // Act
-        const result = await handleGetState();
-
-        // Assert
-        expect(result.ok).toBe(true);
-        if (result.ok) {
-          expect(result.result.tabs).toBeDefined();
-          expect(result.result.tabs?.tracked).toHaveLength(2);
-          expect(result.result.tabs?.tracked).toStrictEqual([
-            { role: 'extension', url: 'chrome-extension://ext-123/home.html' },
-            { role: 'dapp', url: 'https://app.uniswap.org' },
-          ]);
-        }
-      });
-
-      it('handles active page without tracked page info', async () => {
-        // Arrange
-        const mockPage = createMockPage();
-        vi.spyOn(mockPage, 'url').mockReturnValue(
-          'chrome-extension://ext-123/home.html',
-        );
-        vi.spyOn(mockSessionManager, 'getPage').mockReturnValue(mockPage);
-        vi.spyOn(mockSessionManager, 'getExtensionState').mockResolvedValue({
-          isLoaded: true,
-          currentUrl: 'chrome-extension://ext-123/home.html',
-          extensionId: 'ext-123',
-          isUnlocked: false,
-          currentScreen: 'home',
-          accountAddress: null,
-          networkName: null,
-          chainId: null,
-          balance: null,
-        });
-        vi.spyOn(mockSessionManager, 'getTrackedPages').mockReturnValue([]);
-
-        // Act
-        const result = await handleGetState();
-
-        // Assert
-        expect(result.ok).toBe(true);
-        if (result.ok) {
-          expect(result.result.tabs).toBeDefined();
-          expect(result.result.tabs?.active.role).toBe('other');
-          expect(result.result.tabs?.active.url).toBe(
-            'chrome-extension://ext-123/home.html',
-          );
-        }
-      });
-    });
-
-    describe('with state snapshot capability', () => {
-      it('uses state snapshot capability when provided', async () => {
-        // Arrange
-        const mockPage = createMockPage();
-        vi.spyOn(mockPage, 'url').mockReturnValue(
-          'chrome-extension://ext-123/home.html',
-        );
-        vi.spyOn(mockSessionManager, 'getPage').mockReturnValue(mockPage);
-        vi.spyOn(mockSessionManager, 'getSessionState').mockReturnValue({
-          extensionId: 'ext-123',
-          ports: { anvil: 8545 },
-        });
-        vi.spyOn(mockSessionManager, 'getTrackedPages').mockReturnValue([
-          {
-            page: mockPage,
-            role: 'extension',
-            url: 'chrome-extension://ext-123/home.html',
-          },
-        ]);
-
-        const mockStateSnapshot: StateSnapshotCapability = {
-          getState: vi.fn().mockResolvedValue({
-            isLoaded: true,
-            currentUrl: 'chrome-extension://ext-123/home.html',
-            extensionId: 'ext-123',
-            isUnlocked: true,
-            currentScreen: 'home',
-            accountAddress: '0x1234567890123456789012345678901234567890',
-            networkName: 'Localhost 8545',
-            chainId: 1337,
-            balance: '25 ETH',
-          }),
-          detectCurrentScreen: vi.fn().mockResolvedValue('home'),
-        };
-
-        // Act
-        const result = await handleGetState({
-          stateSnapshotCapability: mockStateSnapshot,
-        });
-
-        // Assert
-        expect(result.ok).toBe(true);
-        if (result.ok) {
-          expect(result.result.state.chainId).toBe(1337);
-          expect(result.result.state.networkName).toBe('Localhost 8545');
-          expect(result.result.state.balance).toBe('25 ETH');
-        }
-        expect(mockStateSnapshot.getState).toHaveBeenCalledWith(mockPage, {
-          extensionId: 'ext-123',
-          chainId: 1337,
-        });
-        expect(mockSessionManager.getExtensionState).not.toHaveBeenCalled();
-      });
-
-      it('uses chainId 1 when anvil port not present', async () => {
-        // Arrange
-        const mockPage = createMockPage();
-        vi.spyOn(mockPage, 'url').mockReturnValue(
-          'chrome-extension://ext-123/home.html',
-        );
-        vi.spyOn(mockSessionManager, 'getPage').mockReturnValue(mockPage);
-        vi.spyOn(mockSessionManager, 'getSessionState').mockReturnValue({
-          extensionId: 'ext-123',
-          ports: {},
-        });
-        vi.spyOn(mockSessionManager, 'getTrackedPages').mockReturnValue([
-          {
-            page: mockPage,
-            role: 'extension',
-            url: 'chrome-extension://ext-123/home.html',
-          },
-        ]);
-
-        const mockStateSnapshot: StateSnapshotCapability = {
-          getState: vi.fn().mockResolvedValue({
-            isLoaded: true,
-            currentUrl: 'chrome-extension://ext-123/home.html',
-            extensionId: 'ext-123',
-            isUnlocked: true,
-            currentScreen: 'home',
-            accountAddress: '0x1234567890123456789012345678901234567890',
-            networkName: 'Ethereum Mainnet',
-            chainId: 1,
-            balance: '1.5 ETH',
-          }),
-          detectCurrentScreen: vi.fn().mockResolvedValue('home'),
-        };
-
-        // Act
-        const result = await handleGetState({
-          stateSnapshotCapability: mockStateSnapshot,
-        });
-
-        // Assert
-        expect(result.ok).toBe(true);
-        expect(mockStateSnapshot.getState).toHaveBeenCalledWith(mockPage, {
-          extensionId: 'ext-123',
-          chainId: 1,
-        });
-      });
-    });
-
-    describe('error handling', () => {
-      it('returns error when no active session', async () => {
-        // Arrange
-        vi.spyOn(mockSessionManager, 'hasActiveSession').mockReturnValue(false);
-
-        // Act
-        const result = await handleGetState();
-
-        // Assert
-        expect(result.ok).toBe(false);
-        if (!result.ok) {
-          expect(result.error.code).toBe(ErrorCodes.MM_NO_ACTIVE_SESSION);
-        }
-      });
-
-      it('returns error when getExtensionState fails', async () => {
-        // Arrange
-        const mockPage = createMockPage();
-        vi.spyOn(mockSessionManager, 'getPage').mockReturnValue(mockPage);
-        vi.spyOn(mockSessionManager, 'getExtensionState').mockRejectedValue(
-          new Error('Failed to get state'),
-        );
-
-        // Act
-        const result = await handleGetState();
-
-        // Assert
-        expect(result.ok).toBe(false);
-        if (!result.ok) {
-          expect(result.error.code).toBe(ErrorCodes.MM_STATE_FAILED);
-          expect(result.error.message).toContain('Failed to get state');
-        }
-      });
-
-      it('returns error when page is closed', async () => {
-        // Arrange
-        const mockPage = createMockPage();
-        vi.spyOn(mockSessionManager, 'getPage').mockReturnValue(mockPage);
-        vi.spyOn(mockSessionManager, 'getExtensionState').mockRejectedValue(
-          new Error('Target page, context or browser has been closed'),
-        );
-
-        // Act
-        const result = await handleGetState();
-
-        // Assert
-        expect(result.ok).toBe(false);
-        if (!result.ok) {
-          expect(result.error.code).toBe(ErrorCodes.MM_PAGE_CLOSED);
-        }
-      });
-    });
-  });
-});
diff --git a/src/mcp-server/tools/state.ts b/src/mcp-server/tools/state.ts
deleted file mode 100644
index 8d1f71c..0000000
--- a/src/mcp-server/tools/state.ts
+++ /dev/null
@@ -1,102 +0,0 @@
-import type { Page } from 'playwright';
-
-import { classifyStateError } from './error-classification.js';
-import { collectObservation } from './helpers.js';
-import { runTool } from './run-tool.js';
-import type {
-  StateSnapshotCapability,
-  ExtensionState,
-} from '../../capabilities/types.js';
-import { getSessionManager } from '../session-manager.js';
-import type { GetStateResult, McpResponse, HandlerOptions } from '../types';
-
-/**
- * Tool options for state-related operations.
- */
-export type StateToolOptions = HandlerOptions & {
-  /**
-   * Optional capability for taking state snapshots
-   */
-  stateSnapshotCapability?: StateSnapshotCapability;
-};
-
-/**
- * Retrieves the current extension state, using the snapshot capability if available.
- *
- * @param page The Playwright page object to query
- * @param sessionManager The session manager instance
- * @param stateSnapshotCapability Optional capability for detailed state snapshots
- * @returns Promise resolving to the current extension state
- */
-async function getState(
-  page: Page,
-  sessionManager: ReturnType<typeof getSessionManager>,
-  stateSnapshotCapability?: StateSnapshotCapability,
-): Promise<ExtensionState> {
-  if (stateSnapshotCapability) {
-    const extensionId = sessionManager.getSessionState()?.extensionId;
-    return stateSnapshotCapability.getState(page, {
-      extensionId,
-      chainId: sessionManager.getSessionState()?.ports?.anvil ? 1337 : 1,
-    });
-  }
-  return sessionManager.getExtensionState();
-}
-
-/**
- * Handles the mm_get_state tool to retrieve the current extension state.
- *
- * @param options Tool options including optional state snapshot capability
- * @returns Promise resolving to the current extension state and tab information
- */
-export async function handleGetState(
-  options?: StateToolOptions,
-): Promise<McpResponse<GetStateResult>> {
-  return runTool<Record<string, never>, GetStateResult>({
-    toolName: 'mm_get_state',
-    input: {},
-    options,
-    observationPolicy: 'custom',
-
-    /**
-     * Executes the state retrieval with tab and observation information.
-     *
-     * @param context The tool execution context containing the page
-     * @returns The extension state, tab information, and observation data
-     */
-    execute: async (context) => {
-      const sessionManager = getSessionManager();
-      const state = await getState(
-        context.page,
-        sessionManager,
-        options?.stateSnapshotCapability,
-      );
-
-      const trackedPages = sessionManager.getTrackedPages();
-      const activePage = sessionManager.getPage();
-      const activeTabInfo = trackedPages.find(
-        (trackedPage) => trackedPage.page === activePage,
-      );
-
-      const tabs = {
-        active: {
-          role: activeTabInfo?.role ?? 'other',
-          url: activePage.url(),
-        },
-        tracked: trackedPages.map((trackedPage) => ({
-          role: trackedPage.role,
-          url: trackedPage.url,
-        })),
-      };
-
-      const observation = await collectObservation(context.page, 'full', state);
-
-      return {
-        result: { state, tabs },
-        observation,
-      };
-    },
-
-    classifyError: classifyStateError,
-  });
-}
diff --git a/src/mcp-server/types/responses.ts b/src/mcp-server/types/responses.ts
deleted file mode 100644
index edb96c4..0000000
--- a/src/mcp-server/types/responses.ts
+++ /dev/null
@@ -1,27 +0,0 @@
-export type ResponseMeta = {
-  timestamp: string;
-  sessionId?: string;
-  durationMs: number;
-};
-
-export type SuccessResponse<Result = unknown> = {
-  meta: ResponseMeta;
-  ok: true;
-  result: Result;
-};
-
-export type ErrorDetails = {
-  code: string;
-  message: string;
-  details?: Record<string, unknown>;
-};
-
-export type ErrorResponse = {
-  error: ErrorDetails;
-  meta: ResponseMeta;
-  ok: false;
-};
-
-export type McpResponse<Result = unknown> =
-  | SuccessResponse<Result>
-  | ErrorResponse;
diff --git a/src/mcp-server/utils/index.ts b/src/mcp-server/utils/index.ts
deleted file mode 100644
index f4af15f..0000000
--- a/src/mcp-server/utils/index.ts
+++ /dev/null
@@ -1,14 +0,0 @@
-export { SENSITIVE_FIELD_PATTERNS, isSensitiveField } from './redaction.js';
-export { generateFilesafeTimestamp, generateSessionId } from './time.js';
-export { createSuccessResponse, createErrorResponse } from './response.js';
-export {
-  validateTargetSelection,
-  type TargetValidationResult,
-} from './targets.js';
-export { extractErrorMessage } from './errors.js';
-export { debugWarn } from './logger.js';
-export {
-  isValidTargetSelection,
-  isInvalidTargetSelection,
-  type TargetType,
-} from './type-guards.js';
diff --git a/src/mcp-server/utils/response.ts b/src/mcp-server/utils/response.ts
deleted file mode 100644
index 91d968e..0000000
--- a/src/mcp-server/utils/response.ts
+++ /dev/null
@@ -1,57 +0,0 @@
-import type { SuccessResponse, ErrorResponse, ErrorCode } from '../types';
-
-/**
- * Creates a standardized success response.
- *
- * @param result - The result data to include in the response.
- * @param sessionId - Optional session identifier.
- * @param startTime - Optional start time for duration calculation.
- * @returns A success response object.
- */
-export function createSuccessResponse<Result>(
-  result: Result,
-  sessionId?: string,
-  startTime?: number,
-): SuccessResponse<Result> {
-  return {
-    meta: {
-      timestamp: new Date().toISOString(),
-      sessionId,
-      durationMs: startTime ? Date.now() - startTime : 0,
-    },
-    ok: true,
-    result,
-  };
-}
-
-/**
- * Creates a standardized error response.
- *
- * @param code - The error code identifying the error type.
- * @param message - Human-readable error message.
- * @param details - Optional additional error details.
- * @param sessionId - Optional session identifier.
- * @param startTime - Optional start time for duration calculation.
- * @returns An error response object.
- */
-export function createErrorResponse(
-  code: ErrorCode,
-  message: string,
-  details?: Record<string, unknown>,
-  sessionId?: string,
-  startTime?: number,
-): ErrorResponse {
-  return {
-    error: {
-      code,
-      message,
-      details,
-    },
-    meta: {
-      timestamp: new Date().toISOString(),
-      sessionId,
-      durationMs: startTime ? Date.now() - startTime : 0,
-    },
-    ok: false,
-  };
-}
diff --git a/src/server/create-server.test.ts b/src/server/create-server.test.ts
new file mode 100644
index 0000000..ee651fa
--- /dev/null
+++ b/src/server/create-server.test.ts
@@ -0,0 +1,697 @@
+import * as fs from 'node:fs/promises';
+import * as http from 'node:http';
+import * as os from 'node:os';
+import * as path from 'node:path';
+import { describe, it, expect, beforeEach, afterEach, vi } from 'vitest';
+import type { MockInstance } from 'vitest';
+
+import type { ServerInstance } from './create-server.js';
+import {
+  createServer,
+  extractTargetFromInput,
+  extractScreenshotInfo,
+  extractToolOutcome,
+  buildResponseBody,
+} from './create-server.js';
+import { readDaemonState } from './daemon-state.js';
+import type { DaemonState, ServerConfig } from '../types/http.js';
+import { PACKAGE_VERSION } from '../version.js';
+
+const tmpDir = path.join(os.tmpdir(), `mm-create-server-test-${Date.now()}`);
+
+vi.mock('node:child_process', () => ({
+  execSync: () => Buffer.from(`${tmpDir}\n`),
+}));
+
+vi.mock('../tools/utils/discovery.js', () => ({
+  collectTestIds: vi.fn().mockResolvedValue([]),
+  collectTrimmedA11ySnapshot: vi.fn().mockResolvedValue({
+    nodes: [],
+    refMap: new Map(),
+  }),
+}));
+
+vi.mock('../knowledge-store/knowledge-store.js', () => {
+  const mockStore = {
+    recordStep: vi.fn().mockResolvedValue('/mock/path'),
+    writeSessionMetadata: vi.fn().mockResolvedValue('/mock/path'),
+    getLastSteps: vi.fn().mockResolvedValue([]),
+    searchSteps: vi.fn().mockResolvedValue([]),
+    summarizeSession: vi.fn().mockResolvedValue({ stepCount: 0, recipe: [] }),
+    listSessions: vi.fn().mockResolvedValue([]),
+    generatePriorKnowledge: vi.fn().mockResolvedValue(undefined),
+    getAllSessionIds: vi.fn().mockResolvedValue([]),
+    resolveSessionIds: vi.fn().mockResolvedValue([]),
+  };
+  return {
+    KnowledgeStore: vi.fn(() => mockStore),
+    createDefaultObservation: vi.fn(
+      (state: unknown, testIds?: unknown[], nodes?: unknown[]) => ({
+        state: state ?? {},
+        testIds: testIds ?? [],
+        a11y: { nodes: nodes ?? [] },
+      }),
+    ),
+    createKnowledgeStore: vi.fn(() => mockStore),
+    setKnowledgeStore: vi.fn(),
+    hasKnowledgeStore: vi.fn(() => false),
+    knowledgeStore: mockStore,
+  };
+});
+
+function createMockSessionManager() {
+  return {
+    hasActiveSession: vi.fn(() => false),
+    getSessionId: vi.fn(() => 'test-session'),
+    getSessionState: vi.fn(() => undefined),
+    getSessionMetadata: vi.fn(() => undefined),
+    launch: vi.fn(async () => ({
+      sessionId: 'test-session',
+      extensionId: 'test-ext',
+      state: {},
+    })),
+    cleanup: vi.fn(async () => true),
+    getPage: vi.fn(() => ({})),
+    setActivePage: vi.fn(),
+    getTrackedPages: vi.fn(() => []),
+    classifyPageRole: vi.fn(() => 'extension'),
+    getContext: vi.fn(() => ({})),
+    getExtensionId: vi.fn(() => 'test-ext'),
+    getExtensionState: vi.fn(async () => ({})),
+    takeScreenshot: vi.fn(async () => ({ path: '', base64: '' })),
+    getRefMap: vi.fn(() => new Map()),
+    setRefMap: vi.fn(),
+    setWorkflowContext: vi.fn(),
+    getEnvironmentMode: vi.fn(() => 'e2e'),
+    setContext: vi.fn(),
+    getContextInfo: vi.fn(() => ({
+      currentContext: 'e2e',
+      hasActiveSession: false,
+      sessionId: null,
+      capabilities: { available: [] },
+      canSwitchContext: true,
+    })),
+  };
+}
+
+let exitSpy: MockInstance;
+
+function buildConfig(overrides: Partial<ServerConfig> = {}): ServerConfig {
+  return {
+    sessionManager:
+      createMockSessionManager() as unknown as ServerConfig['sessionManager'],
+    contextFactory: () =>
+      ({}) as unknown as ReturnType<ServerConfig['contextFactory']>,
+    ...overrides,
+  };
+}
+
+async function httpRequest(
+  url: string,
+  options: {
+    method?: string;
+    headers?: Record<string, string>;
+    body?: string;
+  } = {},
+): Promise<{ status: number; json: () => Promise<unknown> }> {
+  return new Promise((resolve, reject) => {
+    const parsedUrl = new URL(url);
+    const req = http.request(
+      {
+        hostname: parsedUrl.hostname,
+        port: parsedUrl.port,
+        path: parsedUrl.pathname,
+        method: options.method ?? 'GET',
+        headers: options.headers,
+      },
+      (res) => {
+        let data = '';
+        res.on('data', (chunk: Buffer) => {
+          data += chunk.toString();
+        });
+        res.on('end', () => {
+          resolve({
+            status: res.statusCode ?? 0,
+            json: async () => JSON.parse(data) as unknown,
+          });
+        });
+      },
+    );
+    req.on('error', reject);
+    if (options.body) {
+      req.write(options.body);
+    }
+    req.end();
+  });
+}
+
+describe('extractTargetFromInput', () => {
+  it('returns undefined for null input', () => {
+    expect(extractTargetFromInput(null)).toBeUndefined();
+  });
+
+  it('returns undefined for non-object input', () => {
+    expect(extractTargetFromInput('string')).toBeUndefined();
+    expect(extractTargetFromInput(42)).toBeUndefined();
+  });
+
+  it('returns undefined when no target fields present', () => {
+    expect(extractTargetFromInput({ name: 'click' })).toBeUndefined();
+  });
+
+  it('extracts a11yRef', () => {
+    expect(extractTargetFromInput({ a11yRef: 'e1' })).toStrictEqual({
+      a11yRef: 'e1',
+      testId: undefined,
+      selector: undefined,
+    });
+  });
+
+  it('extracts testId', () => {
+    expect(extractTargetFromInput({ testId: 'btn' })).toStrictEqual({
+      a11yRef: undefined,
+      testId: 'btn',
+      selector: undefined,
+    });
+  });
+
+  it('extracts selector', () => {
+    expect(extractTargetFromInput({ selector: '.my-btn' })).toStrictEqual({
+      a11yRef: undefined,
+      testId: undefined,
+      selector: '.my-btn',
+    });
+  });
+
+  it('extracts multiple target fields', () => {
+    expect(
+      extractTargetFromInput({ a11yRef: 'e1', testId: 'btn' }),
+    ).toStrictEqual({
+      a11yRef: 'e1',
+      testId: 'btn',
+      selector: undefined,
+    });
+  });
+
+  it('ignores non-string target values', () => {
+    expect(extractTargetFromInput({ a11yRef: 42 })).toBeUndefined();
+  });
+});
+
+describe('extractScreenshotInfo', () => {
+  it('returns undefined for non-screenshot tools', () => {
+    expect(extractScreenshotInfo('click', {})).toBeUndefined();
+  });
+
+  it('returns undefined when toolResult is not an object', () => {
+    expect(extractScreenshotInfo('screenshot', null)).toBeUndefined();
+    expect(extractScreenshotInfo('screenshot', 'string')).toBeUndefined();
+  });
+
+  it('returns undefined when result is not ok', () => {
+    expect(extractScreenshotInfo('screenshot', { ok: false })).toBeUndefined();
+  });
+
+  it('returns undefined when result has no path', () => {
+    expect(
+      extractScreenshotInfo('screenshot', { ok: true, result: {} }),
+    ).toBeUndefined();
+  });
+
+  it('extracts screenshot path from result.path', () => {
+    expect(
+      extractScreenshotInfo('screenshot', {
+        ok: true,
+        result: { path: '/img.png' },
+      }),
+    ).toStrictEqual({ path: '/img.png' });
+  });
+
+  it('extracts screenshot path with dimensions', () => {
+    expect(
+      extractScreenshotInfo('screenshot', {
+        ok: true,
+        result: { path: '/img.png', width: 1280, height: 720 },
+      }),
+    ).toStrictEqual({
+      path: '/img.png',
+      dimensions: { width: 1280, height: 720 },
+    });
+  });
+
+  it('extracts screenshot from nested screenshot object', () => {
+    expect(
+      extractScreenshotInfo('describe_screen', {
+        ok: true,
+        result: { screenshot: { path: '/ss.png', width: 800, height: 600 } },
+      }),
+    ).toStrictEqual({
+      path: '/ss.png',
+      dimensions: { width: 800, height: 600 },
+    });
+  });
+
+  it('extracts nested screenshot without dimensions', () => {
+    expect(
+      extractScreenshotInfo('describe_screen', {
+        ok: true,
+        result: { screenshot: { path: '/ss.png' } },
+      }),
+    ).toStrictEqual({ path: '/ss.png' });
+  });
+
+  it('returns undefined when result.result is null', () => {
+    expect(
+      extractScreenshotInfo('screenshot', { ok: true, result: null }),
+    ).toBeUndefined();
+  });
+
+  it('returns undefined when nested screenshot has no path', () => {
+    expect(
+      extractScreenshotInfo('describe_screen', {
+        ok: true,
+        result: { screenshot: { width: 800 } },
+      }),
+    ).toBeUndefined();
+  });
+
+  it('returns undefined when nested screenshot is null', () => {
+    expect(
+      extractScreenshotInfo('describe_screen', {
+        ok: true,
+        result: { screenshot: null },
+      }),
+    ).toBeUndefined();
+  });
+});
+
+describe('extractToolOutcome', () => {
+  it('returns ok:true for non-object input', () => {
+    expect(extractToolOutcome(null)).toStrictEqual({ ok: true });
+    expect(extractToolOutcome('string')).toStrictEqual({ ok: true });
+  });
+
+  it('returns ok:true when ok not in result', () => {
+    expect(extractToolOutcome({ result: 'data' })).toStrictEqual({ ok: true });
+  });
+
+  it('returns ok:true for successful result', () => {
+    expect(extractToolOutcome({ ok: true, result: 'data' })).toStrictEqual({
+      ok: true,
+    });
+  });
+
+  it('returns ok:false with error for failed result', () => {
+    expect(
+      extractToolOutcome({
+        ok: false,
+        error: { code: 'ERR', message: 'fail' },
+      }),
+    ).toStrictEqual({
+      ok: false,
+      error: { code: 'ERR', message: 'fail' },
+    });
+  });
+
+  it('returns ok:false without error when no error field', () => {
+    expect(extractToolOutcome({ ok: false })).toStrictEqual({ ok: false });
+  });
+});
+
+describe('buildResponseBody', () => {
+  it('returns toolResult as-is for non-object', () => {
+    expect(buildResponseBody('string', undefined)).toBe('string');
+    expect(buildResponseBody(null, undefined)).toBeNull();
+  });
+
+  it('returns toolResult when no observations', () => {
+    const result = { ok: true, data: 'test' };
+    expect(buildResponseBody(result, undefined)).toStrictEqual(result);
+  });
+
+  it('merges observations into result', () => {
+    const result = { ok: true };
+    const obs = { state: {}, testIds: [], a11y: { nodes: [] } };
+    expect(buildResponseBody(result, obs as any)).toStrictEqual({
+      ok: true,
+      observations: obs,
+    });
+  });
+});
+
+describe('createServer integration', () => {
+  let server: ServerInstance;
+  let state: DaemonState;
+
+  beforeEach(async () => {
+    await fs.mkdir(tmpDir, { recursive: true });
+    exitSpy = vi.spyOn(process, 'exit').mockImplementation((() => {}) as never);
+
+    server = createServer(buildConfig());
+    state = await server.start();
+  });
+
+  afterEach(async () => {
+    await server.stop();
+    exitSpy.mockRestore();
+    await fs.rm(tmpDir, { recursive: true, force: true }).catch(() => {});
+  });
+
+  it('gET /health returns 200 with status and nonce', async () => {
+    const res = await httpRequest(`http://127.0.0.1:${state.port}/health`);
+    const body = (await res.json()) as { status: string; nonce: string };
+
+    expect(res.status).toBe(200);
+    expect(body.status).toBe('ok');
+    expect(body.nonce).toBe(state.nonce);
+  });
+
+  it('gET /status returns daemon info', async () => {
+    const res = await httpRequest(`http://127.0.0.1:${state.port}/status`);
+    const body = (await res.json()) as {
+      daemon: { pid: number; port: number };
+      ports: Record<string, number>;
+    };
+
+    expect(res.status).toBe(200);
+    expect(body.daemon.pid).toBe(process.pid);
+    expect(body.daemon.port).toBe(state.port);
+    expect(body.ports).toBeDefined();
+  });
+
+  it('pOST /launch delegates to session manager', async () => {
+    const res = await httpRequest(`http://127.0.0.1:${state.port}/launch`, {
+      method: 'POST',
+      headers: { 'Content-Type': 'application/json' },
+      body: JSON.stringify({ state: 'default' }),
+    });
+    const body = (await res.json()) as { ok: boolean };
+
+    expect(res.status).toBe(200);
+    expect(body.ok).toBe(true);
+  });
+
+  it('pOST /cleanup delegates to session manager', async () => {
+    const res = await httpRequest(`http://127.0.0.1:${state.port}/cleanup`, {
+      method: 'POST',
+      headers: { 'Content-Type': 'application/json' },
+      body: JSON.stringify({}),
+    });
+    const body = (await res.json()) as { ok: boolean };
+
+    expect(res.status).toBe(200);
+    expect(body.ok).toBe(true);
+  });
+
+  it('pOST /tool/nonexistent returns 404', async () => {
+    const res = await httpRequest(
+      `http://127.0.0.1:${state.port}/tool/nonexistent`,
+      {
+        method: 'POST',
+        headers: { 'Content-Type': 'application/json' },
+        body: JSON.stringify({}),
+      },
+    );
+    const body = (await res.json()) as {
+      ok: boolean;
+      error: { code: string };
+    };
+
+    expect(res.status).toBe(404);
+    expect(body.ok).toBe(false);
+    expect(body.error.code).toBe('TOOL_NOT_FOUND');
+  });
+
+  it('writes .mm-server on start', async () => {
+    const daemonState = await readDaemonState(tmpDir);
+    expect(daemonState).not.toBeNull();
+    expect(daemonState?.port).toBe(state.port);
+    expect(daemonState?.nonce).toBe(state.nonce);
+    expect(daemonState?.version).toBe(PACKAGE_VERSION);
+  });
+
+  it('passes workflow context to session manager on start', async () => {
+    await server.stop();
+
+    const workflowContext = { config: { environment: 'e2e' as const } };
+    const mockSM = createMockSessionManager();
+    const customServer = createServer(
+      buildConfig({
+        sessionManager: mockSM as unknown as ServerConfig['sessionManager'],
+        contextFactory: () =>
+          workflowContext as unknown as ReturnType<
+            ServerConfig['contextFactory']
+          >,
+      }),
+    );
+
+    await customServer.start();
+    expect(mockSM.setWorkflowContext).toHaveBeenCalledWith(workflowContext);
+    await customServer.stop();
+  });
+
+  it('removes .mm-server on stop', async () => {
+    await server.stop();
+    const daemonState = await readDaemonState(tmpDir);
+    expect(daemonState).toBeNull();
+  });
+
+  it('serializes concurrent launch requests through the queue', async () => {
+    const [res1, res2] = await Promise.all([
+      httpRequest(`http://127.0.0.1:${state.port}/launch`, {
+        method: 'POST',
+        headers: { 'Content-Type': 'application/json' },
+        body: JSON.stringify({}),
+      }),
+      httpRequest(`http://127.0.0.1:${state.port}/launch`, {
+        method: 'POST',
+        headers: { 'Content-Type': 'application/json' },
+        body: JSON.stringify({}),
+      }),
+    ]);
+
+    expect(res1.status).toBe(200);
+    expect(res2.status).toBe(200);
+  });
+
+  it('stop() is idempotent', async () => {
+    await server.stop();
+    expect(await server.stop()).toBeUndefined();
+  });
+
+  describe('POST /tool/:name input validation', () => {
+    it('returns 400 for missing required field', async () => {
+      const res = await httpRequest(
+        `http://127.0.0.1:${state.port}/tool/click`,
+        {
+          method: 'POST',
+          headers: { 'Content-Type': 'application/json' },
+          body: JSON.stringify({}),
+        },
+      );
+      const body = (await res.json()) as {
+        ok: boolean;
+        error: { code: string; message: string };
+      };
+
+      expect(res.status).toBe(400);
+      expect(body.ok).toBe(false);
+      expect(body.error.code).toBe('VALIDATION_ERROR');
+    });
+
+    it('returns 400 for invalid enum value', async () => {
+      const res = await httpRequest(
+        `http://127.0.0.1:${state.port}/tool/navigate`,
+        {
+          method: 'POST',
+          headers: { 'Content-Type': 'application/json' },
+          body: JSON.stringify({ screen: 'nonexistent' }),
+        },
+      );
+      const body = (await res.json()) as {
+        ok: boolean;
+        error: { code: string; message: string };
+      };
+
+      expect(res.status).toBe(400);
+      expect(body.ok).toBe(false);
+      expect(body.error.code).toBe('VALIDATION_ERROR');
+    });
+
+    it('returns 400 when cross-field refine fails', async () => {
+      const res = await httpRequest(
+        `http://127.0.0.1:${state.port}/tool/clipboard`,
+        {
+          method: 'POST',
+          headers: { 'Content-Type': 'application/json' },
+          body: JSON.stringify({ action: 'write' }),
+        },
+      );
+      const body = (await res.json()) as {
+        ok: boolean;
+        error: { code: string; message: string };
+      };
+
+      expect(res.status).toBe(400);
+      expect(body.ok).toBe(false);
+      expect(body.error.code).toBe('VALIDATION_ERROR');
+      expect(body.error.message).toContain(
+        "text is required when action is 'write'",
+      );
+    });
+
+    it('returns 400 for wrong field type', async () => {
+      const res = await httpRequest(
+        `http://127.0.0.1:${state.port}/tool/wait_for_notification`,
+        {
+          method: 'POST',
+          headers: { 'Content-Type': 'application/json' },
+          body: JSON.stringify({ timeoutMs: 'not-a-number' }),
+        },
+      );
+      const body = (await res.json()) as {
+        ok: boolean;
+        error: { code: string; message: string };
+      };
+
+      expect(res.status).toBe(400);
+      expect(body.ok).toBe(false);
+      expect(body.error.code).toBe('VALIDATION_ERROR');
+    });
+
+    it('passes validation for valid input (empty schema)', async () => {
+      const res = await httpRequest(
+        `http://127.0.0.1:${state.port}/tool/get_state`,
+        {
+          method: 'POST',
+          headers: { 'Content-Type': 'application/json' },
+          body: JSON.stringify({}),
+        },
+      );
+
+      expect(res.status).not.toBe(400);
+    });
+  });
+});
+
+describe('createServer with active session', () => {
+  let server: ServerInstance;
+  let state: DaemonState;
+  let mockSM: ReturnType<typeof createMockSessionManager>;
+
+  beforeEach(async () => {
+    await fs.mkdir(tmpDir, { recursive: true });
+    exitSpy = vi.spyOn(process, 'exit').mockImplementation((() => {}) as never);
+
+    mockSM = createMockSessionManager();
+    mockSM.hasActiveSession.mockReturnValue(true);
+    mockSM.getExtensionState.mockResolvedValue({
+      isLoaded: true,
+      currentUrl: 'chrome-extension://test/home.html',
+    });
+
+    server = createServer(
+      buildConfig({
+        sessionManager: mockSM as unknown as ServerConfig['sessionManager'],
+      }),
+    );
+    state = await server.start();
+  });
+
+  afterEach(async () => {
+    await server.stop();
+    exitSpy.mockRestore();
+    await fs.rm(tmpDir, { recursive: true, force: true }).catch(() => {});
+  });
+
+  it('collects observations and records knowledge for tool execution', async () => {
+    const res = await httpRequest(`http://127.0.0.1:${state.port}/cleanup`, {
+      method: 'POST',
+      headers: { 'Content-Type': 'application/json' },
+      body: JSON.stringify({}),
+    });
+    const body = (await res.json()) as { ok: boolean; observations?: unknown };
+
+    expect(res.status).toBe(200);
+    expect(body.ok).toBe(true);
+    expect(body.observations).toBeDefined();
+  });
+
+  it('records error step when tool execution throws', async () => {
+    mockSM.cleanup.mockRejectedValueOnce(new Error('Browser crash'));
+
+    const res = await httpRequest(`http://127.0.0.1:${state.port}/cleanup`, {
+      method: 'POST',
+      headers: { 'Content-Type': 'application/json' },
+      body: JSON.stringify({}),
+    });
+    const body = (await res.json()) as {
+      ok: boolean;
+      error: { code: string; message: string };
+    };
+
+    expect(res.status).toBe(500);
+    expect(body.ok).toBe(false);
+    expect(body.error.code).toBe('TOOL_EXECUTION_FAILED');
+    expect(body.error.message).toContain('Browser crash');
+  });
+
+  it('handles observation collection failure gracefully', async () => {
+    mockSM.getPage.mockImplementation(() => {
+      throw new Error('Page closed');
+    });
+
+    const res = await httpRequest(`http://127.0.0.1:${state.port}/cleanup`, {
+      method: 'POST',
+      headers: { 'Content-Type': 'application/json' },
+      body: JSON.stringify({}),
+    });
+    const body = (await res.json()) as { ok: boolean };
+
+    expect(res.status).toBe(200);
+    expect(body.ok).toBe(true);
+  });
+
+  it('records step with environment context', async () => {
+    const res = await httpRequest(
+      `http://127.0.0.1:${state.port}/tool/get_state`,
+      {
+        method: 'POST',
+        headers: { 'Content-Type': 'application/json' },
+        body: JSON.stringify({}),
+      },
+    );
+
+    expect(res.status).toBe(200);
+  });
+});
+
+describe('createServer with logging', () => {
+  let server: ServerInstance;
+  let state: DaemonState;
+
+  beforeEach(async () => {
+    await fs.mkdir(tmpDir, { recursive: true });
+    exitSpy = vi.spyOn(process, 'exit').mockImplementation((() => {}) as never);
+
+    server = createServer(
+      buildConfig({ logFilePath: path.join(tmpDir, 'daemon.log') }),
+    );
+    state = await server.start();
+  });
+
+  afterEach(async () => {
+    await server.stop();
+    exitSpy.mockRestore();
+    await fs.rm(tmpDir, { recursive: true, force: true }).catch(() => {});
+  });
+
+  it('writes request logs to file', async () => {
+    await httpRequest(`http://127.0.0.1:${state.port}/health`);
+    await new Promise((resolve) => setTimeout(resolve, 100));
+    const logContent = await fs
+      .readFile(path.join(tmpDir, 'daemon.log'), 'utf-8')
+      .catch(() => '');
+    expect(logContent).toContain('/health');
+  });
+});
diff --git a/src/server/create-server.ts b/src/server/create-server.ts
new file mode 100644
index 0000000..bd51d13
--- /dev/null
+++ b/src/server/create-server.ts
@@ -0,0 +1,648 @@
+import express from 'express';
+import { execSync } from 'node:child_process';
+import { randomUUID } from 'node:crypto';
+import * as fs from 'node:fs/promises';
+import * as http from 'node:http';
+
+import { writeDaemonState, removeDaemonState } from './daemon-state.js';
+import { allocatePort } from './port-allocator.js';
+import { RequestQueue } from './request-queue.js';
+import type { WorkflowContext } from '../capabilities/context.js';
+import type { ExtensionState } from '../capabilities/types.js';
+import {
+  KnowledgeStore,
+  createDefaultObservation,
+} from '../knowledge-store/knowledge-store.js';
+import { toolRegistry } from '../tools/registry.js';
+import type {
+  StepRecordObservation,
+  StepRecordOutcome,
+  StepRecordTool,
+} from '../tools/types/step-record.js';
+import { OBSERVATION_TESTID_LIMIT } from '../tools/utils/constants.js';
+import {
+  collectTestIds,
+  collectTrimmedA11ySnapshot,
+} from '../tools/utils/discovery.js';
+import type { DaemonState, ServerConfig, ToolContext } from '../types/http.js';
+import { extractErrorMessage } from '../utils/errors.js';
+import type { ToolName } from '../validation/schemas.js';
+import { toolSchemas } from '../validation/schemas.js';
+import { PACKAGE_VERSION } from '../version.js';
+
+/**
+ * Extracts target selection fields from a tool's validated input.
+ * Interaction tools (click, type, wait_for) include a11yRef, testId, or selector.
+ *
+ * @param input - The validated tool input.
+ * @returns The target info for knowledge recording, or undefined if not applicable.
+ */
+export function extractTargetFromInput(
+  input: unknown,
+): StepRecordTool['target'] | undefined {
+  if (typeof input !== 'object' || input === null) {
+    return undefined;
+  }
+  const obj = input as Record<string, unknown>;
+  const a11yRef = typeof obj.a11yRef === 'string' ? obj.a11yRef : undefined;
+  const testId = typeof obj.testId === 'string' ? obj.testId : undefined;
+  const selector = typeof obj.selector === 'string' ? obj.selector : undefined;
+  if (!a11yRef && !testId && !selector) {
+    return undefined;
+  }
+  return { a11yRef, testId, selector };
+}
+
+/**
+ * Extracts screenshot artifact metadata from a successful tool result.
+ * Applies to `screenshot` and `describe_screen` tools.
+ *
+ * @param toolName - The name of the tool that produced the result.
+ * @param toolResult - The raw result from the tool execution.
+ * @returns Screenshot path and dimensions, or undefined if not applicable.
+ */
+export function extractScreenshotInfo(
+  toolName: string,
+  toolResult: unknown,
+):
+  | { path: string; dimensions?: { width: number; height: number } }
+  | undefined {
+  if (toolName !== 'screenshot' && toolName !== 'describe_screen') {
+    return undefined;
+  }
+  if (typeof toolResult !== 'object' || toolResult === null) {
+    return undefined;
+  }
+  const result = toolResult as Record<string, unknown>;
+  if (
+    !result.ok ||
+    typeof result.result !== 'object' ||
+    result.result === null
+  ) {
+    return undefined;
+  }
+  const data = result.result as Record<string, unknown>;
+
+  if (typeof data.path === 'string') {
+    return {
+      path: data.path,
+      ...(typeof data.width === 'number' && typeof data.height === 'number'
+        ? { dimensions: { width: data.width, height: data.height } }
+        : {}),
+    };
+  }
+
+  if (typeof data.screenshot === 'object' && data.screenshot !== null) {
+    const ss = data.screenshot as Record<string, unknown>;
+    if (typeof ss.path === 'string') {
+      return {
+        path: ss.path,
+        ...(typeof ss.width === 'number' && typeof ss.height === 'number'
+          ? { dimensions: { width: ss.width, height: ss.height } }
+          : {}),
+      };
+    }
+  }
+
+  return undefined;
+}
+
+export type ServerInstance = {
+  start(): Promise<DaemonState>;
+  stop(): Promise<void>;
+};
+
+/**
+ * Extracts a structured outcome from a raw tool result for knowledge recording.
+ *
+ * @param toolResult - The raw result returned by a tool function.
+ * @returns A normalized outcome with ok status and optional error details.
+ */
+export function extractToolOutcome(toolResult: unknown): {
+  ok: boolean;
+  error?: { code: string; message: string };
+} {
+  if (
+    typeof toolResult !== 'object' ||
+    toolResult === null ||
+    !('ok' in toolResult)
+  ) {
+    return { ok: true };
+  }
+
+  const typed = toolResult as {
+    ok: boolean;
+    error?: { code: string; message: string };
+  };
+  if (typed.ok) {
+    return { ok: true };
+  }
+
+  return typed.error ? { ok: false, error: typed.error } : { ok: false };
+}
+
+/**
+ * Merges a tool result with observation data into the HTTP response body.
+ *
+ * @param toolResult - The raw result returned by a tool function.
+ * @param observations - Optional observation snapshot to attach.
+ * @returns The response body suitable for res.json().
+ */
+export function buildResponseBody(
+  toolResult: unknown,
+  observations: StepRecordObservation | undefined,
+): unknown {
+  if (typeof toolResult !== 'object' || toolResult === null) {
+    return toolResult;
+  }
+
+  if (!observations) {
+    return toolResult;
+  }
+
+  return { ...(toolResult as Record<string, unknown>), observations };
+}
+
+/**
+ * Creates an HTTP daemon server for agent-driven browser testing.
+ *
+ * @param config - The server configuration options.
+ * @returns The server instance with start and stop methods.
+ */
+export function createServer(config: ServerConfig): ServerInstance {
+  const app = express();
+  const queue = new RequestQueue(config.requestTimeoutMs);
+  const nonce = randomUUID();
+  const knowledgeStore = new KnowledgeStore();
+
+  let httpServer: http.Server | null = null;
+  let worktreeRoot = '';
+  let startedAt = '';
+  let daemonPort = 0;
+  let workflowContext: WorkflowContext | null = null;
+  let subPorts: { anvil: number; fixture: number; mock: number } = {
+    anvil: 0,
+    fixture: 0,
+    mock: 0,
+  };
+  let shuttingDown = false;
+  let shutdownHandler: (() => void) | null = null;
+  let lastRequestTime = Date.now();
+  let idleCheckInterval: ReturnType<typeof setInterval> | null = null;
+
+  // eslint-disable-next-line import-x/no-named-as-default-member
+  app.use(express.json({ limit: '10mb' }));
+
+  app.use((req, res, next) => {
+    lastRequestTime = Date.now();
+    const requestStartedAt = lastRequestTime;
+    res.on('finish', () => {
+      const duration = Date.now() - requestStartedAt;
+      appendLog(
+        config.logFilePath,
+        `[INFO] ${req.method} ${req.path} ${res.statusCode} ${duration}ms`,
+      );
+    });
+    next();
+  });
+
+  app.get('/health', (_req, res) => {
+    res.json({ status: 'ok', nonce });
+  });
+
+  app.get('/status', (_req, res) => {
+    res.json({
+      daemon: {
+        pid: process.pid,
+        port: daemonPort,
+        uptime: process.uptime(),
+        startedAt,
+      },
+      ports: subPorts,
+    });
+  });
+
+  /**
+   * Builds a lazy ToolContext where `page` and `refMap` are only accessed
+   * when a tool actually reads them, avoiding throws for non-session tools.
+   *
+   * @param wfCtx - The current workflow context to embed in the tool context.
+   * @returns A ToolContext with lazy page and refMap accessors.
+   */
+  function buildToolContext(wfCtx: WorkflowContext): ToolContext {
+    return {
+      sessionManager: config.sessionManager,
+      get page(): ReturnType<typeof config.sessionManager.getPage> {
+        return config.sessionManager.getPage();
+      },
+      get refMap(): Map<string, string> {
+        return config.sessionManager.hasActiveSession()
+          ? config.sessionManager.getRefMap()
+          : new Map<string, string>();
+      },
+      workflowContext: wfCtx,
+      knowledgeStore,
+      toolRegistry,
+    };
+  }
+
+  /**
+   * Records a tool execution step to the knowledge store.
+   * Failures are silently caught — recording must never block tool responses.
+   *
+   * @param toolName - The registered tool name.
+   * @param validatedInput - The validated input payload.
+   * @param outcome - The tool execution outcome.
+   * @param observation - The post-execution observation snapshot.
+   * @param toolResult - The raw tool result (for screenshot extraction).
+   * @param startTime - The epoch timestamp when execution started.
+   */
+  async function recordToolStep(
+    toolName: string,
+    validatedInput: unknown,
+    outcome: StepRecordOutcome,
+    observation: StepRecordObservation | undefined,
+    toolResult: unknown,
+    startTime: number,
+  ): Promise<void> {
+    try {
+      const sessionId = config.sessionManager.getSessionId();
+      if (!sessionId) {
+        return;
+      }
+
+      const target = extractTargetFromInput(validatedInput);
+      const screenshotInfo = extractScreenshotInfo(toolName, toolResult);
+
+      let executionContext: 'e2e' | 'prod' | undefined;
+      try {
+        executionContext = config.sessionManager.getEnvironmentMode();
+      } catch {
+        // session manager may not support environment mode
+      }
+
+      await knowledgeStore.recordStep({
+        sessionId,
+        toolName,
+        input: validatedInput as Record<string, unknown>,
+        target,
+        outcome,
+        observation:
+          observation ?? createDefaultObservation({} as ExtensionState),
+        durationMs: Date.now() - startTime,
+        ...(screenshotInfo ? { screenshotPath: screenshotInfo.path } : {}),
+        ...(screenshotInfo?.dimensions
+          ? { screenshotDimensions: screenshotInfo.dimensions }
+          : {}),
+        context: executionContext,
+      });
+    } catch {
+      // non-fatal: recording failure must not block tool responses
+    }
+  }
+
+  /**
+   * Shared tool executor — validates input, runs through the queue,
+   * records knowledge steps, and collects observations.
+   *
+   * @param toolName - The registered tool name to execute.
+   * @param rawInput - The unvalidated input payload from the request body.
+   * @param res - The Express response object to write the result to.
+   */
+  async function executeTool(
+    toolName: string,
+    rawInput: unknown,
+    res: express.Response,
+  ): Promise<void> {
+    const tool = toolRegistry.get(toolName);
+    if (!tool) {
+      res.status(404).json({
+        ok: false,
+        error: { code: 'TOOL_NOT_FOUND', message: `Unknown tool: ${toolName}` },
+      });
+      return;
+    }
+
+    if (!workflowContext) {
+      res.status(503).json({
+        ok: false,
+        error: {
+          code: 'SERVER_NOT_STARTED',
+          message: 'Server has not been started yet.',
+        },
+      });
+      return;
+    }
+
+    const schema =
+      toolName in toolSchemas ? toolSchemas[toolName as ToolName] : undefined;
+    let validatedInput = rawInput;
+
+    if (schema) {
+      const parsed = schema.safeParse(rawInput);
+      if (!parsed.success) {
+        res.status(400).json({
+          ok: false,
+          error: {
+            code: 'VALIDATION_ERROR',
+            message: parsed.error.issues
+              .map((i) =>
+                i.path.length > 0
+                  ? `${i.path.join('.')}: ${i.message}`
+                  : i.message,
+              )
+              .join('; '),
+          },
+        });
+        return;
+      }
+      validatedInput = parsed.data;
+    }
+
+    const startTime = Date.now();
+    const currentWorkflowContext = workflowContext;
+
+    try {
+      const { toolResult, observations } = await queue.enqueue(async () => {
+        const context = buildToolContext(currentWorkflowContext);
+        const result = await tool(validatedInput, context);
+
+        let obs: StepRecordObservation | undefined;
+        if (config.sessionManager.hasActiveSession()) {
+          try {
+            const page = config.sessionManager.getPage();
+            const state = await config.sessionManager.getExtensionState();
+            const testIds = await collectTestIds(
+              page,
+              OBSERVATION_TESTID_LIMIT,
+            );
+            const { nodes, refMap: newRefMap } =
+              await collectTrimmedA11ySnapshot(page);
+            config.sessionManager.setRefMap(newRefMap);
+            obs = createDefaultObservation(state, testIds, nodes);
+          } catch {
+            // non-fatal: observation failure must not block the tool response
+          }
+        }
+
+        return { toolResult: result, observations: obs };
+      });
+
+      await recordToolStep(
+        toolName,
+        validatedInput,
+        extractToolOutcome(toolResult),
+        observations,
+        toolResult,
+        startTime,
+      );
+
+      res.json(buildResponseBody(toolResult, observations));
+    } catch (error) {
+      await recordToolStep(
+        toolName,
+        validatedInput,
+        {
+          ok: false,
+          error: {
+            code: 'TOOL_EXECUTION_FAILED',
+            message: extractErrorMessage(error),
+          },
+        },
+        undefined,
+        undefined,
+        startTime,
+      );
+
+      res.status(500).json({
+        ok: false,
+        error: {
+          code: 'TOOL_EXECUTION_FAILED',
+          message: extractErrorMessage(error),
+        },
+      });
+    }
+  }
+
+  app.post('/launch', async (req, res) => {
+    await executeTool('launch', req.body, res);
+  });
+
+  app.post('/cleanup', async (_req, res) => {
+    await executeTool('cleanup', {}, res);
+  });
+
+  app.post(
+    '/tool/:name',
+    async (req: express.Request<{ name: string }>, res) => {
+      await executeTool(req.params.name, req.body, res);
+    },
+  );
+
+  app.use(
+    (
+      error: Error,
+      _req: express.Request,
+      res: express.Response,
+      _next: express.NextFunction,
+    ) => {
+      appendLog(config.logFilePath, `[ERROR] ${error.message}`);
+      res.status(500).json({
+        ok: false,
+        error: {
+          code: 'INTERNAL_ERROR',
+          message: error.message,
+        },
+      });
+    },
+  );
+
+  const instance: ServerInstance = {
+    async start(): Promise<DaemonState> {
+      worktreeRoot = execSync('git rev-parse --show-toplevel', {
+        cwd: process.cwd(),
+      })
+        .toString()
+        .trim();
+
+      // Allocate sub-ports for external services (anvil, fixture, mock).
+      // These use allocate-then-close because the external services bind
+      // their own sockets — a small TOCTOU window is acceptable here.
+      const [anvilAlloc, fixtureAlloc, mockAlloc] = await Promise.all([
+        allocatePort(),
+        allocatePort(),
+        allocatePort(),
+      ]);
+
+      subPorts = {
+        anvil: anvilAlloc.port,
+        fixture: fixtureAlloc.port,
+        mock: mockAlloc.port,
+      };
+
+      await Promise.all([
+        new Promise<void>((resolve) =>
+          anvilAlloc.server.close(() => resolve()),
+        ),
+        new Promise<void>((resolve) =>
+          fixtureAlloc.server.close(() => resolve()),
+        ),
+        new Promise<void>((resolve) => mockAlloc.server.close(() => resolve())),
+      ]);
+
+      workflowContext = config.contextFactory({ ports: subPorts });
+      config.sessionManager.setWorkflowContext(workflowContext);
+      startedAt = new Date().toISOString();
+
+      // Bind daemon directly to port 0 to eliminate TOCTOU race —
+      // the OS assigns the port atomically at listen time.
+      httpServer = await new Promise<http.Server>((resolve, reject) => {
+        const srv = http.createServer(app);
+        srv.listen(0, '127.0.0.1', () => {
+          const addr = srv.address();
+          if (addr && typeof addr !== 'string') {
+            daemonPort = addr.port;
+          }
+          resolve(srv);
+        });
+        srv.on('error', reject);
+      });
+
+      const state: DaemonState = {
+        port: daemonPort,
+        pid: process.pid,
+        startedAt,
+        nonce,
+        version: PACKAGE_VERSION,
+        subPorts,
+      };
+
+      await writeDaemonState(worktreeRoot, state);
+      appendLog(
+        config.logFilePath,
+        `[INFO] Daemon started on port ${daemonPort} (pid ${process.pid})`,
+      );
+
+      shutdownHandler = (): void => {
+        instance
+          .stop()
+          .then(() => process.exit(0))
+          .catch((error: Error) => {
+            appendLog(
+              config.logFilePath,
+              `[ERROR] Daemon failed to shut down: ${error.message}`,
+            );
+            process.exit(1);
+          });
+      };
+
+      process.on('SIGTERM', shutdownHandler);
+      process.on('SIGINT', shutdownHandler);
+
+      const { idleShutdownMs } = config;
+      if (idleShutdownMs && idleShutdownMs > 0) {
+        const checkMs = Math.min(idleShutdownMs / 10, 60_000);
+        idleCheckInterval = setInterval(() => {
+          if (Date.now() - lastRequestTime > idleShutdownMs) {
+            appendLog(
+              config.logFilePath,
+              '[INFO] Idle timeout reached, shutting down',
+            );
+            if (idleCheckInterval) {
+              clearInterval(idleCheckInterval);
+              idleCheckInterval = null;
+            }
+            shutdownHandler?.();
+          }
+        }, checkMs);
+        idleCheckInterval.unref();
+      }
+
+      return state;
+    },
+
+    async stop(): Promise<void> {
+      if (shuttingDown) {
+        return;
+      }
+      shuttingDown = true;
+
+      appendLog(config.logFilePath, '[INFO] Daemon shutting down');
+
+      // 1. Remove signal handlers
+      if (shutdownHandler) {
+        process.removeListener('SIGTERM', shutdownHandler);
+        process.removeListener('SIGINT', shutdownHandler);
+        shutdownHandler = null;
+      }
+
+      // 2. Clear idle check interval
+      if (idleCheckInterval) {
+        clearInterval(idleCheckInterval);
+        idleCheckInterval = null;
+      }
+
+      // 3. Stop accepting new connections, wait for in-flight (max 10s)
+      await new Promise<void>((resolve) => {
+        if (!httpServer) {
+          resolve();
+          return;
+        }
+
+        const forceClose = setTimeout(() => {
+          httpServer?.closeAllConnections();
+          resolve();
+        }, 10_000);
+
+        httpServer.close(() => {
+          clearTimeout(forceClose);
+          httpServer = null;
+          resolve();
+        });
+      });
+
+      // 4. Clean up session
+      try {
+        await config.sessionManager.cleanup();
+      } catch (error) {
+        appendLog(
+          config.logFilePath,
+          `[ERROR] Cleanup failed: ${extractErrorMessage(error)}`,
+          true,
+        );
+      }
+
+      // 5. Remove .mm-server file
+      if (worktreeRoot) {
+        await removeDaemonState(worktreeRoot);
+      }
+
+      appendLog(config.logFilePath, '[INFO] Daemon stopped');
+    },
+  };
+
+  return instance;
+}
+
+/**
+ * Appends a timestamped line to the daemon log file.
+ *
+ * @param logFilePath - Path to the log file, or undefined to skip file logging.
+ * @param message - The log message to append.
+ * @param fatal - Whether to also write to stderr.
+ */
+function appendLog(
+  logFilePath: string | undefined,
+  message: string,
+  fatal = false,
+): void {
+  const line = `[${new Date().toISOString()}] ${message}\n`;
+  if (fatal) {
+    process.stderr.write(line);
+  }
+  if (logFilePath) {
+    fs.appendFile(logFilePath, line, 'utf-8').catch((error) => {
+      process.stderr.write(`Failed to write log: ${error.message}\n`);
+    });
+  }
+}
diff --git a/src/server/daemon-state.test.ts b/src/server/daemon-state.test.ts
new file mode 100644
index 0000000..43c4cec
--- /dev/null
+++ b/src/server/daemon-state.test.ts
@@ -0,0 +1,232 @@
+/* eslint-disable n/no-unsupported-features/node-builtins */
+import * as fs from 'node:fs/promises';
+import * as os from 'node:os';
+import * as path from 'node:path';
+import { describe, it, expect, beforeEach, afterEach, vi } from 'vitest';
+
+import {
+  writeDaemonState,
+  readDaemonState,
+  removeDaemonState,
+  acquireStartupLock,
+  releaseStartupLock,
+  isDaemonAlive,
+  isDaemonVersionMatch,
+  generateNonce,
+} from './daemon-state.js';
+import type { DaemonState } from '../types/http.js';
+import { PACKAGE_VERSION } from '../version.js';
+
+const tmpDir = path.join(os.tmpdir(), `mm-daemon-state-test-${Date.now()}`);
+
+const mockState: DaemonState = {
+  port: 12345,
+  pid: process.pid,
+  startedAt: new Date().toISOString(),
+  nonce: 'test-nonce-abc',
+  version: PACKAGE_VERSION,
+  subPorts: { anvil: 8545, fixture: 8546, mock: 8547 },
+};
+
+describe('daemon-state', () => {
+  beforeEach(async () => {
+    await fs.mkdir(tmpDir, { recursive: true });
+  });
+
+  afterEach(async () => {
+    vi.restoreAllMocks();
+    await fs.rm(tmpDir, { recursive: true, force: true });
+  });
+
+  describe('writeDaemonState / readDaemonState', () => {
+    it('writes and reads state atomically', async () => {
+      await writeDaemonState(tmpDir, mockState);
+      const read = await readDaemonState(tmpDir);
+      expect(read).toStrictEqual(mockState);
+    });
+
+    it('overwrites existing state', async () => {
+      await writeDaemonState(tmpDir, mockState);
+      const updated: DaemonState = { ...mockState, port: 99999 };
+      await writeDaemonState(tmpDir, updated);
+      const read = await readDaemonState(tmpDir);
+      expect(read?.port).toBe(99999);
+    });
+  });
+
+  describe('readDaemonState', () => {
+    it('returns null when file does not exist', async () => {
+      const result = await readDaemonState(tmpDir);
+      expect(result).toBeNull();
+    });
+
+    it('returns null for invalid JSON', async () => {
+      await fs.writeFile(path.join(tmpDir, '.mm-server'), 'not-json', 'utf-8');
+      const result = await readDaemonState(tmpDir);
+      expect(result).toBeNull();
+    });
+  });
+
+  describe('removeDaemonState', () => {
+    it('removes the state file', async () => {
+      await writeDaemonState(tmpDir, mockState);
+      await removeDaemonState(tmpDir);
+      const result = await readDaemonState(tmpDir);
+      expect(result).toBeNull();
+    });
+
+    it('does not throw when file does not exist', async () => {
+      expect(await removeDaemonState(tmpDir)).toBeUndefined();
+    });
+  });
+
+  describe('isDaemonAlive', () => {
+    it('returns false for an unreachable port', async () => {
+      const alive = await isDaemonAlive({ ...mockState, port: 1 });
+      expect(alive).toBe(false);
+    });
+
+    it('returns false when response.ok is false', async () => {
+      vi.spyOn(globalThis, 'fetch').mockResolvedValue({
+        ok: false,
+      } as Response);
+
+      const alive = await isDaemonAlive(mockState);
+
+      expect(alive).toBe(false);
+    });
+
+    it('returns false when nonce does not match', async () => {
+      vi.spyOn(globalThis, 'fetch').mockResolvedValue({
+        ok: true,
+        json: vi.fn().mockResolvedValue({ nonce: 'different-nonce' }),
+      } as unknown as Response);
+
+      const alive = await isDaemonAlive({
+        ...mockState,
+        nonce: 'expected-nonce',
+      });
+
+      expect(alive).toBe(false);
+    });
+  });
+
+  describe('acquireStartupLock / releaseStartupLock', () => {
+    it('creates the lock file and writes the current pid', async () => {
+      const acquired = await acquireStartupLock(tmpDir);
+
+      expect(acquired).toBe(true);
+      expect(
+        await fs.readFile(path.join(tmpDir, '.mm-server.lock'), 'utf-8'),
+      ).toBe(`${process.pid}\n`);
+    });
+
+    it('returns false when another process holds a fresh lock', async () => {
+      await fs.writeFile(
+        path.join(tmpDir, '.mm-server.lock'),
+        `${process.pid}\n`,
+      );
+
+      const acquired = await acquireStartupLock(tmpDir);
+
+      expect(acquired).toBe(false);
+    });
+
+    it('reclaims a stale lock by age', async () => {
+      const lockPath = path.join(tmpDir, '.mm-server.lock');
+      const staleTime = new Date(Date.now() - 31_000);
+
+      await fs.writeFile(lockPath, `${process.pid}\n`);
+      await fs.utimes(lockPath, staleTime, staleTime);
+
+      const acquired = await acquireStartupLock(tmpDir);
+
+      expect(acquired).toBe(true);
+      expect(await fs.readFile(lockPath, 'utf-8')).toBe(`${process.pid}\n`);
+    });
+
+    it('reclaims a stale lock for a dead pid', async () => {
+      const lockPath = path.join(tmpDir, '.mm-server.lock');
+
+      await fs.writeFile(lockPath, '999999\n');
+
+      const acquired = await acquireStartupLock(tmpDir);
+
+      expect(acquired).toBe(true);
+      expect(await fs.readFile(lockPath, 'utf-8')).toBe(`${process.pid}\n`);
+    });
+
+    it('returns false when stale lock check errors', async () => {
+      await fs.writeFile(path.join(tmpDir, '.mm-server.lock'), '12345\n');
+      await fs.chmod(path.join(tmpDir, '.mm-server.lock'), 0o000);
+
+      const acquired = await acquireStartupLock(tmpDir);
+
+      expect(acquired).toBe(false);
+    });
+
+    it('throws when lock creation fails with a non-EEXIST error', async () => {
+      await fs.rm(tmpDir, { recursive: true, force: true });
+
+      await expect(acquireStartupLock(tmpDir)).rejects.toMatchObject({
+        code: 'ENOENT',
+      });
+    });
+
+    it('removes the lock file', async () => {
+      const lockPath = path.join(tmpDir, '.mm-server.lock');
+
+      await fs.writeFile(lockPath, `${process.pid}\n`);
+      await releaseStartupLock(tmpDir);
+
+      await expect(fs.access(lockPath)).rejects.toMatchObject({
+        code: 'ENOENT',
+      });
+    });
+
+    it('ignores ENOENT when releasing the lock', async () => {
+      expect(await releaseStartupLock(tmpDir)).toBeUndefined();
+    });
+
+    it('throws when lock release fails with a non-ENOENT error', async () => {
+      await fs.mkdir(path.join(tmpDir, '.mm-server.lock'));
+
+      await expect(releaseStartupLock(tmpDir)).rejects.toMatchObject({
+        code: 'EPERM',
+      });
+    });
+  });
+
+  describe('isDaemonVersionMatch', () => {
+    it('returns true when version matches PACKAGE_VERSION', () => {
+      expect(isDaemonVersionMatch(mockState)).toBe(true);
+    });
+
+    it('returns false when version differs', () => {
+      expect(isDaemonVersionMatch({ ...mockState, version: '0.0.0' })).toBe(
+        false,
+      );
+    });
+
+    it('returns false when version is absent (pre-version-tracking daemon)', () => {
+      const { version: _, ...stateWithoutVersion } = mockState;
+      expect(isDaemonVersionMatch(stateWithoutVersion as DaemonState)).toBe(
+        false,
+      );
+    });
+  });
+
+  describe('generateNonce', () => {
+    it('returns a non-empty string', () => {
+      const nonce = generateNonce();
+      expect(typeof nonce).toBe('string');
+      expect(nonce.length).toBeGreaterThan(0);
+    });
+
+    it('returns unique values on successive calls', () => {
+      const a = generateNonce();
+      const b = generateNonce();
+      expect(a).not.toBe(b);
+    });
+  });
+});
diff --git a/src/server/daemon-state.ts b/src/server/daemon-state.ts
new file mode 100644
index 0000000..c4e7285
--- /dev/null
+++ b/src/server/daemon-state.ts
@@ -0,0 +1,209 @@
+import { randomUUID } from 'node:crypto';
+import { constants } from 'node:fs';
+import * as fs from 'node:fs/promises';
+import * as path from 'node:path';
+
+import type { DaemonState } from '../types/http.js';
+import { PACKAGE_VERSION } from '../version.js';
+
+const DAEMON_STATE_FILE = '.mm-server';
+const DAEMON_STATE_TMP_FILE = '.mm-server.tmp';
+const DAEMON_LOCK_FILE = '.mm-server.lock';
+const LOCK_STALE_MS = 30_000;
+
+/**
+ * Writes daemon state atomically using rename pattern.
+ * Writes to .mm-server.tmp first, then renames to .mm-server.
+ *
+ * @param worktreeRoot - Absolute path to the git worktree root.
+ * @param state - The daemon state to persist.
+ */
+export async function writeDaemonState(
+  worktreeRoot: string,
+  state: DaemonState,
+): Promise<void> {
+  const tmpPath = path.join(worktreeRoot, DAEMON_STATE_TMP_FILE);
+  const finalPath = path.join(worktreeRoot, DAEMON_STATE_FILE);
+  await fs.writeFile(tmpPath, JSON.stringify(state, null, 2), 'utf-8');
+  await fs.rename(tmpPath, finalPath);
+}
+
+/**
+ * Reads daemon state from .mm-server file.
+ * Returns null if file doesn't exist, JSON is invalid, or required fields are missing.
+ *
+ * @param worktreeRoot - Absolute path to the git worktree root.
+ * @returns The parsed daemon state, or null if unavailable.
+ */
+export async function readDaemonState(
+  worktreeRoot: string,
+): Promise<DaemonState | null> {
+  const filePath = path.join(worktreeRoot, DAEMON_STATE_FILE);
+  try {
+    const content = await fs.readFile(filePath, 'utf-8');
+    const parsed = JSON.parse(content) as Record<string, unknown>;
+    if (
+      typeof parsed.port !== 'number' ||
+      typeof parsed.pid !== 'number' ||
+      typeof parsed.nonce !== 'string' ||
+      typeof parsed.startedAt !== 'string'
+    ) {
+      return null;
+    }
+    return parsed as DaemonState;
+  } catch {
+    return null;
+  }
+}
+
+/**
+ * Removes the .mm-server file.
+ * Silently ignores if file doesn't exist.
+ *
+ * @param worktreeRoot - Absolute path to the git worktree root.
+ */
+export async function removeDaemonState(worktreeRoot: string): Promise<void> {
+  const filePath = path.join(worktreeRoot, DAEMON_STATE_FILE);
+  try {
+    await fs.unlink(filePath);
+  } catch (error) {
+    if ((error as NodeJS.ErrnoException).code !== 'ENOENT') {
+      throw error;
+    }
+  }
+}
+
+/**
+ * Checks if a daemon is alive by sending GET /health and verifying the nonce.
+ * Returns false if connection refused, timeout, or nonce mismatch.
+ *
+ * @param state - The daemon state containing port and nonce to verify.
+ * @returns Whether the daemon is responding and matches the expected nonce.
+ */
+export async function isDaemonAlive(state: DaemonState): Promise<boolean> {
+  try {
+    const controller = new AbortController();
+    const timeout = setTimeout(() => controller.abort(), 2000);
+    try {
+      const response = await fetch(`http://127.0.0.1:${state.port}/health`, {
+        signal: controller.signal,
+      });
+      if (!response.ok) {
+        return false;
+      }
+      const body = (await response.json()) as { nonce?: string };
+      return body.nonce === state.nonce;
+    } finally {
+      clearTimeout(timeout);
+    }
+  } catch {
+    return false;
+  }
+}
+
+/**
+ * Checks whether the daemon's package version matches the current CLI version.
+ * Returns false if the daemon state has no version (pre-version-tracking daemon).
+ *
+ * @param state - The daemon state to check.
+ * @returns Whether the versions match.
+ */
+export function isDaemonVersionMatch(state: DaemonState): boolean {
+  return state.version === PACKAGE_VERSION;
+}
+
+/**
+ * Generates a new random nonce for daemon identification.
+ *
+ * @returns A UUID string.
+ */
+export function generateNonce(): string {
+  return randomUUID();
+}
+
+/**
+ * Acquires an exclusive startup lock for the worktree.
+ * Uses O_CREAT | O_EXCL to atomically create the lock file — if it already
+ * exists, checks whether the lock is stale (dead PID or older than 30s)
+ * and reclaims it if so.
+ *
+ * @param worktreeRoot - Absolute path to the git worktree root.
+ * @returns true if the lock was acquired, false if another process holds it.
+ */
+export async function acquireStartupLock(
+  worktreeRoot: string,
+): Promise<boolean> {
+  const lockPath = path.join(worktreeRoot, DAEMON_LOCK_FILE);
+  try {
+    // eslint-disable-next-line no-bitwise
+    const flags = constants.O_CREAT | constants.O_EXCL | constants.O_WRONLY;
+    const fd = await fs.open(lockPath, flags);
+    await fd.write(`${process.pid}\n`);
+    await fd.close();
+    return true;
+  } catch (error) {
+    if ((error as NodeJS.ErrnoException).code === 'EEXIST') {
+      if (await isLockStale(lockPath)) {
+        try {
+          await fs.unlink(lockPath);
+        } catch {
+          return false;
+        }
+        return acquireStartupLock(worktreeRoot);
+      }
+      return false;
+    }
+    throw error;
+  }
+}
+
+/**
+ * Checks whether a lock file is stale by examining PID liveness and file age.
+ *
+ * @param lockPath - Absolute path to the lock file.
+ * @returns true if the lock holder is dead or the file is older than LOCK_STALE_MS.
+ */
+async function isLockStale(lockPath: string): Promise<boolean> {
+  try {
+    const [content, stat] = await Promise.all([
+      fs.readFile(lockPath, 'utf-8'),
+      fs.stat(lockPath),
+    ]);
+
+    const ageMs = Date.now() - stat.mtimeMs;
+    if (ageMs > LOCK_STALE_MS) {
+      return true;
+    }
+
+    const pid = parseInt(content.trim(), 10);
+    if (!isNaN(pid)) {
+      try {
+        process.kill(pid, 0);
+        return false;
+      } catch {
+        return true;
+      }
+    }
+
+    return false;
+  } catch {
+    return false;
+  }
+}
+
+/**
+ * Releases the startup lock for the worktree.
+ * Silently ignores if the lock file doesn't exist.
+ *
+ * @param worktreeRoot - Absolute path to the git worktree root.
+ */
+export async function releaseStartupLock(worktreeRoot: string): Promise<void> {
+  const lockPath = path.join(worktreeRoot, DAEMON_LOCK_FILE);
+  try {
+    await fs.unlink(lockPath);
+  } catch (error) {
+    if ((error as NodeJS.ErrnoException).code !== 'ENOENT') {
+      throw error;
+    }
+  }
+}
diff --git a/src/server/port-allocator.test.ts b/src/server/port-allocator.test.ts
new file mode 100644
index 0000000..c2f89c5
--- /dev/null
+++ b/src/server/port-allocator.test.ts
@@ -0,0 +1,48 @@
+import { describe, it, expect } from 'vitest';
+
+import { allocatePort } from './port-allocator.js';
+
+describe('allocatePort', () => {
+  it('returns a valid port number', async () => {
+    const { port, server } = await allocatePort();
+    try {
+      expect(port).toBeGreaterThan(0);
+      expect(port).toBeLessThan(65536);
+    } finally {
+      server.close();
+    }
+  });
+
+  it('returns different ports on concurrent calls', async () => {
+    const [a, b] = await Promise.all([allocatePort(), allocatePort()]);
+    try {
+      expect(a.port).not.toBe(b.port);
+    } finally {
+      a.server.close();
+      b.server.close();
+    }
+  });
+
+  it('returns a server that is already listening', async () => {
+    const { server } = await allocatePort();
+    try {
+      expect(server.listening).toBe(true);
+    } finally {
+      server.close();
+    }
+  });
+
+  it('binds to 127.0.0.1', async () => {
+    const { server } = await allocatePort();
+    try {
+      const address = server.address();
+      expect(address).not.toBeNull();
+      expect(typeof address).toBe('object');
+      if (typeof address === 'object' && address !== null) {
+        expect(address.address).toBe('127.0.0.1');
+      }
+    } finally {
+      server.close();
+    }
+  });
+});
diff --git a/src/server/port-allocator.ts b/src/server/port-allocator.ts
new file mode 100644
index 0000000..dbc7982
--- /dev/null
+++ b/src/server/port-allocator.ts
@@ -0,0 +1,27 @@
+import * as net from 'node:net';
+
+/**
+ * Allocates an available port by binding to port 0.
+ * Returns both the port number AND the bound server to avoid port-grab race conditions.
+ * The caller is responsible for passing the server to Express or closing it.
+ *
+ * @returns The allocated port and bound server.
+ */
+export async function allocatePort(): Promise<{
+  port: number;
+  server: net.Server;
+}> {
+  return new Promise((resolve, reject) => {
+    const server = net.createServer();
+    server.listen(0, '127.0.0.1', () => {
+      const address = server.address();
+      if (!address || typeof address === 'string') {
+        server.close();
+        reject(new Error('Failed to get server address'));
+        return;
+      }
+      resolve({ port: address.port, server });
+    });
+    server.on('error', reject);
+  });
+}
diff --git a/src/server/request-queue.test.ts b/src/server/request-queue.test.ts
new file mode 100644
index 0000000..7729a3f
--- /dev/null
+++ b/src/server/request-queue.test.ts
@@ -0,0 +1,92 @@
+import { describe, it, expect } from 'vitest';
+
+import { RequestQueue } from './request-queue.js';
+
+async function sleep(ms: number): Promise<void> {
+  return new Promise((resolve) => setTimeout(resolve, ms));
+}
+
+describe('RequestQueue', () => {
+  it('executes enqueued functions sequentially', async () => {
+    const queue = new RequestQueue();
+    const results: number[] = [];
+
+    await Promise.all([
+      queue.enqueue(async () => {
+        await sleep(30);
+        results.push(1);
+      }),
+      queue.enqueue(async () => {
+        results.push(2);
+      }),
+      queue.enqueue(async () => {
+        results.push(3);
+      }),
+    ]);
+
+    expect(results).toStrictEqual([1, 2, 3]);
+  });
+
+  it('returns the value produced by the enqueued function', async () => {
+    const queue = new RequestQueue();
+    const result = await queue.enqueue(async () => 42);
+    expect(result).toBe(42);
+  });
+
+  it('returns values from concurrent enqueues in order', async () => {
+    const queue = new RequestQueue();
+    const [a, b, c] = await Promise.all([
+      queue.enqueue(async () => 'first'),
+      queue.enqueue(async () => 'second'),
+      queue.enqueue(async () => 'third'),
+    ]);
+
+    expect(a).toBe('first');
+    expect(b).toBe('second');
+    expect(c).toBe('third');
+  });
+
+  it('rejects when the function exceeds the timeout', async () => {
+    const queue = new RequestQueue(50);
+
+    await expect(
+      queue.enqueue(
+        async () => new Promise((resolve) => setTimeout(resolve, 500)),
+      ),
+    ).rejects.toThrowError('timed out');
+  });
+
+  it('remains functional after a timeout rejection', async () => {
+    const queue = new RequestQueue(50);
+
+    await queue
+      .enqueue(async () => new Promise((resolve) => setTimeout(resolve, 500)))
+      .catch(() => {});
+
+    const result = await queue.enqueue(async () => 'recovered');
+    expect(result).toBe('recovered');
+  });
+
+  it('propagates errors thrown by the enqueued function', async () => {
+    const queue = new RequestQueue();
+
+    await expect(
+      queue.enqueue(async () => {
+        throw new Error('task failed');
+      }),
+    ).rejects.toThrowError('task failed');
+  });
+
+  it('continues processing after an error in a previous task', async () => {
+    const queue = new RequestQueue();
+
+    await queue
+      .enqueue(async () => {
+        throw new Error('fail');
+      })
+      .catch(() => {});
+
+    const result = await queue.enqueue(async () => 'after-error');
+    expect(result).toBe('after-error');
+  });
+});
diff --git a/src/server/request-queue.ts b/src/server/request-queue.ts
new file mode 100644
index 0000000..79f52b3
--- /dev/null
+++ b/src/server/request-queue.ts
@@ -0,0 +1,64 @@
+import { debugWarn } from '../utils';
+
+/**
+ * Async mutex for serializing concurrent tool requests.
+ * Ensures only one tool executes at a time.
+ */
+export class RequestQueue {
+  #queue: Promise<void> = Promise.resolve();
+
+  readonly #timeoutMs: number;
+
+  /**
+   * @param timeoutMs - Maximum milliseconds a queued task may run.
+   */
+  constructor(timeoutMs = 30_000) {
+    this.#timeoutMs = timeoutMs;
+  }
+
+  /**
+   * Enqueues an async task for serial execution with a timeout.
+   *
+   * @param fn - The async function to execute.
+   * @returns The resolved value of the provided function.
+   */
+  async enqueue<Result>(fn: () => Promise<Result>): Promise<Result> {
+    let release!: () => void;
+    const next = new Promise<void>((resolve) => {
+      release = resolve;
+    });
+    const prev = this.#queue;
+    this.#queue = next;
+    await prev;
+    let timer: ReturnType<typeof setTimeout> | undefined;
+    const fnPromise = fn();
+    try {
+      return await Promise.race([
+        fnPromise,
+        new Promise<never>((_resolve, reject) => {
+          timer = setTimeout(
+            () =>
+              reject(
+                new Error(
+                  `Tool execution timed out after ${this.#timeoutMs}ms`,
+                ),
+              ),
+            this.#timeoutMs,
+          );
+        }),
+      ]);
+    } finally {
+      if (timer !== undefined) {
+        clearTimeout(timer);
+      }
+      // Wait for the task to actually settle before releasing the mutex,
+      // even after a timeout rejection. This preserves the serialization
+      // guarantee — the next task cannot start while a timed-out task
+      // is still running and potentially mutating shared state.
+      await fnPromise.catch((error) => {
+        debugWarn('request-queue.enqueue', error);
+      });
+      release();
+    }
+  }
+}
diff --git a/src/mcp-server/session-manager.ts b/src/server/session-manager.ts
similarity index 85%
rename from src/mcp-server/session-manager.ts
rename to src/server/session-manager.ts
index 5de75bc..2ac10fe 100644
--- a/src/mcp-server/session-manager.ts
+++ b/src/server/session-manager.ts
@@ -1,5 +1,5 @@
 /**
- * Generic Session Manager Interface for Browser Extension MCP Servers.
+ * Generic Session Manager Interface for Browser Extension HTTP Servers.
  *
  * This module defines the interface that concrete session managers must implement.
  * The interface abstracts browser session management, page tracking, and extension state.
@@ -10,8 +10,10 @@
 
 import type { Page, BrowserContext } from '@playwright/test';
 
-import type { TabRole, SessionState, SessionMetadata } from './types';
-import type { EnvironmentMode } from '../capabilities/context.js';
+import type {
+  EnvironmentMode,
+  WorkflowContext,
+} from '../capabilities/context.js';
 import type {
   ExtensionState,
   BuildCapability,
@@ -21,6 +23,7 @@ import type {
   StateSnapshotCapability,
   ScreenshotResult,
 } from '../capabilities/types.js';
+import type { TabRole, SessionState, SessionMetadata } from '../tools/types';
 
 /**
  * Represents a tracked browser page with its role and URL.
@@ -265,6 +268,18 @@ export type ISessionManager = {
   // Environment Configuration
   // -----------------------------------------------------------------------------
 
+  /**
+   * Set the workflow context created by the server's context factory.
+   *
+   * Called by `createServer` during startup so that the session manager has
+   * access to the same capability objects that tools receive. Implementations
+   * should store the context and expose its capabilities through the
+   * individual capability getters.
+   *
+   * @param context - The workflow context produced by the configured `contextFactory`.
+   */
+  setWorkflowContext(context: WorkflowContext): void;
+
   /**
    * Get the current environment mode.
    *
@@ -295,47 +310,3 @@ export type ISessionManager = {
     canSwitchContext: boolean;
   };
 };
-
-/**
- * Session manager instance holder.
- *
- * In the core package, this is undefined by default.
- * Extension implementations should call setSessionManager() to inject
- * their concrete implementation.
- */
-let _sessionManager: ISessionManager | undefined;
-
-/**
- * Set the session manager instance.
- *
- * This should be called by extension-specific code during server initialization.
- *
- * @param manager The session manager implementation to inject
- */
-export function setSessionManager(manager: ISessionManager): void {
-  _sessionManager = manager;
-}
-
-/**
- * Get the session manager instance.
- *
- * @throws Error if no session manager has been set
- * @returns The session manager instance
- */
-export function getSessionManager(): ISessionManager {
-  if (!_sessionManager) {
-    throw new Error(
-      'Session manager not initialized. Call setSessionManager() first.',
-    );
-  }
-  return _sessionManager;
-}
-
-/**
- * Check if a session manager has been set.
- *
- * @returns True if a session manager is set, false otherwise
- */
-export function hasSessionManager(): boolean {
-  return _sessionManager !== undefined;
-}
diff --git a/src/tools/batch.test.ts b/src/tools/batch.test.ts
new file mode 100644
index 0000000..3c5e061
--- /dev/null
+++ b/src/tools/batch.test.ts
@@ -0,0 +1,458 @@
+import { describe, it, expect, vi } from 'vitest';
+
+import { runStepsTool } from './batch.js';
+import { createMockSessionManager } from './test-utils/mock-factories.js';
+import { ErrorCodes } from './types/errors.js';
+import type { ToolContext, ToolFunction } from '../types/http.js';
+
+function createMockContext(
+  options: {
+    hasActive?: boolean;
+    toolRegistry?: Map<string, ToolFunction<any, any>>;
+  } = {},
+): ToolContext {
+  const { hasActive = true, toolRegistry } = options;
+
+  return {
+    sessionManager: createMockSessionManager({ hasActive }),
+    page: {} as ToolContext['page'],
+    refMap: new Map(),
+    workflowContext: {},
+    knowledgeStore: {},
+    toolRegistry,
+  } as unknown as ToolContext;
+}
+
+describe('runStepsTool', () => {
+  it('returns error when no active session', async () => {
+    const context = createMockContext({ hasActive: false });
+
+    const result = await runStepsTool(
+      { steps: [{ tool: 'click', args: { testId: 'button' } }] },
+      context,
+    );
+
+    expect(result.ok).toBe(false);
+    if (!result.ok) {
+      expect(result.error.code).toBe(ErrorCodes.MM_NO_ACTIVE_SESSION);
+    }
+  });
+
+  it('returns internal error when tool registry is missing', async () => {
+    const context = createMockContext();
+
+    const result = await runStepsTool(
+      { steps: [{ tool: 'click', args: { testId: 'button' } }] },
+      context,
+    );
+
+    expect(result.ok).toBe(false);
+    if (!result.ok) {
+      expect(result.error.code).toBe(ErrorCodes.MM_INTERNAL_ERROR);
+      expect(result.error.message).toContain('Tool registry not available');
+    }
+  });
+
+  it('executes a single step successfully', async () => {
+    const clickHandler = vi.fn().mockResolvedValue({
+      ok: true,
+      result: 'clicked',
+    });
+    const context = createMockContext({
+      toolRegistry: new Map([['click', clickHandler]]),
+    });
+
+    const result = await runStepsTool(
+      { steps: [{ tool: 'click', args: { testId: 'button' } }] },
+      context,
+    );
+
+    expect(clickHandler).toHaveBeenCalledWith(
+      { testId: 'button', timeoutMs: 15000 },
+      context,
+    );
+    expect(result.ok).toBe(true);
+    if (result.ok) {
+      expect(result.result.steps).toHaveLength(1);
+      expect(result.result.steps[0]).toMatchObject({
+        tool: 'click',
+        ok: true,
+        result: 'clicked',
+      });
+      expect(result.result.steps[0].meta.durationMs).toBeGreaterThanOrEqual(0);
+      expect(result.result.steps[0].meta.timestamp).toStrictEqual(
+        expect.any(String),
+      );
+      expect(result.result.summary).toMatchObject({
+        ok: true,
+        total: 1,
+        succeeded: 1,
+        failed: 0,
+      });
+    }
+  });
+
+  it('returns unknown tool error in the step result', async () => {
+    const context = createMockContext({ toolRegistry: new Map() });
+
+    const result = await runStepsTool(
+      { steps: [{ tool: 'unknown_tool', args: {} }] },
+      context,
+    );
+
+    expect(result.ok).toBe(true);
+    if (result.ok) {
+      expect(result.result.steps[0]).toMatchObject({
+        tool: 'unknown_tool',
+        ok: false,
+        error: {
+          code: ErrorCodes.MM_UNKNOWN_TOOL,
+          message: 'Unknown tool: unknown_tool',
+        },
+      });
+      expect(result.result.summary).toMatchObject({
+        ok: false,
+        total: 1,
+        succeeded: 0,
+        failed: 1,
+      });
+    }
+  });
+
+  it('records a failed step when a handler returns ok false', async () => {
+    const clickHandler = vi.fn().mockResolvedValue({
+      ok: false,
+      error: { code: 'MM_CLICK_FAILED', message: 'Click failed' },
+    });
+    const context = createMockContext({
+      toolRegistry: new Map([['click', clickHandler]]),
+    });
+
+    const result = await runStepsTool(
+      { steps: [{ tool: 'click', args: { testId: 'btn' } }] },
+      context,
+    );
+
+    expect(clickHandler).toHaveBeenCalledWith(
+      { testId: 'btn', timeoutMs: 15000 },
+      context,
+    );
+    expect(result.ok).toBe(true);
+    if (result.ok) {
+      expect(result.result.steps[0]).toMatchObject({
+        tool: 'click',
+        ok: false,
+        error: { code: 'MM_CLICK_FAILED', message: 'Click failed' },
+      });
+      expect(result.result.summary).toMatchObject({
+        ok: false,
+        total: 1,
+        succeeded: 0,
+        failed: 1,
+      });
+    }
+  });
+
+  it('stops on error when stopOnError is true', async () => {
+    const clickHandler = vi.fn().mockResolvedValue({
+      ok: false,
+      error: { code: 'ERR', message: 'fail' },
+    });
+    const typeHandler = vi
+      .fn()
+      .mockResolvedValue({ ok: true, result: 'typed' });
+    const context = createMockContext({
+      toolRegistry: new Map([
+        ['click', clickHandler],
+        ['type', typeHandler],
+      ]),
+    });
+
+    const result = await runStepsTool(
+      {
+        steps: [
+          { tool: 'click', args: { testId: 'btn' } },
+          { tool: 'type', args: { testId: 'input', text: 'hello' } },
+        ],
+        stopOnError: true,
+      },
+      context,
+    );
+
+    expect(clickHandler).toHaveBeenCalledTimes(1);
+    expect(typeHandler).not.toHaveBeenCalled();
+    expect(result.ok).toBe(true);
+    if (result.ok) {
+      expect(result.result.steps).toHaveLength(1);
+      expect(result.result.summary).toMatchObject({
+        ok: false,
+        total: 1,
+        succeeded: 0,
+        failed: 1,
+      });
+    }
+  });
+
+  it('collects multiple step results with mixed outcomes', async () => {
+    const clickHandler = vi.fn().mockResolvedValue({
+      ok: true,
+      result: 'clicked',
+    });
+    const typeHandler = vi.fn().mockResolvedValue({
+      ok: false,
+      error: { code: 'MM_TYPE_FAILED', message: 'Type failed' },
+    });
+    const context = createMockContext({
+      toolRegistry: new Map([
+        ['click', clickHandler],
+        ['type', typeHandler],
+      ]),
+    });
+
+    const result = await runStepsTool(
+      {
+        steps: [
+          { tool: 'click', args: { testId: 'button' } },
+          { tool: 'unknown_tool', args: {} },
+          { tool: 'type', args: { testId: 'input', text: 'hello' } },
+        ],
+      },
+      context,
+    );
+
+    expect(clickHandler).toHaveBeenCalledTimes(1);
+    expect(typeHandler).toHaveBeenCalledTimes(1);
+    expect(result.ok).toBe(true);
+    if (result.ok) {
+      expect(result.result.steps).toHaveLength(3);
+      expect(result.result.steps.map((step) => step.ok)).toStrictEqual([
+        true,
+        false,
+        false,
+      ]);
+      expect(result.result.steps[1].error?.code).toBe(
+        ErrorCodes.MM_UNKNOWN_TOOL,
+      );
+      expect(result.result.steps[2].error?.code).toBe('MM_TYPE_FAILED');
+      expect(result.result.summary).toMatchObject({
+        ok: false,
+        total: 3,
+        succeeded: 1,
+        failed: 2,
+      });
+      expect(result.result.summary.durationMs).toBeGreaterThanOrEqual(0);
+    }
+  });
+
+  it('records internal error when a handler throws', async () => {
+    const clickHandler = vi.fn().mockRejectedValue(new Error('Timeout'));
+    const context = createMockContext({
+      toolRegistry: new Map([['click', clickHandler]]),
+    });
+
+    const result = await runStepsTool(
+      { steps: [{ tool: 'click', args: { testId: 'btn' } }] },
+      context,
+    );
+
+    expect(result.ok).toBe(true);
+    if (result.ok) {
+      expect(result.result.steps[0]).toMatchObject({
+        tool: 'click',
+        ok: false,
+        error: {
+          code: ErrorCodes.MM_INTERNAL_ERROR,
+        },
+      });
+      expect(result.result.steps[0].error?.message).toContain('Timeout');
+      expect(result.result.summary).toMatchObject({
+        ok: false,
+        total: 1,
+        succeeded: 0,
+        failed: 1,
+      });
+    }
+  });
+
+  it('stops on error for unknown tool when stopOnError is true', async () => {
+    const typeHandler = vi
+      .fn()
+      .mockResolvedValue({ ok: true, result: 'typed' });
+    const context = createMockContext({
+      toolRegistry: new Map([['type', typeHandler]]),
+    });
+
+    const result = await runStepsTool(
+      {
+        steps: [
+          { tool: 'unknown_tool', args: {} },
+          { tool: 'type', args: { testId: 'input', text: 'hello' } },
+        ],
+        stopOnError: true,
+      },
+      context,
+    );
+
+    expect(typeHandler).not.toHaveBeenCalled();
+    expect(result.ok).toBe(true);
+    if (result.ok) {
+      expect(result.result.steps).toHaveLength(1);
+      expect(result.result.steps[0]).toMatchObject({
+        tool: 'unknown_tool',
+        ok: false,
+        error: {
+          code: ErrorCodes.MM_UNKNOWN_TOOL,
+        },
+      });
+      expect(result.result.summary).toMatchObject({
+        ok: false,
+        total: 1,
+        succeeded: 0,
+        failed: 1,
+      });
+    }
+  });
+
+  it('returns validation error for invalid tool args', async () => {
+    const clickHandler = vi.fn().mockResolvedValue({
+      ok: true,
+      result: 'clicked',
+    });
+    const typeHandler = vi
+      .fn()
+      .mockResolvedValue({ ok: true, result: 'typed' });
+    const context = createMockContext({
+      toolRegistry: new Map([
+        ['click', clickHandler],
+        ['type', typeHandler],
+      ]),
+    });
+
+    const result = await runStepsTool(
+      {
+        steps: [
+          { tool: 'click', args: {} },
+          { tool: 'type', args: { testId: 'input', text: 'hello' } },
+        ],
+      },
+      context,
+    );
+
+    expect(clickHandler).not.toHaveBeenCalled();
+    expect(typeHandler).toHaveBeenCalledTimes(1);
+    expect(result.ok).toBe(true);
+    if (result.ok) {
+      expect(result.result.steps).toHaveLength(2);
+      expect(result.result.steps[0]).toMatchObject({
+        tool: 'click',
+        ok: false,
+        error: {
+          code: 'VALIDATION_ERROR',
+        },
+      });
+      expect(result.result.steps[0].error?.message).toContain('Exactly one of');
+      expect(result.result.steps[1]).toMatchObject({
+        tool: 'type',
+        ok: true,
+      });
+      expect(result.result.summary).toMatchObject({
+        ok: false,
+        total: 2,
+        succeeded: 1,
+        failed: 1,
+      });
+    }
+  });
+
+  it('stops on validation error when stopOnError is true', async () => {
+    const clickHandler = vi.fn().mockResolvedValue({
+      ok: true,
+      result: 'clicked',
+    });
+    const typeHandler = vi
+      .fn()
+      .mockResolvedValue({ ok: true, result: 'typed' });
+    const context = createMockContext({
+      toolRegistry: new Map([
+        ['click', clickHandler],
+        ['type', typeHandler],
+      ]),
+    });
+
+    const result = await runStepsTool(
+      {
+        steps: [
+          { tool: 'click', args: {} },
+          { tool: 'type', args: { testId: 'input', text: 'hello' } },
+        ],
+        stopOnError: true,
+      },
+      context,
+    );
+
+    expect(clickHandler).not.toHaveBeenCalled();
+    expect(typeHandler).not.toHaveBeenCalled();
+    expect(result.ok).toBe(true);
+    if (result.ok) {
+      expect(result.result.steps).toHaveLength(1);
+      expect(result.result.steps[0]).toMatchObject({
+        tool: 'click',
+        ok: false,
+        error: {
+          code: 'VALIDATION_ERROR',
+        },
+      });
+      expect(result.result.summary).toMatchObject({
+        ok: false,
+        total: 1,
+        succeeded: 0,
+        failed: 1,
+      });
+    }
+  });
+
+  it('stops on handler throw when stopOnError is true', async () => {
+    const clickHandler = vi.fn().mockRejectedValue(new Error('Timeout'));
+    const typeHandler = vi
+      .fn()
+      .mockResolvedValue({ ok: true, result: 'typed' });
+    const context = createMockContext({
+      toolRegistry: new Map([
+        ['click', clickHandler],
+        ['type', typeHandler],
+      ]),
+    });
+
+    const result = await runStepsTool(
+      {
+        steps: [
+          { tool: 'click', args: { testId: 'btn' } },
+          { tool: 'type', args: { testId: 'input', text: 'hello' } },
+        ],
+        stopOnError: true,
+      },
+      context,
+    );
+
+    expect(clickHandler).toHaveBeenCalledTimes(1);
+    expect(typeHandler).not.toHaveBeenCalled();
+    expect(result.ok).toBe(true);
+    if (result.ok) {
+      expect(result.result.steps).toHaveLength(1);
+      expect(result.result.steps[0]).toMatchObject({
+        tool: 'click',
+        ok: false,
+        error: {
+          code: ErrorCodes.MM_INTERNAL_ERROR,
+        },
+      });
+      expect(result.result.steps[0].error?.message).toContain('Timeout');
+      expect(result.result.summary).toMatchObject({
+        ok: false,
+        total: 1,
+        succeeded: 0,
+        failed: 1,
+      });
+    }
+  });
+});
diff --git a/src/tools/batch.ts b/src/tools/batch.ts
new file mode 100644
index 0000000..70b9f02
--- /dev/null
+++ b/src/tools/batch.ts
@@ -0,0 +1,160 @@
+import type { RunStepsInput, RunStepsResult, StepResult } from './types';
+import { ErrorCodes } from './types';
+import { createToolError, createToolSuccess } from './utils.js';
+import type { ToolContext, ToolFunction, ToolResponse } from '../types/http.js';
+import { extractErrorMessage } from '../utils';
+import type { ToolName } from '../validation/schemas.js';
+import { toolSchemas } from '../validation/schemas.js';
+
+/**
+ * Executes a batch of tool steps sequentially.
+ *
+ * @param input - The batch step definitions and options.
+ * @param context - The tool execution context.
+ * @returns The aggregated step results and summary.
+ */
+export async function runStepsTool(
+  input: RunStepsInput,
+  context: ToolContext,
+): Promise<ToolResponse<RunStepsResult>> {
+  if (!context.sessionManager.hasActiveSession()) {
+    return createToolError(
+      ErrorCodes.MM_NO_ACTIVE_SESSION,
+      'No active session. Call launch first.',
+    );
+  }
+
+  if (!context.toolRegistry) {
+    return createToolError(
+      ErrorCodes.MM_INTERNAL_ERROR,
+      'Tool registry not available.',
+    );
+  }
+
+  // TODO: implement observation policy filtering using input.includeObservations
+  const { steps: stepInputs, stopOnError = false } = input;
+  const stepResults: StepResult[] = [];
+  let succeeded = 0;
+  let failed = 0;
+  const batchStartTime = Date.now();
+
+  for (const stepInput of stepInputs) {
+    const stepStartTime = Date.now();
+    const { tool, args = {} } = stepInput;
+    const handler = context.toolRegistry.get(tool) as
+      | ToolFunction<Record<string, unknown>, unknown>
+      | undefined;
+
+    if (!handler) {
+      stepResults.push({
+        tool,
+        ok: false,
+        error: {
+          code: ErrorCodes.MM_UNKNOWN_TOOL,
+          message: `Unknown tool: ${tool}`,
+        },
+        meta: {
+          durationMs: Date.now() - stepStartTime,
+          timestamp: new Date().toISOString(),
+        },
+      });
+      failed += 1;
+
+      if (stopOnError) {
+        break;
+      }
+
+      continue;
+    }
+
+    const schema =
+      tool in toolSchemas ? toolSchemas[tool as ToolName] : undefined;
+    let validatedArgs: Record<string, unknown> = args;
+    if (schema) {
+      const parsed = schema.safeParse(args);
+      if (!parsed.success) {
+        stepResults.push({
+          tool,
+          ok: false,
+          error: {
+            code: 'VALIDATION_ERROR',
+            message: parsed.error.issues
+              .map((i) =>
+                i.path.length > 0
+                  ? `${i.path.join('.')}: ${i.message}`
+                  : i.message,
+              )
+              .join('; '),
+          },
+          meta: {
+            durationMs: Date.now() - stepStartTime,
+            timestamp: new Date().toISOString(),
+          },
+        });
+        failed += 1;
+
+        if (stopOnError) {
+          break;
+        }
+
+        continue;
+      }
+      validatedArgs = parsed.data as Record<string, unknown>;
+    }
+
+    try {
+      const response = await handler(validatedArgs, context);
+
+      stepResults.push({
+        tool,
+        ok: response.ok,
+        result: response.ok ? response.result : undefined,
+        error: response.ok ? undefined : response.error,
+        meta: {
+          durationMs: Date.now() - stepStartTime,
+          timestamp: new Date().toISOString(),
+        },
+      });
+
+      // TODO: implement observation policy filtering
+
+      if (response.ok) {
+        succeeded += 1;
+      } else {
+        failed += 1;
+        if (stopOnError) {
+          break;
+        }
+      }
+    } catch (error) {
+      stepResults.push({
+        tool,
+        ok: false,
+        error: {
+          code: ErrorCodes.MM_INTERNAL_ERROR,
+          message: `Unexpected error: ${extractErrorMessage(error)}`,
+        },
+        meta: {
+          durationMs: Date.now() - stepStartTime,
+          timestamp: new Date().toISOString(),
+        },
+      });
+      failed += 1;
+
+      if (stopOnError) {
+        break;
+      }
+    }
+  }
+
+  return createToolSuccess({
+    steps: stepResults,
+    summary: {
+      ok: failed === 0,
+      total: stepResults.length,
+      succeeded,
+      failed,
+      durationMs: Date.now() - batchStartTime,
+    },
+  });
+}
diff --git a/src/tools/build.test.ts b/src/tools/build.test.ts
new file mode 100644
index 0000000..4429237
--- /dev/null
+++ b/src/tools/build.test.ts
@@ -0,0 +1,192 @@
+/**
+ * Unit tests for build tool handler.
+ *
+ * Tests the build handler with BuildCapability and legacy build paths,
+ * including success/failure scenarios and build options handling.
+ */
+
+import { beforeEach, describe, expect, it, vi } from 'vitest';
+
+import { buildTool } from './build.js';
+import type { BuildCapability } from '../capabilities/types.js';
+import { createMockSessionManager } from './test-utils/mock-factories.js';
+import { ErrorCodes } from './types/errors.js';
+import type { ToolContext } from '../types/http.js';
+
+function createMockContext(
+  options: { buildCapability?: BuildCapability } = {},
+) {
+  const sessionManager = createMockSessionManager({
+    hasActive: true,
+    sessionId: 'test-session-123',
+    sessionMetadata: {
+      schemaVersion: 1,
+      sessionId: 'test-session-123',
+      createdAt: new Date().toISOString(),
+      flowTags: [],
+      tags: [],
+      launch: { stateMode: 'default' },
+    },
+  });
+
+  sessionManager.getBuildCapability.mockReturnValue(options.buildCapability);
+
+  return {
+    sessionManager,
+    page: {},
+    refMap: new Map(),
+    workflowContext: {},
+    knowledgeStore: {},
+  } as unknown as ToolContext;
+}
+
+describe('buildTool', () => {
+  let mockBuildCapability: BuildCapability;
+
+  beforeEach(() => {
+    mockBuildCapability = {
+      build: vi.fn(),
+      getExtensionPath: vi.fn(),
+      isBuilt: vi.fn(),
+    };
+  });
+
+  describe('with capability', () => {
+    it('builds extension successfully with default buildType', async () => {
+      vi.spyOn(mockBuildCapability, 'build').mockResolvedValue({
+        success: true,
+        extensionPath: '/path/to/dist/chrome',
+        durationMs: 5000,
+      });
+      const context = createMockContext({
+        buildCapability: mockBuildCapability,
+      });
+
+      const result = await buildTool({}, context);
+
+      expect(result.ok).toBe(true);
+      if (result.ok) {
+        expect(result.result.buildType).toBe('build:test');
+        expect(result.result.extensionPathResolved).toBe(
+          '/path/to/dist/chrome',
+        );
+      }
+      expect(mockBuildCapability.build).toHaveBeenCalledWith({
+        buildType: undefined,
+        force: undefined,
+      });
+    });
+
+    it('builds extension with explicit buildType', async () => {
+      vi.spyOn(mockBuildCapability, 'build').mockResolvedValue({
+        success: true,
+        extensionPath: '/path/to/dist/chrome',
+        durationMs: 5000,
+      });
+      const context = createMockContext({
+        buildCapability: mockBuildCapability,
+      });
+
+      const result = await buildTool({ buildType: 'build:test' }, context);
+
+      expect(result.ok).toBe(true);
+      if (result.ok) {
+        expect(result.result.buildType).toBe('build:test');
+        expect(result.result.extensionPathResolved).toBe(
+          '/path/to/dist/chrome',
+        );
+      }
+      expect(mockBuildCapability.build).toHaveBeenCalledWith({
+        buildType: 'build:test',
+        force: undefined,
+      });
+    });
+
+    it('builds extension with force flag', async () => {
+      vi.spyOn(mockBuildCapability, 'build').mockResolvedValue({
+        success: true,
+        extensionPath: '/path/to/dist/chrome',
+        durationMs: 5000,
+      });
+      const context = createMockContext({
+        buildCapability: mockBuildCapability,
+      });
+
+      const result = await buildTool({ force: true }, context);
+
+      expect(result.ok).toBe(true);
+      expect(mockBuildCapability.build).toHaveBeenCalledWith({
+        buildType: undefined,
+        force: true,
+      });
+    });
+
+    it('returns error when build fails with error message', async () => {
+      vi.spyOn(mockBuildCapability, 'build').mockResolvedValue({
+        success: false,
+        extensionPath: '',
+        durationMs: 1000,
+        error: 'Compilation error',
+      });
+      const context = createMockContext({
+        buildCapability: mockBuildCapability,
+      });
+
+      const result = await buildTool({}, context);
+
+      expect(result.ok).toBe(false);
+      if (!result.ok) {
+        expect(result.error.code).toBe(ErrorCodes.MM_BUILD_FAILED);
+        expect(result.error.message).toContain('Compilation error');
+      }
+    });
+
+    it('returns error when build fails without error message', async () => {
+      vi.spyOn(mockBuildCapability, 'build').mockResolvedValue({
+        success: false,
+        extensionPath: '',
+        durationMs: 1000,
+      });
+      const context = createMockContext({
+        buildCapability: mockBuildCapability,
+      });
+
+      const result = await buildTool({}, context);
+
+      expect(result.ok).toBe(false);
+      if (!result.ok) {
+        expect(result.error.code).toBe(ErrorCodes.MM_BUILD_FAILED);
+        expect(result.error.message).toContain('Unknown error');
+      }
+    });
+
+    it('returns error when build throws exception', async () => {
+      vi.spyOn(mockBuildCapability, 'build').mockRejectedValue(
+        new Error('Build process crashed'),
+      );
+      const context = createMockContext({
+        buildCapability: mockBuildCapability,
+      });
+
+      const result = await buildTool({}, context);
+
+      expect(result.ok).toBe(false);
+      if (!result.ok) {
+        expect(result.error.code).toBe(ErrorCodes.MM_BUILD_FAILED);
+        expect(result.error.message).toContain('Build process crashed');
+      }
+    });
+  });
+
+  it('returns error when build capability is unavailable', async () => {
+    const context = createMockContext();
+
+    const result = await buildTool({}, context);
+
+    expect(result.ok).toBe(false);
+    if (!result.ok) {
+      expect(result.error.code).toBe(ErrorCodes.MM_CAPABILITY_NOT_AVAILABLE);
+      expect(result.error.message).toContain('BuildCapability not available');
+    }
+  });
+});
diff --git a/src/tools/build.ts b/src/tools/build.ts
new file mode 100644
index 0000000..316b756
--- /dev/null
+++ b/src/tools/build.ts
@@ -0,0 +1,52 @@
+import type { BuildInput, BuildToolResult } from './types';
+import { ErrorCodes } from './types';
+import { createToolError, createToolSuccess } from './utils.js';
+import type { ToolContext, ToolResponse } from '../types/http.js';
+import { extractErrorMessage } from '../utils';
+
+/**
+ * Triggers an extension build using the configured build capability.
+ *
+ * @param input - The build configuration options.
+ * @param context - The tool execution context.
+ * @returns The build result with the resolved extension path.
+ */
+export async function buildTool(
+  input: BuildInput,
+  context: ToolContext,
+): Promise<ToolResponse<BuildToolResult>> {
+  const buildCapability =
+    context.workflowContext.build ??
+    context.sessionManager.getBuildCapability();
+
+  if (!buildCapability) {
+    return createToolError(
+      ErrorCodes.MM_CAPABILITY_NOT_AVAILABLE,
+      'BuildCapability not available. The mm_build tool requires either: (1) running in e2e mode with the MetaMask extension wrapper, or (2) running directly in the metamask-extension repository with dependencies installed.',
+    );
+  }
+
+  try {
+    const result = await buildCapability.build({
+      buildType: input.buildType,
+      force: input.force,
+    });
+
+    if (!result.success) {
+      return createToolError(
+        ErrorCodes.MM_BUILD_FAILED,
+        `Build failed: ${result.error ?? 'Unknown error'}`,
+      );
+    }
+
+    return createToolSuccess({
+      buildType: input.buildType ?? 'build:test',
+      extensionPathResolved: result.extensionPath,
+    });
+  } catch (error) {
+    return createToolError(
+      ErrorCodes.MM_BUILD_FAILED,
+      `Build failed: ${extractErrorMessage(error)}`,
+    );
+  }
+}
diff --git a/src/tools/cleanup.test.ts b/src/tools/cleanup.test.ts
new file mode 100644
index 0000000..5348703
--- /dev/null
+++ b/src/tools/cleanup.test.ts
@@ -0,0 +1,74 @@
+/**
+ * Unit tests for cleanup tool handler.
+ *
+ * Tests session cleanup with various session states.
+ */
+
+import { describe, it, expect, vi, beforeEach } from 'vitest';
+
+import { cleanupTool } from './cleanup.js';
+import { createMockSessionManager } from './test-utils/mock-factories.js';
+import type { ToolContext } from '../types/http.js';
+
+function createMockContext(hasActive = false): ToolContext {
+  return {
+    sessionManager: createMockSessionManager({ hasActive }),
+    page: {},
+    refMap: new Map(),
+    workflowContext: {},
+    knowledgeStore: {},
+  } as unknown as ToolContext;
+}
+
+describe('cleanupTool', () => {
+  beforeEach(() => {
+    vi.clearAllMocks();
+  });
+
+  it('cleans up active session successfully', async () => {
+    const context = createMockContext(true);
+    vi.spyOn(context.sessionManager, 'cleanup').mockResolvedValue(true);
+
+    const result = await cleanupTool({}, context);
+
+    expect(result.ok).toBe(true);
+    if (result.ok) {
+      expect(result.result.cleanedUp).toBe(true);
+    }
+    expect(context.sessionManager.cleanup).toHaveBeenCalled();
+  });
+
+  it('returns false when no session to clean up', async () => {
+    const context = createMockContext(false);
+    vi.spyOn(context.sessionManager, 'cleanup').mockResolvedValue(false);
+
+    const result = await cleanupTool({}, context);
+
+    expect(result.ok).toBe(true);
+    if (result.ok) {
+      expect(result.result.cleanedUp).toBe(false);
+    }
+  });
+
+  it('cleans up multiple times without error', async () => {
+    const context = createMockContext(true);
+    vi.spyOn(context.sessionManager, 'cleanup')
+      .mockResolvedValueOnce(true)
+      .mockResolvedValueOnce(false);
+
+    const result1 = await cleanupTool({}, context);
+    const result2 = await cleanupTool({}, context);
+
+    expect(result1.ok).toBe(true);
+    if (result1.ok) {
+      expect(result1.result.cleanedUp).toBe(true);
+    }
+
+    expect(result2.ok).toBe(true);
+    if (result2.ok) {
+      expect(result2.result.cleanedUp).toBe(false);
+    }
+
+    expect(context.sessionManager.cleanup).toHaveBeenCalledTimes(2);
+  });
+});
diff --git a/src/tools/cleanup.ts b/src/tools/cleanup.ts
new file mode 100644
index 0000000..cae4b36
--- /dev/null
+++ b/src/tools/cleanup.ts
@@ -0,0 +1,19 @@
+import type { CleanupInput, CleanupResult } from './types';
+import { createToolSuccess } from './utils.js';
+import type { ToolContext, ToolResponse } from '../types/http.js';
+
+/**
+ * Tears down the active browser session and cleans up resources.
+ *
+ * @param _input - Unused input parameters.
+ * @param context - The tool execution context.
+ * @returns The cleanup result indicating what was cleaned up.
+ */
+export async function cleanupTool(
+  _input: CleanupInput,
+  context: ToolContext,
+): Promise<ToolResponse<CleanupResult>> {
+  const cleanedUp = await context.sessionManager.cleanup();
+
+  return createToolSuccess({ cleanedUp });
+}
diff --git a/src/tools/clipboard.test.ts b/src/tools/clipboard.test.ts
new file mode 100644
index 0000000..d067712
--- /dev/null
+++ b/src/tools/clipboard.test.ts
@@ -0,0 +1,215 @@
+/**
+ * Unit tests for clipboard tool handler.
+ *
+ * Tests CDP-based clipboard operations (read/write) with proper mocking.
+ */
+
+import { describe, it, expect, vi } from 'vitest';
+
+import { clipboardTool } from './clipboard.js';
+import { createMockSessionManager } from './test-utils/mock-factories.js';
+import { ErrorCodes } from './types/errors.js';
+import type { ToolContext } from '../types/http.js';
+
+function createMockContext(
+  options: {
+    hasActive?: boolean;
+    cdpSession?: {
+      send: ReturnType<typeof vi.fn>;
+      detach: ReturnType<typeof vi.fn>;
+    };
+  } = {},
+): ToolContext {
+  const { hasActive = true, cdpSession } = options;
+
+  const mockCdpSession = cdpSession ?? {
+    send: vi.fn().mockResolvedValue(undefined),
+    detach: vi.fn().mockResolvedValue(undefined),
+  };
+
+  const mockPage = {
+    context: vi.fn().mockReturnValue({
+      newCDPSession: vi.fn().mockResolvedValue(mockCdpSession),
+    }),
+  };
+
+  return {
+    sessionManager: createMockSessionManager({ hasActive }),
+    page: mockPage,
+    refMap: new Map(),
+    workflowContext: {},
+    knowledgeStore: {},
+  } as unknown as ToolContext;
+}
+
+describe('clipboardTool', () => {
+  describe('write action', () => {
+    it('writes text to clipboard via CDP', async () => {
+      const cdpSession = {
+        send: vi.fn().mockResolvedValue(undefined),
+        detach: vi.fn().mockResolvedValue(undefined),
+      };
+      const context = createMockContext({ cdpSession });
+
+      const result = await clipboardTool(
+        { action: 'write', text: 'test content' },
+        context,
+      );
+
+      expect(result.ok).toBe(true);
+      if (result.ok) {
+        expect(result.result.action).toBe('write');
+        expect(result.result.success).toBe(true);
+        expect(result.result.text).toBe('test content');
+      }
+      expect(cdpSession.send).toHaveBeenCalledWith('Runtime.evaluate', {
+        expression: 'navigator.clipboard.writeText("test content")',
+        awaitPromise: true,
+        userGesture: true,
+      });
+      expect(cdpSession.detach).toHaveBeenCalled();
+    });
+
+    it('detaches CDP session even if write fails', async () => {
+      const cdpSession = {
+        send: vi.fn().mockRejectedValue(new Error('Write failed')),
+        detach: vi.fn().mockResolvedValue(undefined),
+      };
+      const context = createMockContext({ cdpSession });
+
+      const result = await clipboardTool(
+        { action: 'write', text: 'test' },
+        context,
+      );
+
+      expect(result.ok).toBe(false);
+      expect(cdpSession.detach).toHaveBeenCalled();
+    });
+  });
+
+  describe('read action', () => {
+    it('reads text from clipboard via CDP', async () => {
+      const cdpSession = {
+        send: vi.fn().mockResolvedValue({
+          result: { value: 'clipboard content' },
+        }),
+        detach: vi.fn().mockResolvedValue(undefined),
+      };
+      const context = createMockContext({ cdpSession });
+
+      const result = await clipboardTool({ action: 'read' }, context);
+
+      expect(result.ok).toBe(true);
+      if (result.ok) {
+        expect(result.result.action).toBe('read');
+        expect(result.result.success).toBe(true);
+        expect(result.result.text).toBe('clipboard content');
+      }
+      expect(cdpSession.send).toHaveBeenCalledWith('Runtime.evaluate', {
+        expression: 'navigator.clipboard.readText()',
+        awaitPromise: true,
+        userGesture: true,
+      });
+    });
+
+    it('uses description when value is missing', async () => {
+      const cdpSession = {
+        send: vi.fn().mockResolvedValue({
+          result: { description: 'fallback content' },
+        }),
+        detach: vi.fn().mockResolvedValue(undefined),
+      };
+      const context = createMockContext({ cdpSession });
+
+      const result = await clipboardTool({ action: 'read' }, context);
+
+      expect(result.ok).toBe(true);
+      if (result.ok) {
+        expect(result.result.text).toBe('fallback content');
+      }
+    });
+
+    it('returns empty string when result is missing', async () => {
+      const cdpSession = {
+        send: vi.fn().mockResolvedValue({ result: {} }),
+        detach: vi.fn().mockResolvedValue(undefined),
+      };
+      const context = createMockContext({ cdpSession });
+
+      const result = await clipboardTool({ action: 'read' }, context);
+
+      expect(result.ok).toBe(true);
+      if (result.ok) {
+        expect(result.result.text).toBe('');
+      }
+    });
+  });
+
+  describe('error classification', () => {
+    it('classifies permission denied errors', async () => {
+      const cdpSession = {
+        send: vi.fn().mockRejectedValue(new Error('permissions denied')),
+        detach: vi.fn().mockResolvedValue(undefined),
+      };
+      const context = createMockContext({ cdpSession });
+
+      const result = await clipboardTool({ action: 'read' }, context);
+
+      expect(result.ok).toBe(false);
+      if (!result.ok) {
+        expect(result.error.code).toBe('MM_CLIPBOARD_PERMISSION_DENIED');
+        expect(result.error.message).toContain('Clipboard permission denied');
+      }
+    });
+
+    it('classifies LavaMoat blocked errors', async () => {
+      const cdpSession = {
+        send: vi.fn().mockRejectedValue(new Error('LavaMoat policy violation')),
+        detach: vi.fn().mockResolvedValue(undefined),
+      };
+      const context = createMockContext({ cdpSession });
+
+      const result = await clipboardTool(
+        { action: 'write', text: 'test' },
+        context,
+      );
+
+      expect(result.ok).toBe(false);
+      if (!result.ok) {
+        expect(result.error.code).toBe('MM_CLIPBOARD_LAVAMOAT_BLOCKED');
+        expect(result.error.message).toContain(
+          'Clipboard blocked by LavaMoat policy',
+        );
+      }
+    });
+
+    it('classifies generic clipboard errors', async () => {
+      const cdpSession = {
+        send: vi.fn().mockRejectedValue(new Error('Unknown error')),
+        detach: vi.fn().mockResolvedValue(undefined),
+      };
+      const context = createMockContext({ cdpSession });
+
+      const result = await clipboardTool({ action: 'read' }, context);
+
+      expect(result.ok).toBe(false);
+      if (!result.ok) {
+        expect(result.error.code).toBe('MM_CLIPBOARD_FAILED');
+        expect(result.error.message).toContain('Clipboard operation failed');
+      }
+    });
+  });
+
+  describe('session validation', () => {
+    it('returns error when no active session', async () => {
+      const context = createMockContext({ hasActive: false });
+
+      const result = await clipboardTool({ action: 'read' }, context);
+
+      expect(result.ok).toBe(false);
+      if (!result.ok) {
+        expect(result.error.code).toBe(ErrorCodes.MM_NO_ACTIVE_SESSION);
+      }
+    });
+  });
+});
diff --git a/src/tools/clipboard.ts b/src/tools/clipboard.ts
new file mode 100644
index 0000000..ade71a3
--- /dev/null
+++ b/src/tools/clipboard.ts
@@ -0,0 +1,82 @@
+import type { ClipboardInput, ClipboardResult } from './types';
+import {
+  createToolError,
+  createToolSuccess,
+  requireActiveSession,
+} from './utils.js';
+import type { ToolContext, ToolResponse } from '../types/http.js';
+
+/**
+ * Reads from or writes to the system clipboard via CDP.
+ *
+ * @param input - The clipboard action and optional text payload.
+ * @param context - The tool execution context.
+ * @returns The clipboard operation result with the text content.
+ */
+export async function clipboardTool(
+  input: ClipboardInput,
+  context: ToolContext,
+): Promise<ToolResponse<ClipboardResult>> {
+  const missingSession = requireActiveSession<ClipboardResult>(context);
+  if (missingSession) {
+    return missingSession;
+  }
+
+  try {
+    const cdpSession = await context.page.context().newCDPSession(context.page);
+
+    try {
+      if (input.action === 'write') {
+        await cdpSession.send('Runtime.evaluate', {
+          expression: `navigator.clipboard.writeText(${JSON.stringify(input.text)})`,
+          awaitPromise: true,
+          userGesture: true,
+        });
+
+        return createToolSuccess({
+          action: 'write',
+          success: true,
+          text: input.text,
+        });
+      }
+
+      const result = await cdpSession.send('Runtime.evaluate', {
+        expression: 'navigator.clipboard.readText()',
+        awaitPromise: true,
+        userGesture: true,
+      });
+
+      const clipboardText =
+        result.result?.value ?? result.result?.description ?? '';
+
+      return createToolSuccess({
+        action: 'read',
+        success: true,
+        text: clipboardText as string,
+      });
+    } finally {
+      await cdpSession.detach().catch(() => undefined);
+    }
+  } catch (error) {
+    const message = error instanceof Error ? error.message : String(error);
+
+    if (message.includes('permissions') || message.includes('denied')) {
+      return createToolError(
+        'MM_CLIPBOARD_PERMISSION_DENIED',
+        `Clipboard permission denied: ${message}`,
+      );
+    }
+
+    if (message.includes('LavaMoat') || message.includes('policy')) {
+      return createToolError(
+        'MM_CLIPBOARD_LAVAMOAT_BLOCKED',
+        `Clipboard blocked by LavaMoat policy: ${message}`,
+      );
+    }
+
+    return createToolError(
+      'MM_CLIPBOARD_FAILED',
+      `Clipboard operation failed: ${message}`,
+    );
+  }
+}
diff --git a/src/tools/context.test.ts b/src/tools/context.test.ts
new file mode 100644
index 0000000..2af9de5
--- /dev/null
+++ b/src/tools/context.test.ts
@@ -0,0 +1,176 @@
+/**
+ * Unit tests for context tool handlers.
+ *
+ * Tests context switching (e2e/prod) and context info retrieval.
+ */
+
+import { describe, it, expect, vi } from 'vitest';
+
+import { setContextTool, getContextTool } from './context.js';
+import { createMockSessionManager } from './test-utils/mock-factories.js';
+import { ErrorCodes } from './types/errors.js';
+import type { ToolContext } from '../types/http.js';
+
+function createMockContext(
+  options: {
+    hasActive?: boolean;
+    sessionId?: string;
+    environmentMode?: 'e2e' | 'prod';
+  } = {},
+): ToolContext {
+  return {
+    sessionManager: createMockSessionManager(options),
+    page: {} as ToolContext['page'],
+    refMap: new Map(),
+    workflowContext: {},
+    knowledgeStore: {},
+  } as unknown as ToolContext;
+}
+
+describe('setContextTool', () => {
+  it('switches context from e2e to prod', async () => {
+    const context = createMockContext({ environmentMode: 'e2e' });
+    vi.mocked(context.sessionManager.getContextInfo).mockReturnValue({
+      currentContext: 'prod',
+      hasActiveSession: false,
+      sessionId: null,
+      capabilities: { available: ['build', 'fixture'] },
+      canSwitchContext: true,
+    });
+
+    const result = await setContextTool({ context: 'prod' }, context);
+
+    expect(result.ok).toBe(true);
+    if (result.ok) {
+      expect(result.result.previousContext).toBe('e2e');
+      expect(result.result.newContext).toBe('prod');
+      expect(result.result.availableCapabilities).toStrictEqual([
+        'build',
+        'fixture',
+      ]);
+    }
+    expect(context.sessionManager.setContext).toHaveBeenCalledWith(
+      'prod',
+      undefined,
+    );
+  });
+
+  it('forwards context options to session manager', async () => {
+    const context = createMockContext({ environmentMode: 'e2e' });
+    vi.mocked(context.sessionManager.getContextInfo).mockReturnValue({
+      currentContext: 'e2e',
+      hasActiveSession: false,
+      sessionId: null,
+      capabilities: { available: ['build', 'fixture', 'chain'] },
+      canSwitchContext: true,
+    });
+
+    const contextOptions = {
+      mockServer: {
+        enabled: true,
+        port: 18000,
+      },
+    };
+
+    const result = await setContextTool(
+      {
+        context: 'e2e',
+        options: contextOptions,
+      },
+      context,
+    );
+
+    expect(result.ok).toBe(true);
+    expect(context.sessionManager.setContext).toHaveBeenCalledWith(
+      'e2e',
+      contextOptions,
+    );
+  });
+
+  it('switches context from prod to e2e', async () => {
+    const context = createMockContext({ environmentMode: 'prod' });
+    vi.mocked(context.sessionManager.getContextInfo).mockReturnValue({
+      currentContext: 'e2e',
+      hasActiveSession: false,
+      sessionId: null,
+      capabilities: { available: ['build', 'fixture', 'chain', 'seeding'] },
+      canSwitchContext: true,
+    });
+
+    const result = await setContextTool({ context: 'e2e' }, context);
+
+    expect(result.ok).toBe(true);
+    if (result.ok) {
+      expect(result.result.previousContext).toBe('prod');
+      expect(result.result.newContext).toBe('e2e');
+      expect(result.result.availableCapabilities).toStrictEqual([
+        'build',
+        'fixture',
+        'chain',
+        'seeding',
+      ]);
+    }
+  });
+
+  it('classifies context switch blocked errors', async () => {
+    const context = createMockContext({ environmentMode: 'e2e' });
+    vi.mocked(context.sessionManager.setContext).mockImplementation(() => {
+      throw new Error(ErrorCodes.MM_CONTEXT_SWITCH_BLOCKED);
+    });
+
+    const result = await setContextTool({ context: 'prod' }, context);
+
+    expect(result.ok).toBe(false);
+    if (!result.ok) {
+      expect(result.error.code).toBe(ErrorCodes.MM_CONTEXT_SWITCH_BLOCKED);
+      expect(result.error.message).toBe(ErrorCodes.MM_CONTEXT_SWITCH_BLOCKED);
+    }
+  });
+
+  it('classifies generic context errors', async () => {
+    const context = createMockContext({ environmentMode: 'e2e' });
+    vi.mocked(context.sessionManager.setContext).mockImplementation(() => {
+      throw new Error('Unknown error');
+    });
+
+    const result = await setContextTool({ context: 'prod' }, context);
+
+    expect(result.ok).toBe(false);
+    if (!result.ok) {
+      expect(result.error.code).toBe(ErrorCodes.MM_SET_CONTEXT_FAILED);
+      expect(result.error.message).toContain('Context switch failed');
+    }
+  });
+});
+
+describe('getContextTool', () => {
+  it('returns context info when getContextInfo is available', async () => {
+    const context = createMockContext({
+      hasActive: true,
+      sessionId: 'test-session-123',
+      environmentMode: 'e2e',
+    });
+    vi.mocked(context.sessionManager.getContextInfo).mockReturnValue({
+      currentContext: 'e2e',
+      hasActiveSession: true,
+      sessionId: 'test-session-123',
+      capabilities: { available: ['build', 'fixture', 'chain'] },
+      canSwitchContext: false,
+    });
+
+    const result = await getContextTool({}, context);
+
+    expect(result.ok).toBe(true);
+    if (result.ok) {
+      expect(result.result.currentContext).toBe('e2e');
+      expect(result.result.hasActiveSession).toBe(true);
+      expect(result.result.sessionId).toBe('test-session-123');
+      expect(result.result.capabilities.available).toStrictEqual([
+        'build',
+        'fixture',
+        'chain',
+      ]);
+      expect(result.result.canSwitchContext).toBe(false);
+    }
+  });
+});
diff --git a/src/tools/context.ts b/src/tools/context.ts
new file mode 100644
index 0000000..65f501d
--- /dev/null
+++ b/src/tools/context.ts
@@ -0,0 +1,55 @@
+import { classifyContextError } from './error-classification.js';
+import type { SetContextInput } from './types/tool-inputs.js';
+import type {
+  SetContextResult,
+  GetContextResult,
+} from './types/tool-outputs.js';
+import { createToolError, createToolSuccess } from './utils.js';
+import type { ToolContext, ToolResponse } from '../types/http.js';
+
+export type { SetContextInput } from './types/tool-inputs.js';
+export type {
+  SetContextResult,
+  GetContextResult,
+} from './types/tool-outputs.js';
+
+/**
+ * Switches the session environment context between e2e and prod modes.
+ *
+ * @param input - The target context and optional configuration.
+ * @param context - The tool execution context.
+ * @returns The previous and new context with available capabilities.
+ */
+export async function setContextTool(
+  input: SetContextInput,
+  context: ToolContext,
+): Promise<ToolResponse<SetContextResult>> {
+  try {
+    const previousContext = context.sessionManager.getEnvironmentMode();
+    context.sessionManager.setContext(input.context, input.options);
+    const info = context.sessionManager.getContextInfo();
+
+    return createToolSuccess({
+      previousContext,
+      newContext: input.context,
+      availableCapabilities: info.capabilities.available,
+    });
+  } catch (error) {
+    const errorInfo = classifyContextError(error);
+    return createToolError(errorInfo.code, errorInfo.message);
+  }
+}
+
+/**
+ * Retrieves the current session context, capabilities, and status.
+ *
+ * @param _input - Unused input parameters.
+ * @param context - The tool execution context.
+ * @returns The current context information.
+ */
+export async function getContextTool(
+  _input: Record<string, never>,
+  context: ToolContext,
+): Promise<ToolResponse<GetContextResult>> {
+  return createToolSuccess(context.sessionManager.getContextInfo());
+}
diff --git a/src/mcp-server/tools/discovery-tools.test.ts b/src/tools/discovery-tools.test.ts
similarity index 58%
rename from src/mcp-server/tools/discovery-tools.test.ts
rename to src/tools/discovery-tools.test.ts
index 8b5a248..683a7af 100644
--- a/src/mcp-server/tools/discovery-tools.test.ts
+++ b/src/tools/discovery-tools.test.ts
@@ -8,19 +8,18 @@
  */
 
 import type { Page } from '@playwright/test';
-import { describe, it, expect, vi, beforeEach } from 'vitest';
+import { beforeEach, describe, expect, it, vi } from 'vitest';
 
 import {
-  handleListTestIds,
-  handleAccessibilitySnapshot,
-  handleDescribeScreen,
+  accessibilitySnapshotTool,
+  describeScreenTool,
+  listTestIdsTool,
 } from './discovery-tools.js';
-import { ScreenshotResult } from '../../capabilities/types.js';
-import * as discoveryModule from '../discovery.js';
-import * as knowledgeStoreModule from '../knowledge-store.js';
-import * as sessionManagerModule from '../session-manager.js';
-import { createMockSessionManager } from '../test-utils/mock-factories.js';
-import type { TestIdItem, A11yNodeTrimmed } from '../types';
+import { createMockSessionManager } from './test-utils/mock-factories.js';
+import type { A11yNodeTrimmed, TestIdItem } from './types';
+import { ErrorCodes } from './types/errors.js';
+import * as discoveryModule from './utils/discovery.js';
+import type { ToolContext } from '../types/http.js';
 
 function createMockPage(): Page {
   return {
@@ -28,12 +27,16 @@ function createMockPage(): Page {
   } as unknown as Page;
 }
 
-describe('discovery-tools', () => {
-  beforeEach(() => {
-    vi.clearAllMocks();
+function createMockContext(
+  options: {
+    hasActive?: boolean;
+  } = {},
+): ToolContext {
+  const { hasActive = true } = options;
 
-    const mockSessionManager = createMockSessionManager({
-      hasActive: true,
+  return {
+    sessionManager: createMockSessionManager({
+      hasActive,
       sessionId: 'test-session-123',
       sessionMetadata: {
         schemaVersion: 1,
@@ -43,34 +46,27 @@ describe('discovery-tools', () => {
         flowTags: ['discovery'],
         tags: [],
         launch: {
-          stateMode: 'default' as const,
+          stateMode: 'default',
         },
       },
-    });
-
-    vi.spyOn(mockSessionManager, 'getPage').mockReturnValue(createMockPage());
-
-    vi.spyOn(sessionManagerModule, 'getSessionManager').mockReturnValue(
-      mockSessionManager,
-    );
-
-    vi.spyOn(knowledgeStoreModule, 'knowledgeStore', 'get').mockReturnValue({
-      recordStep: vi.fn().mockResolvedValue(undefined),
-      getLastSteps: vi.fn().mockResolvedValue([]),
-      searchSteps: vi.fn().mockResolvedValue([]),
-      summarizeSession: vi.fn().mockResolvedValue({
-        sessionId: 'test-session-123',
-        stepCount: 0,
-        recipe: [],
-      }),
-      listSessions: vi.fn().mockResolvedValue([]),
+    }),
+    page: createMockPage(),
+    refMap: new Map(),
+    workflowContext: {},
+    knowledgeStore: {
       generatePriorKnowledge: vi.fn().mockResolvedValue(undefined),
-      writeSessionMetadata: vi.fn().mockResolvedValue('test-session-123'),
-    } as any);
+    },
+  } as unknown as ToolContext;
+}
+
+describe('discovery-tools', () => {
+  beforeEach(() => {
+    vi.clearAllMocks();
   });
 
-  describe('handleListTestIds', () => {
+  describe('listTestIdsTool', () => {
     it('returns list of test IDs with default limit', async () => {
+      const context = createMockContext();
       const mockItems: TestIdItem[] = [
         { testId: 'button-1', tag: 'button', text: 'Click', visible: true },
         { testId: 'input-1', tag: 'input', visible: true },
@@ -84,24 +80,24 @@ describe('discovery-tools', () => {
         },
       );
 
-      const result = await handleListTestIds({});
+      const result = await listTestIdsTool({}, context);
 
       expect(result.ok).toBe(true);
       if (result.ok) {
         expect(result.result.items).toStrictEqual(mockItems);
-        expect(discoveryModule.collectTestIds).toHaveBeenCalledWith(
-          expect.anything(),
-          150,
-        );
       }
+      expect(discoveryModule.collectTestIds).toHaveBeenCalledWith(
+        context.page,
+        150,
+      );
     });
 
     it('respects custom limit', async () => {
-      const mockItems: TestIdItem[] = [
-        { testId: 'item-1', tag: 'div', visible: true },
-      ];
+      const context = createMockContext();
 
-      vi.spyOn(discoveryModule, 'collectTestIds').mockResolvedValue(mockItems);
+      vi.spyOn(discoveryModule, 'collectTestIds').mockResolvedValue([
+        { testId: 'item-1', tag: 'div', visible: true },
+      ]);
       vi.spyOn(discoveryModule, 'collectTrimmedA11ySnapshot').mockResolvedValue(
         {
           nodes: [],
@@ -109,18 +105,17 @@ describe('discovery-tools', () => {
         },
       );
 
-      const result = await handleListTestIds({ limit: 50 });
+      const result = await listTestIdsTool({ limit: 50 }, context);
 
       expect(result.ok).toBe(true);
-      if (result.ok) {
-        expect(discoveryModule.collectTestIds).toHaveBeenCalledWith(
-          expect.anything(),
-          50,
-        );
-      }
+      expect(discoveryModule.collectTestIds).toHaveBeenCalledWith(
+        context.page,
+        50,
+      );
     });
 
     it('updates refMap in session manager', async () => {
+      const context = createMockContext();
       const mockRefMap = new Map([['e1', 'role=button[name="Submit"]']]);
 
       vi.spyOn(discoveryModule, 'collectTestIds').mockResolvedValue([]);
@@ -131,53 +126,30 @@ describe('discovery-tools', () => {
         },
       );
 
-      const sessionManager = sessionManagerModule.getSessionManager();
+      await listTestIdsTool({}, context);
 
-      await handleListTestIds({});
-
-      expect(sessionManager.setRefMap).toHaveBeenCalledWith(mockRefMap);
-    });
-
-    it('records step to knowledge store', async () => {
-      const mockItems: TestIdItem[] = [
-        { testId: 'test-1', tag: 'button', visible: true },
-      ];
-
-      vi.spyOn(discoveryModule, 'collectTestIds').mockResolvedValue(mockItems);
-      vi.spyOn(discoveryModule, 'collectTrimmedA11ySnapshot').mockResolvedValue(
-        {
-          nodes: [],
-          refMap: new Map(),
-        },
-      );
-
-      const { knowledgeStore } = knowledgeStoreModule;
-
-      await handleListTestIds({});
-
-      expect(knowledgeStore.recordStep).toHaveBeenCalled();
+      expect(context.sessionManager.setRefMap).toHaveBeenCalledWith(mockRefMap);
     });
 
     it('returns error when no active session', async () => {
-      const mockSessionManager = createMockSessionManager({ hasActive: false });
-      vi.spyOn(sessionManagerModule, 'getSessionManager').mockReturnValue(
-        mockSessionManager,
-      );
+      const context = createMockContext({ hasActive: false });
 
-      const result = await handleListTestIds({});
+      const result = await listTestIdsTool({}, context);
 
       expect(result.ok).toBe(false);
       if (!result.ok) {
-        expect(result.error.code).toBe('MM_NO_ACTIVE_SESSION');
+        expect(result.error.code).toBe(ErrorCodes.MM_NO_ACTIVE_SESSION);
       }
     });
 
     it('handles discovery errors', async () => {
+      const context = createMockContext();
+
       vi.spyOn(discoveryModule, 'collectTestIds').mockRejectedValue(
         new Error('Page closed'),
       );
 
-      const result = await handleListTestIds({});
+      const result = await listTestIdsTool({}, context);
 
       expect(result.ok).toBe(false);
       if (!result.ok) {
@@ -186,8 +158,9 @@ describe('discovery-tools', () => {
     });
   });
 
-  describe('handleAccessibilitySnapshot', () => {
+  describe('accessibilitySnapshotTool', () => {
     it('returns accessibility tree with refs', async () => {
+      const context = createMockContext();
       const mockNodes: A11yNodeTrimmed[] = [
         { ref: 'e1', role: 'button', name: 'Submit', path: [] },
         { ref: 'e2', role: 'link', name: 'Cancel', path: [] },
@@ -205,7 +178,7 @@ describe('discovery-tools', () => {
       );
       vi.spyOn(discoveryModule, 'collectTestIds').mockResolvedValue([]);
 
-      const result = await handleAccessibilitySnapshot({});
+      const result = await accessibilitySnapshotTool({}, context);
 
       expect(result.ok).toBe(true);
       if (result.ok) {
@@ -214,6 +187,8 @@ describe('discovery-tools', () => {
     });
 
     it('uses root selector when provided', async () => {
+      const context = createMockContext();
+
       vi.spyOn(discoveryModule, 'collectTrimmedA11ySnapshot').mockResolvedValue(
         {
           nodes: [],
@@ -222,15 +197,16 @@ describe('discovery-tools', () => {
       );
       vi.spyOn(discoveryModule, 'collectTestIds').mockResolvedValue([]);
 
-      await handleAccessibilitySnapshot({ rootSelector: '.modal' });
+      await accessibilitySnapshotTool({ rootSelector: '.modal' }, context);
 
       expect(discoveryModule.collectTrimmedA11ySnapshot).toHaveBeenCalledWith(
-        expect.anything(),
+        context.page,
         '.modal',
       );
     });
 
     it('updates refMap in session manager', async () => {
+      const context = createMockContext();
       const mockRefMap = new Map([['e1', 'role=button[name="OK"]']]);
 
       vi.spyOn(discoveryModule, 'collectTrimmedA11ySnapshot').mockResolvedValue(
@@ -241,14 +217,14 @@ describe('discovery-tools', () => {
       );
       vi.spyOn(discoveryModule, 'collectTestIds').mockResolvedValue([]);
 
-      const sessionManager = sessionManagerModule.getSessionManager();
-
-      await handleAccessibilitySnapshot({});
+      await accessibilitySnapshotTool({}, context);
 
-      expect(sessionManager.setRefMap).toHaveBeenCalledWith(mockRefMap);
+      expect(context.sessionManager.setRefMap).toHaveBeenCalledWith(mockRefMap);
     });
 
-    it('records step to knowledge store', async () => {
+    it('collects test ids with observation limit', async () => {
+      const context = createMockContext();
+
       vi.spyOn(discoveryModule, 'collectTrimmedA11ySnapshot').mockResolvedValue(
         {
           nodes: [],
@@ -257,33 +233,33 @@ describe('discovery-tools', () => {
       );
       vi.spyOn(discoveryModule, 'collectTestIds').mockResolvedValue([]);
 
-      const { knowledgeStore } = knowledgeStoreModule;
+      await accessibilitySnapshotTool({}, context);
 
-      await handleAccessibilitySnapshot({});
-
-      expect(knowledgeStore.recordStep).toHaveBeenCalled();
+      expect(discoveryModule.collectTestIds).toHaveBeenCalledWith(
+        context.page,
+        50,
+      );
     });
 
     it('returns error when no active session', async () => {
-      const mockSessionManager = createMockSessionManager({ hasActive: false });
-      vi.spyOn(sessionManagerModule, 'getSessionManager').mockReturnValue(
-        mockSessionManager,
-      );
+      const context = createMockContext({ hasActive: false });
 
-      const result = await handleAccessibilitySnapshot({});
+      const result = await accessibilitySnapshotTool({}, context);
 
       expect(result.ok).toBe(false);
       if (!result.ok) {
-        expect(result.error.code).toBe('MM_NO_ACTIVE_SESSION');
+        expect(result.error.code).toBe(ErrorCodes.MM_NO_ACTIVE_SESSION);
       }
     });
 
     it('handles discovery errors', async () => {
+      const context = createMockContext();
+
       vi.spyOn(discoveryModule, 'collectTrimmedA11ySnapshot').mockRejectedValue(
         new Error('Discovery failed'),
       );
 
-      const result = await handleAccessibilitySnapshot({});
+      const result = await accessibilitySnapshotTool({}, context);
 
       expect(result.ok).toBe(false);
       if (!result.ok) {
@@ -292,8 +268,9 @@ describe('discovery-tools', () => {
     });
   });
 
-  describe('handleDescribeScreen', () => {
+  describe('describeScreenTool', () => {
     it('returns comprehensive screen state', async () => {
+      const context = createMockContext();
       const mockTestIds: TestIdItem[] = [
         { testId: 'button-1', tag: 'button', visible: true },
       ];
@@ -311,7 +288,7 @@ describe('discovery-tools', () => {
         },
       );
 
-      const result = await handleDescribeScreen({});
+      const result = await describeScreenTool({}, context);
 
       expect(result.ok).toBe(true);
       if (result.ok) {
@@ -323,6 +300,8 @@ describe('discovery-tools', () => {
     });
 
     it('includes screenshot when requested', async () => {
+      const context = createMockContext();
+
       vi.spyOn(discoveryModule, 'collectTestIds').mockResolvedValue([]);
       vi.spyOn(discoveryModule, 'collectTrimmedA11ySnapshot').mockResolvedValue(
         {
@@ -330,19 +309,20 @@ describe('discovery-tools', () => {
           refMap: new Map(),
         },
       );
-
-      const sessionManager = sessionManagerModule.getSessionManager();
-      vi.spyOn(sessionManager, 'screenshot').mockResolvedValue({
+      vi.mocked(context.sessionManager.screenshot).mockResolvedValue({
         path: '/path/to/screenshot.png',
         width: 1280,
         height: 720,
         base64: 'base64data',
       });
 
-      const result = await handleDescribeScreen({
-        includeScreenshot: true,
-        screenshotName: 'test-screen',
-      });
+      const result = await describeScreenTool(
+        {
+          includeScreenshot: true,
+          screenshotName: 'test-screen',
+        },
+        context,
+      );
 
       expect(result.ok).toBe(true);
       if (result.ok) {
@@ -352,14 +332,16 @@ describe('discovery-tools', () => {
           height: 720,
           base64: null,
         });
-        expect(sessionManager.screenshot).toHaveBeenCalledWith({
-          name: 'test-screen',
-          fullPage: true,
-        });
       }
+      expect(context.sessionManager.screenshot).toHaveBeenCalledWith({
+        name: 'test-screen',
+        fullPage: true,
+      });
     });
 
     it('includes base64 in screenshot when requested', async () => {
+      const context = createMockContext();
+
       vi.spyOn(discoveryModule, 'collectTestIds').mockResolvedValue([]);
       vi.spyOn(discoveryModule, 'collectTrimmedA11ySnapshot').mockResolvedValue(
         {
@@ -367,19 +349,20 @@ describe('discovery-tools', () => {
           refMap: new Map(),
         },
       );
-
-      const sessionManager = sessionManagerModule.getSessionManager();
-      vi.spyOn(sessionManager, 'screenshot').mockResolvedValue({
+      vi.mocked(context.sessionManager.screenshot).mockResolvedValue({
         path: '/path/to/screenshot.png',
         width: 1280,
         height: 720,
         base64: 'base64data',
       });
 
-      const result = await handleDescribeScreen({
-        includeScreenshot: true,
-        includeScreenshotBase64: true,
-      });
+      const result = await describeScreenTool(
+        {
+          includeScreenshot: true,
+          includeScreenshotBase64: true,
+        },
+        context,
+      );
 
       expect(result.ok).toBe(true);
       if (result.ok) {
@@ -388,6 +371,8 @@ describe('discovery-tools', () => {
     });
 
     it('uses default screenshot name when not provided', async () => {
+      const context = createMockContext();
+
       vi.spyOn(discoveryModule, 'collectTestIds').mockResolvedValue([]);
       vi.spyOn(discoveryModule, 'collectTrimmedA11ySnapshot').mockResolvedValue(
         {
@@ -396,30 +381,36 @@ describe('discovery-tools', () => {
         },
       );
 
-      const sessionManager = sessionManagerModule.getSessionManager();
-      const mockedScreenshot = vi
-        .spyOn(sessionManager, 'screenshot')
-        .mockResolvedValue({
-          path: '/path/to/screenshot.png',
-          width: 1280,
-          height: 720,
-        } as ScreenshotResult);
+      await describeScreenTool({ includeScreenshot: true }, context);
 
-      await handleDescribeScreen({ includeScreenshot: true });
-
-      expect(mockedScreenshot).toHaveBeenCalledWith({
+      expect(context.sessionManager.screenshot).toHaveBeenCalledWith({
         name: 'describe-screen',
         fullPage: true,
       });
     });
 
     it('generates prior knowledge from context', async () => {
+      const context = createMockContext();
       const mockTestIds: TestIdItem[] = [
         { testId: 'send-btn', tag: 'button', visible: true },
       ];
       const mockNodes: A11yNodeTrimmed[] = [
         { ref: 'e1', role: 'button', name: 'Send', path: [] },
       ];
+      const mockPriorKnowledge = {
+        schemaVersion: 1 as const,
+        generatedAt: '2026-02-04T00:00:00.000Z',
+        query: {
+          currentScreen: 'home',
+          currentUrl: 'chrome-extension://ext-123/home.html',
+          visibleTestIds: mockTestIds,
+          a11yNodes: mockNodes,
+          currentSessionFlowTags: ['discovery'],
+        },
+        relatedSessions: [],
+        similarSteps: [],
+        suggestedNextActions: [],
+      };
 
       vi.spyOn(discoveryModule, 'collectTestIds').mockResolvedValue(
         mockTestIds,
@@ -430,37 +421,31 @@ describe('discovery-tools', () => {
           refMap: new Map([['e1', 'role=button[name="Send"]']]),
         },
       );
+      vi.mocked(
+        context.knowledgeStore.generatePriorKnowledge,
+      ).mockResolvedValue(mockPriorKnowledge as any);
 
-      const mockPriorKnowledge = {
-        version: 1 as const,
-        hints: [
-          { type: 'similar_flow' as const, content: 'Previous send flow' },
-        ],
-      };
-
-      const { knowledgeStore } = knowledgeStoreModule;
-      vi.spyOn(knowledgeStore, 'generatePriorKnowledge').mockResolvedValue(
-        mockPriorKnowledge as any,
-      );
-
-      const result = await handleDescribeScreen({});
+      const result = await describeScreenTool({}, context);
 
       expect(result.ok).toBe(true);
       if (result.ok) {
         expect(result.result.priorKnowledge).toStrictEqual(mockPriorKnowledge);
-        expect(knowledgeStore.generatePriorKnowledge).toHaveBeenCalledWith(
-          expect.objectContaining({
-            currentScreen: 'home',
-            visibleTestIds: mockTestIds,
-            a11yNodes: mockNodes,
-            currentSessionFlowTags: ['discovery'],
-          }),
-          'test-session-123',
-        );
       }
+      expect(
+        context.knowledgeStore.generatePriorKnowledge,
+      ).toHaveBeenCalledWith(
+        expect.objectContaining({
+          currentScreen: 'home',
+          visibleTestIds: mockTestIds,
+          a11yNodes: mockNodes,
+          currentSessionFlowTags: ['discovery'],
+        }),
+        'test-session-123',
+      );
     });
 
     it('updates refMap in session manager', async () => {
+      const context = createMockContext();
       const mockRefMap = new Map([['e1', 'role=button[name="OK"]']]);
 
       vi.spyOn(discoveryModule, 'collectTestIds').mockResolvedValue([]);
@@ -471,49 +456,30 @@ describe('discovery-tools', () => {
         },
       );
 
-      const sessionManager = sessionManagerModule.getSessionManager();
-
-      await handleDescribeScreen({});
+      await describeScreenTool({}, context);
 
-      expect(sessionManager.setRefMap).toHaveBeenCalledWith(mockRefMap);
-    });
-
-    it('records step to knowledge store', async () => {
-      vi.spyOn(discoveryModule, 'collectTestIds').mockResolvedValue([]);
-      vi.spyOn(discoveryModule, 'collectTrimmedA11ySnapshot').mockResolvedValue(
-        {
-          nodes: [],
-          refMap: new Map(),
-        },
-      );
-
-      const { knowledgeStore } = knowledgeStoreModule;
-
-      await handleDescribeScreen({});
-
-      expect(knowledgeStore.recordStep).toHaveBeenCalled();
+      expect(context.sessionManager.setRefMap).toHaveBeenCalledWith(mockRefMap);
     });
 
     it('returns error when no active session', async () => {
-      const mockSessionManager = createMockSessionManager({ hasActive: false });
-      vi.spyOn(sessionManagerModule, 'getSessionManager').mockReturnValue(
-        mockSessionManager,
-      );
+      const context = createMockContext({ hasActive: false });
 
-      const result = await handleDescribeScreen({});
+      const result = await describeScreenTool({}, context);
 
       expect(result.ok).toBe(false);
       if (!result.ok) {
-        expect(result.error.code).toBe('MM_NO_ACTIVE_SESSION');
+        expect(result.error.code).toBe(ErrorCodes.MM_NO_ACTIVE_SESSION);
       }
     });
 
     it('handles discovery errors', async () => {
+      const context = createMockContext();
+
       vi.spyOn(discoveryModule, 'collectTestIds').mockRejectedValue(
         new Error('Page closed'),
       );
 
-      const result = await handleDescribeScreen({});
+      const result = await describeScreenTool({}, context);
 
       expect(result.ok).toBe(false);
       if (!result.ok) {
diff --git a/src/tools/discovery-tools.ts b/src/tools/discovery-tools.ts
new file mode 100644
index 0000000..fba199a
--- /dev/null
+++ b/src/tools/discovery-tools.ts
@@ -0,0 +1,155 @@
+import { classifyDiscoveryError } from './error-classification.js';
+import type {
+  AccessibilitySnapshotInput,
+  AccessibilitySnapshotResult,
+  DescribeScreenInput,
+  DescribeScreenResult,
+  ListTestIdsInput,
+  ListTestIdsResult,
+  PriorKnowledgeContext,
+} from './types';
+import {
+  DEFAULT_TESTID_LIMIT,
+  OBSERVATION_TESTID_LIMIT,
+} from './utils/constants.js';
+import {
+  collectTestIds,
+  collectTrimmedA11ySnapshot,
+} from './utils/discovery.js';
+import {
+  createToolError,
+  createToolSuccess,
+  requireActiveSession,
+} from './utils.js';
+import type { ToolContext, ToolResponse } from '../types/http.js';
+
+/**
+ * Collects visible test IDs from the current page.
+ *
+ * @param input - The test ID collection options including limit.
+ * @param context - The tool execution context.
+ * @returns The list of discovered test ID items.
+ */
+export async function listTestIdsTool(
+  input: ListTestIdsInput,
+  context: ToolContext,
+): Promise<ToolResponse<ListTestIdsResult>> {
+  const missingSession = requireActiveSession<ListTestIdsResult>(context);
+  if (missingSession) {
+    return missingSession;
+  }
+
+  const limit = input.limit ?? DEFAULT_TESTID_LIMIT;
+
+  try {
+    const items = await collectTestIds(context.page, limit);
+    const { refMap } = await collectTrimmedA11ySnapshot(context.page);
+
+    context.sessionManager.setRefMap(refMap);
+
+    return createToolSuccess({ items });
+  } catch (error) {
+    const errorInfo = classifyDiscoveryError(error);
+    return createToolError(errorInfo.code, errorInfo.message);
+  }
+}
+
+/**
+ * Captures a trimmed accessibility tree snapshot of the current page.
+ *
+ * @param input - The snapshot options including optional root selector.
+ * @param context - The tool execution context.
+ * @returns The accessibility snapshot nodes.
+ */
+export async function accessibilitySnapshotTool(
+  input: AccessibilitySnapshotInput,
+  context: ToolContext,
+): Promise<ToolResponse<AccessibilitySnapshotResult>> {
+  const missingSession =
+    requireActiveSession<AccessibilitySnapshotResult>(context);
+  if (missingSession) {
+    return missingSession;
+  }
+
+  try {
+    const { nodes, refMap } = await collectTrimmedA11ySnapshot(
+      context.page,
+      input.rootSelector,
+    );
+
+    context.sessionManager.setRefMap(refMap);
+    await collectTestIds(context.page, OBSERVATION_TESTID_LIMIT);
+
+    return createToolSuccess({ nodes });
+  } catch (error) {
+    const errorInfo = classifyDiscoveryError(error);
+    return createToolError(errorInfo.code, errorInfo.message);
+  }
+}
+
+/**
+ * Captures a full screen description including state, test IDs, a11y, and prior knowledge.
+ *
+ * @param input - The describe-screen options including screenshot flags.
+ * @param context - The tool execution context.
+ * @returns The composite screen description result.
+ */
+export async function describeScreenTool(
+  input: DescribeScreenInput,
+  context: ToolContext,
+): Promise<ToolResponse<DescribeScreenResult>> {
+  const missingSession = requireActiveSession<DescribeScreenResult>(context);
+  if (missingSession) {
+    return missingSession;
+  }
+
+  try {
+    const state = await context.sessionManager.getExtensionState();
+    const testIds = await collectTestIds(context.page, DEFAULT_TESTID_LIMIT);
+    const { nodes, refMap } = await collectTrimmedA11ySnapshot(context.page);
+
+    context.sessionManager.setRefMap(refMap);
+
+    let screenshot: DescribeScreenResult['screenshot'] = null;
+
+    if (input.includeScreenshot) {
+      const screenshotName = input.screenshotName ?? 'describe-screen';
+      const result = await context.sessionManager.screenshot({
+        name: screenshotName,
+        fullPage: true,
+      });
+
+      screenshot = {
+        path: result.path,
+        width: result.width,
+        height: result.height,
+        base64: input.includeScreenshotBase64 ? result.base64 : null,
+      };
+    }
+
+    const sessionMetadata = context.sessionManager.getSessionMetadata();
+    const priorKnowledgeContext: PriorKnowledgeContext = {
+      currentScreen: state.currentScreen,
+      currentUrl: state.currentUrl,
+      visibleTestIds: testIds,
+      a11yNodes: nodes,
+      currentSessionFlowTags: sessionMetadata?.flowTags,
+    };
+
+    const priorKnowledge = await context.knowledgeStore.generatePriorKnowledge(
+      priorKnowledgeContext,
+      context.sessionManager.getSessionId(),
+    );
+
+    return createToolSuccess({
+      state,
+      testIds: { items: testIds },
+      a11y: { nodes },
+      screenshot,
+      priorKnowledge,
+    });
+  } catch (error) {
+    const errorInfo = classifyDiscoveryError(error);
+    return createToolError(errorInfo.code, errorInfo.message);
+  }
+}
diff --git a/src/mcp-server/tools/error-classification.test.ts b/src/tools/error-classification.test.ts
similarity index 99%
rename from src/mcp-server/tools/error-classification.test.ts
rename to src/tools/error-classification.test.ts
index b9ba1bb..e141e12 100644
--- a/src/mcp-server/tools/error-classification.test.ts
+++ b/src/tools/error-classification.test.ts
@@ -20,8 +20,8 @@ import {
   classifyStateError,
   classifySeedingError,
   classifyContextError,
-} from './error-classification';
-import { ErrorCodes } from '../types';
+} from './error-classification.js';
+import { ErrorCodes } from './types';
 
 describe('error-classification', () => {
   describe('isPageClosedError', () => {
diff --git a/src/mcp-server/tools/error-classification.ts b/src/tools/error-classification.ts
similarity index 99%
rename from src/mcp-server/tools/error-classification.ts
rename to src/tools/error-classification.ts
index c424d91..9b844f3 100644
--- a/src/mcp-server/tools/error-classification.ts
+++ b/src/tools/error-classification.ts
@@ -5,7 +5,7 @@
  * based on error message patterns.
  */
 
-import { ErrorCodes } from '../types';
+import { ErrorCodes } from './types';
 import { extractErrorMessage } from '../utils';
 
 const ERROR_PATTERNS = {
diff --git a/src/tools/index.ts b/src/tools/index.ts
new file mode 100644
index 0000000..c75bad2
--- /dev/null
+++ b/src/tools/index.ts
@@ -0,0 +1,15 @@
+export * from './batch.js';
+export * from './build.js';
+export * from './cleanup.js';
+export * from './clipboard.js';
+export * from './context.js';
+export * from './discovery-tools.js';
+export * from './interaction.js';
+export * from './knowledge.js';
+export * from './launch.js';
+export * from './navigation.js';
+export * from './registry.js';
+export * from './screenshot.js';
+export * from './seeding.js';
+export * from './state.js';
+export * from './utils.js';
diff --git a/src/tools/interaction.test.ts b/src/tools/interaction.test.ts
new file mode 100644
index 0000000..911059a
--- /dev/null
+++ b/src/tools/interaction.test.ts
@@ -0,0 +1,660 @@
+/**
+ * Unit tests for interaction tool handlers.
+ *
+ * Tests handleClick, handleType, and handleWaitFor with various target types,
+ * error scenarios, and page closure detection.
+ */
+
+import { describe, it, expect, vi, afterEach } from 'vitest';
+
+import { clickTool, typeTool, waitForTool } from './interaction.js';
+import { createMockSessionManager } from './test-utils/mock-factories.js';
+import { ErrorCodes } from './types/errors.js';
+import * as discoveryModule from './utils/discovery.js';
+import * as targetsModule from './utils/targets.js';
+import type { ToolContext } from '../types/http.js';
+
+function createMockLocator() {
+  return {
+    click: vi.fn().mockResolvedValue(undefined),
+    fill: vi.fn().mockResolvedValue(undefined),
+    waitFor: vi.fn().mockResolvedValue(undefined),
+  };
+}
+
+function createMockContext(
+  options: {
+    hasActive?: boolean;
+    page?: object;
+    refMap?: Map<string, string>;
+  } = {},
+): ToolContext {
+  return {
+    sessionManager: createMockSessionManager({
+      hasActive: options.hasActive ?? true,
+    }),
+    page: (options.page ?? {}) as ToolContext['page'],
+    refMap: options.refMap ?? new Map(),
+    workflowContext: {},
+    knowledgeStore: {},
+  } as unknown as ToolContext;
+}
+
+describe('interaction', () => {
+  afterEach(() => {
+    vi.restoreAllMocks();
+  });
+
+  describe('clickTool', () => {
+    it('clicks element by testId', async () => {
+      const page = {};
+      const locator = createMockLocator();
+      const context = createMockContext({ page });
+
+      vi.spyOn(discoveryModule, 'waitForTarget').mockResolvedValue(
+        locator as any,
+      );
+
+      const result = await clickTool({ testId: 'my-button' }, context);
+
+      expect(result.ok).toBe(true);
+      if (result.ok) {
+        expect(result.result.clicked).toBe(true);
+        expect(result.result.target).toBe('testId:my-button');
+      }
+      expect(discoveryModule.waitForTarget).toHaveBeenCalledWith(
+        page,
+        'testId',
+        'my-button',
+        context.refMap,
+        15000,
+      );
+      expect(locator.click).toHaveBeenCalled();
+    });
+
+    it('uses custom timeout when provided', async () => {
+      const page = {};
+      const locator = createMockLocator();
+      const context = createMockContext({ page });
+
+      vi.spyOn(discoveryModule, 'waitForTarget').mockResolvedValue(
+        locator as any,
+      );
+
+      await clickTool({ testId: 'my-button', timeoutMs: 5000 }, context);
+
+      expect(discoveryModule.waitForTarget).toHaveBeenCalledWith(
+        page,
+        'testId',
+        'my-button',
+        context.refMap,
+        5000,
+      );
+    });
+
+    it('clicks element by CSS selector', async () => {
+      const page = {};
+      const locator = createMockLocator();
+      const context = createMockContext({ page });
+
+      vi.spyOn(discoveryModule, 'waitForTarget').mockResolvedValue(
+        locator as any,
+      );
+
+      const result = await clickTool({ selector: 'button.primary' }, context);
+
+      expect(result.ok).toBe(true);
+      if (result.ok) {
+        expect(result.result.clicked).toBe(true);
+        expect(result.result.target).toBe('selector:button.primary');
+      }
+      expect(discoveryModule.waitForTarget).toHaveBeenCalledWith(
+        page,
+        'selector',
+        'button.primary',
+        context.refMap,
+        15000,
+      );
+    });
+
+    it('clicks element by accessibility reference', async () => {
+      const page = {};
+      const locator = createMockLocator();
+      const refMap = new Map([['e5', 'button[aria-label="Submit"]']]);
+      const context = createMockContext({ page, refMap });
+
+      vi.spyOn(discoveryModule, 'waitForTarget').mockResolvedValue(
+        locator as any,
+      );
+
+      const result = await clickTool({ a11yRef: 'e5' }, context);
+
+      expect(result.ok).toBe(true);
+      if (result.ok) {
+        expect(result.result.clicked).toBe(true);
+        expect(result.result.target).toBe('a11yRef:e5');
+      }
+      expect(discoveryModule.waitForTarget).toHaveBeenCalledWith(
+        page,
+        'a11yRef',
+        'e5',
+        refMap,
+        15000,
+      );
+    });
+
+    it('returns error when no target specified', async () => {
+      const result = await clickTool({} as any, createMockContext());
+
+      expect(result.ok).toBe(false);
+      if (!result.ok) {
+        expect(result.error.code).toBe(ErrorCodes.MM_INVALID_INPUT);
+        expect(result.error.message).toContain('Exactly one');
+      }
+    });
+
+    it('returns error when multiple targets specified', async () => {
+      const result = await clickTool(
+        { testId: 'button', selector: '.button' } as any,
+        createMockContext(),
+      );
+
+      expect(result.ok).toBe(false);
+      if (!result.ok) {
+        expect(result.error.code).toBe(ErrorCodes.MM_INVALID_INPUT);
+        expect(result.error.message).toContain('Exactly one');
+      }
+    });
+
+    it('returns error when validation result is invalid but not caught by isInvalidTargetSelection', async () => {
+      vi.spyOn(targetsModule, 'validateTargetSelection').mockReturnValue({
+        valid: true,
+      } as any);
+
+      const result = await clickTool({ testId: 'button' }, createMockContext());
+
+      expect(result.ok).toBe(false);
+      if (!result.ok) {
+        expect(result.error.code).toBe(ErrorCodes.MM_INVALID_INPUT);
+        expect(result.error.message).toBe('Invalid target selection');
+      }
+    });
+
+    it('handles page closure gracefully', async () => {
+      const locator = createMockLocator();
+      locator.click.mockRejectedValue(
+        new Error('Target page, context or browser has been closed'),
+      );
+      const context = createMockContext();
+
+      vi.spyOn(discoveryModule, 'waitForTarget').mockResolvedValue(
+        locator as any,
+      );
+
+      const result = await clickTool({ testId: 'close-btn' }, context);
+
+      expect(result.ok).toBe(true);
+      if (result.ok) {
+        expect(result.result.clicked).toBe(true);
+        expect(result.result.pageClosedAfterClick).toBe(true);
+        expect(result.result.target).toBe('testId:close-btn');
+      }
+    });
+
+    it('handles browser closed error gracefully', async () => {
+      const locator = createMockLocator();
+      locator.click.mockRejectedValue(new Error('browser has been closed'));
+      const context = createMockContext();
+
+      vi.spyOn(discoveryModule, 'waitForTarget').mockResolvedValue(
+        locator as any,
+      );
+
+      const result = await clickTool({ testId: 'close-btn' }, context);
+
+      expect(result.ok).toBe(true);
+      if (result.ok) {
+        expect(result.result.pageClosedAfterClick).toBe(true);
+      }
+    });
+
+    it('returns error when click fails with non-closure error', async () => {
+      const locator = createMockLocator();
+      locator.click.mockRejectedValue(new Error('Element is not clickable'));
+      const context = createMockContext();
+
+      vi.spyOn(discoveryModule, 'waitForTarget').mockResolvedValue(
+        locator as any,
+      );
+
+      const result = await clickTool({ testId: 'my-button' }, context);
+
+      expect(result.ok).toBe(false);
+      if (!result.ok) {
+        expect(result.error.code).toBe(ErrorCodes.MM_CLICK_FAILED);
+      }
+    });
+
+    it('returns error when element not found', async () => {
+      const context = createMockContext();
+
+      vi.spyOn(discoveryModule, 'waitForTarget').mockRejectedValue(
+        new Error('Timeout waiting for element'),
+      );
+
+      const result = await clickTool({ testId: 'nonexistent' }, context);
+
+      expect(result.ok).toBe(false);
+      if (!result.ok) {
+        expect(result.error.code).toBe(ErrorCodes.MM_WAIT_TIMEOUT);
+      }
+    });
+
+    it('returns error when no session active', async () => {
+      const result = await clickTool(
+        { testId: 'my-button' },
+        createMockContext({ hasActive: false }),
+      );
+
+      expect(result.ok).toBe(false);
+      if (!result.ok) {
+        expect(result.error.code).toBe(ErrorCodes.MM_NO_ACTIVE_SESSION);
+      }
+    });
+  });
+
+  describe('typeTool', () => {
+    it('types text into element by testId', async () => {
+      const page = {};
+      const locator = createMockLocator();
+      const context = createMockContext({ page });
+
+      vi.spyOn(discoveryModule, 'waitForTarget').mockResolvedValue(
+        locator as any,
+      );
+
+      const result = await typeTool(
+        { testId: 'amount-input', text: '0.5' },
+        context,
+      );
+
+      expect(result.ok).toBe(true);
+      if (result.ok) {
+        expect(result.result.typed).toBe(true);
+        expect(result.result.target).toBe('testId:amount-input');
+        expect(result.result.textLength).toBe(3);
+      }
+      expect(discoveryModule.waitForTarget).toHaveBeenCalledWith(
+        page,
+        'testId',
+        'amount-input',
+        context.refMap,
+        15000,
+      );
+      expect(locator.fill).toHaveBeenCalledWith('0.5');
+    });
+
+    it('uses custom timeout when provided', async () => {
+      const page = {};
+      const locator = createMockLocator();
+      const context = createMockContext({ page });
+
+      vi.spyOn(discoveryModule, 'waitForTarget').mockResolvedValue(
+        locator as any,
+      );
+
+      await typeTool(
+        { testId: 'input', text: 'test', timeoutMs: 3000 },
+        context,
+      );
+
+      expect(discoveryModule.waitForTarget).toHaveBeenCalledWith(
+        page,
+        'testId',
+        'input',
+        context.refMap,
+        3000,
+      );
+    });
+
+    it('types text into element by CSS selector', async () => {
+      const locator = createMockLocator();
+      const context = createMockContext();
+
+      vi.spyOn(discoveryModule, 'waitForTarget').mockResolvedValue(
+        locator as any,
+      );
+
+      const result = await typeTool(
+        { selector: 'input[name="email"]', text: 'test@example.com' },
+        context,
+      );
+
+      expect(result.ok).toBe(true);
+      if (result.ok) {
+        expect(result.result.typed).toBe(true);
+        expect(result.result.target).toBe('selector:input[name="email"]');
+        expect(result.result.textLength).toBe(16);
+      }
+      expect(locator.fill).toHaveBeenCalledWith('test@example.com');
+    });
+
+    it('types text into element by accessibility reference', async () => {
+      const page = {};
+      const locator = createMockLocator();
+      const refMap = new Map([['e3', 'input[aria-label="Amount"]']]);
+      const context = createMockContext({ page, refMap });
+
+      vi.spyOn(discoveryModule, 'waitForTarget').mockResolvedValue(
+        locator as any,
+      );
+
+      const result = await typeTool({ a11yRef: 'e3', text: '100' }, context);
+
+      expect(result.ok).toBe(true);
+      if (result.ok) {
+        expect(result.result.typed).toBe(true);
+        expect(result.result.target).toBe('a11yRef:e3');
+        expect(result.result.textLength).toBe(3);
+      }
+      expect(discoveryModule.waitForTarget).toHaveBeenCalledWith(
+        page,
+        'a11yRef',
+        'e3',
+        refMap,
+        15000,
+      );
+    });
+
+    it('types empty string and reports zero length', async () => {
+      const locator = createMockLocator();
+      const context = createMockContext();
+
+      vi.spyOn(discoveryModule, 'waitForTarget').mockResolvedValue(
+        locator as any,
+      );
+
+      const result = await typeTool({ testId: 'input', text: '' }, context);
+
+      expect(result.ok).toBe(true);
+      if (result.ok) {
+        expect(result.result.typed).toBe(true);
+        expect(result.result.textLength).toBe(0);
+      }
+      expect(locator.fill).toHaveBeenCalledWith('');
+    });
+
+    it('returns error when no target specified', async () => {
+      const result = await typeTool(
+        { text: 'test' } as any,
+        createMockContext(),
+      );
+
+      expect(result.ok).toBe(false);
+      if (!result.ok) {
+        expect(result.error.code).toBe(ErrorCodes.MM_INVALID_INPUT);
+        expect(result.error.message).toContain('Exactly one');
+      }
+    });
+
+    it('returns error when multiple targets specified', async () => {
+      const result = await typeTool(
+        { testId: 'input', selector: 'input', text: 'test' } as any,
+        createMockContext(),
+      );
+
+      expect(result.ok).toBe(false);
+      if (!result.ok) {
+        expect(result.error.code).toBe(ErrorCodes.MM_INVALID_INPUT);
+        expect(result.error.message).toContain('Exactly one');
+      }
+    });
+
+    it('returns error when validation result is invalid but not caught by isInvalidTargetSelection', async () => {
+      vi.spyOn(targetsModule, 'validateTargetSelection').mockReturnValue({
+        valid: true,
+      } as any);
+
+      const result = await typeTool(
+        { testId: 'input', text: 'test' },
+        createMockContext(),
+      );
+
+      expect(result.ok).toBe(false);
+      if (!result.ok) {
+        expect(result.error.code).toBe(ErrorCodes.MM_INVALID_INPUT);
+        expect(result.error.message).toBe('Invalid target selection');
+      }
+    });
+
+    it('returns error when fill fails', async () => {
+      const locator = createMockLocator();
+      locator.fill.mockRejectedValue(new Error('Element is not editable'));
+      const context = createMockContext();
+
+      vi.spyOn(discoveryModule, 'waitForTarget').mockResolvedValue(
+        locator as any,
+      );
+
+      const result = await typeTool({ testId: 'input', text: 'test' }, context);
+
+      expect(result.ok).toBe(false);
+      if (!result.ok) {
+        expect(result.error.code).toBe(ErrorCodes.MM_TYPE_FAILED);
+      }
+    });
+
+    it('returns error when element not found', async () => {
+      const context = createMockContext();
+
+      vi.spyOn(discoveryModule, 'waitForTarget').mockRejectedValue(
+        new Error('Timeout waiting for element'),
+      );
+
+      const result = await typeTool(
+        { testId: 'nonexistent', text: 'test' },
+        context,
+      );
+
+      expect(result.ok).toBe(false);
+      if (!result.ok) {
+        expect(result.error.code).toBe(ErrorCodes.MM_WAIT_TIMEOUT);
+      }
+    });
+
+    it('returns error when no session active', async () => {
+      const result = await typeTool(
+        { testId: 'input', text: 'test' },
+        createMockContext({ hasActive: false }),
+      );
+
+      expect(result.ok).toBe(false);
+      if (!result.ok) {
+        expect(result.error.code).toBe(ErrorCodes.MM_NO_ACTIVE_SESSION);
+      }
+    });
+  });
+
+  describe('waitForTool', () => {
+    it('waits for element by testId', async () => {
+      const page = {};
+      const locator = createMockLocator();
+      const context = createMockContext({ page });
+
+      vi.spyOn(discoveryModule, 'waitForTarget').mockResolvedValue(
+        locator as any,
+      );
+
+      const result = await waitForTool({ testId: 'loading-spinner' }, context);
+
+      expect(result.ok).toBe(true);
+      if (result.ok) {
+        expect(result.result.found).toBe(true);
+        expect(result.result.target).toBe('testId:loading-spinner');
+      }
+      expect(discoveryModule.waitForTarget).toHaveBeenCalledWith(
+        page,
+        'testId',
+        'loading-spinner',
+        context.refMap,
+        15000,
+      );
+    });
+
+    it('uses custom timeout when provided', async () => {
+      const page = {};
+      const locator = createMockLocator();
+      const context = createMockContext({ page });
+
+      vi.spyOn(discoveryModule, 'waitForTarget').mockResolvedValue(
+        locator as any,
+      );
+
+      await waitForTool({ testId: 'element', timeoutMs: 30000 }, context);
+
+      expect(discoveryModule.waitForTarget).toHaveBeenCalledWith(
+        page,
+        'testId',
+        'element',
+        context.refMap,
+        30000,
+      );
+    });
+
+    it('waits for element by CSS selector', async () => {
+      const page = {};
+      const locator = createMockLocator();
+      const context = createMockContext({ page });
+
+      vi.spyOn(discoveryModule, 'waitForTarget').mockResolvedValue(
+        locator as any,
+      );
+
+      const result = await waitForTool(
+        { selector: '.success-message' },
+        context,
+      );
+
+      expect(result.ok).toBe(true);
+      if (result.ok) {
+        expect(result.result.found).toBe(true);
+        expect(result.result.target).toBe('selector:.success-message');
+      }
+      expect(discoveryModule.waitForTarget).toHaveBeenCalledWith(
+        page,
+        'selector',
+        '.success-message',
+        context.refMap,
+        15000,
+      );
+    });
+
+    it('waits for element by accessibility reference', async () => {
+      const page = {};
+      const locator = createMockLocator();
+      const refMap = new Map([['e10', 'button[aria-label="Confirm"]']]);
+      const context = createMockContext({ page, refMap });
+
+      vi.spyOn(discoveryModule, 'waitForTarget').mockResolvedValue(
+        locator as any,
+      );
+
+      const result = await waitForTool({ a11yRef: 'e10' }, context);
+
+      expect(result.ok).toBe(true);
+      if (result.ok) {
+        expect(result.result.found).toBe(true);
+        expect(result.result.target).toBe('a11yRef:e10');
+      }
+      expect(discoveryModule.waitForTarget).toHaveBeenCalledWith(
+        page,
+        'a11yRef',
+        'e10',
+        refMap,
+        15000,
+      );
+    });
+
+    it('returns error when no target specified', async () => {
+      const result = await waitForTool({} as any, createMockContext());
+
+      expect(result.ok).toBe(false);
+      if (!result.ok) {
+        expect(result.error.code).toBe(ErrorCodes.MM_INVALID_INPUT);
+        expect(result.error.message).toContain('Exactly one');
+      }
+    });
+
+    it('returns error when multiple targets specified', async () => {
+      const result = await waitForTool(
+        { testId: 'element', selector: '.element' } as any,
+        createMockContext(),
+      );
+
+      expect(result.ok).toBe(false);
+      if (!result.ok) {
+        expect(result.error.code).toBe(ErrorCodes.MM_INVALID_INPUT);
+        expect(result.error.message).toContain('Exactly one');
+      }
+    });
+
+    it('returns error when validation result is invalid but not caught by isInvalidTargetSelection', async () => {
+      vi.spyOn(targetsModule, 'validateTargetSelection').mockReturnValue({
+        valid: true,
+      } as any);
+
+      const result = await waitForTool(
+        { testId: 'element' },
+        createMockContext(),
+      );
+
+      expect(result.ok).toBe(false);
+      if (!result.ok) {
+        expect(result.error.code).toBe(ErrorCodes.MM_INVALID_INPUT);
+        expect(result.error.message).toBe('Invalid target selection');
+      }
+    });
+
+    it('returns error when element not found within timeout', async () => {
+      const context = createMockContext();
+
+      vi.spyOn(discoveryModule, 'waitForTarget').mockRejectedValue(
+        new Error('Timeout 15000ms exceeded'),
+      );
+
+      const result = await waitForTool({ testId: 'nonexistent' }, context);
+
+      expect(result.ok).toBe(false);
+      if (!result.ok) {
+        expect(result.error.code).toBe(ErrorCodes.MM_WAIT_TIMEOUT);
+      }
+    });
+
+    it('returns error when page closed during wait', async () => {
+      const context = createMockContext();
+
+      vi.spyOn(discoveryModule, 'waitForTarget').mockRejectedValue(
+        new Error('Target page has been closed'),
+      );
+
+      const result = await waitForTool({ testId: 'element' }, context);
+
+      expect(result.ok).toBe(false);
+      if (!result.ok) {
+        expect(result.error.code).toBe(ErrorCodes.MM_WAIT_TIMEOUT);
+      }
+    });
+
+    it('returns error when no session active', async () => {
+      const result = await waitForTool(
+        { testId: 'element' },
+        createMockContext({ hasActive: false }),
+      );
+
+      expect(result.ok).toBe(false);
+      if (!result.ok) {
+        expect(result.error.code).toBe(ErrorCodes.MM_NO_ACTIVE_SESSION);
+      }
+    });
+  });
+});
diff --git a/src/tools/interaction.ts b/src/tools/interaction.ts
new file mode 100644
index 0000000..6e7fabe
--- /dev/null
+++ b/src/tools/interaction.ts
@@ -0,0 +1,197 @@
+import {
+  classifyClickError,
+  classifyTypeError,
+  classifyWaitError,
+  isPageClosedError,
+} from './error-classification.js';
+import type {
+  ClickInput,
+  ClickResult,
+  TypeInput,
+  TypeResult,
+  WaitForInput,
+  WaitForResult,
+} from './types';
+import { ErrorCodes } from './types';
+import { DEFAULT_INTERACTION_TIMEOUT_MS } from './utils/constants.js';
+import { waitForTarget } from './utils/discovery.js';
+import { validateTargetSelection } from './utils/targets.js';
+import {
+  isInvalidTargetSelection,
+  isValidTargetSelection,
+} from './utils/type-guards.js';
+import {
+  createToolError,
+  createToolSuccess,
+  requireActiveSession,
+} from './utils.js';
+import type { ToolContext, ToolResponse } from '../types/http.js';
+
+/**
+ * Clicks an element identified by ref, test ID, or selector.
+ *
+ * @param input - The click target and timeout options.
+ * @param context - The tool execution context.
+ * @returns The click operation result.
+ */
+export async function clickTool(
+  input: ClickInput,
+  context: ToolContext,
+): Promise<ToolResponse<ClickResult>> {
+  const missingSession = requireActiveSession<ClickResult>(context);
+  if (missingSession) {
+    return missingSession;
+  }
+
+  const timeoutMs = input.timeoutMs ?? DEFAULT_INTERACTION_TIMEOUT_MS;
+  const validation = validateTargetSelection(input);
+
+  if (isInvalidTargetSelection(validation)) {
+    return createToolError(ErrorCodes.MM_INVALID_INPUT, validation.error);
+  }
+
+  if (!isValidTargetSelection(validation)) {
+    return createToolError(
+      ErrorCodes.MM_INVALID_INPUT,
+      'Invalid target selection',
+    );
+  }
+
+  const { type: targetType, value: targetValue } = validation;
+
+  try {
+    const locator = await waitForTarget(
+      context.page,
+      targetType,
+      targetValue,
+      context.refMap,
+      timeoutMs,
+    );
+
+    try {
+      await locator.click();
+      return createToolSuccess({
+        clicked: true,
+        target: `${targetType}:${targetValue}`,
+      });
+    } catch (clickError) {
+      if (isPageClosedError(clickError)) {
+        return createToolSuccess({
+          clicked: true,
+          target: `${targetType}:${targetValue}`,
+          pageClosedAfterClick: true,
+        });
+      }
+
+      throw clickError;
+    }
+  } catch (error) {
+    const errorInfo = classifyClickError(error);
+    return createToolError(errorInfo.code, errorInfo.message);
+  }
+}
+
+/**
+ * Types text into an element identified by ref, test ID, or selector.
+ *
+ * @param input - The type target, text content, and timeout options.
+ * @param context - The tool execution context.
+ * @returns The type operation result.
+ */
+export async function typeTool(
+  input: TypeInput,
+  context: ToolContext,
+): Promise<ToolResponse<TypeResult>> {
+  const missingSession = requireActiveSession<TypeResult>(context);
+  if (missingSession) {
+    return missingSession;
+  }
+
+  const timeoutMs = input.timeoutMs ?? DEFAULT_INTERACTION_TIMEOUT_MS;
+  const validation = validateTargetSelection(input);
+
+  if (isInvalidTargetSelection(validation)) {
+    return createToolError(ErrorCodes.MM_INVALID_INPUT, validation.error);
+  }
+
+  if (!isValidTargetSelection(validation)) {
+    return createToolError(
+      ErrorCodes.MM_INVALID_INPUT,
+      'Invalid target selection',
+    );
+  }
+
+  const { type: targetType, value: targetValue } = validation;
+
+  try {
+    const locator = await waitForTarget(
+      context.page,
+      targetType,
+      targetValue,
+      context.refMap,
+      timeoutMs,
+    );
+
+    await locator.fill(input.text);
+
+    return createToolSuccess({
+      typed: true,
+      target: `${targetType}:${targetValue}`,
+      textLength: input.text.length,
+    });
+  } catch (error) {
+    const errorInfo = classifyTypeError(error);
+    return createToolError(errorInfo.code, errorInfo.message);
+  }
+}
+
+/**
+ * Waits for an element to appear on the page within a timeout.
+ *
+ * @param input - The wait target and timeout options.
+ * @param context - The tool execution context.
+ * @returns The wait result indicating whether the element was found.
+ */
+export async function waitForTool(
+  input: WaitForInput,
+  context: ToolContext,
+): Promise<ToolResponse<WaitForResult>> {
+  const missingSession = requireActiveSession<WaitForResult>(context);
+  if (missingSession) {
+    return missingSession;
+  }
+
+  const timeoutMs = input.timeoutMs ?? DEFAULT_INTERACTION_TIMEOUT_MS;
+  const validation = validateTargetSelection(input);
+
+  if (isInvalidTargetSelection(validation)) {
+    return createToolError(ErrorCodes.MM_INVALID_INPUT, validation.error);
+  }
+
+  if (!isValidTargetSelection(validation)) {
+    return createToolError(
+      ErrorCodes.MM_INVALID_INPUT,
+      'Invalid target selection',
+    );
+  }
+
+  const { type: targetType, value: targetValue } = validation;
+
+  try {
+    await waitForTarget(
+      context.page,
+      targetType,
+      targetValue,
+      context.refMap,
+      timeoutMs,
+    );
+
+    return createToolSuccess({
+      found: true,
+      target: `${targetType}:${targetValue}`,
+    });
+  } catch (error) {
+    const errorInfo = classifyWaitError(error);
+    return createToolError(errorInfo.code, errorInfo.message);
+  }
+}
diff --git a/src/mcp-server/tools/knowledge.test.ts b/src/tools/knowledge.test.ts
similarity index 53%
rename from src/mcp-server/tools/knowledge.test.ts
rename to src/tools/knowledge.test.ts
index afb0233..5eaa60c 100644
--- a/src/mcp-server/tools/knowledge.test.ts
+++ b/src/tools/knowledge.test.ts
@@ -5,25 +5,21 @@
  * summarize, and session listing with various filter combinations.
  */
 
-import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
+import { describe, it, expect, vi, beforeEach } from 'vitest';
 
 import {
-  handleKnowledgeLast,
-  handleKnowledgeSearch,
-  handleKnowledgeSummarize,
-  handleKnowledgeSessions,
+  knowledgeLastTool,
+  knowledgeSearchTool,
+  knowledgeSummarizeTool,
+  knowledgeSessionsTool,
 } from './knowledge.js';
-import * as knowledgeStoreModule from '../knowledge-store.js';
-import * as sessionManagerModule from '../session-manager.js';
-import { createMockSessionManager } from '../test-utils';
-import { ErrorCodes } from '../types/errors.js';
+import { createMockSessionManager } from './test-utils/mock-factories.js';
+import { ErrorCodes } from './types/errors.js';
+import type { ToolContext } from '../types/http.js';
 
-describe('knowledge', () => {
-  let mockSessionManager: ReturnType<typeof createMockSessionManager>;
-  let mockKnowledgeStore: any;
-
-  beforeEach(() => {
-    mockSessionManager = createMockSessionManager({
+function createMockContext(): ToolContext {
+  return {
+    sessionManager: createMockSessionManager({
       hasActive: true,
       sessionId: 'test-session-123',
       sessionMetadata: {
@@ -34,14 +30,11 @@ describe('knowledge', () => {
         tags: [],
         launch: { stateMode: 'default' },
       },
-    });
-    vi.spyOn(sessionManagerModule, 'getSessionManager').mockReturnValue(
-      mockSessionManager,
-    );
-
-    // Mock knowledge store to prevent "not initialized" errors
-    mockKnowledgeStore = {
-      recordStep: vi.fn().mockResolvedValue(undefined),
+    }),
+    page: {},
+    refMap: new Map(),
+    workflowContext: {},
+    knowledgeStore: {
       getLastSteps: vi.fn().mockResolvedValue([]),
       searchSteps: vi.fn().mockResolvedValue([]),
       summarizeSession: vi.fn().mockResolvedValue({
@@ -50,59 +43,71 @@ describe('knowledge', () => {
         recipe: [],
       }),
       listSessions: vi.fn().mockResolvedValue([]),
-      generatePriorKnowledge: vi.fn().mockResolvedValue(undefined),
-      writeSessionMetadata: vi.fn().mockResolvedValue('test-session'),
-    };
-    vi.spyOn(knowledgeStoreModule, 'knowledgeStore', 'get').mockReturnValue(
-      mockKnowledgeStore,
-    );
-  });
+    },
+  } as unknown as ToolContext;
+}
+
+describe('knowledge', () => {
+  let context: ToolContext;
 
-  afterEach(() => {
-    vi.restoreAllMocks();
+  beforeEach(() => {
+    context = createMockContext();
   });
 
-  describe('handleKnowledgeLast', () => {
+  describe('knowledgeLastTool', () => {
     it('retrieves last N steps with default parameters', async () => {
-      // Arrange
       const mockSteps = [
-        { timestamp: '2026-02-04T10:00:00Z', tool: 'mm_click', screen: 'home' },
-        { timestamp: '2026-02-04T10:01:00Z', tool: 'mm_type', screen: 'home' },
+        {
+          timestamp: '2026-02-04T10:00:00Z',
+          tool: 'click',
+          screen: 'home',
+          snippet: 'Clicked send',
+        },
+        {
+          timestamp: '2026-02-04T10:01:00Z',
+          tool: 'type',
+          screen: 'home',
+          snippet: 'Entered amount',
+        },
       ];
-      mockKnowledgeStore.getLastSteps.mockResolvedValue(mockSteps);
+      vi.mocked(context.knowledgeStore.getLastSteps).mockResolvedValue(
+        mockSteps,
+      );
 
-      // Act
-      const result = await handleKnowledgeLast({});
+      const result = await knowledgeLastTool({}, context);
 
-      // Assert
       expect(result.ok).toBe(true);
       if (result.ok) {
         expect(result.result.steps).toStrictEqual(mockSteps);
       }
-      expect(mockKnowledgeStore.getLastSteps).toHaveBeenCalledWith(
-        20, // default n
-        'current', // default scope
+      expect(context.knowledgeStore.getLastSteps).toHaveBeenCalledWith(
+        20,
+        'current',
         'test-session-123',
-        undefined, // no filters
+        undefined,
       );
     });
 
     it('retrieves last N steps with custom n parameter', async () => {
-      // Arrange
       const mockSteps = [
-        { timestamp: '2026-02-04T10:00:00Z', tool: 'mm_click', screen: 'home' },
+        {
+          timestamp: '2026-02-04T10:00:00Z',
+          tool: 'click',
+          screen: 'home',
+          snippet: 'Clicked send',
+        },
       ];
-      mockKnowledgeStore.getLastSteps.mockResolvedValue(mockSteps);
+      vi.mocked(context.knowledgeStore.getLastSteps).mockResolvedValue(
+        mockSteps,
+      );
 
-      // Act
-      const result = await handleKnowledgeLast({ n: 5 });
+      const result = await knowledgeLastTool({ n: 5 }, context);
 
-      // Assert
       expect(result.ok).toBe(true);
       if (result.ok) {
         expect(result.result.steps).toStrictEqual(mockSteps);
       }
-      expect(mockKnowledgeStore.getLastSteps).toHaveBeenCalledWith(
+      expect(context.knowledgeStore.getLastSteps).toHaveBeenCalledWith(
         5,
         'current',
         'test-session-123',
@@ -111,18 +116,22 @@ describe('knowledge', () => {
     });
 
     it('retrieves steps with scope "all"', async () => {
-      // Arrange
       const mockSteps = [
-        { timestamp: '2026-02-04T10:00:00Z', tool: 'mm_click', screen: 'home' },
+        {
+          timestamp: '2026-02-04T10:00:00Z',
+          tool: 'click',
+          screen: 'home',
+          snippet: 'Clicked send',
+        },
       ];
-      mockKnowledgeStore.getLastSteps.mockResolvedValue(mockSteps);
+      vi.mocked(context.knowledgeStore.getLastSteps).mockResolvedValue(
+        mockSteps,
+      );
 
-      // Act
-      const result = await handleKnowledgeLast({ scope: 'all' });
+      const result = await knowledgeLastTool({ scope: 'all' }, context);
 
-      // Assert
       expect(result.ok).toBe(true);
-      expect(mockKnowledgeStore.getLastSteps).toHaveBeenCalledWith(
+      expect(context.knowledgeStore.getLastSteps).toHaveBeenCalledWith(
         20,
         'all',
         'test-session-123',
@@ -131,26 +140,30 @@ describe('knowledge', () => {
     });
 
     it('retrieves steps with filters', async () => {
-      // Arrange
       const mockSteps = [
-        { timestamp: '2026-02-04T10:00:00Z', tool: 'mm_click', screen: 'send' },
+        {
+          timestamp: '2026-02-04T10:00:00Z',
+          tool: 'click',
+          screen: 'send',
+          snippet: 'Clicked confirm',
+        },
       ];
-      mockKnowledgeStore.getLastSteps.mockResolvedValue(mockSteps);
       const filters = {
         flowTag: 'send',
         screen: 'send',
         sinceHours: 24,
       };
+      vi.mocked(context.knowledgeStore.getLastSteps).mockResolvedValue(
+        mockSteps,
+      );
 
-      // Act
-      const result = await handleKnowledgeLast({ n: 10, filters });
+      const result = await knowledgeLastTool({ n: 10, filters }, context);
 
-      // Assert
       expect(result.ok).toBe(true);
       if (result.ok) {
         expect(result.result.steps).toStrictEqual(mockSteps);
       }
-      expect(mockKnowledgeStore.getLastSteps).toHaveBeenCalledWith(
+      expect(context.knowledgeStore.getLastSteps).toHaveBeenCalledWith(
         10,
         'current',
         'test-session-123',
@@ -159,13 +172,10 @@ describe('knowledge', () => {
     });
 
     it('returns empty array when no steps found', async () => {
-      // Arrange
-      mockKnowledgeStore.getLastSteps.mockResolvedValue([]);
+      vi.mocked(context.knowledgeStore.getLastSteps).mockResolvedValue([]);
 
-      // Act
-      const result = await handleKnowledgeLast({ n: 10 });
+      const result = await knowledgeLastTool({ n: 10 }, context);
 
-      // Assert
       expect(result.ok).toBe(true);
       if (result.ok) {
         expect(result.result.steps).toStrictEqual([]);
@@ -173,15 +183,12 @@ describe('knowledge', () => {
     });
 
     it('returns error when knowledge store fails', async () => {
-      // Arrange
-      mockKnowledgeStore.getLastSteps.mockRejectedValue(
+      vi.mocked(context.knowledgeStore.getLastSteps).mockRejectedValue(
         new Error('Database connection failed'),
       );
 
-      // Act
-      const result = await handleKnowledgeLast({ n: 10 });
+      const result = await knowledgeLastTool({ n: 10 }, context);
 
-      // Assert
       expect(result.ok).toBe(false);
       if (!result.ok) {
         expect(result.error.code).toBe(ErrorCodes.MM_KNOWLEDGE_ERROR);
@@ -191,52 +198,60 @@ describe('knowledge', () => {
     });
   });
 
-  describe('handleKnowledgeSearch', () => {
+  describe('knowledgeSearchTool', () => {
     it('searches steps with default parameters', async () => {
-      // Arrange
       const mockMatches = [
-        { timestamp: '2026-02-04T10:00:00Z', tool: 'mm_click', screen: 'home' },
+        {
+          timestamp: '2026-02-04T10:00:00Z',
+          tool: 'click',
+          screen: 'home',
+          snippet: 'Clicked send',
+        },
       ];
-      mockKnowledgeStore.searchSteps.mockResolvedValue(mockMatches);
+      vi.mocked(context.knowledgeStore.searchSteps).mockResolvedValue(
+        mockMatches,
+      );
 
-      // Act
-      const result = await handleKnowledgeSearch({ query: 'mm_click' });
+      const result = await knowledgeSearchTool({ query: 'click' }, context);
 
-      // Assert
       expect(result.ok).toBe(true);
       if (result.ok) {
         expect(result.result.matches).toStrictEqual(mockMatches);
-        expect(result.result.query).toBe('mm_click');
+        expect(result.result.query).toBe('click');
       }
-      expect(mockKnowledgeStore.searchSteps).toHaveBeenCalledWith(
-        'mm_click',
-        20, // default limit
-        'all', // default scope
+      expect(context.knowledgeStore.searchSteps).toHaveBeenCalledWith(
+        'click',
+        20,
+        'all',
         'test-session-123',
-        undefined, // no filters
+        undefined,
       );
     });
 
     it('searches steps with custom limit', async () => {
-      // Arrange
       const mockMatches = [
-        { timestamp: '2026-02-04T10:00:00Z', tool: 'mm_type', screen: 'send' },
+        {
+          timestamp: '2026-02-04T10:00:00Z',
+          tool: 'type',
+          screen: 'send',
+          snippet: 'Entered recipient',
+        },
       ];
-      mockKnowledgeStore.searchSteps.mockResolvedValue(mockMatches);
+      vi.mocked(context.knowledgeStore.searchSteps).mockResolvedValue(
+        mockMatches,
+      );
 
-      // Act
-      const result = await handleKnowledgeSearch({
-        query: 'mm_type',
-        limit: 50,
-      });
+      const result = await knowledgeSearchTool(
+        { query: 'type', limit: 50 },
+        context,
+      );
 
-      // Assert
       expect(result.ok).toBe(true);
       if (result.ok) {
         expect(result.result.matches).toStrictEqual(mockMatches);
       }
-      expect(mockKnowledgeStore.searchSteps).toHaveBeenCalledWith(
-        'mm_type',
+      expect(context.knowledgeStore.searchSteps).toHaveBeenCalledWith(
+        'type',
         50,
         'all',
         'test-session-123',
@@ -245,22 +260,26 @@ describe('knowledge', () => {
     });
 
     it('searches steps with scope "current"', async () => {
-      // Arrange
       const mockMatches = [
-        { timestamp: '2026-02-04T10:00:00Z', tool: 'mm_click', screen: 'home' },
+        {
+          timestamp: '2026-02-04T10:00:00Z',
+          tool: 'click',
+          screen: 'home',
+          snippet: 'Clicked send',
+        },
       ];
-      mockKnowledgeStore.searchSteps.mockResolvedValue(mockMatches);
+      vi.mocked(context.knowledgeStore.searchSteps).mockResolvedValue(
+        mockMatches,
+      );
 
-      // Act
-      const result = await handleKnowledgeSearch({
-        query: 'mm_click',
-        scope: 'current',
-      });
+      const result = await knowledgeSearchTool(
+        { query: 'click', scope: 'current' },
+        context,
+      );
 
-      // Assert
       expect(result.ok).toBe(true);
-      expect(mockKnowledgeStore.searchSteps).toHaveBeenCalledWith(
-        'mm_click',
+      expect(context.knowledgeStore.searchSteps).toHaveBeenCalledWith(
+        'click',
         20,
         'current',
         'test-session-123',
@@ -269,30 +288,33 @@ describe('knowledge', () => {
     });
 
     it('searches steps with filters', async () => {
-      // Arrange
       const mockMatches = [
-        { timestamp: '2026-02-04T10:00:00Z', tool: 'mm_click', screen: 'send' },
+        {
+          timestamp: '2026-02-04T10:00:00Z',
+          tool: 'click',
+          screen: 'send',
+          snippet: 'Confirmed transaction',
+        },
       ];
-      mockKnowledgeStore.searchSteps.mockResolvedValue(mockMatches);
       const filters = {
         flowTag: 'send',
         tag: 'transaction',
         screen: 'send',
       };
+      vi.mocked(context.knowledgeStore.searchSteps).mockResolvedValue(
+        mockMatches,
+      );
 
-      // Act
-      const result = await handleKnowledgeSearch({
-        query: 'confirm',
-        limit: 10,
-        filters,
-      });
+      const result = await knowledgeSearchTool(
+        { query: 'confirm', limit: 10, filters },
+        context,
+      );
 
-      // Assert
       expect(result.ok).toBe(true);
       if (result.ok) {
         expect(result.result.matches).toStrictEqual(mockMatches);
       }
-      expect(mockKnowledgeStore.searchSteps).toHaveBeenCalledWith(
+      expect(context.knowledgeStore.searchSteps).toHaveBeenCalledWith(
         'confirm',
         10,
         'all',
@@ -302,13 +324,13 @@ describe('knowledge', () => {
     });
 
     it('returns empty array when no matches found', async () => {
-      // Arrange
-      mockKnowledgeStore.searchSteps.mockResolvedValue([]);
+      vi.mocked(context.knowledgeStore.searchSteps).mockResolvedValue([]);
 
-      // Act
-      const result = await handleKnowledgeSearch({ query: 'nonexistent' });
+      const result = await knowledgeSearchTool(
+        { query: 'nonexistent' },
+        context,
+      );
 
-      // Assert
       expect(result.ok).toBe(true);
       if (result.ok) {
         expect(result.result.matches).toStrictEqual([]);
@@ -317,15 +339,12 @@ describe('knowledge', () => {
     });
 
     it('returns error when search fails', async () => {
-      // Arrange
-      mockKnowledgeStore.searchSteps.mockRejectedValue(
+      vi.mocked(context.knowledgeStore.searchSteps).mockRejectedValue(
         new Error('Search index corrupted'),
       );
 
-      // Act
-      const result = await handleKnowledgeSearch({ query: 'test' });
+      const result = await knowledgeSearchTool({ query: 'test' }, context);
 
-      // Assert
       expect(result.ok).toBe(false);
       if (!result.ok) {
         expect(result.error.code).toBe(ErrorCodes.MM_KNOWLEDGE_ERROR);
@@ -335,109 +354,106 @@ describe('knowledge', () => {
     });
   });
 
-  describe('handleKnowledgeSummarize', () => {
+  describe('knowledgeSummarizeTool', () => {
     it('summarizes current session by default', async () => {
-      // Arrange
       const mockSummary = {
         sessionId: 'test-session-123',
         stepCount: 5,
         recipe: [
-          { stepNumber: 1, tool: 'mm_click', notes: 'Clicked send button' },
-          { stepNumber: 2, tool: 'mm_type', notes: 'Entered amount' },
+          { stepNumber: 1, tool: 'click', notes: 'Clicked send button' },
+          { stepNumber: 2, tool: 'type', notes: 'Entered amount' },
         ],
       };
-      mockKnowledgeStore.summarizeSession.mockResolvedValue(mockSummary);
+      vi.mocked(context.knowledgeStore.summarizeSession).mockResolvedValue(
+        mockSummary,
+      );
 
-      // Act
-      const result = await handleKnowledgeSummarize({});
+      const result = await knowledgeSummarizeTool({}, context);
 
-      // Assert
       expect(result.ok).toBe(true);
       if (result.ok) {
         expect(result.result).toStrictEqual(mockSummary);
       }
-      expect(mockKnowledgeStore.summarizeSession).toHaveBeenCalledWith(
+      expect(context.knowledgeStore.summarizeSession).toHaveBeenCalledWith(
         'test-session-123',
       );
     });
 
     it('summarizes current session with scope "current"', async () => {
-      // Arrange
       const mockSummary = {
         sessionId: 'test-session-123',
         stepCount: 3,
         recipe: [],
       };
-      mockKnowledgeStore.summarizeSession.mockResolvedValue(mockSummary);
+      vi.mocked(context.knowledgeStore.summarizeSession).mockResolvedValue(
+        mockSummary,
+      );
 
-      // Act
-      const result = await handleKnowledgeSummarize({ scope: 'current' });
+      const result = await knowledgeSummarizeTool(
+        { scope: 'current' },
+        context,
+      );
 
-      // Assert
       expect(result.ok).toBe(true);
       if (result.ok) {
         expect(result.result).toStrictEqual(mockSummary);
       }
-      expect(mockKnowledgeStore.summarizeSession).toHaveBeenCalledWith(
+      expect(context.knowledgeStore.summarizeSession).toHaveBeenCalledWith(
         'test-session-123',
       );
     });
 
     it('summarizes specific session by sessionId', async () => {
-      // Arrange
       const mockSummary = {
         sessionId: 'other-session-456',
         stepCount: 10,
-        recipe: [
-          { stepNumber: 1, tool: 'mm_launch', notes: 'Launched browser' },
-        ],
+        recipe: [{ stepNumber: 1, tool: 'launch', notes: 'Launched browser' }],
       };
-      mockKnowledgeStore.summarizeSession.mockResolvedValue(mockSummary);
+      vi.mocked(context.knowledgeStore.summarizeSession).mockResolvedValue(
+        mockSummary,
+      );
 
-      // Act
-      const result = await handleKnowledgeSummarize({
-        sessionId: 'other-session-456',
-      });
+      const result = await knowledgeSummarizeTool(
+        { sessionId: 'other-session-456' },
+        context,
+      );
 
-      // Assert
       expect(result.ok).toBe(true);
       if (result.ok) {
         expect(result.result).toStrictEqual(mockSummary);
       }
-      expect(mockKnowledgeStore.summarizeSession).toHaveBeenCalledWith(
+      expect(context.knowledgeStore.summarizeSession).toHaveBeenCalledWith(
         'other-session-456',
       );
     });
 
     it('summarizes session with scope object containing sessionId', async () => {
-      // Arrange
       const mockSummary = {
         sessionId: 'scoped-session-789',
         stepCount: 7,
         recipe: [],
       };
-      mockKnowledgeStore.summarizeSession.mockResolvedValue(mockSummary);
+      vi.mocked(context.knowledgeStore.summarizeSession).mockResolvedValue(
+        mockSummary,
+      );
 
-      // Act
-      const result = await handleKnowledgeSummarize({
-        scope: { sessionId: 'scoped-session-789' },
-      });
+      const result = await knowledgeSummarizeTool(
+        { scope: { sessionId: 'scoped-session-789' } },
+        context,
+      );
 
-      // Assert
       expect(result.ok).toBe(true);
       if (result.ok) {
         expect(result.result).toStrictEqual(mockSummary);
       }
-      expect(mockKnowledgeStore.summarizeSession).toHaveBeenCalledWith(
+      expect(context.knowledgeStore.summarizeSession).toHaveBeenCalledWith(
         'scoped-session-789',
       );
     });
 
     it('returns error when scope is "all"', async () => {
-      // Act
-      const result = await handleKnowledgeSummarize({ scope: 'all' });
+      const result = await knowledgeSummarizeTool({ scope: 'all' }, context);
 
-      // Assert
       expect(result.ok).toBe(false);
       if (!result.ok) {
         expect(result.error.code).toBe(ErrorCodes.MM_INVALID_INPUT);
@@ -446,13 +462,10 @@ describe('knowledge', () => {
     });
 
     it('returns error when no sessionId can be determined', async () => {
-      // Arrange
-      vi.spyOn(mockSessionManager, 'getSessionId').mockReturnValue(undefined);
+      vi.mocked(context.sessionManager.getSessionId).mockReturnValue(undefined);
 
-      // Act
-      const result = await handleKnowledgeSummarize({});
+      const result = await knowledgeSummarizeTool({}, context);
 
-      // Assert
       expect(result.ok).toBe(false);
       if (!result.ok) {
         expect(result.error.code).toBe(ErrorCodes.MM_INVALID_INPUT);
@@ -461,17 +474,15 @@ describe('knowledge', () => {
     });
 
     it('returns error when summarize fails', async () => {
-      // Arrange
-      mockKnowledgeStore.summarizeSession.mockRejectedValue(
+      vi.mocked(context.knowledgeStore.summarizeSession).mockRejectedValue(
         new Error('Session not found'),
       );
 
-      // Act
-      const result = await handleKnowledgeSummarize({
-        sessionId: 'nonexistent-session',
-      });
+      const result = await knowledgeSummarizeTool(
+        { sessionId: 'nonexistent-session' },
+        context,
+      );
 
-      // Assert
       expect(result.ok).toBe(false);
       if (!result.ok) {
         expect(result.error.code).toBe(ErrorCodes.MM_KNOWLEDGE_ERROR);
@@ -481,9 +492,8 @@ describe('knowledge', () => {
     });
   });
 
-  describe('handleKnowledgeSessions', () => {
+  describe('knowledgeSessionsTool', () => {
     it('lists sessions with default limit', async () => {
-      // Arrange
       const mockSessions = [
         {
           sessionId: 'session-1',
@@ -499,24 +509,23 @@ describe('knowledge', () => {
           tags: ['test'],
         },
       ];
-      mockKnowledgeStore.listSessions.mockResolvedValue(mockSessions);
+      vi.mocked(context.knowledgeStore.listSessions).mockResolvedValue(
+        mockSessions,
+      );
 
-      // Act
-      const result = await handleKnowledgeSessions({});
+      const result = await knowledgeSessionsTool({}, context);
 
-      // Assert
       expect(result.ok).toBe(true);
       if (result.ok) {
         expect(result.result.sessions).toStrictEqual(mockSessions);
       }
-      expect(mockKnowledgeStore.listSessions).toHaveBeenCalledWith(
-        10, // default limit
-        undefined, // no filters
+      expect(context.knowledgeStore.listSessions).toHaveBeenCalledWith(
+        10,
+        undefined,
       );
     });
 
     it('lists sessions with custom limit', async () => {
-      // Arrange
       const mockSessions = [
         {
           sessionId: 'session-1',
@@ -525,24 +534,23 @@ describe('knowledge', () => {
           tags: [],
         },
       ];
-      mockKnowledgeStore.listSessions.mockResolvedValue(mockSessions);
+      vi.mocked(context.knowledgeStore.listSessions).mockResolvedValue(
+        mockSessions,
+      );
 
-      // Act
-      const result = await handleKnowledgeSessions({ limit: 25 });
+      const result = await knowledgeSessionsTool({ limit: 25 }, context);
 
-      // Assert
       expect(result.ok).toBe(true);
       if (result.ok) {
         expect(result.result.sessions).toStrictEqual(mockSessions);
       }
-      expect(mockKnowledgeStore.listSessions).toHaveBeenCalledWith(
+      expect(context.knowledgeStore.listSessions).toHaveBeenCalledWith(
         25,
         undefined,
       );
     });
 
     it('lists sessions with filters', async () => {
-      // Arrange
       const mockSessions = [
         {
           sessionId: 'session-1',
@@ -551,31 +559,34 @@ describe('knowledge', () => {
           tags: [],
         },
       ];
-      mockKnowledgeStore.listSessions.mockResolvedValue(mockSessions);
       const filters = {
         flowTag: 'send',
         sinceHours: 48,
       };
+      vi.mocked(context.knowledgeStore.listSessions).mockResolvedValue(
+        mockSessions,
+      );
 
-      // Act
-      const result = await handleKnowledgeSessions({ limit: 20, filters });
+      const result = await knowledgeSessionsTool(
+        { limit: 20, filters },
+        context,
+      );
 
-      // Assert
       expect(result.ok).toBe(true);
       if (result.ok) {
         expect(result.result.sessions).toStrictEqual(mockSessions);
       }
-      expect(mockKnowledgeStore.listSessions).toHaveBeenCalledWith(20, filters);
+      expect(context.knowledgeStore.listSessions).toHaveBeenCalledWith(
+        20,
+        filters,
+      );
     });
 
     it('returns empty array when no sessions found', async () => {
-      // Arrange
-      mockKnowledgeStore.listSessions.mockResolvedValue([]);
+      vi.mocked(context.knowledgeStore.listSessions).mockResolvedValue([]);
 
-      // Act
-      const result = await handleKnowledgeSessions({ limit: 10 });
+      const result = await knowledgeSessionsTool({ limit: 10 }, context);
 
-      // Assert
       expect(result.ok).toBe(true);
       if (result.ok) {
         expect(result.result.sessions).toStrictEqual([]);
@@ -583,15 +594,12 @@ describe('knowledge', () => {
     });
 
     it('returns error when listing fails', async () => {
-      // Arrange
-      mockKnowledgeStore.listSessions.mockRejectedValue(
+      vi.mocked(context.knowledgeStore.listSessions).mockRejectedValue(
         new Error('Database unavailable'),
       );
 
-      // Act
-      const result = await handleKnowledgeSessions({});
+      const result = await knowledgeSessionsTool({}, context);
 
-      // Assert
       expect(result.ok).toBe(false);
       if (!result.ok) {
         expect(result.error.code).toBe(ErrorCodes.MM_KNOWLEDGE_ERROR);
diff --git a/src/tools/knowledge.ts b/src/tools/knowledge.ts
new file mode 100644
index 0000000..27a3939
--- /dev/null
+++ b/src/tools/knowledge.ts
@@ -0,0 +1,164 @@
+import { extractErrorMessage } from '../utils';
+import type {
+  KnowledgeLastInput,
+  KnowledgeLastResult,
+  KnowledgeScope,
+  KnowledgeSearchInput,
+  KnowledgeSearchResult,
+  KnowledgeSessionsInput,
+  KnowledgeSessionsResult,
+  KnowledgeSummarizeInput,
+  KnowledgeSummarizeResult,
+} from './types';
+import { ErrorCodes } from './types';
+import { createToolError, createToolSuccess } from './utils.js';
+import type { ToolContext, ToolResponse } from '../types/http.js';
+
+/**
+ * Retrieves the most recent knowledge steps from the store.
+ *
+ * @param input - The step retrieval options including count and scope.
+ * @param context - The tool execution context.
+ * @returns The retrieved knowledge steps.
+ */
+export async function knowledgeLastTool(
+  input: KnowledgeLastInput,
+  context: ToolContext,
+): Promise<ToolResponse<KnowledgeLastResult>> {
+  const sessionId = context.sessionManager.getSessionId();
+  const nSteps = input.n ?? 20;
+  const scope: KnowledgeScope = input.scope ?? 'current';
+
+  try {
+    const steps = await context.knowledgeStore.getLastSteps(
+      nSteps,
+      scope,
+      sessionId,
+      input.filters,
+    );
+
+    return createToolSuccess({ steps });
+  } catch (error) {
+    return createToolError(
+      ErrorCodes.MM_KNOWLEDGE_ERROR,
+      `Failed to retrieve steps: ${extractErrorMessage(error)}`,
+    );
+  }
+}
+
+/**
+ * Searches knowledge steps by query string.
+ *
+ * @param input - The search query, limit, scope, and filters.
+ * @param context - The tool execution context.
+ * @returns The matching knowledge steps and query.
+ */
+export async function knowledgeSearchTool(
+  input: KnowledgeSearchInput,
+  context: ToolContext,
+): Promise<ToolResponse<KnowledgeSearchResult>> {
+  const sessionId = context.sessionManager.getSessionId();
+  const limit = input.limit ?? 20;
+  const scope: KnowledgeScope = input.scope ?? 'all';
+
+  try {
+    const matches = await context.knowledgeStore.searchSteps(
+      input.query,
+      limit,
+      scope,
+      sessionId,
+      input.filters,
+    );
+
+    return createToolSuccess({
+      matches,
+      query: input.query,
+    });
+  } catch (error) {
+    return createToolError(
+      ErrorCodes.MM_KNOWLEDGE_ERROR,
+      `Search failed: ${extractErrorMessage(error)}`,
+    );
+  }
+}
+
+/**
+ * Generates a summary of a knowledge session.
+ *
+ * @param input - The session ID or scope to summarize.
+ * @param context - The tool execution context.
+ * @returns The session summary.
+ */
+export async function knowledgeSummarizeTool(
+  input: KnowledgeSummarizeInput,
+  context: ToolContext,
+): Promise<ToolResponse<KnowledgeSummarizeResult>> {
+  const currentSessionId = context.sessionManager.getSessionId();
+
+  let targetSessionId: string | undefined;
+
+  if (input.sessionId) {
+    targetSessionId = input.sessionId;
+  } else if (input.scope) {
+    if (input.scope === 'all') {
+      return createToolError(
+        ErrorCodes.MM_INVALID_INPUT,
+        'Cannot summarize all sessions. Use scope="current" or provide a specific sessionId.',
+      );
+    }
+
+    if (input.scope === 'current') {
+      targetSessionId = currentSessionId;
+    } else if (typeof input.scope === 'object' && 'sessionId' in input.scope) {
+      targetSessionId = input.scope.sessionId;
+    }
+  } else {
+    targetSessionId = currentSessionId;
+  }
+
+  if (!targetSessionId) {
+    return createToolError(
+      ErrorCodes.MM_INVALID_INPUT,
+      'No sessionId provided and no active session',
+    );
+  }
+
+  try {
+    const summary =
+      await context.knowledgeStore.summarizeSession(targetSessionId);
+    return createToolSuccess(summary);
+  } catch (error) {
+    return createToolError(
+      ErrorCodes.MM_KNOWLEDGE_ERROR,
+      `Summarize failed: ${extractErrorMessage(error)}`,
+    );
+  }
+}
+
+/**
+ * Lists available knowledge sessions with optional filters.
+ *
+ * @param input - The listing options including limit and filters.
+ * @param context - The tool execution context.
+ * @returns The list of knowledge sessions.
+ */
+export async function knowledgeSessionsTool(
+  input: KnowledgeSessionsInput,
+  context: ToolContext,
+): Promise<ToolResponse<KnowledgeSessionsResult>> {
+  const limit = input.limit ?? 10;
+
+  try {
+    const sessions = await context.knowledgeStore.listSessions(
+      limit,
+      input.filters,
+    );
+
+    return createToolSuccess({ sessions });
+  } catch (error) {
+    return createToolError(
+      ErrorCodes.MM_KNOWLEDGE_ERROR,
+      `Failed to list sessions: ${extractErrorMessage(error)}`,
+    );
+  }
+}
diff --git a/src/tools/launch.test.ts b/src/tools/launch.test.ts
new file mode 100644
index 0000000..8cf0915
--- /dev/null
+++ b/src/tools/launch.test.ts
@@ -0,0 +1,252 @@
+/**
+ * Unit tests for launch tool handler.
+ *
+ * Tests session launch with various states and error scenarios.
+ */
+
+import { describe, it, expect, vi, beforeEach } from 'vitest';
+
+import { launchTool } from './launch.js';
+import type { LaunchInput } from './types';
+import type { ExtensionState } from '../capabilities/types.js';
+import type { SessionLaunchResult } from '../server/session-manager.js';
+import { createMockSessionManager } from './test-utils/mock-factories.js';
+import { ErrorCodes } from './types/errors.js';
+import type { ToolContext } from '../types/http.js';
+
+function createMockContext(
+  options: {
+    hasActive?: boolean;
+    launchResult?: SessionLaunchResult;
+    environmentMode?: 'e2e' | 'prod';
+  } = {},
+): ToolContext {
+  return {
+    sessionManager: createMockSessionManager(options),
+    page: {} as ToolContext['page'],
+    refMap: new Map(),
+    workflowContext: {},
+    knowledgeStore: {
+      writeSessionMetadata: vi.fn().mockResolvedValue('test-session-123'),
+    },
+  } as unknown as ToolContext;
+}
+
+describe('launchTool', () => {
+  beforeEach(() => {
+    vi.clearAllMocks();
+  });
+
+  describe('successful launch', () => {
+    it('returns session info on successful launch', async () => {
+      const mockState: ExtensionState = {
+        isLoaded: true,
+        currentUrl: 'chrome-extension://ext-123/home.html',
+        extensionId: 'ext-123',
+        isUnlocked: false,
+        currentScreen: 'home',
+        accountAddress: null,
+        networkName: null,
+        chainId: null,
+        balance: null,
+      };
+
+      const mockLaunchResult: SessionLaunchResult = {
+        sessionId: 'test-session-123',
+        extensionId: 'ext-123',
+        state: mockState,
+      };
+
+      const context = createMockContext({
+        hasActive: false,
+        launchResult: mockLaunchResult,
+      });
+      const input: LaunchInput = { stateMode: 'default' };
+
+      const result = await launchTool(input, context);
+
+      expect(result.ok).toBe(true);
+      if (result.ok) {
+        expect(result.result.sessionId).toBe('test-session-123');
+        expect(result.result.extensionId).toBe('ext-123');
+        expect(result.result.state).toStrictEqual(mockState);
+      }
+      expect(context.sessionManager.launch).toHaveBeenCalledWith(input);
+    });
+
+    it('includes prerequisites in prod mode', async () => {
+      const mockState: ExtensionState = {
+        isLoaded: true,
+        currentUrl: 'chrome-extension://ext-456/home.html',
+        extensionId: 'ext-456',
+        isUnlocked: true,
+        currentScreen: 'home',
+        accountAddress: '0x1234',
+        networkName: 'Ethereum Mainnet',
+        chainId: 1,
+        balance: '10 ETH',
+      };
+
+      const mockLaunchResult: SessionLaunchResult = {
+        sessionId: 'prod-session-456',
+        extensionId: 'ext-456',
+        state: mockState,
+      };
+
+      const context = createMockContext({
+        hasActive: false,
+        launchResult: mockLaunchResult,
+        environmentMode: 'prod',
+      });
+
+      const result = await launchTool({ stateMode: 'default' }, context);
+
+      expect(result.ok).toBe(true);
+      if (result.ok) {
+        expect(result.result.prerequisites).toBeDefined();
+        expect(result.result.prerequisites).toHaveLength(3);
+        expect(result.result.prerequisites?.[0].step).toBe('Unlock Wallet');
+        expect(result.result.prerequisites?.[1].step).toBe('Configure Network');
+        expect(result.result.prerequisites?.[2].step).toBe('Set Up Accounts');
+      }
+    });
+
+    it('does not include prerequisites in e2e mode', async () => {
+      const mockState: ExtensionState = {
+        isLoaded: true,
+        currentUrl: 'chrome-extension://ext-123/home.html',
+        extensionId: 'ext-123',
+        isUnlocked: false,
+        currentScreen: 'home',
+        accountAddress: null,
+        networkName: null,
+        chainId: null,
+        balance: null,
+      };
+
+      const mockLaunchResult: SessionLaunchResult = {
+        sessionId: 'e2e-session-789',
+        extensionId: 'ext-123',
+        state: mockState,
+      };
+
+      const context = createMockContext({
+        hasActive: false,
+        launchResult: mockLaunchResult,
+        environmentMode: 'e2e',
+      });
+
+      const result = await launchTool({ stateMode: 'default' }, context);
+
+      expect(result.ok).toBe(true);
+      if (result.ok) {
+        expect(result.result.prerequisites).toBeUndefined();
+      }
+    });
+
+    it('passes through all launch input parameters', async () => {
+      const context = createMockContext({ hasActive: false });
+      const input: LaunchInput = {
+        stateMode: 'custom',
+        fixturePreset: 'test-preset',
+        autoBuild: false,
+        slowMo: 100,
+        goal: 'Test send flow',
+        flowTags: ['send', 'transaction'],
+        tags: ['smoke-test'],
+        seedContracts: ['hst', 'nfts'],
+        ports: {
+          anvil: 8546,
+          fixtureServer: 12346,
+        },
+      };
+
+      const result = await launchTool(input, context);
+
+      expect(result.ok).toBe(true);
+      expect(context.sessionManager.launch).toHaveBeenCalledWith(input);
+    });
+  });
+
+  describe('session already running', () => {
+    it('returns error when session already active', async () => {
+      const context = createMockContext({ hasActive: true });
+
+      const result = await launchTool({ stateMode: 'default' }, context);
+
+      expect(result.ok).toBe(false);
+      if (!result.ok) {
+        expect(result.error.code).toBe(ErrorCodes.MM_SESSION_ALREADY_RUNNING);
+        expect(result.error.message).toBe(
+          'A session is already running. Call cleanup first, or use --force.',
+        );
+      }
+      expect(context.sessionManager.launch).not.toHaveBeenCalled();
+    });
+  });
+
+  describe('launch failures', () => {
+    it('returns port conflict error for EADDRINUSE', async () => {
+      const context = createMockContext({ hasActive: false });
+      vi.spyOn(context.sessionManager, 'launch').mockRejectedValue(
+        new Error('listen EADDRINUSE: address already in use :::8545'),
+      );
+
+      const input: LaunchInput = { stateMode: 'default' };
+      const result = await launchTool(input, context);
+
+      expect(result.ok).toBe(false);
+      if (!result.ok) {
+        expect(result.error.code).toBe(ErrorCodes.MM_PORT_IN_USE);
+        expect(result.error.message).toContain('Port conflict');
+        expect(result.error.message).toContain('EADDRINUSE');
+      }
+    });
+
+    it('returns port conflict error for port keyword in message', async () => {
+      const context = createMockContext({ hasActive: false });
+      vi.spyOn(context.sessionManager, 'launch').mockRejectedValue(
+        new Error('port 8545 is already in use'),
+      );
+
+      const result = await launchTool({ stateMode: 'default' }, context);
+
+      expect(result.ok).toBe(false);
+      if (!result.ok) {
+        expect(result.error.code).toBe(ErrorCodes.MM_PORT_IN_USE);
+        expect(result.error.message).toContain('Port conflict');
+      }
+    });
+
+    it('returns generic launch failed error for other errors', async () => {
+      const context = createMockContext({ hasActive: false });
+      vi.spyOn(context.sessionManager, 'launch').mockRejectedValue(
+        new Error('Browser failed to start'),
+      );
+
+      const result = await launchTool({ stateMode: 'default' }, context);
+
+      expect(result.ok).toBe(false);
+      if (!result.ok) {
+        expect(result.error.code).toBe(ErrorCodes.MM_LAUNCH_FAILED);
+        expect(result.error.message).toContain('Launch failed');
+        expect(result.error.message).toContain('Browser failed to start');
+      }
+    });
+
+    it('handles non-Error exceptions', async () => {
+      const context = createMockContext({ hasActive: false });
+      vi.spyOn(context.sessionManager, 'launch').mockRejectedValue(
+        'string error',
+      );
+
+      const result = await launchTool({ stateMode: 'default' }, context);
+
+      expect(result.ok).toBe(false);
+      if (!result.ok) {
+        expect(result.error.code).toBe(ErrorCodes.MM_LAUNCH_FAILED);
+        expect(result.error.message).toContain('Launch failed');
+      }
+    });
+  });
+});
diff --git a/src/tools/launch.ts b/src/tools/launch.ts
new file mode 100644
index 0000000..c4cfc51
--- /dev/null
+++ b/src/tools/launch.ts
@@ -0,0 +1,72 @@
+import type { LaunchInput, LaunchPrerequisite, LaunchResult } from './types';
+import { ErrorCodes } from './types';
+import { createToolError, createToolSuccess } from './utils.js';
+import type { ToolContext, ToolResponse } from '../types/http.js';
+import { extractErrorMessage } from '../utils';
+
+const PROD_MODE_PREREQUISITES: LaunchPrerequisite[] = [
+  {
+    step: 'Unlock Wallet',
+    description:
+      'The wallet must be unlocked before interacting with it. Use the extension UI to enter your password.',
+  },
+  {
+    step: 'Configure Network',
+    description:
+      'Ensure the correct network is selected (e.g., Ethereum Mainnet, Sepolia, or custom network).',
+  },
+  {
+    step: 'Set Up Accounts',
+    description:
+      'Import or create accounts as needed. Ensure the active account has sufficient funds for transactions.',
+  },
+];
+
+/**
+ * Launches a new browser session with the configured extension.
+ *
+ * @param input - The launch configuration options.
+ * @param context - The tool execution context.
+ * @returns The launch result with session details and prerequisites.
+ */
+export async function launchTool(
+  input: LaunchInput,
+  context: ToolContext,
+): Promise<ToolResponse<LaunchResult>> {
+  const { sessionManager } = context;
+
+  try {
+    if (sessionManager.hasActiveSession()) {
+      if (input.force) {
+        await sessionManager.cleanup();
+      } else {
+        return createToolError(
+          ErrorCodes.MM_SESSION_ALREADY_RUNNING,
+          'A session is already running. Call cleanup first, or use --force.',
+        );
+      }
+    }
+
+    const result = await sessionManager.launch(input);
+    const isProdMode = sessionManager.getEnvironmentMode() === 'prod';
+
+    return createToolSuccess({
+      ...result,
+      ...(isProdMode && { prerequisites: PROD_MODE_PREREQUISITES }),
+    });
+  } catch (error) {
+    const message = extractErrorMessage(error);
+
+    if (message.includes('EADDRINUSE') || message.includes('port')) {
+      return createToolError(
+        ErrorCodes.MM_PORT_IN_USE,
+        `Port conflict: ${message}`,
+      );
+    }
+
+    return createToolError(
+      ErrorCodes.MM_LAUNCH_FAILED,
+      `Launch failed: ${message}`,
+    );
+  }
+}
diff --git a/src/tools/navigation.test.ts b/src/tools/navigation.test.ts
new file mode 100644
index 0000000..c3ef76b
--- /dev/null
+++ b/src/tools/navigation.test.ts
@@ -0,0 +1,471 @@
+/**
+ * Unit tests for navigation tool handlers.
+ *
+ * Tests handleNavigate, handleWaitForNotification, handleSwitchToTab, and handleCloseTab
+ * with various navigation targets, tab operations, and error scenarios.
+ */
+
+import { describe, it, expect, vi } from 'vitest';
+
+import {
+  navigateTool,
+  waitForNotificationTool,
+  switchToTabTool,
+  closeTabTool,
+} from './navigation.js';
+import { createMockSessionManager } from './test-utils/mock-factories.js';
+import { ErrorCodes } from './types/errors.js';
+import type { ToolContext } from '../types/http.js';
+
+function createMockPage(url = 'about:blank') {
+  return {
+    url: vi.fn().mockReturnValue(url),
+    bringToFront: vi.fn().mockResolvedValue(undefined),
+    close: vi.fn().mockResolvedValue(undefined),
+  };
+}
+
+function createMockContext(
+  options: {
+    hasActive?: boolean;
+    page?: ReturnType<typeof createMockPage>;
+    trackedPages?: { page: unknown; role: string; url: string }[];
+  } = {},
+): ToolContext {
+  const page = options.page ?? createMockPage();
+  const sessionManager = createMockSessionManager({
+    hasActive: options.hasActive ?? true,
+    trackedPages: options.trackedPages as never,
+  });
+
+  return {
+    sessionManager,
+    page: page as never,
+    refMap: new Map(),
+    workflowContext: {},
+    knowledgeStore: {},
+  } as unknown as ToolContext;
+}
+
+describe('navigation', () => {
+  describe('navigateTool', () => {
+    it('navigates to home screen', async () => {
+      const page = createMockPage('chrome-extension://ext-123/home.html');
+      const context = createMockContext({ page });
+
+      const result = await navigateTool({ screen: 'home' }, context);
+
+      expect(result.ok).toBe(true);
+      if (result.ok) {
+        expect(result.result.navigated).toBe(true);
+        expect(result.result.currentUrl).toBe(
+          'chrome-extension://ext-123/home.html',
+        );
+      }
+      expect(context.sessionManager.navigateToHome).toHaveBeenCalled();
+    });
+
+    it('navigates to settings screen', async () => {
+      const page = createMockPage('chrome-extension://ext-123/settings.html');
+      const context = createMockContext({ page });
+
+      const result = await navigateTool({ screen: 'settings' }, context);
+
+      expect(result.ok).toBe(true);
+      if (result.ok) {
+        expect(result.result.currentUrl).toBe(
+          'chrome-extension://ext-123/settings.html',
+        );
+      }
+      expect(context.sessionManager.navigateToSettings).toHaveBeenCalled();
+    });
+
+    it('navigates to notification screen', async () => {
+      const page = createMockPage(
+        'chrome-extension://ext-123/notification.html',
+      );
+      const context = createMockContext({ page });
+
+      const result = await navigateTool({ screen: 'notification' }, context);
+
+      expect(result.ok).toBe(true);
+      if (result.ok) {
+        expect(result.result.currentUrl).toBe(
+          'chrome-extension://ext-123/notification.html',
+        );
+      }
+      expect(context.sessionManager.navigateToNotification).toHaveBeenCalled();
+    });
+
+    it('navigates to a custom URL', async () => {
+      const page = createMockPage('https://app.uniswap.org');
+      const context = createMockContext({ page });
+      vi.spyOn(context.sessionManager, 'navigateToUrl').mockResolvedValue(
+        page as never,
+      );
+
+      const result = await navigateTool(
+        { screen: 'url', url: 'https://app.uniswap.org' },
+        context,
+      );
+
+      expect(result.ok).toBe(true);
+      if (result.ok) {
+        expect(result.result.currentUrl).toBe('https://app.uniswap.org');
+      }
+      expect(context.sessionManager.navigateToUrl).toHaveBeenCalledWith(
+        'https://app.uniswap.org',
+      );
+    });
+
+    it('returns error when URL is missing', async () => {
+      const context = createMockContext();
+
+      const result = await navigateTool({ screen: 'url' } as never, context);
+
+      expect(result.ok).toBe(false);
+      if (!result.ok) {
+        expect(result.error.code).toBe(ErrorCodes.MM_INVALID_INPUT);
+        expect(result.error.message).toContain('url is required');
+      }
+    });
+
+    it('returns error for unknown screen', async () => {
+      const context = createMockContext();
+
+      const result = await navigateTool(
+        { screen: 'invalid' } as never,
+        context,
+      );
+
+      expect(result.ok).toBe(false);
+      if (!result.ok) {
+        expect(result.error.code).toBe(ErrorCodes.MM_INVALID_INPUT);
+        expect(result.error.message).toContain('Unknown screen');
+      }
+    });
+
+    it('classifies navigation failures', async () => {
+      const context = createMockContext();
+      vi.spyOn(context.sessionManager, 'navigateToHome').mockRejectedValue(
+        new Error('Navigation failed'),
+      );
+
+      const result = await navigateTool({ screen: 'home' }, context);
+
+      expect(result.ok).toBe(false);
+      if (!result.ok) {
+        expect(result.error.code).toBe(ErrorCodes.MM_NAVIGATION_FAILED);
+      }
+    });
+
+    it('returns no active session error when session is missing', async () => {
+      const context = createMockContext({ hasActive: false });
+
+      const result = await navigateTool({ screen: 'home' }, context);
+
+      expect(result.ok).toBe(false);
+      if (!result.ok) {
+        expect(result.error.code).toBe(ErrorCodes.MM_NO_ACTIVE_SESSION);
+      }
+    });
+  });
+
+  describe('waitForNotificationTool', () => {
+    it('waits for notification popup with default timeout', async () => {
+      const notificationPage = createMockPage(
+        'chrome-extension://ext-123/notification.html',
+      );
+      const context = createMockContext();
+      vi.spyOn(
+        context.sessionManager,
+        'waitForNotificationPage',
+      ).mockResolvedValue(notificationPage as never);
+
+      const result = await waitForNotificationTool({}, context);
+
+      expect(result.ok).toBe(true);
+      if (result.ok) {
+        expect(result.result.found).toBe(true);
+        expect(result.result.pageUrl).toBe(
+          'chrome-extension://ext-123/notification.html',
+        );
+      }
+      expect(
+        context.sessionManager.waitForNotificationPage,
+      ).toHaveBeenCalledWith(15000);
+    });
+
+    it('uses custom timeout value', async () => {
+      const notificationPage = createMockPage(
+        'chrome-extension://ext-123/notification.html',
+      );
+      const context = createMockContext();
+      vi.spyOn(
+        context.sessionManager,
+        'waitForNotificationPage',
+      ).mockResolvedValue(notificationPage as never);
+
+      const result = await waitForNotificationTool(
+        { timeoutMs: 30000 },
+        context,
+      );
+
+      expect(result.ok).toBe(true);
+      expect(
+        context.sessionManager.waitForNotificationPage,
+      ).toHaveBeenCalledWith(30000);
+    });
+
+    it('classifies notification timeout errors', async () => {
+      const context = createMockContext();
+      vi.spyOn(
+        context.sessionManager,
+        'waitForNotificationPage',
+      ).mockRejectedValue(new Error('Timeout 15000ms exceeded'));
+
+      const result = await waitForNotificationTool({}, context);
+
+      expect(result.ok).toBe(false);
+      if (!result.ok) {
+        expect(result.error.code).toBe(ErrorCodes.MM_NOTIFICATION_TIMEOUT);
+      }
+    });
+
+    it('returns no active session error when session is missing', async () => {
+      const context = createMockContext({ hasActive: false });
+
+      const result = await waitForNotificationTool({}, context);
+
+      expect(result.ok).toBe(false);
+      if (!result.ok) {
+        expect(result.error.code).toBe(ErrorCodes.MM_NO_ACTIVE_SESSION);
+      }
+    });
+  });
+
+  describe('switchToTabTool', () => {
+    it('switches to tab by role', async () => {
+      const extensionPage = createMockPage(
+        'chrome-extension://ext-123/home.html',
+      );
+      const dappPage = createMockPage('https://app.uniswap.org');
+      const context = createMockContext({
+        page: extensionPage,
+        trackedPages: [
+          {
+            page: extensionPage,
+            role: 'extension',
+            url: 'chrome-extension://ext-123/home.html',
+          },
+          { page: dappPage, role: 'dapp', url: 'https://app.uniswap.org' },
+        ],
+      });
+
+      const result = await switchToTabTool({ role: 'dapp' }, context);
+
+      expect(result.ok).toBe(true);
+      if (result.ok) {
+        expect(result.result.switched).toBe(true);
+        expect(result.result.activeTab.role).toBe('dapp');
+        expect(result.result.activeTab.url).toBe('https://app.uniswap.org');
+      }
+      expect(dappPage.bringToFront).toHaveBeenCalled();
+      expect(context.sessionManager.setActivePage).toHaveBeenCalledWith(
+        dappPage,
+      );
+    });
+
+    it('switches to tab by URL prefix', async () => {
+      const extensionPage = createMockPage(
+        'chrome-extension://ext-123/home.html',
+      );
+      const dappPage = createMockPage('https://app.uniswap.org/swap');
+      const context = createMockContext({
+        page: extensionPage,
+        trackedPages: [
+          {
+            page: extensionPage,
+            role: 'extension',
+            url: 'chrome-extension://ext-123/home.html',
+          },
+          {
+            page: dappPage,
+            role: 'dapp',
+            url: 'https://app.uniswap.org/swap',
+          },
+        ],
+      });
+
+      const result = await switchToTabTool(
+        { url: 'https://app.uniswap.org' },
+        context,
+      );
+
+      expect(result.ok).toBe(true);
+      if (result.ok) {
+        expect(result.result.activeTab.url).toBe(
+          'https://app.uniswap.org/swap',
+        );
+      }
+      expect(dappPage.bringToFront).toHaveBeenCalled();
+    });
+
+    it('returns invalid input when neither role nor url is provided', async () => {
+      const context = createMockContext();
+
+      const result = await switchToTabTool({} as never, context);
+
+      expect(result.ok).toBe(false);
+      if (!result.ok) {
+        expect(result.error.code).toBe(ErrorCodes.MM_INVALID_INPUT);
+      }
+    });
+
+    it('returns tab not found when no matching tab exists', async () => {
+      const extensionPage = createMockPage(
+        'chrome-extension://ext-123/home.html',
+      );
+      const context = createMockContext({
+        trackedPages: [
+          {
+            page: extensionPage,
+            role: 'extension',
+            url: 'chrome-extension://ext-123/home.html',
+          },
+        ],
+      });
+
+      const result = await switchToTabTool({ role: 'dapp' }, context);
+
+      expect(result.ok).toBe(false);
+      if (!result.ok) {
+        expect(result.error.code).toBe(ErrorCodes.MM_TAB_NOT_FOUND);
+        expect(result.error.message).toContain('No tab found matching: dapp');
+      }
+    });
+  });
+
+  describe('closeTabTool', () => {
+    it('closes tab by role', async () => {
+      const extensionPage = createMockPage(
+        'chrome-extension://ext-123/home.html',
+      );
+      const dappPage = createMockPage('https://app.uniswap.org');
+      const context = createMockContext({
+        page: extensionPage,
+        trackedPages: [
+          {
+            page: extensionPage,
+            role: 'extension',
+            url: 'chrome-extension://ext-123/home.html',
+          },
+          { page: dappPage, role: 'dapp', url: 'https://app.uniswap.org' },
+        ],
+      });
+
+      const result = await closeTabTool({ role: 'dapp' }, context);
+
+      expect(result.ok).toBe(true);
+      if (result.ok) {
+        expect(result.result.closed).toBe(true);
+        expect(result.result.closedUrl).toBe('https://app.uniswap.org');
+      }
+      expect(dappPage.close).toHaveBeenCalled();
+    });
+
+    it('closes tab by URL prefix', async () => {
+      const extensionPage = createMockPage(
+        'chrome-extension://ext-123/home.html',
+      );
+      const dappPage = createMockPage('https://app.uniswap.org/swap');
+      const context = createMockContext({
+        page: extensionPage,
+        trackedPages: [
+          {
+            page: extensionPage,
+            role: 'extension',
+            url: 'chrome-extension://ext-123/home.html',
+          },
+          {
+            page: dappPage,
+            role: 'dapp',
+            url: 'https://app.uniswap.org/swap',
+          },
+        ],
+      });
+
+      const result = await closeTabTool(
+        { url: 'https://app.uniswap.org' },
+        context,
+      );
+
+      expect(result.ok).toBe(true);
+      if (result.ok) {
+        expect(result.result.closedUrl).toBe('https://app.uniswap.org/swap');
+      }
+      expect(dappPage.close).toHaveBeenCalled();
+    });
+
+    it('switches to extension tab when closing the active tab', async () => {
+      const extensionPage = createMockPage(
+        'chrome-extension://ext-123/home.html',
+      );
+      const dappPage = createMockPage('https://app.uniswap.org');
+      const context = createMockContext({
+        page: dappPage,
+        trackedPages: [
+          {
+            page: extensionPage,
+            role: 'extension',
+            url: 'chrome-extension://ext-123/home.html',
+          },
+          { page: dappPage, role: 'dapp', url: 'https://app.uniswap.org' },
+        ],
+      });
+
+      const result = await closeTabTool({ role: 'dapp' }, context);
+
+      expect(result.ok).toBe(true);
+      expect(extensionPage.bringToFront).toHaveBeenCalled();
+      expect(context.sessionManager.setActivePage).toHaveBeenCalledWith(
+        extensionPage,
+      );
+      expect(dappPage.close).toHaveBeenCalled();
+    });
+
+    it('returns invalid input when neither role nor url is provided', async () => {
+      const context = createMockContext();
+
+      const result = await closeTabTool({} as never, context);
+
+      expect(result.ok).toBe(false);
+      if (!result.ok) {
+        expect(result.error.code).toBe(ErrorCodes.MM_INVALID_INPUT);
+      }
+    });
+
+    it('returns tab not found when no matching tab exists', async () => {
+      const extensionPage = createMockPage(
+        'chrome-extension://ext-123/home.html',
+      );
+      const context = createMockContext({
+        trackedPages: [
+          {
+            page: extensionPage,
+            role: 'extension',
+            url: 'chrome-extension://ext-123/home.html',
+          },
+        ],
+      });
+
+      const result = await closeTabTool({ role: 'dapp' }, context);
+
+      expect(result.ok).toBe(false);
+      if (!result.ok) {
+        expect(result.error.code).toBe(ErrorCodes.MM_TAB_NOT_FOUND);
+        expect(result.error.message).toContain('No tab found matching: dapp');
+      }
+    });
+  });
+});
diff --git a/src/tools/navigation.ts b/src/tools/navigation.ts
new file mode 100644
index 0000000..5844a5f
--- /dev/null
+++ b/src/tools/navigation.ts
@@ -0,0 +1,247 @@
+import {
+  classifyNavigationError,
+  classifyNotificationError,
+  classifyTabError,
+} from './error-classification.js';
+import type {
+  CloseTabInput,
+  CloseTabResult,
+  NavigateInput,
+  NavigateResult,
+  SwitchToTabInput,
+  SwitchToTabResult,
+  WaitForNotificationInput,
+  WaitForNotificationResult,
+} from './types';
+import { ErrorCodes } from './types';
+import { DEFAULT_INTERACTION_TIMEOUT_MS } from './utils/constants.js';
+import {
+  createToolError,
+  createToolSuccess,
+  requireActiveSession,
+} from './utils.js';
+import type { ToolContext, ToolResponse } from '../types/http.js';
+
+/**
+ * Navigates the browser to a specified screen or URL.
+ *
+ * @param input - The navigation target screen and optional URL.
+ * @param context - The tool execution context.
+ * @returns The navigation result with the current URL.
+ */
+export async function navigateTool(
+  input: NavigateInput,
+  context: ToolContext,
+): Promise<ToolResponse<NavigateResult>> {
+  const missingSession = requireActiveSession<NavigateResult>(context);
+  if (missingSession) {
+    return missingSession;
+  }
+
+  if (input.screen === 'url' && !input.url) {
+    return createToolError(
+      ErrorCodes.MM_INVALID_INPUT,
+      'url is required when screen is "url"',
+    );
+  }
+
+  const validScreens = ['home', 'settings', 'url', 'notification'];
+  if (!validScreens.includes(input.screen)) {
+    return createToolError(
+      ErrorCodes.MM_INVALID_INPUT,
+      `Unknown screen: ${String(input.screen)}`,
+    );
+  }
+
+  try {
+    switch (input.screen) {
+      case 'home':
+        await context.sessionManager.navigateToHome();
+        break;
+      case 'settings':
+        await context.sessionManager.navigateToSettings();
+        break;
+      case 'url':
+        await context.sessionManager.navigateToUrl(input.url as string);
+        break;
+      case 'notification':
+        await context.sessionManager.navigateToNotification();
+        break;
+      default:
+        throw new Error(`Unsupported screen: ${String(input.screen)}`);
+    }
+
+    return createToolSuccess({
+      navigated: true,
+      currentUrl: context.page.url(),
+    });
+  } catch (error) {
+    const errorInfo = classifyNavigationError(error);
+    return createToolError(errorInfo.code, errorInfo.message);
+  }
+}
+
+/**
+ * Waits for a notification page to appear within a timeout.
+ *
+ * @param input - The notification wait options including timeout.
+ * @param context - The tool execution context.
+ * @returns The notification page URL when found.
+ */
+export async function waitForNotificationTool(
+  input: WaitForNotificationInput,
+  context: ToolContext,
+): Promise<ToolResponse<WaitForNotificationResult>> {
+  const missingSession =
+    requireActiveSession<WaitForNotificationResult>(context);
+  if (missingSession) {
+    return missingSession;
+  }
+
+  const timeoutMs = input.timeoutMs ?? DEFAULT_INTERACTION_TIMEOUT_MS;
+
+  try {
+    const notificationPage =
+      await context.sessionManager.waitForNotificationPage(timeoutMs);
+
+    return createToolSuccess({
+      found: true,
+      pageUrl: notificationPage.url(),
+    });
+  } catch (error) {
+    const errorInfo = classifyNotificationError(error);
+    return createToolError(errorInfo.code, errorInfo.message);
+  }
+}
+
+/**
+ * Switches the active page to a tab matching the given role or URL.
+ *
+ * @param input - The tab selection criteria (role or URL).
+ * @param context - The tool execution context.
+ * @returns The active tab info after switching.
+ */
+export async function switchToTabTool(
+  input: SwitchToTabInput,
+  context: ToolContext,
+): Promise<ToolResponse<SwitchToTabResult>> {
+  const missingSession = requireActiveSession<SwitchToTabResult>(context);
+  if (missingSession) {
+    return missingSession;
+  }
+
+  if (!input.role && !input.url) {
+    return createToolError(
+      ErrorCodes.MM_INVALID_INPUT,
+      'Either role or url must be provided',
+    );
+  }
+
+  try {
+    const trackedPages = context.sessionManager.getTrackedPages();
+    const targetPage = trackedPages.find((trackedPage) => {
+      if (input.role) {
+        return trackedPage.role === input.role;
+      }
+      if (input.url) {
+        return trackedPage.url.startsWith(input.url);
+      }
+      return false;
+    });
+
+    if (!targetPage) {
+      const availableTabs = trackedPages.map((trackedPage) => ({
+        role: trackedPage.role,
+        url: trackedPage.url,
+      }));
+      throw new Error(
+        `No tab found matching: ${input.role ?? input.url}. Available tabs: ${JSON.stringify(availableTabs)}`,
+      );
+    }
+
+    await targetPage.page.bringToFront();
+    context.sessionManager.setActivePage(targetPage.page);
+
+    const activeTabInfo = context.sessionManager
+      .getTrackedPages()
+      .find((trackedPage) => trackedPage.page === targetPage.page);
+
+    return createToolSuccess({
+      switched: true,
+      activeTab: {
+        role: activeTabInfo?.role ?? 'other',
+        url: targetPage.page.url(),
+      },
+    });
+  } catch (error) {
+    const errorInfo = classifyTabError(error);
+    return createToolError(errorInfo.code, errorInfo.message);
+  }
+}
+
+/**
+ * Closes a browser tab matching the given role or URL.
+ *
+ * @param input - The tab selection criteria (role or URL).
+ * @param context - The tool execution context.
+ * @returns The close result with the closed tab URL.
+ */
+export async function closeTabTool(
+  input: CloseTabInput,
+  context: ToolContext,
+): Promise<ToolResponse<CloseTabResult>> {
+  const missingSession = requireActiveSession<CloseTabResult>(context);
+  if (missingSession) {
+    return missingSession;
+  }
+
+  if (!input.role && !input.url) {
+    return createToolError(
+      ErrorCodes.MM_INVALID_INPUT,
+      'Either role or url must be provided',
+    );
+  }
+
+  try {
+    const trackedPages = context.sessionManager.getTrackedPages();
+    const targetPage = trackedPages.find((trackedPage) => {
+      if (input.role) {
+        return trackedPage.role === input.role;
+      }
+      if (input.url) {
+        return trackedPage.url.startsWith(input.url);
+      }
+      return false;
+    });
+
+    if (!targetPage) {
+      throw new Error(`No tab found matching: ${input.role ?? input.url}`);
+    }
+
+    const closedUrl = targetPage.url;
+
+    if (targetPage.page === context.page) {
+      const otherPages = trackedPages.filter(
+        (trackedPage) => trackedPage.page !== targetPage.page,
+      );
+      const fallbackPage =
+        otherPages.find((trackedPage) => trackedPage.role === 'extension') ??
+        otherPages[0];
+
+      if (fallbackPage) {
+        await fallbackPage.page.bringToFront();
+        context.sessionManager.setActivePage(fallbackPage.page);
+      }
+    }
+
+    await targetPage.page.close();
+
+    return createToolSuccess({
+      closed: true,
+      closedUrl,
+    });
+  } catch (error) {
+    const errorInfo = classifyTabError(error);
+    return createToolError(errorInfo.code, errorInfo.message);
+  }
+}
diff --git a/src/tools/registry.test.ts b/src/tools/registry.test.ts
new file mode 100644
index 0000000..345df07
--- /dev/null
+++ b/src/tools/registry.test.ts
@@ -0,0 +1,48 @@
+import { describe, expect, it } from 'vitest';
+
+import { toolRegistry } from './registry.js';
+
+describe('toolRegistry', () => {
+  it('has expected tool entries', () => {
+    const expectedTools = [
+      'build',
+      'launch',
+      'cleanup',
+      'click',
+      'type',
+      'navigate',
+      'screenshot',
+      'describe_screen',
+      'clipboard',
+      'run_steps',
+    ];
+
+    for (const toolName of expectedTools) {
+      expect(toolRegistry.has(toolName)).toBe(true);
+    }
+  });
+
+  it('returns a function for launch', () => {
+    expect(typeof toolRegistry.get('launch')).toBe('function');
+  });
+
+  it('returns undefined for a nonexistent tool', () => {
+    expect(toolRegistry.get('nonexistent')).toBeUndefined();
+  });
+
+  it('has the expected number of entries', () => {
+    expect(toolRegistry.size).toBe(27);
+  });
+
+  it('stores only functions as values', () => {
+    for (const handler of toolRegistry.values()) {
+      expect(typeof handler).toBe('function');
+    }
+  });
+
+  it('uses unprefixed keys', () => {
+    for (const key of toolRegistry.keys()) {
+      expect(key.startsWith('mm_')).toBe(false);
+    }
+  });
+});
diff --git a/src/tools/registry.ts b/src/tools/registry.ts
new file mode 100644
index 0000000..cecfe43
--- /dev/null
+++ b/src/tools/registry.ts
@@ -0,0 +1,67 @@
+import { runStepsTool } from './batch.js';
+import { buildTool } from './build.js';
+import { cleanupTool } from './cleanup.js';
+import { clipboardTool } from './clipboard.js';
+import { getContextTool, setContextTool } from './context.js';
+import {
+  accessibilitySnapshotTool,
+  describeScreenTool,
+  listTestIdsTool,
+} from './discovery-tools.js';
+import { clickTool, typeTool, waitForTool } from './interaction.js';
+import {
+  knowledgeLastTool,
+  knowledgeSearchTool,
+  knowledgeSessionsTool,
+  knowledgeSummarizeTool,
+} from './knowledge.js';
+import { launchTool } from './launch.js';
+import {
+  closeTabTool,
+  navigateTool,
+  switchToTabTool,
+  waitForNotificationTool,
+} from './navigation.js';
+import { screenshotTool } from './screenshot.js';
+import {
+  getContractAddressTool,
+  listContractsTool,
+  seedContractTool,
+  seedContractsTool,
+} from './seeding.js';
+import { getStateTool } from './state.js';
+import type { ToolFunction } from '../types/http.js';
+
+// holds tools with heterogeneous parameter types. TypeScript's contravariant
+// function parameters prevent assigning ToolFunction<SpecificInput, ...> to
+// ToolFunction<unknown, ...>, so `any` is the standard pattern for type-erased
+// function maps. Input safety is enforced at the Zod validation boundary.
+export const toolRegistry = new Map<string, ToolFunction<any, any>>([
+  ['build', buildTool],
+  ['launch', launchTool],
+  ['cleanup', cleanupTool],
+  ['get_state', getStateTool],
+  ['navigate', navigateTool],
+  ['wait_for_notification', waitForNotificationTool],
+  ['switch_to_tab', switchToTabTool],
+  ['close_tab', closeTabTool],
+  ['list_testids', listTestIdsTool],
+  ['accessibility_snapshot', accessibilitySnapshotTool],
+  ['describe_screen', describeScreenTool],
+  ['screenshot', screenshotTool],
+  ['click', clickTool],
+  ['type', typeTool],
+  ['wait_for', waitForTool],
+  ['knowledge_last', knowledgeLastTool],
+  ['knowledge_search', knowledgeSearchTool],
+  ['knowledge_summarize', knowledgeSummarizeTool],
+  ['knowledge_sessions', knowledgeSessionsTool],
+  ['seed_contract', seedContractTool],
+  ['seed_contracts', seedContractsTool],
+  ['get_contract_address', getContractAddressTool],
+  ['list_contracts', listContractsTool],
+  ['run_steps', runStepsTool],
+  ['set_context', setContextTool],
+  ['get_context', getContextTool],
+  ['clipboard', clipboardTool],
+]);
diff --git a/src/tools/screenshot.test.ts b/src/tools/screenshot.test.ts
new file mode 100644
index 0000000..994c269
--- /dev/null
+++ b/src/tools/screenshot.test.ts
@@ -0,0 +1,242 @@
+/**
+ * Unit tests for screenshot tool handler.
+ *
+ * Tests screenshotTool with various options including base64 encoding,
+ * selector scoping, and error handling.
+ */
+
+import { describe, it, expect, vi } from 'vitest';
+
+import { screenshotTool } from './screenshot.js';
+import { createMockSessionManager } from './test-utils/mock-factories.js';
+import { ErrorCodes } from './types/errors.js';
+import type { ToolContext } from '../types/http.js';
+
+function createMockContext(
+  options: {
+    hasActive?: boolean;
+  } = {},
+): ToolContext {
+  const { hasActive = true } = options;
+
+  return {
+    sessionManager: createMockSessionManager({ hasActive }),
+    page: {} as ToolContext['page'],
+    refMap: new Map(),
+    workflowContext: {},
+    knowledgeStore: {},
+  } as unknown as ToolContext;
+}
+
+describe('screenshotTool', () => {
+  describe('basic screenshot', () => {
+    it('captures full page screenshot by default', async () => {
+      const context = createMockContext();
+
+      vi.spyOn(context.sessionManager, 'screenshot').mockResolvedValue({
+        path: '/path/to/screenshot.png',
+        width: 1280,
+        height: 720,
+        base64: 'mock-base64',
+      });
+
+      const result = await screenshotTool({ name: 'test-screenshot' }, context);
+
+      expect(result.ok).toBe(true);
+      if (result.ok) {
+        expect(result.result.path).toBe('/path/to/screenshot.png');
+        expect(result.result.width).toBe(1280);
+        expect(result.result.height).toBe(720);
+        expect(result.result.base64).toBeUndefined();
+      }
+      expect(context.sessionManager.screenshot).toHaveBeenCalledWith({
+        name: 'test-screenshot',
+        fullPage: true,
+        selector: undefined,
+      });
+    });
+
+    it('captures viewport-only screenshot when fullPage is false', async () => {
+      const context = createMockContext();
+
+      vi.spyOn(context.sessionManager, 'screenshot').mockResolvedValue({
+        path: '/path/to/screenshot.png',
+        width: 1280,
+        height: 720,
+        base64: 'mock-base64',
+      });
+
+      const result = await screenshotTool(
+        {
+          name: 'viewport-screenshot',
+          fullPage: false,
+        },
+        context,
+      );
+
+      expect(result.ok).toBe(true);
+      expect(context.sessionManager.screenshot).toHaveBeenCalledWith({
+        name: 'viewport-screenshot',
+        fullPage: false,
+        selector: undefined,
+      });
+    });
+  });
+
+  describe('with base64 encoding', () => {
+    it('includes base64 when includeBase64 is true', async () => {
+      const context = createMockContext();
+
+      vi.spyOn(context.sessionManager, 'screenshot').mockResolvedValue({
+        path: '/path/to/screenshot.png',
+        width: 1280,
+        height: 720,
+        base64:
+          'iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg==',
+      });
+
+      const result = await screenshotTool(
+        {
+          name: 'base64-screenshot',
+          includeBase64: true,
+        },
+        context,
+      );
+
+      expect(result.ok).toBe(true);
+      if (result.ok) {
+        expect(result.result.base64).toBe(
+          'iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg==',
+        );
+      }
+    });
+
+    it('excludes base64 when includeBase64 is false', async () => {
+      const context = createMockContext();
+
+      vi.spyOn(context.sessionManager, 'screenshot').mockResolvedValue({
+        path: '/path/to/screenshot.png',
+        width: 1280,
+        height: 720,
+        base64:
+          'iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg==',
+      });
+
+      const result = await screenshotTool(
+        {
+          name: 'no-base64-screenshot',
+          includeBase64: false,
+        },
+        context,
+      );
+
+      expect(result.ok).toBe(true);
+      if (result.ok) {
+        expect(result.result.base64).toBeUndefined();
+      }
+    });
+  });
+
+  describe('with selector scoping', () => {
+    it('captures screenshot of specific element', async () => {
+      const context = createMockContext();
+
+      vi.spyOn(context.sessionManager, 'screenshot').mockResolvedValue({
+        path: '/path/to/element-screenshot.png',
+        width: 400,
+        height: 200,
+        base64: 'mock-base64',
+      });
+
+      const result = await screenshotTool(
+        {
+          name: 'element-screenshot',
+          selector: '[data-testid="account-menu"]',
+        },
+        context,
+      );
+
+      expect(result.ok).toBe(true);
+      if (result.ok) {
+        expect(result.result.width).toBe(400);
+        expect(result.result.height).toBe(200);
+      }
+      expect(context.sessionManager.screenshot).toHaveBeenCalledWith({
+        name: 'element-screenshot',
+        fullPage: true,
+        selector: '[data-testid="account-menu"]',
+      });
+    });
+
+    it('combines selector with fullPage false', async () => {
+      const context = createMockContext();
+
+      vi.spyOn(context.sessionManager, 'screenshot').mockResolvedValue({
+        path: '/path/to/element-screenshot.png',
+        width: 400,
+        height: 200,
+        base64: 'mock-base64',
+      });
+
+      const result = await screenshotTool(
+        {
+          name: 'element-viewport-screenshot',
+          selector: '.modal-content',
+          fullPage: false,
+        },
+        context,
+      );
+
+      expect(result.ok).toBe(true);
+      expect(context.sessionManager.screenshot).toHaveBeenCalledWith({
+        name: 'element-viewport-screenshot',
+        fullPage: false,
+        selector: '.modal-content',
+      });
+    });
+  });
+
+  describe('error handling', () => {
+    it('returns error when no active session', async () => {
+      const context = createMockContext({ hasActive: false });
+
+      const result = await screenshotTool({ name: 'test-screenshot' }, context);
+
+      expect(result.ok).toBe(false);
+      if (!result.ok) {
+        expect(result.error.code).toBe(ErrorCodes.MM_NO_ACTIVE_SESSION);
+      }
+    });
+
+    it('returns error when screenshot fails', async () => {
+      const context = createMockContext();
+
+      vi.spyOn(context.sessionManager, 'screenshot').mockRejectedValue(
+        new Error('Screenshot failed'),
+      );
+
+      const result = await screenshotTool({ name: 'test-screenshot' }, context);
+
+      expect(result.ok).toBe(false);
+      if (!result.ok) {
+        expect(result.error.code).toBe(ErrorCodes.MM_SCREENSHOT_FAILED);
+        expect(result.error.message).toContain('Screenshot failed');
+      }
+    });
+
+    it('returns error when page is closed', async () => {
+      const context = createMockContext();
+
+      vi.spyOn(context.sessionManager, 'screenshot').mockRejectedValue(
+        new Error('Target page, context or browser has been closed'),
+      );
+
+      const result = await screenshotTool({ name: 'test-screenshot' }, context);
+
+      expect(result.ok).toBe(false);
+      if (!result.ok) {
+        expect(result.error.code).toBe(ErrorCodes.MM_PAGE_CLOSED);
+      }
+    });
+  });
+});
diff --git a/src/tools/screenshot.ts b/src/tools/screenshot.ts
new file mode 100644
index 0000000..5a842c4
--- /dev/null
+++ b/src/tools/screenshot.ts
@@ -0,0 +1,49 @@
+import { classifyScreenshotError } from './error-classification.js';
+import type { ScreenshotInput, ScreenshotToolResult } from './types';
+import {
+  createToolError,
+  createToolSuccess,
+  requireActiveSession,
+} from './utils.js';
+import type { ToolContext, ToolResponse } from '../types/http.js';
+
+/**
+ * Captures a screenshot of the current page.
+ *
+ * @param input - The screenshot options including name, selector, and base64 flag.
+ * @param context - The tool execution context.
+ * @returns The screenshot metadata and optional base64 data.
+ */
+export async function screenshotTool(
+  input: ScreenshotInput,
+  context: ToolContext,
+): Promise<ToolResponse<ScreenshotToolResult>> {
+  const missingSession = requireActiveSession<ScreenshotToolResult>(context);
+  if (missingSession) {
+    return missingSession;
+  }
+
+  try {
+    const screenshotName = input.name ?? `screenshot-${Date.now()}`;
+    const result = await context.sessionManager.screenshot({
+      name: screenshotName,
+      fullPage: input.fullPage ?? true,
+      selector: input.selector,
+    });
+
+    const response: ScreenshotToolResult = {
+      path: result.path,
+      width: result.width,
+      height: result.height,
+    };
+
+    if (input.includeBase64) {
+      response.base64 = result.base64;
+    }
+
+    return createToolSuccess(response);
+  } catch (error) {
+    const errorInfo = classifyScreenshotError(error);
+    return createToolError(errorInfo.code, errorInfo.message);
+  }
+}
diff --git a/src/tools/seeding.test.ts b/src/tools/seeding.test.ts
new file mode 100644
index 0000000..28cae73
--- /dev/null
+++ b/src/tools/seeding.test.ts
@@ -0,0 +1,346 @@
+/**
+ * Unit tests for seeding tool handlers.
+ *
+ * Tests contract deployment handlers including single/multiple contract deployment,
+ * address lookup, and contract listing with ContractSeedingCapability.
+ */
+
+import { describe, it, expect, vi } from 'vitest';
+
+import {
+  seedContractTool,
+  seedContractsTool,
+  getContractAddressTool,
+  listContractsTool,
+} from './seeding.js';
+import { createMockSessionManager } from './test-utils/mock-factories.js';
+import { ErrorCodes } from './types';
+import type { ContractSeedingCapability } from '../capabilities/types.js';
+import type { ToolContext } from '../types/http.js';
+
+function createMockSeedingCapability(): ContractSeedingCapability {
+  return {
+    deployContract: vi.fn(),
+    deployContracts: vi.fn(),
+    getContractAddress: vi.fn(),
+    listDeployedContracts: vi.fn(),
+    getAvailableContracts: vi.fn(),
+    clearRegistry: vi.fn(),
+    initialize: vi.fn(),
+  };
+}
+
+function createMockContext(
+  options: {
+    hasActive?: boolean;
+    workflowCapability?: ContractSeedingCapability;
+    sessionCapability?: ContractSeedingCapability;
+  } = {},
+): ToolContext {
+  const { hasActive = true, workflowCapability, sessionCapability } = options;
+
+  const sessionManager = createMockSessionManager({ hasActive });
+  sessionManager.getContractSeedingCapability.mockReturnValue(
+    sessionCapability,
+  );
+
+  return {
+    sessionManager,
+    page: {} as ToolContext['page'],
+    refMap: new Map(),
+    workflowContext: {
+      config: {
+        environment: 'e2e',
+        extensionName: 'MetaMask',
+      },
+      contractSeeding: workflowCapability,
+    },
+    knowledgeStore: {} as ToolContext['knowledgeStore'],
+    toolRegistry: new Map(),
+  } as unknown as ToolContext;
+}
+
+describe('seeding tools', () => {
+  describe('seedContractTool', () => {
+    it('deploys a single contract using workflowContext capability', async () => {
+      const deployedAt = new Date().toISOString();
+      const capability = createMockSeedingCapability();
+      vi.spyOn(capability, 'deployContract').mockResolvedValue({
+        name: 'hst',
+        address: '0x1234567890123456789012345678901234567890',
+        deployedAt,
+      });
+      const context = createMockContext({ workflowCapability: capability });
+
+      const result = await seedContractTool({ contractName: 'hst' }, context);
+
+      expect(result.ok).toBe(true);
+      if (result.ok) {
+        expect(result.result).toStrictEqual({
+          contractName: 'hst',
+          contractAddress: '0x1234567890123456789012345678901234567890',
+          deployedAt,
+        });
+      }
+      expect(capability.deployContract).toHaveBeenCalledWith('hst', {
+        hardfork: undefined,
+        deployerOptions: undefined,
+      });
+      expect(
+        context.sessionManager.getContractSeedingCapability,
+      ).not.toHaveBeenCalled();
+    });
+
+    it('falls back to session manager capability when workflowContext lacks one', async () => {
+      const deployedAt = new Date().toISOString();
+      const capability = createMockSeedingCapability();
+      vi.spyOn(capability, 'deployContract').mockResolvedValue({
+        name: 'nfts',
+        address: '0xabcdefabcdefabcdefabcdefabcdefabcdefabcd',
+        deployedAt,
+      });
+      const context = createMockContext({ sessionCapability: capability });
+
+      const result = await seedContractTool(
+        { contractName: 'nfts', hardfork: 'shanghai' },
+        context,
+      );
+
+      expect(result.ok).toBe(true);
+      expect(capability.deployContract).toHaveBeenCalledWith('nfts', {
+        hardfork: 'shanghai',
+        deployerOptions: undefined,
+      });
+      expect(
+        context.sessionManager.getContractSeedingCapability,
+      ).toHaveBeenCalled();
+    });
+
+    it('returns contract not found errors from deployment failures', async () => {
+      const capability = createMockSeedingCapability();
+      vi.spyOn(capability, 'deployContract').mockRejectedValue(
+        new Error('Contract not found: unknown'),
+      );
+      const context = createMockContext({ workflowCapability: capability });
+
+      const result = await seedContractTool({ contractName: 'hst' }, context);
+
+      expect(result.ok).toBe(false);
+      if (!result.ok) {
+        expect(result.error.code).toBe(ErrorCodes.MM_CONTRACT_NOT_FOUND);
+        expect(result.error.message).toContain('Contract not found');
+      }
+    });
+  });
+
+  describe('seedContractsTool', () => {
+    it('deploys multiple contracts and maps deployed and failed results', async () => {
+      const deployedAt = new Date().toISOString();
+      const capability = createMockSeedingCapability();
+      vi.spyOn(capability, 'deployContracts').mockResolvedValue({
+        deployed: [
+          {
+            name: 'hst',
+            address: '0x1234567890123456789012345678901234567890',
+            deployedAt,
+          },
+        ],
+        failed: [
+          {
+            name: 'nfts',
+            error: 'Contract deployment failed',
+          },
+        ],
+      });
+      const context = createMockContext({ workflowCapability: capability });
+
+      const result = await seedContractsTool(
+        { contracts: ['hst', 'nfts'] },
+        context,
+      );
+
+      expect(result.ok).toBe(true);
+      if (result.ok) {
+        expect(result.result).toStrictEqual({
+          deployed: [
+            {
+              contractName: 'hst',
+              contractAddress: '0x1234567890123456789012345678901234567890',
+              deployedAt,
+            },
+          ],
+          failed: [
+            {
+              contractName: 'nfts',
+              error: 'Contract deployment failed',
+            },
+          ],
+        });
+      }
+      expect(capability.deployContracts).toHaveBeenCalledWith(['hst', 'nfts'], {
+        hardfork: undefined,
+      });
+    });
+
+    it('returns seed failed errors for complete deployment failures', async () => {
+      const capability = createMockSeedingCapability();
+      vi.spyOn(capability, 'deployContracts').mockRejectedValue(
+        new Error('Anvil not running'),
+      );
+      const context = createMockContext({ workflowCapability: capability });
+
+      const result = await seedContractsTool({ contracts: ['hst'] }, context);
+
+      expect(result.ok).toBe(false);
+      if (!result.ok) {
+        expect(result.error.code).toBe(ErrorCodes.MM_SEED_FAILED);
+        expect(result.error.message).toContain('Anvil not running');
+      }
+    });
+  });
+
+  describe('getContractAddressTool', () => {
+    it('returns the contract address when found', async () => {
+      const capability = createMockSeedingCapability();
+      vi.spyOn(capability, 'getContractAddress').mockReturnValue(
+        '0x1234567890123456789012345678901234567890',
+      );
+      const context = createMockContext({ workflowCapability: capability });
+
+      const result = await getContractAddressTool(
+        { contractName: 'hst' },
+        context,
+      );
+
+      expect(result.ok).toBe(true);
+      if (result.ok) {
+        expect(result.result).toStrictEqual({
+          contractName: 'hst',
+          contractAddress: '0x1234567890123456789012345678901234567890',
+        });
+      }
+      expect(capability.getContractAddress).toHaveBeenCalledWith('hst');
+    });
+
+    it('returns null when the contract address is missing', async () => {
+      const capability = createMockSeedingCapability();
+      vi.spyOn(capability, 'getContractAddress').mockReturnValue(null);
+      const context = createMockContext({ workflowCapability: capability });
+
+      const result = await getContractAddressTool(
+        { contractName: 'nfts' },
+        context,
+      );
+
+      expect(result.ok).toBe(true);
+      if (result.ok) {
+        expect(result.result).toStrictEqual({
+          contractName: 'nfts',
+          contractAddress: null,
+        });
+      }
+    });
+
+    it('returns error when getContractAddress throws', async () => {
+      const capability = createMockSeedingCapability();
+      vi.spyOn(capability, 'getContractAddress').mockImplementation(() => {
+        throw new Error('Connection lost');
+      });
+      const context = createMockContext({ workflowCapability: capability });
+
+      const result = await getContractAddressTool(
+        { contractName: 'hst' },
+        context,
+      );
+
+      expect(result.ok).toBe(false);
+      if (!result.ok) {
+        expect(result.error.code).toBe(ErrorCodes.MM_SEED_FAILED);
+        expect(result.error.message).toContain('Connection lost');
+      }
+    });
+  });
+
+  describe('listContractsTool', () => {
+    it('returns the list of deployed contracts', async () => {
+      const deployedAt1 = new Date().toISOString();
+      const deployedAt2 = new Date(Date.now() + 1000).toISOString();
+      const capability = createMockSeedingCapability();
+      vi.spyOn(capability, 'listDeployedContracts').mockReturnValue([
+        {
+          name: 'hst',
+          address: '0x1234567890123456789012345678901234567890',
+          deployedAt: deployedAt1,
+        },
+        {
+          name: 'nfts',
+          address: '0xabcdefabcdefabcdefabcdefabcdefabcdefabcd',
+          deployedAt: deployedAt2,
+        },
+      ]);
+      const context = createMockContext({ workflowCapability: capability });
+
+      const result = await listContractsTool({}, context);
+
+      expect(result.ok).toBe(true);
+      if (result.ok) {
+        expect(result.result).toStrictEqual({
+          contracts: [
+            {
+              contractName: 'hst',
+              contractAddress: '0x1234567890123456789012345678901234567890',
+              deployedAt: deployedAt1,
+            },
+            {
+              contractName: 'nfts',
+              contractAddress: '0xabcdefabcdefabcdefabcdefabcdefabcdefabcd',
+              deployedAt: deployedAt2,
+            },
+          ],
+        });
+      }
+      expect(capability.listDeployedContracts).toHaveBeenCalled();
+    });
+
+    it('returns capability unavailable when no seeding capability exists', async () => {
+      const context = createMockContext();
+
+      const result = await listContractsTool({}, context);
+
+      expect(result.ok).toBe(false);
+      if (!result.ok) {
+        expect(result.error.code).toBe(ErrorCodes.MM_CAPABILITY_NOT_AVAILABLE);
+        expect(result.error.message).toContain(
+          'ContractSeedingCapability not available',
+        );
+      }
+    });
+
+    it('returns no active session when the session is missing', async () => {
+      const context = createMockContext({ hasActive: false });
+
+      const result = await listContractsTool({}, context);
+
+      expect(result.ok).toBe(false);
+      if (!result.ok) {
+        expect(result.error.code).toBe(ErrorCodes.MM_NO_ACTIVE_SESSION);
+      }
+    });
+
+    it('returns error when listDeployedContracts throws', async () => {
+      const capability = createMockSeedingCapability();
+      vi.spyOn(capability, 'listDeployedContracts').mockImplementation(() => {
+        throw new Error('Connection lost');
+      });
+      const context = createMockContext({ workflowCapability: capability });
+
+      const result = await listContractsTool({}, context);
+
+      expect(result.ok).toBe(false);
+      if (!result.ok) {
+        expect(result.error.code).toBe(ErrorCodes.MM_SEED_FAILED);
+        expect(result.error.message).toContain('Connection lost');
+      }
+    });
+  });
+});
diff --git a/src/tools/seeding.ts b/src/tools/seeding.ts
new file mode 100644
index 0000000..1a36cbb
--- /dev/null
+++ b/src/tools/seeding.ts
@@ -0,0 +1,187 @@
+import { classifySeedingError } from './error-classification.js';
+import type {
+  GetContractAddressInput,
+  GetContractAddressResult,
+  ListDeployedContractsInput,
+  ListDeployedContractsResult,
+  SeedContractInput,
+  SeedContractResult,
+  SeedContractsInput,
+  SeedContractsResult,
+} from './types';
+import { ErrorCodes } from './types';
+import {
+  createToolError,
+  createToolSuccess,
+  requireActiveSession,
+} from './utils.js';
+import type { ContractSeedingCapability } from '../capabilities/types.js';
+import type { ToolContext, ToolResponse } from '../types/http.js';
+
+/**
+ * Resolves the contract seeding capability or returns an error response.
+ *
+ * @param context - The tool execution context.
+ * @returns The seeding capability or an error response if unavailable.
+ */
+function getSeedingCapability(
+  context: ToolContext,
+): ContractSeedingCapability | ToolResponse<never> {
+  const missingSession = requireActiveSession<never>(context);
+  if (missingSession) {
+    return missingSession;
+  }
+
+  const capability =
+    context.workflowContext.contractSeeding ??
+    context.sessionManager.getContractSeedingCapability();
+
+  if (!capability) {
+    return createToolError(
+      ErrorCodes.MM_CAPABILITY_NOT_AVAILABLE,
+      'ContractSeedingCapability not available. The mm_seed_contract tool requires running in e2e mode with the MetaMask extension wrapper, which provides Anvil chain and contract deployment support.',
+    );
+  }
+
+  return capability;
+}
+
+/**
+ * Type guard that checks if the value is a ToolResponse rather than a capability.
+ *
+ * @param value - The capability or tool response to check.
+ * @returns True if the value is a ToolResponse.
+ */
+function isToolResponse(
+  value: ContractSeedingCapability | ToolResponse<never>,
+): value is ToolResponse<never> {
+  return 'ok' in value;
+}
+
+/**
+ * Deploys a single smart contract to the local Anvil chain.
+ *
+ * @param input - The contract name and deployment options.
+ * @param context - The tool execution context.
+ * @returns The deployed contract address and metadata.
+ */
+export async function seedContractTool(
+  input: SeedContractInput,
+  context: ToolContext,
+): Promise<ToolResponse<SeedContractResult>> {
+  const capability = getSeedingCapability(context);
+  if (isToolResponse(capability)) {
+    return capability;
+  }
+
+  try {
+    const deployed = await capability.deployContract(input.contractName, {
+      hardfork: input.hardfork,
+      deployerOptions: input.deployerOptions,
+    });
+
+    return createToolSuccess({
+      contractName: deployed.name,
+      contractAddress: deployed.address,
+      deployedAt: deployed.deployedAt,
+    });
+  } catch (error) {
+    const errorInfo = classifySeedingError(error);
+    return createToolError(errorInfo.code, errorInfo.message);
+  }
+}
+
+/**
+ * Deploys multiple smart contracts in batch to the local Anvil chain.
+ *
+ * @param input - The contract list and shared deployment options.
+ * @param context - The tool execution context.
+ * @returns The deployed and failed contract results.
+ */
+export async function seedContractsTool(
+  input: SeedContractsInput,
+  context: ToolContext,
+): Promise<ToolResponse<SeedContractsResult>> {
+  const capability = getSeedingCapability(context);
+  if (isToolResponse(capability)) {
+    return capability;
+  }
+
+  try {
+    const seedResult = await capability.deployContracts(input.contracts, {
+      hardfork: input.hardfork,
+    });
+
+    return createToolSuccess({
+      deployed: seedResult.deployed.map((deployedContract) => ({
+        contractName: deployedContract.name,
+        contractAddress: deployedContract.address,
+        deployedAt: deployedContract.deployedAt,
+      })),
+      failed: seedResult.failed.map((failedDeployment) => ({
+        contractName: failedDeployment.name,
+        error: failedDeployment.error,
+      })),
+    });
+  } catch (error) {
+    const errorInfo = classifySeedingError(error);
+    return createToolError(errorInfo.code, errorInfo.message);
+  }
+}
+
+/**
+ * Looks up the deployed address of a contract by name.
+ *
+ * @param input - The contract name to look up.
+ * @param context - The tool execution context.
+ * @returns The contract name and its deployed address.
+ */
+export async function getContractAddressTool(
+  input: GetContractAddressInput,
+  context: ToolContext,
+): Promise<ToolResponse<GetContractAddressResult>> {
+  const capability = getSeedingCapability(context);
+  if (isToolResponse(capability)) {
+    return capability;
+  }
+
+  try {
+    return createToolSuccess({
+      contractName: input.contractName,
+      contractAddress: capability.getContractAddress(input.contractName),
+    });
+  } catch (error) {
+    const errorInfo = classifySeedingError(error);
+    return createToolError(errorInfo.code, errorInfo.message);
+  }
+}
+
+/**
+ * Lists all currently deployed contracts.
+ *
+ * @param _input - Unused input parameters.
+ * @param context - The tool execution context.
+ * @returns The list of deployed contracts with addresses and timestamps.
+ */
+export async function listContractsTool(
+  _input: ListDeployedContractsInput,
+  context: ToolContext,
+): Promise<ToolResponse<ListDeployedContractsResult>> {
+  const capability = getSeedingCapability(context);
+  if (isToolResponse(capability)) {
+    return capability;
+  }
+
+  try {
+    return createToolSuccess({
+      contracts: capability.listDeployedContracts().map((deployedContract) => ({
+        contractName: deployedContract.name,
+        contractAddress: deployedContract.address,
+        deployedAt: deployedContract.deployedAt,
+      })),
+    });
+  } catch (error) {
+    const errorInfo = classifySeedingError(error);
+    return createToolError(errorInfo.code, errorInfo.message);
+  }
+}
diff --git a/src/tools/state.test.ts b/src/tools/state.test.ts
new file mode 100644
index 0000000..3969f5a
--- /dev/null
+++ b/src/tools/state.test.ts
@@ -0,0 +1,319 @@
+/**
+ * Unit tests for state tool handler.
+ *
+ * Tests handleGetState with various scenarios including state snapshot capability,
+ * tab tracking, and error handling.
+ */
+
+import { describe, it, expect, vi } from 'vitest';
+
+import { getStateTool } from './state.js';
+import type { StateSnapshotCapability } from '../capabilities/types.js';
+import { createMockSessionManager } from './test-utils/mock-factories.js';
+import type { MockSessionManagerOptions } from './test-utils/mock-factories.js';
+import { ErrorCodes } from './types/errors.js';
+import type { ToolContext } from '../types/http.js';
+
+function createMockPage(url = 'chrome-extension://ext-123/home.html') {
+  return {
+    url: vi.fn().mockReturnValue(url),
+  } as never;
+}
+
+function createMockContext(
+  options: MockSessionManagerOptions & {
+    page?: ReturnType<typeof createMockPage>;
+    stateSnapshotCapability?: StateSnapshotCapability;
+  } = {},
+): ToolContext & {
+  sessionManager: ReturnType<typeof createMockSessionManager>;
+} {
+  const page = createMockPage();
+  const sessionManager = createMockSessionManager(options);
+
+  sessionManager.getPage.mockReturnValue(options.page ?? page);
+  sessionManager.getStateSnapshotCapability.mockReturnValue(
+    options.stateSnapshotCapability,
+  );
+
+  return {
+    sessionManager,
+    page: options.page ?? page,
+    refMap: new Map(),
+    workflowContext: {},
+    knowledgeStore: {},
+  } as unknown as ToolContext & {
+    sessionManager: ReturnType<typeof createMockSessionManager>;
+  };
+}
+
+describe('getStateTool', () => {
+  describe('without state snapshot capability', () => {
+    it('returns extension state from session manager', async () => {
+      const page = createMockPage('chrome-extension://ext-123/home.html');
+      const context = createMockContext({
+        hasActive: true,
+        page,
+        extensionState: {
+          isLoaded: true,
+          currentUrl: 'chrome-extension://ext-123/home.html',
+          extensionId: 'ext-123',
+          isUnlocked: true,
+          currentScreen: 'home',
+          accountAddress: '0x1234567890123456789012345678901234567890',
+          networkName: 'Ethereum Mainnet',
+          chainId: 1,
+          balance: '1.5 ETH',
+        },
+        trackedPages: [
+          {
+            page,
+            role: 'extension',
+            url: 'chrome-extension://ext-123/home.html',
+          },
+        ],
+      });
+
+      const result = await getStateTool({}, context);
+
+      expect(result.ok).toBe(true);
+      if (result.ok) {
+        expect(result.result.state).toStrictEqual({
+          isLoaded: true,
+          currentUrl: 'chrome-extension://ext-123/home.html',
+          extensionId: 'ext-123',
+          isUnlocked: true,
+          currentScreen: 'home',
+          accountAddress: '0x1234567890123456789012345678901234567890',
+          networkName: 'Ethereum Mainnet',
+          chainId: 1,
+          balance: '1.5 ETH',
+        });
+        expect(result.result.tabs).toStrictEqual({
+          active: {
+            role: 'extension',
+            url: 'chrome-extension://ext-123/home.html',
+          },
+          tracked: [
+            {
+              role: 'extension',
+              url: 'chrome-extension://ext-123/home.html',
+            },
+          ],
+        });
+      }
+      expect(context.sessionManager.getExtensionState).toHaveBeenCalled();
+    });
+
+    it('includes multiple tracked pages in tabs', async () => {
+      const extensionPage = createMockPage(
+        'chrome-extension://ext-123/home.html',
+      );
+      const dappPage = createMockPage('https://app.uniswap.org');
+      const context = createMockContext({
+        hasActive: true,
+        page: extensionPage,
+        extensionState: {
+          isLoaded: true,
+          currentUrl: 'chrome-extension://ext-123/home.html',
+          extensionId: 'ext-123',
+          isUnlocked: true,
+          currentScreen: 'home',
+          accountAddress: '0x1234567890123456789012345678901234567890',
+          networkName: 'Ethereum Mainnet',
+          chainId: 1,
+          balance: '1.5 ETH',
+        },
+        trackedPages: [
+          {
+            page: extensionPage,
+            role: 'extension',
+            url: 'chrome-extension://ext-123/home.html',
+          },
+          {
+            page: dappPage,
+            role: 'dapp',
+            url: 'https://app.uniswap.org',
+          },
+        ],
+      });
+
+      const result = await getStateTool({}, context);
+
+      expect(result.ok).toBe(true);
+      if (result.ok) {
+        expect(result.result.tabs).toBeDefined();
+        expect(result.result.tabs?.tracked).toHaveLength(2);
+        expect(result.result.tabs?.tracked).toStrictEqual([
+          { role: 'extension', url: 'chrome-extension://ext-123/home.html' },
+          { role: 'dapp', url: 'https://app.uniswap.org' },
+        ]);
+      }
+    });
+
+    it('handles active page without tracked page info', async () => {
+      const page = createMockPage('chrome-extension://ext-123/home.html');
+      const context = createMockContext({
+        hasActive: true,
+        page,
+        extensionState: {
+          isLoaded: true,
+          currentUrl: 'chrome-extension://ext-123/home.html',
+          extensionId: 'ext-123',
+          isUnlocked: false,
+          currentScreen: 'home',
+          accountAddress: null,
+          networkName: null,
+          chainId: null,
+          balance: null,
+        },
+        trackedPages: [],
+      });
+
+      const result = await getStateTool({}, context);
+
+      expect(result.ok).toBe(true);
+      if (result.ok) {
+        expect(result.result.tabs).toBeDefined();
+        expect(result.result.tabs?.active.role).toBe('other');
+        expect(result.result.tabs?.active.url).toBe(
+          'chrome-extension://ext-123/home.html',
+        );
+      }
+    });
+  });
+
+  describe('with state snapshot capability', () => {
+    it('uses state snapshot capability when provided', async () => {
+      const page = createMockPage('chrome-extension://ext-123/home.html');
+      const stateSnapshotCapability: StateSnapshotCapability = {
+        getState: vi.fn().mockResolvedValue({
+          isLoaded: true,
+          currentUrl: 'chrome-extension://ext-123/home.html',
+          extensionId: 'ext-123',
+          isUnlocked: true,
+          currentScreen: 'home',
+          accountAddress: '0x1234567890123456789012345678901234567890',
+          networkName: 'Localhost 8545',
+          chainId: 1337,
+          balance: '25 ETH',
+        }),
+        detectCurrentScreen: vi.fn().mockResolvedValue('home'),
+      };
+      const context = createMockContext({
+        hasActive: true,
+        page,
+        sessionState: {
+          extensionId: 'ext-123',
+          ports: { anvil: 8545 },
+        } as never,
+        trackedPages: [
+          {
+            page,
+            role: 'extension',
+            url: 'chrome-extension://ext-123/home.html',
+          },
+        ],
+        stateSnapshotCapability,
+      });
+
+      const result = await getStateTool({}, context);
+
+      expect(result.ok).toBe(true);
+      if (result.ok) {
+        expect(result.result.state.chainId).toBe(1337);
+        expect(result.result.state.networkName).toBe('Localhost 8545');
+        expect(result.result.state.balance).toBe('25 ETH');
+      }
+      expect(stateSnapshotCapability.getState).toHaveBeenCalledWith(page, {
+        extensionId: 'ext-123',
+        chainId: 1337,
+      });
+      expect(context.sessionManager.getExtensionState).not.toHaveBeenCalled();
+    });
+
+    it('uses chainId 1 when anvil port not present', async () => {
+      const page = createMockPage('chrome-extension://ext-123/home.html');
+      const stateSnapshotCapability: StateSnapshotCapability = {
+        getState: vi.fn().mockResolvedValue({
+          isLoaded: true,
+          currentUrl: 'chrome-extension://ext-123/home.html',
+          extensionId: 'ext-123',
+          isUnlocked: true,
+          currentScreen: 'home',
+          accountAddress: '0x1234567890123456789012345678901234567890',
+          networkName: 'Ethereum Mainnet',
+          chainId: 1,
+          balance: '1.5 ETH',
+        }),
+        detectCurrentScreen: vi.fn().mockResolvedValue('home'),
+      };
+      const context = createMockContext({
+        hasActive: true,
+        page,
+        sessionState: {
+          extensionId: 'ext-123',
+          ports: {},
+        } as never,
+        trackedPages: [
+          {
+            page,
+            role: 'extension',
+            url: 'chrome-extension://ext-123/home.html',
+          },
+        ],
+        stateSnapshotCapability,
+      });
+
+      const result = await getStateTool({}, context);
+
+      expect(result.ok).toBe(true);
+      expect(stateSnapshotCapability.getState).toHaveBeenCalledWith(page, {
+        extensionId: 'ext-123',
+        chainId: 1,
+      });
+    });
+  });
+
+  describe('error handling', () => {
+    it('returns error when no active session', async () => {
+      const context = createMockContext({ hasActive: false });
+
+      const result = await getStateTool({}, context);
+
+      expect(result.ok).toBe(false);
+      if (!result.ok) {
+        expect(result.error.code).toBe(ErrorCodes.MM_NO_ACTIVE_SESSION);
+      }
+    });
+
+    it('returns error when getExtensionState fails', async () => {
+      const context = createMockContext({ hasActive: true });
+      context.sessionManager.getExtensionState.mockRejectedValue(
+        new Error('Failed to get state'),
+      );
+
+      const result = await getStateTool({}, context);
+
+      expect(result.ok).toBe(false);
+      if (!result.ok) {
+        expect(result.error.code).toBe(ErrorCodes.MM_STATE_FAILED);
+        expect(result.error.message).toContain('Failed to get state');
+      }
+    });
+
+    it('returns error when page is closed', async () => {
+      const context = createMockContext({ hasActive: true });
+      context.sessionManager.getExtensionState.mockRejectedValue(
+        new Error('Target page, context or browser has been closed'),
+      );
+
+      const result = await getStateTool({}, context);
+
+      expect(result.ok).toBe(false);
+      if (!result.ok) {
+        expect(result.error.code).toBe(ErrorCodes.MM_PAGE_CLOSED);
+      }
+    });
+  });
+});
diff --git a/src/tools/state.ts b/src/tools/state.ts
new file mode 100644
index 0000000..c974cce
--- /dev/null
+++ b/src/tools/state.ts
@@ -0,0 +1,88 @@
+import type { Page } from '@playwright/test';
+
+import { classifyStateError } from './error-classification.js';
+import type { GetStateResult } from './types';
+import {
+  createToolError,
+  createToolSuccess,
+  requireActiveSession,
+} from './utils.js';
+import type {
+  ExtensionState,
+  StateSnapshotCapability,
+} from '../capabilities/types.js';
+import type { ISessionManager } from '../server/session-manager.js';
+import type { ToolContext, ToolResponse } from '../types/http.js';
+
+/**
+ * Retrieves the extension state using the snapshot capability or session manager.
+ *
+ * @param page - The active Playwright page.
+ * @param sessionManager - The session manager instance.
+ * @param stateSnapshotCapability - Optional capability for direct state snapshots.
+ * @returns The current extension state.
+ */
+async function getState(
+  page: Page,
+  sessionManager: ISessionManager,
+  stateSnapshotCapability?: StateSnapshotCapability,
+): Promise<ExtensionState> {
+  if (stateSnapshotCapability) {
+    const extensionId = sessionManager.getSessionState()?.extensionId;
+    return stateSnapshotCapability.getState(page, {
+      extensionId,
+      chainId: sessionManager.getSessionState()?.ports?.anvil ? 1337 : 1,
+    });
+  }
+
+  return sessionManager.getExtensionState();
+}
+
+/**
+ * Retrieves the extension state and tracked tab information.
+ *
+ * @param _input - Unused input parameters.
+ * @param context - The tool execution context.
+ * @returns The extension state and tab details.
+ */
+export async function getStateTool(
+  _input: Record<string, never>,
+  context: ToolContext,
+): Promise<ToolResponse<GetStateResult>> {
+  const missingSession = requireActiveSession<GetStateResult>(context);
+  if (missingSession) {
+    return missingSession;
+  }
+
+  try {
+    const state = await getState(
+      context.page,
+      context.sessionManager,
+      context.workflowContext.stateSnapshot ??
+        context.sessionManager.getStateSnapshotCapability(),
+    );
+
+    const trackedPages = context.sessionManager.getTrackedPages();
+    const activePage = context.sessionManager.getPage();
+    const activeTabInfo = trackedPages.find(
+      (trackedPage) => trackedPage.page === activePage,
+    );
+
+    return createToolSuccess({
+      state,
+      tabs: {
+        active: {
+          role: activeTabInfo?.role ?? 'other',
+          url: activePage.url(),
+        },
+        tracked: trackedPages.map((trackedPage) => ({
+          role: trackedPage.role,
+          url: trackedPage.url,
+        })),
+      },
+    });
+  } catch (error) {
+    const errorInfo = classifyStateError(error);
+    return createToolError(errorInfo.code, errorInfo.message);
+  }
+}
diff --git a/src/mcp-server/test-utils/index.ts b/src/tools/test-utils/index.ts
similarity index 86%
rename from src/mcp-server/test-utils/index.ts
rename to src/tools/test-utils/index.ts
index 4959175..76200db 100644
--- a/src/mcp-server/test-utils/index.ts
+++ b/src/tools/test-utils/index.ts
@@ -13,5 +13,3 @@ export {
   type MockLocatorOptions,
   type MockBrowserContextOptions,
 } from './mock-playwright.js';
-
-export { flushPromises } from './flush-promises.js';
diff --git a/src/mcp-server/test-utils/mock-factories.test.ts b/src/tools/test-utils/mock-factories.test.ts
similarity index 99%
rename from src/mcp-server/test-utils/mock-factories.test.ts
rename to src/tools/test-utils/mock-factories.test.ts
index 9d490cf..0987102 100644
--- a/src/mcp-server/test-utils/mock-factories.test.ts
+++ b/src/tools/test-utils/mock-factories.test.ts
@@ -136,7 +136,7 @@ describe('mock-factories', () => {
     });
 
     it('allows customization via options', async () => {
-      const customSteps = [{ tool: 'mm_click', screen: 'home' }];
+      const customSteps = [{ tool: 'click', screen: 'home' }];
       const mock = createMockKnowledgeStore({
         lastSteps: customSteps,
       });
diff --git a/src/mcp-server/test-utils/mock-factories.ts b/src/tools/test-utils/mock-factories.ts
similarity index 96%
rename from src/mcp-server/test-utils/mock-factories.ts
rename to src/tools/test-utils/mock-factories.ts
index 8540852..f19ee1c 100644
--- a/src/mcp-server/test-utils/mock-factories.ts
+++ b/src/tools/test-utils/mock-factories.ts
@@ -11,8 +11,11 @@
 import { vi } from 'vitest';
 
 import type { ExtensionState } from '../../capabilities/types.js';
-import type { KnowledgeStore } from '../knowledge-store.js';
-import type { TrackedPage, SessionLaunchResult } from '../session-manager.js';
+import type { KnowledgeStore } from '../../knowledge-store/knowledge-store.js';
+import type {
+  TrackedPage,
+  SessionLaunchResult,
+} from '../../server/session-manager.js';
 import type { SessionState } from '../types/session.js';
 import type { SessionMetadata } from '../types/step-record.js';
 
@@ -130,6 +133,7 @@ export function createMockSessionManager(
     getStateSnapshotCapability: vi.fn().mockReturnValue(undefined),
 
     // Environment
+    setWorkflowContext: vi.fn(),
     getEnvironmentMode: vi
       .fn()
       .mockReturnValue(options.environmentMode ?? 'e2e'),
diff --git a/src/mcp-server/test-utils/mock-playwright.ts b/src/tools/test-utils/mock-playwright.ts
similarity index 100%
rename from src/mcp-server/test-utils/mock-playwright.ts
rename to src/tools/test-utils/mock-playwright.ts
diff --git a/src/mcp-server/types/discovery.ts b/src/tools/types/discovery.ts
similarity index 100%
rename from src/mcp-server/types/discovery.ts
rename to src/tools/types/discovery.ts
diff --git a/src/mcp-server/types/errors.ts b/src/tools/types/errors.ts
similarity index 100%
rename from src/mcp-server/types/errors.ts
rename to src/tools/types/errors.ts
diff --git a/src/mcp-server/types/index.ts b/src/tools/types/index.ts
similarity index 88%
rename from src/mcp-server/types/index.ts
rename to src/tools/types/index.ts
index 7a9edd6..e4c7ce1 100644
--- a/src/mcp-server/types/index.ts
+++ b/src/tools/types/index.ts
@@ -1,4 +1,3 @@
-export type * from './responses.js';
 export * from './errors.js';
 export * from './seeding.js';
 export type * from './tool-inputs.js';
diff --git a/src/mcp-server/types/knowledge.ts b/src/tools/types/knowledge.ts
similarity index 100%
rename from src/mcp-server/types/knowledge.ts
rename to src/tools/types/knowledge.ts
diff --git a/src/mcp-server/types/seeding.ts b/src/tools/types/seeding.ts
similarity index 100%
rename from src/mcp-server/types/seeding.ts
rename to src/tools/types/seeding.ts
diff --git a/src/mcp-server/types/session.ts b/src/tools/types/session.ts
similarity index 100%
rename from src/mcp-server/types/session.ts
rename to src/tools/types/session.ts
diff --git a/src/mcp-server/types/step-record.ts b/src/tools/types/step-record.ts
similarity index 100%
rename from src/mcp-server/types/step-record.ts
rename to src/tools/types/step-record.ts
diff --git a/src/mcp-server/types/tool-inputs.ts b/src/tools/types/tool-inputs.ts
similarity index 92%
rename from src/mcp-server/types/tool-inputs.ts
rename to src/tools/types/tool-inputs.ts
index 65bd1ac..211ee5c 100644
--- a/src/mcp-server/types/tool-inputs.ts
+++ b/src/tools/types/tool-inputs.ts
@@ -2,13 +2,6 @@ import type { SmartContractName } from './seeding.js';
 
 export type TabRole = 'extension' | 'notification' | 'dapp' | 'other';
 
-export type ObservationPolicyOverride = 'default' | 'none' | 'failures';
-
-export type HandlerOptions = {
-  signal?: AbortSignal;
-  observationPolicy?: ObservationPolicyOverride;
-};
-
 export type BuildInput = {
   buildType?: 'build:test';
   force?: boolean;
@@ -29,6 +22,7 @@ export type LaunchInput = {
   flowTags?: string[];
   tags?: string[];
   seedContracts?: SmartContractName[];
+  force?: boolean;
 };
 
 export type CleanupInput = {
@@ -59,7 +53,7 @@ export type DescribeScreenInput = {
 };
 
 export type ScreenshotInput = {
-  name: string;
+  name?: string;
   fullPage?: boolean;
   selector?: string;
   includeBase64?: boolean;
@@ -144,3 +138,8 @@ export type ClipboardInput = {
   action: 'write' | 'read';
   text?: string;
 };
+
+export type SetContextInput = {
+  context: 'e2e' | 'prod';
+  options?: Record<string, unknown>;
+};
diff --git a/src/mcp-server/types/tool-outputs.ts b/src/tools/types/tool-outputs.ts
similarity index 87%
rename from src/mcp-server/types/tool-outputs.ts
rename to src/tools/types/tool-outputs.ts
index 541bd51..969030a 100644
--- a/src/mcp-server/types/tool-outputs.ts
+++ b/src/tools/types/tool-outputs.ts
@@ -139,3 +139,19 @@ export type ClipboardResult = {
   success: boolean;
   text?: string;
 };
+
+export type SetContextResult = {
+  previousContext: 'e2e' | 'prod';
+  newContext: 'e2e' | 'prod';
+  availableCapabilities: string[];
+};
+
+export type GetContextResult = {
+  currentContext: 'e2e' | 'prod';
+  hasActiveSession: boolean;
+  sessionId: string | null;
+  capabilities: {
+    available: string[];
+  };
+  canSwitchContext: boolean;
+};
diff --git a/src/tools/utils.ts b/src/tools/utils.ts
new file mode 100644
index 0000000..6eae6a2
--- /dev/null
+++ b/src/tools/utils.ts
@@ -0,0 +1,53 @@
+import { ErrorCodes } from './types';
+import type { ToolContext, ToolResponse } from '../types/http.js';
+
+/**
+ * Wraps a result value in a successful tool response.
+ *
+ * @param result - The result payload to return.
+ * @returns A successful tool response containing the result.
+ */
+export function createToolSuccess<TResult>(
+  result: TResult,
+): ToolResponse<TResult> {
+  return { ok: true, result };
+}
+
+/**
+ * Wraps an error code and message in a failed tool response.
+ *
+ * @param code - The error code identifying the failure type.
+ * @param message - A human-readable error description.
+ * @returns A failed tool response containing the error.
+ */
+export function createToolError<TResult = never>(
+  code: string,
+  message: string,
+): ToolResponse<TResult> {
+  return {
+    ok: false,
+    error: {
+      code,
+      message,
+    },
+  };
+}
+
+/**
+ * Returns an error response if no active session exists.
+ *
+ * @param context - The tool execution context.
+ * @returns An error response when no session is active, or undefined.
+ */
+export function requireActiveSession<TResult>(
+  context: ToolContext,
+): ToolResponse<TResult> | undefined {
+  if (!context.sessionManager.hasActiveSession()) {
+    return createToolError(
+      ErrorCodes.MM_NO_ACTIVE_SESSION,
+      'No active session. Call launch first.',
+    );
+  }
+
+  return undefined;
+}
diff --git a/src/mcp-server/constants.ts b/src/tools/utils/constants.ts
similarity index 94%
rename from src/mcp-server/constants.ts
rename to src/tools/utils/constants.ts
index fd1404f..8adf3cd 100644
--- a/src/mcp-server/constants.ts
+++ b/src/tools/utils/constants.ts
@@ -1,5 +1,5 @@
 /**
- * Constants for MCP server tool operations.
+ * Constants for HTTP daemon tool operations.
  * Centralized to ensure consistency and easy tuning.
  */
 
diff --git a/src/mcp-server/discovery.test.ts b/src/tools/utils/discovery.test.ts
similarity index 100%
rename from src/mcp-server/discovery.test.ts
rename to src/tools/utils/discovery.test.ts
diff --git a/src/mcp-server/discovery.ts b/src/tools/utils/discovery.ts
similarity index 99%
rename from src/mcp-server/discovery.ts
rename to src/tools/utils/discovery.ts
index 5df6159..3063255 100644
--- a/src/mcp-server/discovery.ts
+++ b/src/tools/utils/discovery.ts
@@ -1,14 +1,14 @@
 import type { Page, Locator } from '@playwright/test';
 
 import { TEXT_PREVIEW_MAX_LENGTH } from './constants.js';
+import { debugWarn } from '../../utils';
 import type {
   TestIdItem,
   A11yNodeTrimmed,
   RawA11yNode,
   IncludedRole,
-} from './types';
-import { INCLUDED_ROLES } from './types';
-import { debugWarn } from './utils';
+} from '../types';
+import { INCLUDED_ROLES } from '../types';
 
 const INCLUDED_ROLES_SET = new Set<string>(INCLUDED_ROLES);
 
diff --git a/src/mcp-server/utils/targets.ts b/src/tools/utils/targets.ts
similarity index 100%
rename from src/mcp-server/utils/targets.ts
rename to src/tools/utils/targets.ts
diff --git a/src/mcp-server/utils/type-guards.test.ts b/src/tools/utils/type-guards.test.ts
similarity index 100%
rename from src/mcp-server/utils/type-guards.test.ts
rename to src/tools/utils/type-guards.test.ts
diff --git a/src/mcp-server/utils/type-guards.ts b/src/tools/utils/type-guards.ts
similarity index 100%
rename from src/mcp-server/utils/type-guards.ts
rename to src/tools/utils/type-guards.ts
diff --git a/src/types/http.ts b/src/types/http.ts
new file mode 100644
index 0000000..31c0640
--- /dev/null
+++ b/src/types/http.ts
@@ -0,0 +1,119 @@
+/**
+ * HTTP Server Type Definitions
+ *
+ * Types for standalone tool functions and HTTP response shapes.
+ */
+
+import type { Page } from '@playwright/test';
+
+import type { WorkflowContext } from '../capabilities/context.js';
+import type { KnowledgeStore } from '../knowledge-store/knowledge-store.js';
+import type { ISessionManager } from '../server/session-manager.js';
+
+/**
+ * Context passed to standalone tool functions.
+ *
+ * This context provides access to the session manager, current page,
+ * accessibility reference map, workflow capabilities, and knowledge store.
+ */
+export type ToolContext = {
+  /** Session manager for browser session control */
+  sessionManager: ISessionManager;
+  /** Current active Playwright page (lazy — throws if no session) */
+  get page(): Page;
+  /** Accessibility reference map (lazy — returns empty map if no session) */
+  get refMap(): Map<string, string>;
+  /** Workflow context with capabilities and environment config */
+  workflowContext: WorkflowContext;
+  /** Knowledge store for session history and prior knowledge */
+  knowledgeStore: KnowledgeStore;
+  /** Tool registry for batch execution (run_steps) */
+  toolRegistry: Map<string, ToolFunction<unknown, unknown>>;
+};
+
+/**
+ * Result shape for tool responses.
+ *
+ * @template T The type of the successful result
+ */
+// eslint-disable-next-line @typescript-eslint/naming-convention
+export type ToolResponse<T = unknown> =
+  | { ok: true; result: T }
+  | { ok: false; error: { code: string; message: string } };
+
+/**
+ * Standalone tool function signature.
+ *
+ * Tool functions receive parameters and a context, and return a ToolResponse.
+ *
+ * @template TParams The type of parameters the tool accepts
+ * @template TResult The type of the successful result
+ */
+export type ToolFunction<TParams = unknown, TResult = unknown> = (
+  params: TParams,
+  context: ToolContext,
+) => Promise<ToolResponse<TResult>>;
+
+/**
+ * Port configuration passed to contextFactory at runtime.
+ *
+ * These ports are used to configure test infrastructure (Anvil, fixture server, mock server).
+ */
+export type ContextFactoryOptions = {
+  /** Port configuration for test services */
+  ports: {
+    /** Anvil local chain port */
+    anvil: number;
+    /** Fixture server port */
+    fixture: number;
+    /** Mock server port */
+    mock: number;
+  };
+};
+
+/**
+ * Configuration for createServer().
+ *
+ * This configuration is used to initialize the HTTP server with
+ * session management, context factory, and optional settings.
+ */
+export type ServerConfig = {
+  /** Session manager instance */
+  sessionManager: ISessionManager;
+  /** Factory function to create workflow context */
+  contextFactory: (options: ContextFactoryOptions) => WorkflowContext;
+  /** Idle timeout for daemon auto-shutdown in milliseconds (default: 1_800_000 = 30 min) */
+  idleShutdownMs?: number;
+  /** Per-request execution timeout in milliseconds (default: 30_000) */
+  requestTimeoutMs?: number;
+  /** Path to log file (optional) */
+  logFilePath?: string;
+};
+
+/**
+ * Shape of the .mm-server daemon state file.
+ *
+ * This file is created when the daemon starts and contains
+ * the port, PID, and port configuration for the running server.
+ */
+export type DaemonState = {
+  /** HTTP server port */
+  port: number;
+  /** Process ID of the daemon */
+  pid: number;
+  /** ISO 8601 timestamp when daemon started */
+  startedAt: string;
+  /** Nonce for daemon identification */
+  nonce: string;
+  /** Package version of the daemon process (absent in state files written before version tracking) */
+  version?: string;
+  /** Port configuration for test services */
+  subPorts: {
+    /** Anvil local chain port */
+    anvil: number;
+    /** Fixture server port */
+    fixture: number;
+    /** Mock server port */
+    mock: number;
+  };
+};
diff --git a/src/mcp-server/utils/errors.ts b/src/utils/errors.ts
similarity index 100%
rename from src/mcp-server/utils/errors.ts
rename to src/utils/errors.ts
diff --git a/src/utils/index.ts b/src/utils/index.ts
index 6eae751..99c0548 100644
--- a/src/utils/index.ts
+++ b/src/utils/index.ts
@@ -7,3 +7,6 @@ export {
   waitForServiceReady,
   type WaitForServiceReadyOptions,
 } from './service-readiness.js';
+export { generateFilesafeTimestamp, generateSessionId } from './time.js';
+export { extractErrorMessage } from './errors.js';
+export { debugWarn } from './logger.js';
diff --git a/src/mcp-server/utils/logger.test.ts b/src/utils/logger.test.ts
similarity index 82%
rename from src/mcp-server/utils/logger.test.ts
rename to src/utils/logger.test.ts
index 9824771..37ce500 100644
--- a/src/mcp-server/utils/logger.test.ts
+++ b/src/utils/logger.test.ts
@@ -13,21 +13,21 @@ describe('debugWarn', () => {
     vi.resetModules();
   });
 
-  describe('when MCP_DEBUG is true', () => {
+  describe('when DEBUG is true', () => {
     it('logs warning with context and error message', async () => {
-      process.env.MCP_DEBUG = 'true';
+      process.env.DEBUG = 'true';
       vi.resetModules();
       const { debugWarn } = await import('./logger.js');
 
       debugWarn('test.context', new Error('test error'));
 
       expect(consoleWarnSpy).toHaveBeenCalledWith(
-        expect.stringContaining('[MCP:test.context]'),
+        expect.stringContaining('[Server:test.context] test error'),
       );
     });
 
     it('extracts error message from Error objects', async () => {
-      process.env.MCP_DEBUG = 'true';
+      process.env.DEBUG = 'true';
       vi.resetModules();
       const { debugWarn } = await import('./logger.js');
 
@@ -40,7 +40,7 @@ describe('debugWarn', () => {
     });
 
     it('handles string error messages', async () => {
-      process.env.MCP_DEBUG = 'true';
+      process.env.DEBUG = 'true';
       vi.resetModules();
       const { debugWarn } = await import('./logger.js');
 
@@ -52,7 +52,7 @@ describe('debugWarn', () => {
     });
 
     it('handles unknown error types', async () => {
-      process.env.MCP_DEBUG = 'true';
+      process.env.DEBUG = 'true';
       vi.resetModules();
       const { debugWarn } = await import('./logger.js');
 
@@ -62,9 +62,9 @@ describe('debugWarn', () => {
     });
   });
 
-  describe('when MCP_DEBUG is false or unset', () => {
+  describe('when DEBUG is false or unset', () => {
     it('does not log anything', async () => {
-      delete process.env.MCP_DEBUG;
+      delete process.env.DEBUG;
       vi.resetModules();
       const { debugWarn } = await import('./logger.js');
 
@@ -73,8 +73,8 @@ describe('debugWarn', () => {
       expect(consoleWarnSpy).not.toHaveBeenCalled();
     });
 
-    it('does not log when MCP_DEBUG is empty string', async () => {
-      process.env.MCP_DEBUG = '';
+    it('does not log when DEBUG is empty string', async () => {
+      process.env.DEBUG = '';
       vi.resetModules();
       const { debugWarn } = await import('./logger.js');
 
diff --git a/src/mcp-server/utils/logger.ts b/src/utils/logger.ts
similarity index 56%
rename from src/mcp-server/utils/logger.ts
rename to src/utils/logger.ts
index f363068..0f1c414 100644
--- a/src/mcp-server/utils/logger.ts
+++ b/src/utils/logger.ts
@@ -1,16 +1,16 @@
 import { extractErrorMessage } from './errors.js';
 
 /**
- * Debug logging for MCP server operations.
- * Enabled via MCP_DEBUG=true environment variable.
+ * Debug logging for server operations.
+ * Enabled via DEBUG=true environment variable.
  *
- * By default, logging is disabled to avoid polluting MCP protocol stdout.
+ * By default, logging is disabled to avoid noise in HTTP daemon logs.
  */
 
-const DEBUG = process.env.MCP_DEBUG === 'true';
+const DEBUG = process.env.DEBUG === 'true';
 
 /**
- * Log a debug warning message. Only outputs when MCP_DEBUG=true.
+ * Log a debug warning message. Only outputs when DEBUG=true.
  * Use this for caught errors that are intentionally suppressed.
  *
  * @param context - A short identifier for where the warning occurred (e.g., "discovery.collectTestIds")
@@ -19,6 +19,6 @@ const DEBUG = process.env.MCP_DEBUG === 'true';
 export function debugWarn(context: string, error: unknown): void {
   if (DEBUG) {
     const message = extractErrorMessage(error);
-    console.warn(`[MCP:${context}] ${message}`);
+    console.warn(`[Server:${context}] ${message}`);
   }
 }
diff --git a/src/mcp-server/utils/time.test.ts b/src/utils/time.test.ts
similarity index 100%
rename from src/mcp-server/utils/time.test.ts
rename to src/utils/time.test.ts
diff --git a/src/mcp-server/utils/time.ts b/src/utils/time.ts
similarity index 100%
rename from src/mcp-server/utils/time.ts
rename to src/utils/time.ts
diff --git a/src/mcp-server/schemas.test.ts b/src/validation/schemas.test.ts
similarity index 100%
rename from src/mcp-server/schemas.test.ts
rename to src/validation/schemas.test.ts
diff --git a/src/mcp-server/schemas.ts b/src/validation/schemas.ts
similarity index 97%
rename from src/mcp-server/schemas.ts
rename to src/validation/schemas.ts
index 489e6bf..58585c1 100644
--- a/src/mcp-server/schemas.ts
+++ b/src/validation/schemas.ts
@@ -1,6 +1,6 @@
 import { z } from 'zod';
 
-import { SMART_CONTRACT_NAMES, HARDFORKS } from './types/seeding.js';
+import { SMART_CONTRACT_NAMES, HARDFORKS } from '../tools/types/seeding.js';
 
 export const a11yRefPattern = z
   .string()
@@ -159,6 +159,10 @@ export const launchInputSchema = z.object({
     .array(z.enum(smartContractNames))
     .describe('Smart contracts to deploy on launch (before extension loads)')
     .optional(),
+  force: z
+    .boolean()
+    .default(false)
+    .describe('Force replace an existing active session (runs cleanup first)'),
 });
 
 export const cleanupInputSchema = z.object({
@@ -241,7 +245,11 @@ export const describeScreenInputSchema = z.object({
 });
 
 export const screenshotInputSchema = z.object({
-  name: z.string().min(1).describe('Screenshot filename (without extension)'),
+  name: z
+    .string()
+    .min(1)
+    .describe('Screenshot filename (without extension)')
+    .optional(),
   fullPage: z
     .boolean()
     .default(true)
diff --git a/src/version.ts b/src/version.ts
new file mode 100644
index 0000000..7ec097a
--- /dev/null
+++ b/src/version.ts
@@ -0,0 +1,2 @@
+// Keep in sync with package.json — used for daemon version tracking.
+export const PACKAGE_VERSION = '0.2.0';
diff --git a/vitest.config.mts b/vitest.config.mts
index 0470a62..a531075 100644
--- a/vitest.config.mts
+++ b/vitest.config.mts
@@ -27,7 +27,7 @@ export default defineConfig({
 
       // The files to exclude from the coverage report. Vitest excludes test
       // files by default, but not `test-d.ts` files.
-      exclude: ['src/**/*.test-d.ts', 'src/mcp-server/test-utils/'],
+      exclude: ['src/**/*.test-d.ts', 'src/tools/test-utils/'],
 
       // Coverage thresholds. If the coverage is below these thresholds, the
       // test will fail.
diff --git a/yarn.lock b/yarn.lock
index dfdade7..ee84dca 100644
--- a/yarn.lock
+++ b/yarn.lock
@@ -727,15 +727,6 @@ __metadata:
   languageName: node
   linkType: hard
 
-"@hono/node-server@npm:^1.19.9":
-  version: 1.19.9
-  resolution: "@hono/node-server@npm:1.19.9"
-  peerDependencies:
-    hono: ^4
-  checksum: 10/d4915c2e736ee1e3934b5538cde92b19914dc71346340528a04e4c7219afc7367965080cd1a5291ac9cbda7b0780b89b6ca93472a9418aa105d6d1183033dc8a
-  languageName: node
-  linkType: hard
-
 "@humanfs/core@npm:^0.19.1":
   version: 0.19.1
   resolution: "@humanfs/core@npm:0.19.1"
@@ -930,9 +921,9 @@ __metadata:
     "@metamask/eslint-config-nodejs": "npm:^15.0.0"
     "@metamask/eslint-config-typescript": "npm:^15.0.0"
     "@metamask/eslint-config-vitest": "npm:^15.0.0"
-    "@modelcontextprotocol/sdk": "npm:^1.26.0"
     "@playwright/test": "npm:^1.49.0"
     "@ts-bridge/cli": "npm:^0.6.3"
+    "@types/express": "npm:^5.0.6"
     "@types/node": "npm:^20.0.0"
     "@typescript-eslint/utils": "npm:^8.6.0"
     "@vitest/coverage-istanbul": "npm:^3.0.7"
@@ -947,6 +938,7 @@ __metadata:
     eslint-plugin-n: "npm:^17.10.3"
     eslint-plugin-prettier: "npm:^5.2.1"
     eslint-plugin-promise: "npm:^7.1.0"
+    express: "npm:^5.2.1"
     playwright: "npm:^1.49.0"
     prettier: "npm:^3.3.3"
     prettier-plugin-packagejson: "npm:^2.5.8"
@@ -960,6 +952,8 @@ __metadata:
   peerDependencies:
     "@playwright/test": ^1.49.0
     playwright: ^1.49.0
+  bin:
+    mm: ./dist/cli/mm.cjs
   languageName: unknown
   linkType: soft
 
@@ -1026,39 +1020,6 @@ __metadata:
   languageName: node
   linkType: hard
 
-"@modelcontextprotocol/sdk@npm:^1.26.0":
-  version: 1.26.0
-  resolution: "@modelcontextprotocol/sdk@npm:1.26.0"
-  dependencies:
-    "@hono/node-server": "npm:^1.19.9"
-    ajv: "npm:^8.17.1"
-    ajv-formats: "npm:^3.0.1"
-    content-type: "npm:^1.0.5"
-    cors: "npm:^2.8.5"
-    cross-spawn: "npm:^7.0.5"
-    eventsource: "npm:^3.0.2"
-    eventsource-parser: "npm:^3.0.0"
-    express: "npm:^5.2.1"
-    express-rate-limit: "npm:^8.2.1"
-    hono: "npm:^4.11.4"
-    jose: "npm:^6.1.3"
-    json-schema-typed: "npm:^8.0.2"
-    pkce-challenge: "npm:^5.0.0"
-    raw-body: "npm:^3.0.0"
-    zod: "npm:^3.25 || ^4.0"
-    zod-to-json-schema: "npm:^3.25.1"
-  peerDependencies:
-    "@cfworker/json-schema": ^4.1.1
-    zod: ^3.25 || ^4.0
-  peerDependenciesMeta:
-    "@cfworker/json-schema":
-      optional: true
-    zod:
-      optional: false
-  checksum: 10/a206b2a4d61a23be8b8f4c886528dd9348d11b17ce36013b350edf5c082b1c1f07941d52ea098f721daf3828085b6f6276bb844c484a0e9913edbc028517a3d5
-  languageName: node
-  linkType: hard
-
 "@napi-rs/wasm-runtime@npm:^0.2.11":
   version: 0.2.12
   resolution: "@napi-rs/wasm-runtime@npm:0.2.12"
@@ -1623,6 +1584,16 @@ __metadata:
   languageName: node
   linkType: hard
 
+"@types/body-parser@npm:*":
+  version: 1.19.6
+  resolution: "@types/body-parser@npm:1.19.6"
+  dependencies:
+    "@types/connect": "npm:*"
+    "@types/node": "npm:*"
+  checksum: 10/33041e88eae00af2cfa0827e951e5f1751eafab2a8b6fce06cd89ef368a988907996436b1325180edaeddd1c0c7d0d0d4c20a6c9ff294a91e0039a9db9e9b658
+  languageName: node
+  linkType: hard
+
 "@types/chai@npm:^5.2.2":
   version: 5.2.3
   resolution: "@types/chai@npm:5.2.3"
@@ -1633,6 +1604,15 @@ __metadata:
   languageName: node
   linkType: hard
 
+"@types/connect@npm:*":
+  version: 3.4.38
+  resolution: "@types/connect@npm:3.4.38"
+  dependencies:
+    "@types/node": "npm:*"
+  checksum: 10/7eb1bc5342a9604facd57598a6c62621e244822442976c443efb84ff745246b10d06e8b309b6e80130026a396f19bf6793b7cecd7380169f369dac3bfc46fb99
+  languageName: node
+  linkType: hard
+
 "@types/deep-eql@npm:*":
   version: 4.0.2
   resolution: "@types/deep-eql@npm:4.0.2"
@@ -1647,6 +1627,29 @@ __metadata:
   languageName: node
   linkType: hard
 
+"@types/express-serve-static-core@npm:^5.0.0":
+  version: 5.1.1
+  resolution: "@types/express-serve-static-core@npm:5.1.1"
+  dependencies:
+    "@types/node": "npm:*"
+    "@types/qs": "npm:*"
+    "@types/range-parser": "npm:*"
+    "@types/send": "npm:*"
+  checksum: 10/7f3d8cf7e68764c9f3e8f6a12825b69ccf5287347fc1c20b29803d4f08a4abc1153ae11d7258852c61aad50f62ef72d4c1b9c97092b0a90462c3dddec2f6026c
+  languageName: node
+  linkType: hard
+
+"@types/express@npm:^5.0.6":
+  version: 5.0.6
+  resolution: "@types/express@npm:5.0.6"
+  dependencies:
+    "@types/body-parser": "npm:*"
+    "@types/express-serve-static-core": "npm:^5.0.0"
+    "@types/serve-static": "npm:^2"
+  checksum: 10/da2cc3de1b1a4d7f20ed3fb6f0a8ee08e99feb3c2eb5a8d643db77017d8d0e70fee9e95da38a73f51bcdf5eda3bb6435073c0271dc04fb16fda92e55daf911fa
+  languageName: node
+  linkType: hard
+
 "@types/hast@npm:^3.0.0, @types/hast@npm:^3.0.4":
   version: 3.0.4
   resolution: "@types/hast@npm:3.0.4"
@@ -1656,6 +1659,13 @@ __metadata:
   languageName: node
   linkType: hard
 
+"@types/http-errors@npm:*":
+  version: 2.0.5
+  resolution: "@types/http-errors@npm:2.0.5"
+  checksum: 10/a88da669366bc483e8f3b3eb3d34ada5f8d13eeeef851b1204d77e2ba6fc42aba4566d877cca5c095204a3f4349b87fe397e3e21288837bdd945dd514120755b
+  languageName: node
+  linkType: hard
+
 "@types/json-schema@npm:^7.0.15":
   version: 7.0.15
   resolution: "@types/json-schema@npm:7.0.15"
@@ -1679,6 +1689,15 @@ __metadata:
   languageName: node
   linkType: hard
 
+"@types/node@npm:*":
+  version: 25.5.2
+  resolution: "@types/node@npm:25.5.2"
+  dependencies:
+    undici-types: "npm:~7.18.0"
+  checksum: 10/11782030f910ecf600cd537791980bd8b68496570ecd633d512d713b5b8a16ea3740fce85c82d0593305f809a7c205d7e86c07f179063fc98f014a7f9b013166
+  languageName: node
+  linkType: hard
+
 "@types/node@npm:^20.0.0":
   version: 20.19.31
   resolution: "@types/node@npm:20.19.31"
@@ -1695,6 +1714,39 @@ __metadata:
   languageName: node
   linkType: hard
 
+"@types/qs@npm:*":
+  version: 6.15.0
+  resolution: "@types/qs@npm:6.15.0"
+  checksum: 10/871162881f1c83e61d0c8c243c65549be5dddf33a6911f3324edeebd4087207b1174644da9a3afaa20cf494c5288d2a1ece09e10e4822f755339f14a05c339ea
+  languageName: node
+  linkType: hard
+
+"@types/range-parser@npm:*":
+  version: 1.2.7
+  resolution: "@types/range-parser@npm:1.2.7"
+  checksum: 10/95640233b689dfbd85b8c6ee268812a732cf36d5affead89e806fe30da9a430767af8ef2cd661024fd97e19d61f3dec75af2df5e80ec3bea000019ab7028629a
+  languageName: node
+  linkType: hard
+
+"@types/send@npm:*":
+  version: 1.2.1
+  resolution: "@types/send@npm:1.2.1"
+  dependencies:
+    "@types/node": "npm:*"
+  checksum: 10/81ef5790037ba1d2d458392e4241501f0f8b4838cc8797e169e179e099410e12069ec68e8dbd39211cb097c4a9b1ff1682dbcea897ab4ce21dad93438b862d27
+  languageName: node
+  linkType: hard
+
+"@types/serve-static@npm:^2":
+  version: 2.2.0
+  resolution: "@types/serve-static@npm:2.2.0"
+  dependencies:
+    "@types/http-errors": "npm:*"
+    "@types/node": "npm:*"
+  checksum: 10/f2bad1304c7d0d3b7221faff3e490c40129d3803f4fb1b2fb84f31f561071c5e6a4b876c41bbbe82d5645034eea936e946bcaaf993dac1093ce68b56effad6e0
+  languageName: node
+  linkType: hard
+
 "@types/unist@npm:*, @types/unist@npm:^3.0.0":
   version: 3.0.3
   resolution: "@types/unist@npm:3.0.3"
@@ -2218,20 +2270,6 @@ __metadata:
   languageName: node
   linkType: hard
 
-"ajv-formats@npm:^3.0.1":
-  version: 3.0.1
-  resolution: "ajv-formats@npm:3.0.1"
-  dependencies:
-    ajv: "npm:^8.0.0"
-  peerDependencies:
-    ajv: ^8.0.0
-  peerDependenciesMeta:
-    ajv:
-      optional: true
-  checksum: 10/5679b9f9ced9d0213a202a37f3aa91efcffe59a6de1a6e3da5c873344d3c161820a1f11cc29899661fee36271fd2895dd3851b6461c902a752ad661d1c1e8722
-  languageName: node
-  linkType: hard
-
 "ajv@npm:^6.12.4":
   version: 6.12.6
   resolution: "ajv@npm:6.12.6"
@@ -2244,18 +2282,6 @@ __metadata:
   languageName: node
   linkType: hard
 
-"ajv@npm:^8.0.0, ajv@npm:^8.17.1":
-  version: 8.17.1
-  resolution: "ajv@npm:8.17.1"
-  dependencies:
-    fast-deep-equal: "npm:^3.1.3"
-    fast-uri: "npm:^3.0.1"
-    json-schema-traverse: "npm:^1.0.0"
-    require-from-string: "npm:^2.0.2"
-  checksum: 10/ee3c62162c953e91986c838f004132b6a253d700f1e51253b99791e2dbfdb39161bc950ebdc2f156f8568035bb5ed8be7bd78289cd9ecbf3381fe8f5b82e3f33
-  languageName: node
-  linkType: hard
-
 "ansi-escapes@npm:^7.0.0":
   version: 7.2.0
   resolution: "ansi-escapes@npm:7.2.0"
@@ -2773,16 +2799,6 @@ __metadata:
   languageName: node
   linkType: hard
 
-"cors@npm:^2.8.5":
-  version: 2.8.6
-  resolution: "cors@npm:2.8.6"
-  dependencies:
-    object-assign: "npm:^4"
-    vary: "npm:^1"
-  checksum: 10/aa7174305b21ceb90f9c84f4eaa32f04432d333addbfdc0d1eb7310393c48902e5364aada5ac2f5d054528d63b3179238444475426fcb74e1e345077de485727
-  languageName: node
-  linkType: hard
-
 "cosmiconfig@npm:^7.1.0":
   version: 7.1.0
   resolution: "cosmiconfig@npm:7.1.0"
@@ -2803,7 +2819,7 @@ __metadata:
   languageName: node
   linkType: hard
 
-"cross-spawn@npm:^7.0.3, cross-spawn@npm:^7.0.5, cross-spawn@npm:^7.0.6":
+"cross-spawn@npm:^7.0.3, cross-spawn@npm:^7.0.6":
   version: 7.0.6
   resolution: "cross-spawn@npm:7.0.6"
   dependencies:
@@ -3623,22 +3639,6 @@ __metadata:
   languageName: node
   linkType: hard
 
-"eventsource-parser@npm:^3.0.0, eventsource-parser@npm:^3.0.1":
-  version: 3.0.6
-  resolution: "eventsource-parser@npm:3.0.6"
-  checksum: 10/febf7058b9c2168ecbb33e92711a1646e06bd1568f60b6eb6a01a8bf9f8fcd29cc8320d57247059cacf657a296280159f21306d2e3ff33309a9552b2ef889387
-  languageName: node
-  linkType: hard
-
-"eventsource@npm:^3.0.2":
-  version: 3.0.7
-  resolution: "eventsource@npm:3.0.7"
-  dependencies:
-    eventsource-parser: "npm:^3.0.1"
-  checksum: 10/e034915bc97068d1d38617951afd798e6776d6a3a78e36a7569c235b177c7afc2625c9fe82656f7341ab72c7eeecb3fd507b7f88e9328f2448872ff9c4742bb6
-  languageName: node
-  linkType: hard
-
 "execa@npm:^5.1.1":
   version: 5.1.1
   resolution: "execa@npm:5.1.1"
@@ -3679,17 +3679,6 @@ __metadata:
   languageName: node
   linkType: hard
 
-"express-rate-limit@npm:^8.2.1":
-  version: 8.2.1
-  resolution: "express-rate-limit@npm:8.2.1"
-  dependencies:
-    ip-address: "npm:10.0.1"
-  peerDependencies:
-    express: ">= 4.11"
-  checksum: 10/7cbf70df2e88e590e463d2d8f93380775b2ea181d97f2c50c2ff9f2c666c247f83109a852b21d9c99ccc5762119101f281f54a27252a2f1a0a918be6d71f955b
-  languageName: node
-  linkType: hard
-
 "express@npm:^5.2.1":
   version: 5.2.1
   resolution: "express@npm:5.2.1"
@@ -3754,13 +3743,6 @@ __metadata:
   languageName: node
   linkType: hard
 
-"fast-uri@npm:^3.0.1":
-  version: 3.1.0
-  resolution: "fast-uri@npm:3.1.0"
-  checksum: 10/818b2c96dc913bcf8511d844c3d2420e2c70b325c0653633f51821e4e29013c2015387944435cd0ef5322c36c9beecc31e44f71b257aeb8e0b333c1d62bb17c2
-  languageName: node
-  linkType: hard
-
 "fdir@npm:^6.4.4, fdir@npm:^6.5.0":
   version: 6.5.0
   resolution: "fdir@npm:6.5.0"
@@ -4166,13 +4148,6 @@ __metadata:
   languageName: node
   linkType: hard
 
-"hono@npm:^4.11.4":
-  version: 4.11.7
-  resolution: "hono@npm:4.11.7"
-  checksum: 10/16f5a715f70430bd4050b250207adf7c567774c1d91386d5454577fbc191fc4a50b912628845ce8392fae0e3fd9f364a947412961e3747a9f0b2f714790b738e
-  languageName: node
-  linkType: hard
-
 "hosted-git-info@npm:^9.0.0":
   version: 9.0.2
   resolution: "hosted-git-info@npm:9.0.2"
@@ -4313,13 +4288,6 @@ __metadata:
   languageName: node
   linkType: hard
 
-"ip-address@npm:10.0.1":
-  version: 10.0.1
-  resolution: "ip-address@npm:10.0.1"
-  checksum: 10/09731acda32cd8e14c46830c137e7e5940f47b36d63ffb87c737331270287d631cf25aa95570907a67d3f919fdb25f4470c404eda21e62f22e0a55927f4dd0fb
-  languageName: node
-  linkType: hard
-
 "ip-address@npm:^10.0.1":
   version: 10.1.0
   resolution: "ip-address@npm:10.1.0"
@@ -4496,13 +4464,6 @@ __metadata:
   languageName: node
   linkType: hard
 
-"jose@npm:^6.1.3":
-  version: 6.1.3
-  resolution: "jose@npm:6.1.3"
-  checksum: 10/9626c51e8c3792b505e954f3094698c182208617b62dfb27269230f31e57560b083985ed8128b8a9753aa92daf18d3a2341cc826d149503f14569abe87d42389
-  languageName: node
-  linkType: hard
-
 "js-tokens@npm:^4.0.0":
   version: 4.0.0
   resolution: "js-tokens@npm:4.0.0"
@@ -4584,20 +4545,6 @@ __metadata:
   languageName: node
   linkType: hard
 
-"json-schema-traverse@npm:^1.0.0":
-  version: 1.0.0
-  resolution: "json-schema-traverse@npm:1.0.0"
-  checksum: 10/02f2f466cdb0362558b2f1fd5e15cce82ef55d60cd7f8fa828cf35ba74330f8d767fcae5c5c2adb7851fa811766c694b9405810879bc4e1ddd78a7c0e03658ad
-  languageName: node
-  linkType: hard
-
-"json-schema-typed@npm:^8.0.2":
-  version: 8.0.2
-  resolution: "json-schema-typed@npm:8.0.2"
-  checksum: 10/fa866d1fe91e3a94aa4fe007861475cd03dcaf47b719861cab171ef2f8598478007c634d29ae45de94ee34ddff4e13414c63ea5ff06c5b868b613142c699d511
-  languageName: node
-  linkType: hard
-
 "json-stable-stringify-without-jsonify@npm:^1.0.1":
   version: 1.0.1
   resolution: "json-stable-stringify-without-jsonify@npm:1.0.1"
@@ -5214,7 +5161,7 @@ __metadata:
   languageName: node
   linkType: hard
 
-"object-assign@npm:^4, object-assign@npm:^4.0.1":
+"object-assign@npm:^4.0.1":
   version: 4.1.1
   resolution: "object-assign@npm:4.1.1"
   checksum: 10/fcc6e4ea8c7fe48abfbb552578b1c53e0d194086e2e6bbbf59e0a536381a292f39943c6e9628af05b5528aa5e3318bb30d6b2e53cadaf5b8fe9e12c4b69af23f
@@ -5476,13 +5423,6 @@ __metadata:
   languageName: node
   linkType: hard
 
-"pkce-challenge@npm:^5.0.0":
-  version: 5.0.1
-  resolution: "pkce-challenge@npm:5.0.1"
-  checksum: 10/51d11f68d5a78617cfb2e9c2706dadcc2cbe55ffb55b21d42a6ed848ac5159db2657bf6c966a5a414119aa839ceb64240afea35e9e1c06946b57606ed0b43789
-  languageName: node
-  linkType: hard
-
 "playwright-core@npm:1.58.1":
   version: 1.58.1
   resolution: "playwright-core@npm:1.58.1"
@@ -5630,7 +5570,7 @@ __metadata:
   languageName: node
   linkType: hard
 
-"raw-body@npm:^3.0.0, raw-body@npm:^3.0.1":
+"raw-body@npm:^3.0.1":
   version: 3.0.2
   resolution: "raw-body@npm:3.0.2"
   dependencies:
@@ -5691,13 +5631,6 @@ __metadata:
   languageName: node
   linkType: hard
 
-"require-from-string@npm:^2.0.2":
-  version: 2.0.2
-  resolution: "require-from-string@npm:2.0.2"
-  checksum: 10/839a3a890102a658f4cb3e7b2aa13a1f80a3a976b512020c3d1efc418491c48a886b6e481ea56afc6c4cb5eef678f23b2a4e70575e7534eccadf5e30ed2e56eb
-  languageName: node
-  linkType: hard
-
 "require-package-name@npm:^2.0.1":
   version: 2.0.1
   resolution: "require-package-name@npm:2.0.1"
@@ -6649,6 +6582,13 @@ __metadata:
   languageName: node
   linkType: hard
 
+"undici-types@npm:~7.18.0":
+  version: 7.18.2
+  resolution: "undici-types@npm:7.18.2"
+  checksum: 10/e61a5918f624d68420c3ca9d301e9f15b61cba6e97be39fe2ce266dd6151e4afe424d679372638826cb506be33952774e0424141200111a9857e464216c009af
+  languageName: node
+  linkType: hard
+
 "unicode-emoji-modifier-base@npm:^1.0.0":
   version: 1.0.0
   resolution: "unicode-emoji-modifier-base@npm:1.0.0"
@@ -6857,7 +6797,7 @@ __metadata:
   languageName: node
   linkType: hard
 
-"vary@npm:^1, vary@npm:^1.1.2":
+"vary@npm:^1.1.2":
   version: 1.1.2
   resolution: "vary@npm:1.1.2"
   checksum: 10/31389debef15a480849b8331b220782230b9815a8e0dbb7b9a8369559aed2e9a7800cd904d4371ea74f4c3527db456dc8e7ac5befce5f0d289014dbdf47b2242
@@ -7258,16 +7198,7 @@ __metadata:
   languageName: node
   linkType: hard
 
-"zod-to-json-schema@npm:^3.25.1":
-  version: 3.25.1
-  resolution: "zod-to-json-schema@npm:3.25.1"
-  peerDependencies:
-    zod: ^3.25 || ^4
-  checksum: 10/744dd370f4452c8db120de1475ea4d484a11df884c4636111d630e5e1351b8a7590d99cf14a2b9f21e7906f8b78721d958663a7973a40994e7d28770876674cc
-  languageName: node
-  linkType: hard
-
-"zod@npm:^3.25 || ^4.0, zod@npm:^4.3.5":
+"zod@npm:^4.3.5":
   version: 4.3.6
   resolution: "zod@npm:4.3.6"
   checksum: 10/25fc0f62e01b557b4644bf0b393bbaf47542ab30877c37837ea8caf314a8713d220c7d7fe51f68ffa72f0e1018ddfa34d96f1973d23033f5a2a5a9b6b9d9da01

From c15cf5ed51e647d1488b2f4fca19ebd02f1fcac1 Mon Sep 17 00:00:00 2001
From: cryptotavares <joao.tavares@consensys.net>
Date: Thu, 16 Apr 2026 10:09:50 +0100
Subject: [PATCH 02/36] feat: add category-based observation filtering for tool
 responses

Tools are now classified into categories (mutating, readonly, discovery,
batch) that control whether post-execution observations appear in HTTP
responses. Mutating tools always include observations, readonly and
discovery tools omit them, and batch (run_steps) respects the new
includeObservations parameter ('all', 'none', 'failures').
---
 README.md                        |  78 +++++------
 SKILL.md                         |  70 +++++++++-
 src/server/create-server.test.ts | 225 ++++++++++++++++++++++++++++++-
 src/server/create-server.ts      |  92 ++++++++++++-
 src/tools/batch.ts               |   3 -
 src/tools/registry.test.ts       |  36 ++++-
 src/tools/registry.ts            |  51 +++++++
 7 files changed, 506 insertions(+), 49 deletions(-)

diff --git a/README.md b/README.md
index 75bc819..2ed3b89 100644
--- a/README.md
+++ b/README.md
@@ -341,44 +341,44 @@ The daemon routes `POST /tool/:name` requests through the registry, applies Zod
 
 **Registered tools:**
 
-| Tool                     | Description                                                                                                                                                                                         |
-| ------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| **Lifecycle**            |                                                                                                                                                                                                     |
-| `build`                  | Triggers an extension build using the configured `BuildCapability`. Accepts build type and force options.                                                                                           |
-| `launch`                 | Launches a new browser session with the configured extension. Supports state modes (`default`, `onboarding`, `custom`), fixture presets, goal/tag metadata, and optional contract seeding on start. |
-| `cleanup`                | Tears down the active browser session and cleans up all resources (browser, services, fixtures).                                                                                                    |
-| **Interaction**          |                                                                                                                                                                                                     |
-| `click`                  | Clicks an element identified by a11y ref, test ID, or CSS selector. Waits for the element to be visible before clicking.                                                                            |
-| `type`                   | Types text into an input element identified by a11y ref, test ID, or CSS selector. Uses Playwright's `fill()` for reliable input.                                                                   |
-| `wait_for`               | Waits for an element to become visible on the page within a configurable timeout.                                                                                                                   |
-| `clipboard`              | Reads from or writes to the system clipboard via Chrome DevTools Protocol. Useful for pasting seed phrases or copying addresses.                                                                    |
-| **Navigation**           |                                                                                                                                                                                                     |
-| `navigate`               | Navigates the browser to a named screen (`home`, `settings`, `notification`) or an arbitrary URL.                                                                                                   |
-| `switch_to_tab`          | Switches the active page to a tab matching a given role (e.g., `extension`, `dapp`) or URL prefix.                                                                                                  |
-| `close_tab`              | Closes a browser tab matching a given role or URL. Falls back to the extension tab if the active tab is closed.                                                                                     |
-| `wait_for_notification`  | Waits for the extension notification popup to appear within a timeout. Returns the notification page URL.                                                                                           |
-| **Discovery**            |                                                                                                                                                                                                     |
-| `describe_screen`        | Captures a comprehensive screen snapshot: extension state, visible test IDs, trimmed a11y tree with refs, optional screenshot, and prior knowledge from historical sessions.                        |
-| `accessibility_snapshot` | Captures a trimmed accessibility tree of the current page with deterministic refs (`e1`, `e2`, ...). Supports scoping to a root CSS selector.                                                       |
-| `list_testids`           | Collects all visible `data-testid` attributes from the current page with text previews and visibility status.                                                                                       |
-| **State**                |                                                                                                                                                                                                     |
-| `get_state`              | Retrieves the current extension state (URL, screen, network, balance, account) and tracked tab information.                                                                                         |
-| `get_context`            | Returns the current environment context (`e2e` or `prod`), session status, available capabilities, and whether context switching is allowed.                                                        |
-| `set_context`            | Switches the session environment between `e2e` and `prod` modes. Blocked while a session is active.                                                                                                 |
-| **Screenshots**          |                                                                                                                                                                                                     |
-| `screenshot`             | Captures a screenshot of the current page. Supports naming, full-page capture, scoping to a CSS selector, and optional base64 output.                                                               |
-| **Knowledge**            |                                                                                                                                                                                                     |
-| `knowledge_last`         | Retrieves the N most recent step records from the knowledge store, with optional scope and filter parameters.                                                                                       |
-| `knowledge_search`       | Searches step records by query string with token-based matching and synonym expansion. Scores results by relevance to screen, URL, test IDs, and a11y nodes.                                        |
-| `knowledge_summarize`    | Generates a recipe-style summary of a session's tool invocations, showing the step sequence with targets and outcomes.                                                                              |
-| `knowledge_sessions`     | Lists available knowledge sessions with metadata (goal, flow tags, timestamps), with optional filtering.                                                                                            |
-| **Contracts**            |                                                                                                                                                                                                     |
-| `seed_contract`          | Deploys a single smart contract to the local Anvil chain by name. Requires `ContractSeedingCapability`.                                                                                             |
-| `seed_contracts`         | Deploys multiple smart contracts in sequence. Returns both successful deployments and individual failures.                                                                                          |
-| `get_contract_address`   | Looks up the deployed address of a contract by name from the session's deployment registry.                                                                                                         |
-| `list_contracts`         | Lists all contracts deployed in the current session with addresses and deployment timestamps.                                                                                                       |
-| **Batching**             |                                                                                                                                                                                                     |
-| `run_steps`              | Executes a batch of tool invocations sequentially. Supports `stopOnError` to halt on first failure. Returns per-step results with timing.                                                           |
+| Tool                     | Description                                                                                                                                                                                                                                                                 |
+| ------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| **Lifecycle**            |                                                                                                                                                                                                                                                                             |
+| `build`                  | Triggers an extension build using the configured `BuildCapability`. Accepts build type and force options.                                                                                                                                                                   |
+| `launch`                 | Launches a new browser session with the configured extension. Supports state modes (`default`, `onboarding`, `custom`), fixture presets, goal/tag metadata, and optional contract seeding on start.                                                                         |
+| `cleanup`                | Tears down the active browser session and cleans up all resources (browser, services, fixtures).                                                                                                                                                                            |
+| **Interaction**          |                                                                                                                                                                                                                                                                             |
+| `click`                  | Clicks an element identified by a11y ref, test ID, or CSS selector. Waits for the element to be visible before clicking.                                                                                                                                                    |
+| `type`                   | Types text into an input element identified by a11y ref, test ID, or CSS selector. Uses Playwright's `fill()` for reliable input.                                                                                                                                           |
+| `wait_for`               | Waits for an element to become visible on the page within a configurable timeout.                                                                                                                                                                                           |
+| `clipboard`              | Reads from or writes to the system clipboard via Chrome DevTools Protocol. Useful for pasting seed phrases or copying addresses.                                                                                                                                            |
+| **Navigation**           |                                                                                                                                                                                                                                                                             |
+| `navigate`               | Navigates the browser to a named screen (`home`, `settings`, `notification`) or an arbitrary URL.                                                                                                                                                                           |
+| `switch_to_tab`          | Switches the active page to a tab matching a given role (e.g., `extension`, `dapp`) or URL prefix.                                                                                                                                                                          |
+| `close_tab`              | Closes a browser tab matching a given role or URL. Falls back to the extension tab if the active tab is closed.                                                                                                                                                             |
+| `wait_for_notification`  | Waits for the extension notification popup to appear within a timeout. Returns the notification page URL.                                                                                                                                                                   |
+| **Discovery**            |                                                                                                                                                                                                                                                                             |
+| `describe_screen`        | Captures a comprehensive screen snapshot: extension state, visible test IDs, trimmed a11y tree with refs, optional screenshot, and prior knowledge from historical sessions.                                                                                                |
+| `accessibility_snapshot` | Captures a trimmed accessibility tree of the current page with deterministic refs (`e1`, `e2`, ...). Supports scoping to a root CSS selector.                                                                                                                               |
+| `list_testids`           | Collects all visible `data-testid` attributes from the current page with text previews and visibility status.                                                                                                                                                               |
+| **State**                |                                                                                                                                                                                                                                                                             |
+| `get_state`              | Retrieves the current extension state (URL, screen, network, balance, account) and tracked tab information.                                                                                                                                                                 |
+| `get_context`            | Returns the current environment context (`e2e` or `prod`), session status, available capabilities, and whether context switching is allowed.                                                                                                                                |
+| `set_context`            | Switches the session environment between `e2e` and `prod` modes. Blocked while a session is active.                                                                                                                                                                         |
+| **Screenshots**          |                                                                                                                                                                                                                                                                             |
+| `screenshot`             | Captures a screenshot of the current page. Supports naming, full-page capture, scoping to a CSS selector, and optional base64 output.                                                                                                                                       |
+| **Knowledge**            |                                                                                                                                                                                                                                                                             |
+| `knowledge_last`         | Retrieves the N most recent step records from the knowledge store, with optional scope and filter parameters.                                                                                                                                                               |
+| `knowledge_search`       | Searches step records by query string with token-based matching and synonym expansion. Scores results by relevance to screen, URL, test IDs, and a11y nodes.                                                                                                                |
+| `knowledge_summarize`    | Generates a recipe-style summary of a session's tool invocations, showing the step sequence with targets and outcomes.                                                                                                                                                      |
+| `knowledge_sessions`     | Lists available knowledge sessions with metadata (goal, flow tags, timestamps), with optional filtering.                                                                                                                                                                    |
+| **Contracts**            |                                                                                                                                                                                                                                                                             |
+| `seed_contract`          | Deploys a single smart contract to the local Anvil chain by name. Requires `ContractSeedingCapability`.                                                                                                                                                                     |
+| `seed_contracts`         | Deploys multiple smart contracts in sequence. Returns both successful deployments and individual failures.                                                                                                                                                                  |
+| `get_contract_address`   | Looks up the deployed address of a contract by name from the session's deployment registry.                                                                                                                                                                                 |
+| `list_contracts`         | Lists all contracts deployed in the current session with addresses and deployment timestamps.                                                                                                                                                                               |
+| **Batching**             |                                                                                                                                                                                                                                                                             |
+| `run_steps`              | Executes a batch of tool invocations sequentially. Supports `stopOnError` to halt on first failure and `includeObservations` (`'all'`, `'none'`, `'failures'`) to control whether post-execution observations appear in the response. Returns per-step results with timing. |
 
 ### Accessibility References
 
@@ -486,6 +486,8 @@ All responses follow a consistent shape:
 { ok: false, error: { code: string, message: string } }
 ```
 
+The `observations` field is included for **mutating** tools (click, type, navigate, launch, cleanup, build, etc.) and for `run_steps` when its `includeObservations` parameter is `'all'` (default) or `'failures'`. **Read-only** and **discovery** tools omit observations from the response.
+
 ## CLI Reference
 
 The `mm` CLI provides a unified interface for agents and developers. All commands communicate with the daemon over HTTP — the daemon is auto-started on `mm launch` if not already running.
diff --git a/SKILL.md b/SKILL.md
index a93f8c8..a73bc48 100644
--- a/SKILL.md
+++ b/SKILL.md
@@ -17,10 +17,76 @@ mm cleanup --shutdown      # 5. Clean up when done
 **Critical rules:**
 
 - **Always `describe-screen` before interacting.** Refs like `e1`, `e2` are ephemeral — they change after every action.
-- **Always `describe-screen` after interacting.** The screen state changed; your old refs are stale.
+- **Always `describe-screen` after interacting** — OR use inline `observations` from mutating tool responses. Mutating tools (click, type, navigate, etc.) return an `observations` object with fresh `state`, `testIds`, and `a11y` refs. You can use these refs directly for the next interaction without calling `describe-screen`. Call `describe-screen` when you need `priorKnowledge` or screenshots.
 - **One target per command.** Specify exactly ONE of: a11y ref (`e5`), testId, or CSS selector.
 - **Errors are structured.** Check the `error.code` field to decide recovery strategy (see Error Codes below).
 
+## Observation Behavior
+
+Tool responses include different data based on the tool's category:
+
+| Category      | Examples                                                          | Observations in response?                 |
+| ------------- | ----------------------------------------------------------------- | ----------------------------------------- |
+| **Mutating**  | click, type, navigate, launch, cleanup, build                     | Yes — `state` + `a11y` + `testIds`        |
+| **Read-only** | get*state, knowledge*\*, get_context, set_context                 | No — faster response                      |
+| **Discovery** | describe_screen, list_testids, accessibility_snapshot, screenshot | Data is already in `result`               |
+| **Batch**     | run_steps                                                         | Controlled by `includeObservations` param |
+
+### Using inline observations (mutating tools)
+
+After a mutating action, the response includes fresh screen state:
+
+```json
+{
+  "ok": true,
+  "result": { ... },
+  "observations": {
+    "state": { "screen": "send", "url": "...", "balance": "1.5 ETH" },
+    "testIds": ["send-amount-input", "send-button"],
+    "a11y": {
+      "nodes": [
+        { "ref": "e1", "role": "textbox", "name": "Amount" },
+        { "ref": "e2", "role": "button", "name": "Send" }
+      ]
+    }
+  }
+}
+```
+
+You can use the `ref` values from `observations.a11y.nodes` for the next interaction — no `describe-screen` needed.
+
+```bash
+mm click e3                 # mutating: response includes fresh observations
+# observations.a11y.nodes has updated refs — use them directly:
+mm type e1 "0.01"           # use ref from previous response
+```
+
+Call `describe-screen` explicitly when you need:
+
+- `priorKnowledge` (historical actions for this screen)
+- A screenshot via `includeScreenshot`
+- Full context after unexpected navigation
+
+### `run_steps` and `includeObservations`
+
+The `run_steps` tool collects observations once after all steps complete. Control inclusion with the `includeObservations` parameter:
+
+| Value             | Behavior                                      |
+| ----------------- | --------------------------------------------- |
+| `'all'` (default) | Always include final state observations       |
+| `'none'`          | Never include observations (fastest response) |
+| `'failures'`      | Include observations only if any step failed  |
+
+```json
+{
+  "steps": [
+    { "tool": "click", "args": { "a11yRef": "e3" } },
+    { "tool": "type", "args": { "a11yRef": "e5", "text": "0.01" } }
+  ],
+  "includeObservations": "failures"
+}
+```
+
 ## Commands
 
 ### Session Lifecycle
@@ -217,7 +283,7 @@ Executes multiple tool invocations in sequence from a JSON array. Each step spec
 mm run-steps '{"steps":[{"tool":"click","args":{"a11yRef":"e3"}},{"tool":"wait_for","args":{"a11yRef":"e5"}}]}'
 ```
 
-Supports `stopOnError` (halt on first failure) and returns per-step results with timing.
+Supports `stopOnError` (halt on first failure) and returns per-step results with timing. The `includeObservations` param controls whether final-state observations appear in the response: `'all'` (default), `'none'`, or `'failures'` (only on partial failure).
 
 ## Element Targeting
 
diff --git a/src/server/create-server.test.ts b/src/server/create-server.test.ts
index ee651fa..dc77c07 100644
--- a/src/server/create-server.test.ts
+++ b/src/server/create-server.test.ts
@@ -12,9 +12,11 @@ import {
   extractScreenshotInfo,
   extractToolOutcome,
   buildResponseBody,
+  shouldCollectObservations,
+  shouldIncludeObservationsInResponse,
 } from './create-server.js';
 import { readDaemonState } from './daemon-state.js';
-import type { DaemonState, ServerConfig } from '../types/http.js';
+import type { DaemonState, ServerConfig, ToolResponse } from '../types/http.js';
 import { PACKAGE_VERSION } from '../version.js';
 
 const tmpDir = path.join(os.tmpdir(), `mm-create-server-test-${Date.now()}`);
@@ -339,6 +341,122 @@ describe('buildResponseBody', () => {
   });
 });
 
+describe('shouldCollectObservations', () => {
+  it('returns true for mutating', () => {
+    expect(shouldCollectObservations('mutating')).toBe(true);
+  });
+
+  it('returns true for readonly (collected for knowledge store)', () => {
+    expect(shouldCollectObservations('readonly')).toBe(true);
+  });
+
+  it('returns true for discovery (collected for knowledge store)', () => {
+    expect(shouldCollectObservations('discovery')).toBe(true);
+  });
+
+  it('returns true for batch with default policy', () => {
+    expect(shouldCollectObservations('batch')).toBe(true);
+  });
+
+  it("returns true for batch with 'all' policy", () => {
+    expect(
+      shouldCollectObservations('batch', { includeObservations: 'all' }),
+    ).toBe(true);
+  });
+
+  it("returns false for batch with 'none' policy", () => {
+    expect(
+      shouldCollectObservations('batch', { includeObservations: 'none' }),
+    ).toBe(false);
+  });
+
+  it("returns true for batch with 'failures' policy", () => {
+    expect(
+      shouldCollectObservations('batch', { includeObservations: 'failures' }),
+    ).toBe(true);
+  });
+});
+
+describe('shouldIncludeObservationsInResponse', () => {
+  const okResult: ToolResponse = { ok: true, result: {} };
+  const failResult: ToolResponse = {
+    ok: false,
+    error: { code: 'ERR', message: 'fail' },
+  };
+  const summaryFailResult: ToolResponse = {
+    ok: true,
+    result: { summary: { ok: false } },
+  };
+
+  it('returns true for mutating', () => {
+    expect(shouldIncludeObservationsInResponse('mutating', okResult)).toBe(
+      true,
+    );
+  });
+
+  it('returns false for readonly', () => {
+    expect(shouldIncludeObservationsInResponse('readonly', okResult)).toBe(
+      false,
+    );
+  });
+
+  it('returns false for discovery', () => {
+    expect(shouldIncludeObservationsInResponse('discovery', okResult)).toBe(
+      false,
+    );
+  });
+
+  it("returns true for batch with 'all' (default)", () => {
+    expect(shouldIncludeObservationsInResponse('batch', okResult, {})).toBe(
+      true,
+    );
+  });
+
+  it("returns false for batch with 'none'", () => {
+    expect(
+      shouldIncludeObservationsInResponse('batch', okResult, {
+        includeObservations: 'none',
+      }),
+    ).toBe(false);
+  });
+
+  it("returns true for batch with 'failures' when tool failed", () => {
+    expect(
+      shouldIncludeObservationsInResponse('batch', failResult, {
+        includeObservations: 'failures',
+      }),
+    ).toBe(true);
+  });
+
+  it("returns true for batch with 'failures' when summary.ok is false", () => {
+    expect(
+      shouldIncludeObservationsInResponse('batch', summaryFailResult, {
+        includeObservations: 'failures',
+      }),
+    ).toBe(true);
+  });
+
+  it("returns false for batch with 'failures' when tool succeeded", () => {
+    const batchOk: ToolResponse = {
+      ok: true,
+      result: { summary: { ok: true } },
+    };
+    expect(
+      shouldIncludeObservationsInResponse('batch', batchOk, {
+        includeObservations: 'failures',
+      }),
+    ).toBe(false);
+  });
+
+  it("returns false for batch with 'failures' when summary is missing", () => {
+    expect(
+      shouldIncludeObservationsInResponse('batch', okResult, {
+        includeObservations: 'failures',
+      }),
+    ).toBe(false);
+  });
+});
+
 describe('createServer integration', () => {
   let server: ServerInstance;
   let state: DaemonState;
@@ -664,6 +782,111 @@ describe('createServer with active session', () => {
 
     expect(res.status).toBe(200);
   });
+
+  it('read-only tool response omits observations', async () => {
+    const res = await httpRequest(
+      `http://127.0.0.1:${state.port}/tool/get_state`,
+      {
+        method: 'POST',
+        headers: { 'Content-Type': 'application/json' },
+        body: JSON.stringify({}),
+      },
+    );
+    const body = (await res.json()) as { ok: boolean; observations?: unknown };
+
+    expect(res.status).toBe(200);
+    expect(body.observations).toBeUndefined();
+  });
+
+  it('mutating tool response includes observations with state, testIds, a11y', async () => {
+    const res = await httpRequest(`http://127.0.0.1:${state.port}/cleanup`, {
+      method: 'POST',
+      headers: { 'Content-Type': 'application/json' },
+      body: JSON.stringify({}),
+    });
+    const body = (await res.json()) as {
+      ok: boolean;
+      observations?: { state: unknown; testIds: unknown[]; a11y: unknown };
+    };
+
+    expect(res.status).toBe(200);
+    expect(body.observations).toBeDefined();
+    expect(body.observations?.state).toBeDefined();
+    expect(body.observations?.testIds).toBeDefined();
+    expect(body.observations?.a11y).toBeDefined();
+  });
+
+  it('playwright helpers called for read-only tools (knowledge store)', async () => {
+    const { collectTestIds, collectTrimmedA11ySnapshot } =
+      await import('../tools/utils/discovery.js');
+    const collectTestIdsSpy = vi.mocked(collectTestIds);
+    const collectA11ySpy = vi.mocked(collectTrimmedA11ySnapshot);
+
+    collectTestIdsSpy.mockClear();
+    collectA11ySpy.mockClear();
+
+    await httpRequest(`http://127.0.0.1:${state.port}/tool/get_state`, {
+      method: 'POST',
+      headers: { 'Content-Type': 'application/json' },
+      body: JSON.stringify({}),
+    });
+
+    expect(collectTestIdsSpy).toHaveBeenCalled();
+    expect(collectA11ySpy).toHaveBeenCalled();
+  });
+
+  it('observation Playwright helpers called for mutating tools', async () => {
+    const { collectTestIds, collectTrimmedA11ySnapshot } =
+      await import('../tools/utils/discovery.js');
+    const collectTestIdsSpy = vi.mocked(collectTestIds);
+    const collectA11ySpy = vi.mocked(collectTrimmedA11ySnapshot);
+
+    collectTestIdsSpy.mockClear();
+    collectA11ySpy.mockClear();
+
+    await httpRequest(`http://127.0.0.1:${state.port}/cleanup`, {
+      method: 'POST',
+      headers: { 'Content-Type': 'application/json' },
+      body: JSON.stringify({}),
+    });
+
+    expect(collectTestIdsSpy).toHaveBeenCalled();
+    expect(collectA11ySpy).toHaveBeenCalled();
+  });
+
+  it('recordStep is called for mutating tool routes', async () => {
+    const { KnowledgeStore } =
+      await import('../knowledge-store/knowledge-store.js');
+    const mockStore = vi.mocked(KnowledgeStore).mock.results.at(-1)?.value as {
+      recordStep: ReturnType<typeof vi.fn>;
+    };
+    mockStore.recordStep.mockClear();
+
+    await httpRequest(`http://127.0.0.1:${state.port}/cleanup`, {
+      method: 'POST',
+      headers: { 'Content-Type': 'application/json' },
+      body: JSON.stringify({}),
+    });
+
+    expect(mockStore.recordStep).toHaveBeenCalled();
+  });
+
+  it('recordStep is called for read-only tool routes', async () => {
+    const { KnowledgeStore } =
+      await import('../knowledge-store/knowledge-store.js');
+    const mockStore = vi.mocked(KnowledgeStore).mock.results.at(-1)?.value as {
+      recordStep: ReturnType<typeof vi.fn>;
+    };
+    mockStore.recordStep.mockClear();
+
+    await httpRequest(`http://127.0.0.1:${state.port}/tool/get_state`, {
+      method: 'POST',
+      headers: { 'Content-Type': 'application/json' },
+      body: JSON.stringify({}),
+    });
+
+    expect(mockStore.recordStep).toHaveBeenCalled();
+  });
 });
 
 describe('createServer with logging', () => {
diff --git a/src/server/create-server.ts b/src/server/create-server.ts
index bd51d13..a6256f1 100644
--- a/src/server/create-server.ts
+++ b/src/server/create-server.ts
@@ -13,7 +13,8 @@ import {
   KnowledgeStore,
   createDefaultObservation,
 } from '../knowledge-store/knowledge-store.js';
-import { toolRegistry } from '../tools/registry.js';
+import { toolRegistry, getToolCategory } from '../tools/registry.js';
+import type { ToolCategory } from '../tools/registry.js';
 import type {
   StepRecordObservation,
   StepRecordOutcome,
@@ -24,7 +25,12 @@ import {
   collectTestIds,
   collectTrimmedA11ySnapshot,
 } from '../tools/utils/discovery.js';
-import type { DaemonState, ServerConfig, ToolContext } from '../types/http.js';
+import type {
+  DaemonState,
+  ServerConfig,
+  ToolContext,
+  ToolResponse,
+} from '../types/http.js';
 import { extractErrorMessage } from '../utils/errors.js';
 import type { ToolName } from '../validation/schemas.js';
 import { toolSchemas } from '../validation/schemas.js';
@@ -163,6 +169,66 @@ export function buildResponseBody(
   return { ...(toolResult as Record<string, unknown>), observations };
 }
 
+/**
+ * Whether to run Playwright observation collection for this tool invocation.
+ *
+ * Observations are always collected for the knowledge store, regardless of
+ * whether they appear in the HTTP response. The only exception is batch
+ * with `'none'` policy, which skips collection entirely for best performance.
+ *
+ * @param category - The tool category to check.
+ * @param validatedInput - The validated input payload (checked for batch policy).
+ * @returns True if observations should be collected.
+ */
+export function shouldCollectObservations(
+  category: ToolCategory,
+  validatedInput?: Record<string, unknown>,
+): boolean {
+  if (category === 'batch') {
+    const policy =
+      (validatedInput as { includeObservations?: string })
+        ?.includeObservations ?? 'all';
+    return policy !== 'none';
+  }
+  return true;
+}
+
+/**
+ * Whether to include observations in the HTTP response.
+ *
+ * @param category - The tool category.
+ * @param toolResult - The result returned by the tool.
+ * @param validatedInput - The validated input payload (used for batch policy).
+ * @returns True if observations should be included in the response.
+ */
+export function shouldIncludeObservationsInResponse(
+  category: ToolCategory,
+  toolResult: ToolResponse,
+  validatedInput?: Record<string, unknown>,
+): boolean {
+  if (category === 'mutating') {
+    return true;
+  }
+  if (category === 'batch') {
+    const policy =
+      (validatedInput as { includeObservations?: string })
+        ?.includeObservations ?? 'all';
+    if (policy === 'none') {
+      return false;
+    }
+    if (policy === 'failures') {
+      if (!toolResult.ok) {
+        return true;
+      }
+      const result = toolResult.result as Record<string, unknown>;
+      const summary = result?.summary as Record<string, unknown> | undefined;
+      return summary !== undefined && !summary.ok;
+    }
+    return true; // 'all'
+  }
+  return false; // readonly, discovery
+}
+
 /**
  * Creates an HTTP daemon server for agent-driven browser testing.
  *
@@ -362,13 +428,21 @@ export function createServer(config: ServerConfig): ServerInstance {
     const startTime = Date.now();
     const currentWorkflowContext = workflowContext;
 
+    const category = getToolCategory(toolName);
+
     try {
       const { toolResult, observations } = await queue.enqueue(async () => {
         const context = buildToolContext(currentWorkflowContext);
         const result = await tool(validatedInput, context);
 
         let obs: StepRecordObservation | undefined;
-        if (config.sessionManager.hasActiveSession()) {
+        if (
+          shouldCollectObservations(
+            category,
+            validatedInput as Record<string, unknown>,
+          ) &&
+          config.sessionManager.hasActiveSession()
+        ) {
           try {
             const page = config.sessionManager.getPage();
             const state = await config.sessionManager.getExtensionState();
@@ -397,7 +471,17 @@ export function createServer(config: ServerConfig): ServerInstance {
         startTime,
       );
 
-      res.json(buildResponseBody(toolResult, observations));
+      const includeInResponse = shouldIncludeObservationsInResponse(
+        category,
+        toolResult,
+        validatedInput as Record<string, unknown>,
+      );
+      res.json(
+        buildResponseBody(
+          toolResult,
+          includeInResponse ? observations : undefined,
+        ),
+      );
     } catch (error) {
       await recordToolStep(
         toolName,
diff --git a/src/tools/batch.ts b/src/tools/batch.ts
index 70b9f02..40d5e02 100644
--- a/src/tools/batch.ts
+++ b/src/tools/batch.ts
@@ -31,7 +31,6 @@ export async function runStepsTool(
     );
   }
 
-  // TODO: implement observation policy filtering using input.includeObservations
   const { steps: stepInputs, stopOnError = false } = input;
   const stepResults: StepResult[] = [];
   let succeeded = 0;
@@ -116,8 +115,6 @@ export async function runStepsTool(
         },
       });
 
-      // TODO: implement observation policy filtering
-
       if (response.ok) {
         succeeded += 1;
       } else {
diff --git a/src/tools/registry.test.ts b/src/tools/registry.test.ts
index 345df07..7d0dc1d 100644
--- a/src/tools/registry.test.ts
+++ b/src/tools/registry.test.ts
@@ -1,6 +1,6 @@
 import { describe, expect, it } from 'vitest';
 
-import { toolRegistry } from './registry.js';
+import { toolRegistry, TOOL_CATEGORIES, getToolCategory } from './registry.js';
 
 describe('toolRegistry', () => {
   it('has expected tool entries', () => {
@@ -46,3 +46,37 @@ describe('toolRegistry', () => {
     }
   });
 });
+
+describe('TOOL_CATEGORIES and getToolCategory', () => {
+  it('every key in toolRegistry exists in TOOL_CATEGORIES', () => {
+    for (const key of toolRegistry.keys()) {
+      expect(TOOL_CATEGORIES).toHaveProperty(key);
+    }
+  });
+
+  it('every key in TOOL_CATEGORIES exists in toolRegistry', () => {
+    for (const key of Object.keys(TOOL_CATEGORIES)) {
+      expect(toolRegistry.has(key)).toBe(true);
+    }
+  });
+
+  it('getToolCategory returns mutating for nonexistent tool', () => {
+    expect(getToolCategory('nonexistent_tool')).toBe('mutating');
+  });
+
+  it('getToolCategory returns mutating for click', () => {
+    expect(getToolCategory('click')).toBe('mutating');
+  });
+
+  it('getToolCategory returns readonly for knowledge_last', () => {
+    expect(getToolCategory('knowledge_last')).toBe('readonly');
+  });
+
+  it('getToolCategory returns discovery for describe_screen', () => {
+    expect(getToolCategory('describe_screen')).toBe('discovery');
+  });
+
+  it('getToolCategory returns batch for run_steps', () => {
+    expect(getToolCategory('run_steps')).toBe('batch');
+  });
+});
diff --git a/src/tools/registry.ts b/src/tools/registry.ts
index cecfe43..dc376cb 100644
--- a/src/tools/registry.ts
+++ b/src/tools/registry.ts
@@ -65,3 +65,54 @@ export const toolRegistry = new Map<string, ToolFunction<any, any>>([
   ['get_context', getContextTool],
   ['clipboard', clipboardTool],
 ]);
+
+export type ToolCategory = 'mutating' | 'readonly' | 'discovery' | 'batch';
+
+export const TOOL_CATEGORIES: Record<string, ToolCategory> = {
+  // MUTATING (13)
+  click: 'mutating',
+  type: 'mutating',
+  navigate: 'mutating',
+  launch: 'mutating',
+  cleanup: 'mutating',
+  switch_to_tab: 'mutating',
+  close_tab: 'mutating',
+  clipboard: 'mutating',
+  build: 'mutating',
+  wait_for: 'mutating',
+  wait_for_notification: 'mutating',
+  seed_contract: 'mutating',
+  seed_contracts: 'mutating',
+  // READONLY (9)
+  knowledge_last: 'readonly',
+  knowledge_search: 'readonly',
+  knowledge_summarize: 'readonly',
+  knowledge_sessions: 'readonly',
+  get_state: 'readonly',
+  get_context: 'readonly',
+  // set_context is blocked while a session is active (MM_CONTEXT_SWITCH_BLOCKED),
+  // so Playwright observations would never be collected. Classified as readonly
+  // since it never runs in a state where page observations are meaningful.
+  set_context: 'readonly',
+  list_contracts: 'readonly',
+  get_contract_address: 'readonly',
+  // DISCOVERY (4)
+  describe_screen: 'discovery',
+  list_testids: 'discovery',
+  accessibility_snapshot: 'discovery',
+  screenshot: 'discovery',
+  // BATCH (1)
+  run_steps: 'batch',
+};
+
+/**
+ * Returns the category for a registered tool name.
+ * Unknown tools default to 'mutating' — the safe default that ensures
+ * new tools get observations until explicitly categorized.
+ *
+ * @param toolName - The registered tool name to look up.
+ * @returns The tool's category, or 'mutating' for unknown tools.
+ */
+export function getToolCategory(toolName: string): ToolCategory {
+  return TOOL_CATEGORIES[toolName] ?? 'mutating';
+}

From ccff6bb305c4c2595c0ae966ccc74d700dc458b8 Mon Sep 17 00:00:00 2001
From: cryptotavares <joao.tavares@consensys.net>
Date: Thu, 16 Apr 2026 11:17:41 +0100
Subject: [PATCH 03/36] fix: add missing return type annotation to CLI error
 handler

---
 README.md          |  27 +++++-----
 package.json       |   1 +
 src/cli/mm.test.ts | 123 +++++++++++++++++++++++++++++++++++----------
 src/cli/mm.ts      |  73 ++++++++++++++++++++-------
 src/index.ts       |   1 +
 yarn.lock          |  26 ++++++++--
 6 files changed, 188 insertions(+), 63 deletions(-)

diff --git a/README.md b/README.md
index 2ed3b89..e247d50 100644
--- a/README.md
+++ b/README.md
@@ -89,7 +89,7 @@ The global CLI can target any project via `--project` or `MM_PROJECT` (see [Proj
 
 ## Getting Started
 
-Consuming this package requires two things: a **daemon entry point** and a **`package.json` configuration**.
+Consuming this package requires two things: a **daemon entry point** and a **configuration file**.
 
 ### 1. Create a daemon entry point
 
@@ -109,21 +109,20 @@ server.start().then((state) => {
 });
 ```
 
-### 2. Configure `package.json`
-
-```json
-{
-  "mm": {
-    "daemon": "path/to/daemon.ts",
-    "runtime": "tsx"
-  },
-  "scripts": {
-    "mm:serve": "tsx path/to/daemon.ts"
-  }
-}
+### 2. Create a configuration file
+
+Create `mm-client-cli.config.ts` in your project root:
+
+```typescript
+export default {
+  daemon: 'path/to/daemon.ts',
+  runtime: 'tsx',
+};
 ```
 
-The `mm.daemon` field tells the CLI where the daemon entry point lives. The `mm.runtime` field specifies the TypeScript runner (defaults to `tsx`).
+The `daemon` field tells the CLI where the daemon entry point lives. The `runtime` field specifies the TypeScript runner (defaults to `tsx`).
+
+The CLI uses [cosmiconfig](https://github.com/cosmiconfig/cosmiconfig) for config discovery, so you can also use `mm-client-cli.config.js`, `.mm-client-clirc.json`, or other supported formats.
 
 ### 3. Use the CLI
 
diff --git a/package.json b/package.json
index 9c7e225..8221fe2 100644
--- a/package.json
+++ b/package.json
@@ -60,6 +60,7 @@
     "@isaacs/brace-expansion": "5.0.1"
   },
   "dependencies": {
+    "cosmiconfig": "^9.0.0",
     "express": "^5.2.1",
     "zod": "^4.3.5"
   },
diff --git a/src/cli/mm.test.ts b/src/cli/mm.test.ts
index ccc92e1..61c2e8e 100644
--- a/src/cli/mm.test.ts
+++ b/src/cli/mm.test.ts
@@ -2,6 +2,7 @@
 /* eslint-disable n/no-process-env */
 /* eslint-disable n/no-sync */
 /* eslint-disable require-atomic-updates */
+import { cosmiconfig } from 'cosmiconfig';
 import { existsSync } from 'node:fs';
 import * as fs from 'node:fs/promises';
 import * as path from 'node:path';
@@ -56,9 +57,7 @@ vi.mock('node:fs/promises', async (importOriginal) => {
     ...actual,
     realpath: vi.fn(async (p: string) => p),
     stat: vi.fn(async () => ({ isDirectory: () => true })),
-    readFile: vi.fn(async () =>
-      JSON.stringify({ mm: { daemon: './daemon.ts', runtime: 'tsx' } }),
-    ),
+    readFile: vi.fn(),
   };
 });
 
@@ -71,6 +70,14 @@ vi.mock('../server/daemon-state.js', () => ({
   releaseStartupLock: vi.fn(async () => {}),
 }));
 
+const mockSearch = vi.fn();
+
+vi.mock('cosmiconfig', () => ({
+  cosmiconfig: vi.fn(() => ({
+    search: mockSearch,
+  })),
+}));
+
 let exitSpy: MockInstance;
 let stderrSpy: MockInstance;
 let stdoutSpy: MockInstance;
@@ -78,6 +85,11 @@ let stdoutSpy: MockInstance;
 // eslint-disable-next-line vitest/require-top-level-describe
 beforeEach(() => {
   vi.clearAllMocks();
+  mockSearch.mockResolvedValue({
+    config: { daemon: './daemon.ts', runtime: 'tsx' },
+    filepath: '/mock/worktree/mm-client-cli.config.ts',
+    isEmpty: false,
+  });
   exitSpy = vi.spyOn(process, 'exit').mockImplementation((() => {
     throw new Error('process.exit');
   }) as never);
@@ -470,12 +482,12 @@ describe('shutdownDaemon', () => {
 });
 
 describe('readDaemonConfig', () => {
-  it('reads and parses mm config from package.json', async () => {
-    vi.mocked(fs.readFile).mockResolvedValueOnce(
-      JSON.stringify({
-        mm: { daemon: './my-daemon.ts', runtime: 'tsx' },
-      }),
-    );
+  it('reads and parses config from cosmiconfig', async () => {
+    mockSearch.mockResolvedValueOnce({
+      config: { daemon: './my-daemon.ts', runtime: 'tsx' },
+      filepath: '/project/mm-client-cli.config.ts',
+      isEmpty: false,
+    });
 
     const result = await readDaemonConfig('/project');
 
@@ -483,31 +495,69 @@ describe('readDaemonConfig', () => {
       daemonPath: './my-daemon.ts',
       runtime: 'tsx',
     });
+    expect(cosmiconfig).toHaveBeenCalledWith('mm-client-cli', {
+      searchPlaces: [
+        'mm-client-cli.config.ts',
+        'mm-client-cli.config.js',
+        'mm-client-cli.config.cjs',
+        'mm-client-cli.config.mjs',
+        '.mm-client-clirc',
+        '.mm-client-clirc.json',
+        '.mm-client-clirc.yaml',
+        '.mm-client-clirc.yml',
+        '.mm-client-clirc.js',
+        '.mm-client-clirc.ts',
+        '.mm-client-clirc.cjs',
+      ],
+      stopDir: '/project',
+    });
+    expect(mockSearch).toHaveBeenCalledWith('/project');
   });
 
   it('defaults runtime to tsx when not specified', async () => {
-    vi.mocked(fs.readFile).mockResolvedValueOnce(
-      JSON.stringify({ mm: { daemon: './d.ts' } }),
-    );
+    mockSearch.mockResolvedValueOnce({
+      config: { daemon: './d.ts' },
+      filepath: '/project/mm-client-cli.config.ts',
+      isEmpty: false,
+    });
 
     const result = await readDaemonConfig('/project');
 
     expect(result.runtime).toBe('tsx');
   });
 
-  it('exits when package.json cannot be read', async () => {
-    vi.mocked(fs.readFile).mockRejectedValueOnce(new Error('ENOENT'));
+  it('exits when no config file is found', async () => {
+    mockSearch.mockResolvedValueOnce(null);
+
+    await expect(readDaemonConfig('/project')).rejects.toThrowError(
+      'process.exit',
+    );
+    expect(stderrSpy).toHaveBeenCalledWith(
+      expect.stringContaining('No mm-client-cli config found'),
+    );
+  });
+
+  it('exits when config file is empty', async () => {
+    mockSearch.mockResolvedValueOnce({
+      config: undefined,
+      filepath: '/project/mm-client-cli.config.ts',
+      isEmpty: true,
+    });
 
     await expect(readDaemonConfig('/project')).rejects.toThrowError(
       'process.exit',
     );
     expect(stderrSpy).toHaveBeenCalledWith(
-      expect.stringContaining('Cannot read package.json'),
+      expect.stringContaining('No mm-client-cli config found'),
     );
   });
 
-  it('exits when mm.daemon is not configured', async () => {
-    vi.mocked(fs.readFile).mockResolvedValueOnce(JSON.stringify({}));
+  it('exits when daemon is not configured', async () => {
+    mockSearch.mockResolvedValueOnce({
+      config: { runtime: 'tsx' },
+      filepath: '/project/mm-client-cli.config.ts',
+      isEmpty: false,
+    });
 
     await expect(readDaemonConfig('/project')).rejects.toThrowError(
       'process.exit',
@@ -774,6 +824,21 @@ describe('sendRequest', () => {
     );
     expect(stderrSpy).toHaveBeenCalledWith('Error: Request failed\n');
   });
+
+  it('reaches the final fallback after repeated transient failures when exit does not throw', async () => {
+    vi.spyOn(globalThis, 'fetch').mockRejectedValue(new Error('ECONNREFUSED'));
+    exitSpy.mockRestore();
+    exitSpy = vi.spyOn(process, 'exit').mockImplementation((() => {
+      return undefined as never;
+    }) as never);
+
+    await sendRequest(3000, 'GET', '/health', null);
+
+    expect(stderrSpy).toHaveBeenCalledWith(
+      expect.stringContaining('request failed after 4 attempts'),
+    );
+    expect(process.exit).toHaveBeenCalledWith(1);
+  });
 });
 
 describe('routeCommand', () => {
@@ -1582,9 +1647,11 @@ describe('handleServe', () => {
     vi.mocked(readDaemonState).mockResolvedValueOnce(null);
 
     vi.mocked(existsSync).mockReturnValue(true);
-    vi.mocked(fs.readFile).mockResolvedValue(
-      JSON.stringify({ mm: { daemon: './daemon.ts', runtime: 'node' } }),
-    );
+    mockSearch.mockResolvedValueOnce({
+      config: { daemon: './daemon.ts', runtime: 'node' },
+      filepath: '/root/mm-client-cli.config.ts',
+      isEmpty: false,
+    });
 
     const mockState = {
       port: 4000,
@@ -1630,9 +1697,11 @@ describe('handleServe', () => {
     vi.mocked(isDaemonAlive).mockResolvedValueOnce(false);
 
     vi.mocked(existsSync).mockReturnValue(true);
-    vi.mocked(fs.readFile).mockResolvedValue(
-      JSON.stringify({ mm: { daemon: './d.ts', runtime: 'node' } }),
-    );
+    mockSearch.mockResolvedValueOnce({
+      config: { daemon: './d.ts', runtime: 'node' },
+      filepath: '/root/mm-client-cli.config.ts',
+      isEmpty: false,
+    });
 
     const { spawn } = await import('node:child_process');
     vi.mocked(spawn).mockReturnValue({
@@ -1692,9 +1761,11 @@ describe('autoStartDaemon', () => {
     vi.mocked(readDaemonState).mockResolvedValueOnce(null);
 
     vi.mocked(existsSync).mockReturnValue(true);
-    vi.mocked(fs.readFile).mockResolvedValue(
-      JSON.stringify({ mm: { daemon: './daemon.ts', runtime: 'node' } }),
-    );
+    mockSearch.mockResolvedValueOnce({
+      config: { daemon: './daemon.ts', runtime: 'node' },
+      filepath: '/root/mm-client-cli.config.ts',
+      isEmpty: false,
+    });
 
     const mockState = {
       port: 3000,
diff --git a/src/cli/mm.ts b/src/cli/mm.ts
index eeadcbd..ec5949a 100644
--- a/src/cli/mm.ts
+++ b/src/cli/mm.ts
@@ -1,4 +1,5 @@
 #!/usr/bin/env node
+import { cosmiconfig } from 'cosmiconfig';
 import { execSync, spawn } from 'node:child_process';
 import { existsSync } from 'node:fs';
 import * as fs from 'node:fs/promises';
@@ -27,6 +28,18 @@ const DAEMON_POLL_INTERVAL_MS = 200;
 const DAEMON_POLL_MAX_ATTEMPTS = 50; // 50 * 200ms = 10s
 const SEND_MAX_RETRIES = 3;
 const SEND_RETRY_BASE_DELAY_MS = 200;
+const CONFIG_MODULE_NAME = 'mm-client-cli';
+
+/**
+ * Configuration shape for mm-client-cli config files.
+ * Used in mm-client-cli.config.ts or equivalent.
+ */
+export type MmClientCliConfig = {
+  /** Path to the daemon entry point (TypeScript or JavaScript file). */
+  daemon: string;
+  /** TypeScript runner to use. Defaults to 'tsx'. */
+  runtime?: string;
+};
 
 type DaemonConfig = {
   daemonPath: string;
@@ -506,6 +519,7 @@ export async function routeCommand(
           process.stderr.write(`Error: invalid JSON — ${error.message}\n`);
           process.exit(1);
         }
+        /* istanbul ignore next -- non-SyntaxError path depends on delegated failures */
         throw error;
       }
       break;
@@ -752,7 +766,10 @@ export async function handleServe(
 }
 
 /**
- * Reads the daemon configuration from the worktree package.json.
+ * Reads the daemon configuration using cosmiconfig file discovery.
+ *
+ * Searches for configuration files (e.g., mm-client-cli.config.ts)
+ * starting from the worktree root directory.
  *
  * @param worktreeRoot - The git worktree root directory.
  * @returns The daemon path and runtime configuration.
@@ -760,29 +777,43 @@ export async function handleServe(
 export async function readDaemonConfig(
   worktreeRoot: string,
 ): Promise<DaemonConfig> {
-  const pkgPath = path.join(worktreeRoot, 'package.json');
-  let content: string;
-  try {
-    content = await fs.readFile(pkgPath, 'utf-8');
-  } catch {
-    process.stderr.write(`Error: Cannot read package.json at ${pkgPath}\n`);
+  const explorer = cosmiconfig(CONFIG_MODULE_NAME, {
+    searchPlaces: [
+      `${CONFIG_MODULE_NAME}.config.ts`,
+      `${CONFIG_MODULE_NAME}.config.js`,
+      `${CONFIG_MODULE_NAME}.config.cjs`,
+      `${CONFIG_MODULE_NAME}.config.mjs`,
+      `.${CONFIG_MODULE_NAME}rc`,
+      `.${CONFIG_MODULE_NAME}rc.json`,
+      `.${CONFIG_MODULE_NAME}rc.yaml`,
+      `.${CONFIG_MODULE_NAME}rc.yml`,
+      `.${CONFIG_MODULE_NAME}rc.js`,
+      `.${CONFIG_MODULE_NAME}rc.ts`,
+      `.${CONFIG_MODULE_NAME}rc.cjs`,
+    ],
+    stopDir: worktreeRoot,
+  });
+
+  const result = await explorer.search(worktreeRoot);
+
+  if (!result || result.isEmpty) {
+    process.stderr.write(
+      `Error: No mm-client-cli config found. Create ${CONFIG_MODULE_NAME}.config.ts in your project root.\n`,
+    );
     process.exit(1);
   }
 
-  const pkgJson = JSON.parse(content) as Record<string, unknown>;
-  const mmConfig = pkgJson.mm as
-    | { daemon?: string; runtime?: string }
-    | undefined;
-  if (!mmConfig?.daemon) {
+  const config = result.config as MmClientCliConfig;
+  if (!config.daemon) {
     process.stderr.write(
-      'Error: No daemon entry point configured. Add `mm.daemon` to package.json.\n',
+      `Error: No daemon entry point configured. Add 'daemon' to ${result.filepath}.\n`,
     );
     process.exit(1);
   }
 
   return {
-    daemonPath: mmConfig.daemon,
-    runtime: mmConfig.runtime ?? 'tsx',
+    daemonPath: config.daemon,
+    runtime: config.runtime ?? 'tsx',
   };
 }
 
@@ -1020,10 +1051,14 @@ Examples:
 `);
 }
 
+/* istanbul ignore next -- CLI entry point, tested via exported functions */
+/* istanbul ignore next -- top-level fatal handler is not exercised in tests */
+const handleFatalCliError = (error: unknown): void => {
+  process.stderr.write(`Fatal: ${String(error)}\n`);
+  process.exit(1);
+};
+
 /* istanbul ignore next -- CLI entry point, tested via exported functions */
 if (process.env.VITEST === undefined) {
-  main().catch((error: unknown) => {
-    process.stderr.write(`Fatal: ${String(error)}\n`);
-    process.exit(1);
-  });
+  main().catch(handleFatalCliError);
 }
diff --git a/src/index.ts b/src/index.ts
index 0025aff..5e60e93 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -22,6 +22,7 @@ export * from './tools/types';
 
 // HTTP Server Types
 export type * from './types/http.js';
+export type { MmClientCliConfig } from './cli/mm.js';
 export * from './tools/registry.js';
 
 // Server utilities
diff --git a/yarn.lock b/yarn.lock
index ee84dca..4ecedee 100644
--- a/yarn.lock
+++ b/yarn.lock
@@ -929,6 +929,7 @@ __metadata:
     "@vitest/coverage-istanbul": "npm:^3.0.7"
     "@vitest/eslint-plugin": "npm:^1.1.4"
     "@yarnpkg/types": "npm:^4.0.0-rc.52"
+    cosmiconfig: "npm:^9.0.0"
     depcheck: "npm:^1.4.3"
     eslint: "npm:^9.11.0"
     eslint-config-prettier: "npm:^9.1.0"
@@ -2812,6 +2813,23 @@ __metadata:
   languageName: node
   linkType: hard
 
+"cosmiconfig@npm:^9.0.0":
+  version: 9.0.1
+  resolution: "cosmiconfig@npm:9.0.1"
+  dependencies:
+    env-paths: "npm:^2.2.1"
+    import-fresh: "npm:^3.3.0"
+    js-yaml: "npm:^4.1.0"
+    parse-json: "npm:^5.2.0"
+  peerDependencies:
+    typescript: ">=4.9.5"
+  peerDependenciesMeta:
+    typescript:
+      optional: true
+  checksum: 10/89fcac84d062f0710091bb2d6a6175bcde22f5448877db9c43429694408191d3d4e215193b3ac4d54f7f89ef188d55cd481c7a2295b0dc572e65b528bf6fec01
+  languageName: node
+  linkType: hard
+
 "create-require@npm:^1.1.0":
   version: 1.1.1
   resolution: "create-require@npm:1.1.1"
@@ -3061,7 +3079,7 @@ __metadata:
   languageName: node
   linkType: hard
 
-"env-paths@npm:^2.2.0":
+"env-paths@npm:^2.2.0, env-paths@npm:^2.2.1":
   version: 2.2.1
   resolution: "env-paths@npm:2.2.1"
   checksum: 10/65b5df55a8bab92229ab2b40dad3b387fad24613263d103a97f91c9fe43ceb21965cd3392b1ccb5d77088021e525c4e0481adb309625d0cb94ade1d1fb8dc17e
@@ -4250,7 +4268,7 @@ __metadata:
   languageName: node
   linkType: hard
 
-"import-fresh@npm:^3.2.1":
+"import-fresh@npm:^3.2.1, import-fresh@npm:^3.3.0":
   version: 3.3.1
   resolution: "import-fresh@npm:3.3.1"
   dependencies:
@@ -4490,7 +4508,7 @@ __metadata:
   languageName: node
   linkType: hard
 
-"js-yaml@npm:^4.1.1":
+"js-yaml@npm:^4.1.0, js-yaml@npm:^4.1.1":
   version: 4.1.1
   resolution: "js-yaml@npm:4.1.1"
   dependencies:
@@ -5277,7 +5295,7 @@ __metadata:
   languageName: node
   linkType: hard
 
-"parse-json@npm:^5.0.0":
+"parse-json@npm:^5.0.0, parse-json@npm:^5.2.0":
   version: 5.2.0
   resolution: "parse-json@npm:5.2.0"
   dependencies:

From 845ffc858db23b8c0d3c27e9e05674f6bc16af25 Mon Sep 17 00:00:00 2001
From: cryptotavares <joao.tavares@consensys.net>
Date: Thu, 16 Apr 2026 12:08:20 +0100
Subject: [PATCH 04/36] refactor: read package version from package.json
 instead of hardcoded constant

Remove src/version.ts which required manual sync with package.json.
All version consumers now import directly from package.json,
with ts-bridge handling the JSON import for both CJS and ESM outputs.
---
 src/cli/mm.ts                    | 4 ++--
 src/index.ts                     | 3 ---
 src/server/create-server.test.ts | 4 ++--
 src/server/create-server.ts      | 4 ++--
 src/server/daemon-state.test.ts  | 6 +++---
 src/server/daemon-state.ts       | 4 ++--
 src/version.ts                   | 2 --
 7 files changed, 11 insertions(+), 16 deletions(-)
 delete mode 100644 src/version.ts

diff --git a/src/cli/mm.ts b/src/cli/mm.ts
index ec5949a..e9cc384 100644
--- a/src/cli/mm.ts
+++ b/src/cli/mm.ts
@@ -5,6 +5,7 @@ import { existsSync } from 'node:fs';
 import * as fs from 'node:fs/promises';
 import * as path from 'node:path';
 
+import pkg from '../../package.json';
 import {
   acquireStartupLock,
   isDaemonAlive,
@@ -14,7 +15,6 @@ import {
   removeDaemonState,
 } from '../server/daemon-state.js';
 import type { DaemonState } from '../types/http.js';
-import { PACKAGE_VERSION } from '../version.js';
 
 const COMMAND_TIMEOUTS_MS: Record<string, number> = {
   launch: 120_000,
@@ -656,7 +656,7 @@ export async function discoverDaemon(
       }
 
       process.stderr.write(
-        `Daemon version mismatch (running: ${state.version ?? 'unknown'}, cli: ${PACKAGE_VERSION}). Restarting...\n`,
+        `Daemon version mismatch (running: ${state.version ?? 'unknown'}, cli: ${pkg.version}). Restarting...\n`,
       );
       await shutdownDaemon(worktreeRoot, state);
       state = null;
diff --git a/src/index.ts b/src/index.ts
index 5e60e93..2275fbe 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -42,6 +42,3 @@ export * from './launcher/retry.js';
 
 // Error classification
 export * from './tools/error-classification.js';
-
-// Version
-export * from './version.js';
diff --git a/src/server/create-server.test.ts b/src/server/create-server.test.ts
index dc77c07..ff00fba 100644
--- a/src/server/create-server.test.ts
+++ b/src/server/create-server.test.ts
@@ -16,8 +16,8 @@ import {
   shouldIncludeObservationsInResponse,
 } from './create-server.js';
 import { readDaemonState } from './daemon-state.js';
+import pkg from '../../package.json';
 import type { DaemonState, ServerConfig, ToolResponse } from '../types/http.js';
-import { PACKAGE_VERSION } from '../version.js';
 
 const tmpDir = path.join(os.tmpdir(), `mm-create-server-test-${Date.now()}`);
 
@@ -545,7 +545,7 @@ describe('createServer integration', () => {
     expect(daemonState).not.toBeNull();
     expect(daemonState?.port).toBe(state.port);
     expect(daemonState?.nonce).toBe(state.nonce);
-    expect(daemonState?.version).toBe(PACKAGE_VERSION);
+    expect(daemonState?.version).toBe(pkg.version);
   });
 
   it('passes workflow context to session manager on start', async () => {
diff --git a/src/server/create-server.ts b/src/server/create-server.ts
index a6256f1..201198a 100644
--- a/src/server/create-server.ts
+++ b/src/server/create-server.ts
@@ -7,6 +7,7 @@ import * as http from 'node:http';
 import { writeDaemonState, removeDaemonState } from './daemon-state.js';
 import { allocatePort } from './port-allocator.js';
 import { RequestQueue } from './request-queue.js';
+import pkg from '../../package.json';
 import type { WorkflowContext } from '../capabilities/context.js';
 import type { ExtensionState } from '../capabilities/types.js';
 import {
@@ -34,7 +35,6 @@ import type {
 import { extractErrorMessage } from '../utils/errors.js';
 import type { ToolName } from '../validation/schemas.js';
 import { toolSchemas } from '../validation/schemas.js';
-import { PACKAGE_VERSION } from '../version.js';
 
 /**
  * Extracts target selection fields from a tool's validated input.
@@ -597,7 +597,7 @@ export function createServer(config: ServerConfig): ServerInstance {
         pid: process.pid,
         startedAt,
         nonce,
-        version: PACKAGE_VERSION,
+        version: pkg.version,
         subPorts,
       };
 
diff --git a/src/server/daemon-state.test.ts b/src/server/daemon-state.test.ts
index 43c4cec..847cac1 100644
--- a/src/server/daemon-state.test.ts
+++ b/src/server/daemon-state.test.ts
@@ -14,8 +14,8 @@ import {
   isDaemonVersionMatch,
   generateNonce,
 } from './daemon-state.js';
+import pkg from '../../package.json';
 import type { DaemonState } from '../types/http.js';
-import { PACKAGE_VERSION } from '../version.js';
 
 const tmpDir = path.join(os.tmpdir(), `mm-daemon-state-test-${Date.now()}`);
 
@@ -24,7 +24,7 @@ const mockState: DaemonState = {
   pid: process.pid,
   startedAt: new Date().toISOString(),
   nonce: 'test-nonce-abc',
-  version: PACKAGE_VERSION,
+  version: pkg.version,
   subPorts: { anvil: 8545, fixture: 8546, mock: 8547 },
 };
 
@@ -198,7 +198,7 @@ describe('daemon-state', () => {
   });
 
   describe('isDaemonVersionMatch', () => {
-    it('returns true when version matches PACKAGE_VERSION', () => {
+    it('returns true when version matches package.json version', () => {
       expect(isDaemonVersionMatch(mockState)).toBe(true);
     });
 
diff --git a/src/server/daemon-state.ts b/src/server/daemon-state.ts
index c4e7285..d3cfe9a 100644
--- a/src/server/daemon-state.ts
+++ b/src/server/daemon-state.ts
@@ -3,8 +3,8 @@ import { constants } from 'node:fs';
 import * as fs from 'node:fs/promises';
 import * as path from 'node:path';
 
+import pkg from '../../package.json';
 import type { DaemonState } from '../types/http.js';
-import { PACKAGE_VERSION } from '../version.js';
 
 const DAEMON_STATE_FILE = '.mm-server';
 const DAEMON_STATE_TMP_FILE = '.mm-server.tmp';
@@ -109,7 +109,7 @@ export async function isDaemonAlive(state: DaemonState): Promise<boolean> {
  * @returns Whether the versions match.
  */
 export function isDaemonVersionMatch(state: DaemonState): boolean {
-  return state.version === PACKAGE_VERSION;
+  return state.version === pkg.version;
 }
 
 /**
diff --git a/src/version.ts b/src/version.ts
deleted file mode 100644
index 7ec097a..0000000
--- a/src/version.ts
+++ /dev/null
@@ -1,2 +0,0 @@
-// Keep in sync with package.json — used for daemon version tracking.
-export const PACKAGE_VERSION = '0.2.0';

From 072f40ada396533c094a93d26c023cf563cb2d9b Mon Sep 17 00:00:00 2001
From: cryptotavares <joao.tavares@consensys.net>
Date: Thu, 16 Apr 2026 16:41:13 +0100
Subject: [PATCH 05/36] refactor: delegate sub-service port allocation to
 consumer contextFactory

- Remove ContextFactoryOptions; contextFactory now takes no arguments
- Support sync and async contextFactory return values
- Add PortMap type and optional allocatedPorts to WorkflowContext
- Validate contextFactory return shape at runtime (config.environment, allocatedPorts)
- Add startup error handling with cleanup rollback on post-contextFactory failures
- Replace hardcoded { anvil, fixture, mock } subPorts with consumer-defined PortMap
- Export allocatePort() utility for consumer convenience
- Update DaemonState.subPorts to use PortMap
- Add comprehensive tests for contextFactory error paths and /status port reporting
- Add edge-case test coverage for batch, launch, screenshot, seeding, discovery, and navigate schema
- Update coverage thresholds to reflect new test additions
---
 README.md                         |  56 ++++--
 src/capabilities/context.test.ts  |  31 ++++
 src/capabilities/context.ts       |   5 +
 src/server/create-server.test.ts  | 298 +++++++++++++++++++++++++++++-
 src/server/create-server.ts       | 210 ++++++++++++---------
 src/server/daemon-state.test.ts   |   2 +-
 src/tools/batch.test.ts           |  23 +++
 src/tools/launch.test.ts          |  14 ++
 src/tools/screenshot.test.ts      |  19 ++
 src/tools/seeding.test.ts         |  36 ++++
 src/tools/utils/discovery.test.ts |  11 ++
 src/types/http.ts                 |  34 +---
 src/validation/schemas.test.ts    |  45 +++++
 vitest.config.mts                 |   8 +-
 14 files changed, 639 insertions(+), 153 deletions(-)

diff --git a/README.md b/README.md
index e247d50..1d2b00f 100644
--- a/README.md
+++ b/README.md
@@ -95,13 +95,26 @@ Consuming this package requires two things: a **daemon entry point** and a **con
 
 ```typescript
 // daemon.ts
-import { createServer } from '@metamask/client-mcp-core';
+import { createServer, allocatePort } from '@metamask/client-mcp-core';
 import { MySessionManager } from './my-session-manager';
 import { createMyContext } from './my-context';
 
 const server = createServer({
   sessionManager: new MySessionManager(),
-  contextFactory: (options) => createMyContext({ ports: options.ports }),
+  contextFactory: async () => {
+    // Consumer owns port allocation — use the allocatePort() helper
+    // or any other strategy that fits your infrastructure.
+    const anvil = await allocatePort();
+    const fixture = await allocatePort();
+    await Promise.all([
+      new Promise<void>((r) => anvil.server.close(() => r())),
+      new Promise<void>((r) => fixture.server.close(() => r())),
+    ]);
+
+    return createMyContext({
+      ports: { anvil: anvil.port, fixture: fixture.port },
+    });
+  },
 });
 
 server.start().then((state) => {
@@ -151,7 +164,7 @@ mm launch
 The architecture relies on a persistent background HTTP daemon that manages the browser lifecycle:
 
 - **Worktree Isolation**: Each git worktree runs its own daemon instance, tracked via a `.mm-server` state file in the project root. This allows parallel work across branches.
-- **Port Allocation**: The daemon automatically allocates ports for the HTTP server and test infrastructure (Anvil, fixture server, mock server) to avoid conflicts.
+- **Port Allocation**: The daemon allocates its own HTTP port automatically. Sub-service ports (Anvil, fixture server, etc.) are allocated by the consumer's `contextFactory` and reported back via `allocatedPorts`. The `allocatePort()` helper is exported for convenience.
 - **Auto-Start**: The daemon starts automatically on `mm launch` if not already running, and shuts down after a period of inactivity (default: 30 minutes).
 - **Request Serialization**: A `RequestQueue` (async mutex) ensures only one tool executes at a time, preventing race conditions on shared browser state.
 - **Health Checks**: Each daemon generates a unique nonce on startup. The CLI verifies daemon identity via `GET /health` to detect stale `.mm-server` files from crashed processes.
@@ -221,19 +234,24 @@ type WorkflowContext = {
   stateSnapshot?: StateSnapshotCapability;
   mockServer?: MockServerCapability;
   config: EnvironmentConfig;
+  allocatedPorts?: PortMap; // reported to /status and persisted in .mm-server
 };
 ```
 
-Capabilities are created by the consumer's `contextFactory` function, which receives allocated port numbers:
+Capabilities are created by the consumer's `contextFactory` function. The factory is responsible for allocating any sub-service ports it needs (the `allocatePort()` helper is exported for convenience):
 
 ```typescript
-function createMyContext(options: {
-  ports: { anvil: number; fixture: number; mock: number };
-}): WorkflowContext {
+async function createMyContext(options: {
+  ports: { anvil: number; fixture: number };
+}): Promise<WorkflowContext> {
   return {
     build: new MyBuildCapability(),
     fixture: new MyFixtureCapability(options.ports.fixture),
     chain: new MyChainCapability(options.ports.anvil),
+    allocatedPorts: {
+      anvil: options.ports.anvil,
+      fixture: options.ports.fixture,
+    },
     config: {
       environment: 'e2e',
       extensionName: 'MyExtension',
@@ -441,26 +459,24 @@ The `createServer()` function accepts a `ServerConfig` object:
 type ServerConfig = {
   /** Session manager instance (required) */
   sessionManager: ISessionManager;
-  /** Factory function to create workflow context (required) */
-  contextFactory: (options: ContextFactoryOptions) => WorkflowContext;
-  /** Idle timeout in milliseconds (optional, defaults to 30000) */
-  idleTimeoutMs?: number;
+  /** Factory function to create workflow context (may be sync or async) */
+  contextFactory: () => WorkflowContext | Promise<WorkflowContext>;
+  /** Idle timeout in milliseconds (optional, defaults to 1_800_000 = 30 min) */
+  idleShutdownMs?: number;
+  /** Per-request execution timeout in milliseconds (default: 30_000) */
+  requestTimeoutMs?: number;
   /** Path to log file (optional) */
   logFilePath?: string;
 };
-
-type ContextFactoryOptions = {
-  ports: {
-    anvil: number;
-    fixture: number;
-    mock: number;
-  };
-};
 ```
 
+The `contextFactory` is called once during `start()`. It is responsible for allocating any sub-service ports and returning a `WorkflowContext`. The core validates the returned shape at runtime — `config.environment` must be a string and every value in `allocatedPorts` (if provided) must be a finite number.
+
+The `allocatePort()` utility is exported as a convenience for consumers who need ephemeral port allocation inside their factory.
+
 The returned `ServerInstance` exposes:
 
-- `start(): Promise<DaemonState>` — Allocates ports, starts HTTP server, writes `.mm-server` state, sets up idle timeout and signal handlers.
+- `start(): Promise<DaemonState>` — Calls `contextFactory`, starts HTTP server, writes `.mm-server` state, sets up idle timeout and signal handlers.
 - `stop(): Promise<void>` — Stops accepting connections, cleans up session, removes `.mm-server` state.
 
 ## HTTP API
diff --git a/src/capabilities/context.test.ts b/src/capabilities/context.test.ts
index 772c25a..e953b93 100644
--- a/src/capabilities/context.test.ts
+++ b/src/capabilities/context.test.ts
@@ -434,3 +434,34 @@ describe('hasCapability', () => {
     expect(hasCapability(context, 'mockServer')).toBe(true);
   });
 });
+
+describe('WorkflowContext with allocatedPorts', () => {
+  it('accepts allocatedPorts with port mappings', () => {
+    const context: WorkflowContext = {
+      config: {
+        environment: 'e2e',
+        extensionName: 'MetaMask',
+      },
+      allocatedPorts: {
+        anvil: 3000,
+        fixture: 4000,
+      },
+    };
+
+    expect(context.allocatedPorts).toStrictEqual({
+      anvil: 3000,
+      fixture: 4000,
+    });
+  });
+
+  it('allows WorkflowContext without allocatedPorts (field is optional)', () => {
+    const context: WorkflowContext = {
+      config: {
+        environment: 'e2e',
+        extensionName: 'MetaMask',
+      },
+    };
+
+    expect(context.allocatedPorts).toBeUndefined();
+  });
+});
diff --git a/src/capabilities/context.ts b/src/capabilities/context.ts
index e5f381a..25b4d9e 100644
--- a/src/capabilities/context.ts
+++ b/src/capabilities/context.ts
@@ -7,6 +7,9 @@ import type {
   MockServerCapability,
 } from './types.js';
 
+/** Sparse port-name → port-number map. `Partial` ensures lookups resolve to `number | undefined`. */
+export type PortMap = Partial<Record<string, number>>;
+
 /**
  * Environment mode discriminator.
  * - 'e2e': End-to-end testing environment with local chain, fixtures, and contract seeding
@@ -110,6 +113,8 @@ export type WorkflowContext = {
   stateSnapshot?: StateSnapshotCapability;
   mockServer?: MockServerCapability;
   config: EnvironmentConfig;
+  /** Port metadata reported back to core from the contextFactory. Used for DaemonState persistence and /status endpoint. */
+  allocatedPorts?: PortMap;
 };
 
 /**
diff --git a/src/server/create-server.test.ts b/src/server/create-server.test.ts
index ff00fba..daad92d 100644
--- a/src/server/create-server.test.ts
+++ b/src/server/create-server.test.ts
@@ -17,6 +17,7 @@ import {
 } from './create-server.js';
 import { readDaemonState } from './daemon-state.js';
 import pkg from '../../package.json';
+import type { PortMap, WorkflowContext } from '../capabilities/context.js';
 import type { DaemonState, ServerConfig, ToolResponse } from '../types/http.js';
 
 const tmpDir = path.join(os.tmpdir(), `mm-create-server-test-${Date.now()}`);
@@ -102,8 +103,10 @@ function buildConfig(overrides: Partial<ServerConfig> = {}): ServerConfig {
   return {
     sessionManager:
       createMockSessionManager() as unknown as ServerConfig['sessionManager'],
-    contextFactory: () =>
-      ({}) as unknown as ReturnType<ServerConfig['contextFactory']>,
+    contextFactory: async () =>
+      ({
+        config: { environment: 'prod', extensionName: 'Test Extension' },
+      }) satisfies WorkflowContext,
     ...overrides,
   };
 }
@@ -488,13 +491,13 @@ describe('createServer integration', () => {
     const res = await httpRequest(`http://127.0.0.1:${state.port}/status`);
     const body = (await res.json()) as {
       daemon: { pid: number; port: number };
-      ports: Record<string, number>;
+      ports: PortMap;
     };
 
     expect(res.status).toBe(200);
     expect(body.daemon.pid).toBe(process.pid);
     expect(body.daemon.port).toBe(state.port);
-    expect(body.ports).toBeDefined();
+    expect(body.ports).toStrictEqual({});
   });
 
   it('pOST /launch delegates to session manager', async () => {
@@ -551,15 +554,14 @@ describe('createServer integration', () => {
   it('passes workflow context to session manager on start', async () => {
     await server.stop();
 
-    const workflowContext = { config: { environment: 'e2e' as const } };
+    const workflowContext: WorkflowContext = {
+      config: { environment: 'e2e', extensionName: 'Test Extension' },
+    };
     const mockSM = createMockSessionManager();
     const customServer = createServer(
       buildConfig({
         sessionManager: mockSM as unknown as ServerConfig['sessionManager'],
-        contextFactory: () =>
-          workflowContext as unknown as ReturnType<
-            ServerConfig['contextFactory']
-          >,
+        contextFactory: vi.fn().mockResolvedValue(workflowContext),
       }),
     );
 
@@ -568,6 +570,203 @@ describe('createServer integration', () => {
     await customServer.stop();
   });
 
+  it('fails startup when contextFactory rejects', async () => {
+    await server.stop();
+
+    const customServer = createServer(
+      buildConfig({
+        contextFactory: vi
+          .fn<ServerConfig['contextFactory']>()
+          .mockRejectedValue(new Error('port allocation failed')),
+      }),
+    );
+
+    await expect(customServer.start()).rejects.toThrowError(
+      'contextFactory failed during server startup: port allocation failed',
+    );
+  });
+
+  it('preserves original error as cause when contextFactory rejects', async () => {
+    await server.stop();
+
+    const cause = new Error('root cause');
+    const customServer = createServer(
+      buildConfig({
+        contextFactory: vi
+          .fn<ServerConfig['contextFactory']>()
+          .mockRejectedValue(cause),
+      }),
+    );
+
+    await expect(customServer.start()).rejects.toThrowError(
+      expect.objectContaining({ cause }),
+    );
+  });
+
+  it('fails startup when contextFactory resolves with null', async () => {
+    await server.stop();
+
+    const customServer = createServer(
+      buildConfig({
+        contextFactory: vi.fn().mockResolvedValue(null),
+      }),
+    );
+
+    await expect(customServer.start()).rejects.toThrowError(
+      'contextFactory must return an object with a valid config.environment field',
+    );
+  });
+
+  it('fails startup when contextFactory resolves without config', async () => {
+    await server.stop();
+
+    const customServer = createServer(
+      buildConfig({
+        contextFactory: vi.fn().mockResolvedValue({}),
+      }),
+    );
+
+    await expect(customServer.start()).rejects.toThrowError(
+      'contextFactory must return an object with a valid config.environment field',
+    );
+  });
+
+  it('fails startup when allocatedPorts contains non-number values', async () => {
+    await server.stop();
+
+    const customServer = createServer(
+      buildConfig({
+        contextFactory: vi.fn().mockResolvedValue({
+          config: { environment: 'prod', extensionName: 'Test' },
+          allocatedPorts: { bad: 'not-a-number' },
+        }),
+      }),
+    );
+
+    await expect(customServer.start()).rejects.toThrowError(
+      'allocatedPorts["bad"] must be a finite number',
+    );
+  });
+
+  it('does not call setWorkflowContext when contextFactory rejects', async () => {
+    await server.stop();
+
+    const mockSM = createMockSessionManager();
+    const customServer = createServer(
+      buildConfig({
+        sessionManager: mockSM as unknown as ServerConfig['sessionManager'],
+        contextFactory: vi
+          .fn<ServerConfig['contextFactory']>()
+          .mockRejectedValue(new Error('boom')),
+      }),
+    );
+
+    await customServer.start().catch(() => {});
+    expect(mockSM.setWorkflowContext).not.toHaveBeenCalled();
+  });
+
+  it('does not write .mm-server when contextFactory rejects', async () => {
+    await server.stop();
+
+    const customServer = createServer(
+      buildConfig({
+        contextFactory: vi
+          .fn<ServerConfig['contextFactory']>()
+          .mockRejectedValue(new Error('boom')),
+      }),
+    );
+
+    await customServer.start().catch(() => {});
+    const daemonState = await readDaemonState(tmpDir);
+    expect(daemonState).toBeNull();
+  });
+
+  it('cleans up session when startup fails after contextFactory succeeds', async () => {
+    await server.stop();
+
+    const mockSM = createMockSessionManager();
+    const customServer = createServer(
+      buildConfig({
+        sessionManager: mockSM as unknown as ServerConfig['sessionManager'],
+        contextFactory: vi.fn().mockResolvedValue({
+          config: { environment: 'prod', extensionName: 'Test' },
+        } satisfies WorkflowContext),
+      }),
+    );
+
+    await fs.chmod(tmpDir, 0o444);
+    try {
+      await expect(customServer.start()).rejects.toThrowError(/EACCES/u);
+      expect(mockSM.cleanup).toHaveBeenCalled();
+    } finally {
+      await fs.chmod(tmpDir, 0o755).catch(() => {});
+    }
+  });
+
+  it('accepts a synchronous contextFactory', async () => {
+    await server.stop();
+
+    const customServer = createServer(
+      buildConfig({
+        contextFactory: () => ({
+          config: { environment: 'prod' as const, extensionName: 'Sync' },
+        }),
+      }),
+    );
+
+    const customState = await customServer.start();
+    expect(customState.port).toBeGreaterThan(0);
+    await customServer.stop();
+  });
+
+  it('gET /status returns empty ports when allocatedPorts is undefined', async () => {
+    await server.stop();
+
+    const customServer = createServer(
+      buildConfig({
+        contextFactory: vi.fn().mockResolvedValue({
+          config: { environment: 'prod', extensionName: 'Test Extension' },
+        } satisfies WorkflowContext),
+      }),
+    );
+
+    const customState = await customServer.start();
+    const res = await httpRequest(
+      `http://127.0.0.1:${customState.port}/status`,
+    );
+    const body = (await res.json()) as { ports: PortMap };
+
+    expect(res.status).toBe(200);
+    expect(body.ports).toStrictEqual({});
+
+    await customServer.stop();
+  });
+
+  it('gET /status returns custom allocated ports', async () => {
+    await server.stop();
+
+    const allocatedPorts = { serviceA: 3001, serviceB: 3002 };
+    const customServer = createServer(
+      buildConfig({
+        contextFactory: vi.fn().mockResolvedValue({
+          config: { environment: 'prod', extensionName: 'Test Extension' },
+          allocatedPorts,
+        } satisfies WorkflowContext),
+      }),
+    );
+
+    const customState = await customServer.start();
+    const res = await httpRequest(
+      `http://127.0.0.1:${customState.port}/status`,
+    );
+    const body = (await res.json()) as { ports: PortMap };
+
+    expect(res.status).toBe(200);
+    expect(body.ports).toStrictEqual(allocatedPorts);
+
+    await customServer.stop();
+  });
+
   it('removes .mm-server on stop', async () => {
     await server.stop();
     const daemonState = await readDaemonState(tmpDir);
@@ -917,4 +1116,85 @@ describe('createServer with logging', () => {
       .catch(() => '');
     expect(logContent).toContain('/health');
   });
+
+  it('logs fatal errors to stderr and file', async () => {
+    const stderrSpy = vi
+      .spyOn(process.stderr, 'write')
+      .mockImplementation(() => true);
+
+    // Trigger a cleanup error by making sessionManager.cleanup() throw
+    const mockSM = createMockSessionManager();
+    mockSM.hasActiveSession.mockReturnValue(true);
+    mockSM.cleanup.mockRejectedValue(new Error('Cleanup failed'));
+
+    const testServer = createServer({
+      sessionManager: mockSM as unknown as ServerConfig['sessionManager'],
+      contextFactory: vi.fn().mockResolvedValue({
+        config: {
+          environment: 'e2e',
+          extensionName: 'Test',
+          defaultPassword: 'test',
+          artifactsDir: tmpDir,
+          defaultChainId: 1,
+          ports: { anvil: 8545, fixtureServer: 12345 },
+        },
+      } satisfies WorkflowContext),
+      logFilePath: path.join(tmpDir, 'error.log'),
+    });
+
+    await testServer.start();
+    await testServer.stop();
+
+    // Verify stderr was called with fatal error
+    expect(stderrSpy).toHaveBeenCalledWith(
+      expect.stringContaining('[ERROR] Cleanup failed'),
+    );
+
+    stderrSpy.mockRestore();
+  });
+
+  it('handles log file write errors gracefully', async () => {
+    const stderrSpy = vi
+      .spyOn(process.stderr, 'write')
+      .mockImplementation(() => true);
+
+    // Create a read-only directory to cause write errors
+    const readOnlyDir = path.join(tmpDir, 'readonly');
+    await fs.mkdir(readOnlyDir, { recursive: true });
+    const logPath = path.join(readOnlyDir, 'daemon.log');
+
+    // Make directory read-only
+    await fs.chmod(readOnlyDir, 0o444);
+
+    try {
+      const testServer = createServer(buildConfig({ logFilePath: logPath }));
+      const testState = await testServer.start();
+
+      // Make a request to trigger logging
+      await httpRequest(`http://127.0.0.1:${testState.port}/health`);
+      await new Promise((resolve) => setTimeout(resolve, 100));
+
+      await testServer.stop();
+
+      // Verify that stderr was called with the write error message
+      expect(stderrSpy).toHaveBeenCalledWith(
+        expect.stringContaining('Failed to write log'),
+      );
+    } finally {
+      stderrSpy.mockRestore();
+      // Restore write permissions for cleanup
+      await fs.chmod(readOnlyDir, 0o755).catch(() => {});
+    }
+  });
+
+  it('handles server close timeout with force close', async () => {
+    const testServer = createServer(buildConfig());
+    const testState = await testServer.start();
+
+    // Make a request to ensure server is active
+    await httpRequest(`http://127.0.0.1:${testState.port}/health`);
+
+    // Stop should complete even if server doesn't close gracefully
+    expect(await testServer.stop()).toBeUndefined();
+  });
 });
diff --git a/src/server/create-server.ts b/src/server/create-server.ts
index 201198a..3f269ac 100644
--- a/src/server/create-server.ts
+++ b/src/server/create-server.ts
@@ -5,10 +5,9 @@ import * as fs from 'node:fs/promises';
 import * as http from 'node:http';
 
 import { writeDaemonState, removeDaemonState } from './daemon-state.js';
-import { allocatePort } from './port-allocator.js';
 import { RequestQueue } from './request-queue.js';
 import pkg from '../../package.json';
-import type { WorkflowContext } from '../capabilities/context.js';
+import type { PortMap, WorkflowContext } from '../capabilities/context.js';
 import type { ExtensionState } from '../capabilities/types.js';
 import {
   KnowledgeStore,
@@ -246,11 +245,7 @@ export function createServer(config: ServerConfig): ServerInstance {
   let startedAt = '';
   let daemonPort = 0;
   let workflowContext: WorkflowContext | null = null;
-  let subPorts: { anvil: number; fixture: number; mock: number } = {
-    anvil: 0,
-    fixture: 0,
-    mock: 0,
-  };
+  let subPorts: PortMap = {};
   let shuttingDown = false;
   let shutdownHandler: (() => void) | null = null;
   let lastRequestTime = Date.now();
@@ -549,100 +544,135 @@ export function createServer(config: ServerConfig): ServerInstance {
         .toString()
         .trim();
 
-      // Allocate sub-ports for external services (anvil, fixture, mock).
-      // These use allocate-then-close because the external services bind
-      // their own sockets — a small TOCTOU window is acceptable here.
-      const [anvilAlloc, fixtureAlloc, mockAlloc] = await Promise.all([
-        allocatePort(),
-        allocatePort(),
-        allocatePort(),
-      ]);
-
-      subPorts = {
-        anvil: anvilAlloc.port,
-        fixture: fixtureAlloc.port,
-        mock: mockAlloc.port,
-      };
+      try {
+        workflowContext = await config.contextFactory();
+      } catch (error) {
+        throw new Error(
+          `contextFactory failed during server startup: ${error instanceof Error ? error.message : String(error)}`,
+          { cause: error },
+        );
+      }
 
-      await Promise.all([
-        new Promise<void>((resolve) =>
-          anvilAlloc.server.close(() => resolve()),
-        ),
-        new Promise<void>((resolve) =>
-          fixtureAlloc.server.close(() => resolve()),
-        ),
-        new Promise<void>((resolve) => mockAlloc.server.close(() => resolve())),
-      ]);
+      if (
+        !workflowContext ||
+        typeof workflowContext !== 'object' ||
+        !workflowContext.config ||
+        typeof workflowContext.config.environment !== 'string'
+      ) {
+        throw new Error(
+          'contextFactory must return an object with a valid config.environment field',
+        );
+      }
+
+      const rawPorts = workflowContext.allocatedPorts;
+      if (rawPorts !== undefined) {
+        if (typeof rawPorts !== 'object' || rawPorts === null) {
+          throw new Error('allocatedPorts must be a plain object');
+        }
+        for (const [key, val] of Object.entries(rawPorts)) {
+          if (typeof val !== 'number' || !Number.isFinite(val)) {
+            throw new Error(
+              `allocatedPorts["${key}"] must be a finite number, got ${String(val)}`,
+            );
+          }
+        }
+      }
 
-      workflowContext = config.contextFactory({ ports: subPorts });
+      subPorts = workflowContext.allocatedPorts ?? {};
       config.sessionManager.setWorkflowContext(workflowContext);
       startedAt = new Date().toISOString();
 
-      // Bind daemon directly to port 0 to eliminate TOCTOU race —
-      // the OS assigns the port atomically at listen time.
-      httpServer = await new Promise<http.Server>((resolve, reject) => {
-        const srv = http.createServer(app);
-        srv.listen(0, '127.0.0.1', () => {
-          const addr = srv.address();
-          if (addr && typeof addr !== 'string') {
-            daemonPort = addr.port;
-          }
-          resolve(srv);
+      // Everything after setWorkflowContext may have side-effects the
+      // consumer expects to be cleaned up.  Wrap in try/catch so a
+      // listen() or writeDaemonState() failure still runs cleanup.
+      try {
+        // Bind daemon directly to port 0 to eliminate TOCTOU race —
+        // the OS assigns the port atomically at listen time.
+        httpServer = await new Promise<http.Server>((resolve, reject) => {
+          const srv = http.createServer(app);
+          srv.listen(0, '127.0.0.1', () => {
+            const addr = srv.address();
+            if (addr && typeof addr !== 'string') {
+              daemonPort = addr.port;
+            }
+            resolve(srv);
+          });
+          srv.on('error', reject);
         });
-        srv.on('error', reject);
-      });
 
-      const state: DaemonState = {
-        port: daemonPort,
-        pid: process.pid,
-        startedAt,
-        nonce,
-        version: pkg.version,
-        subPorts,
-      };
+        const state: DaemonState = {
+          port: daemonPort,
+          pid: process.pid,
+          startedAt,
+          nonce,
+          version: pkg.version,
+          subPorts,
+        };
 
-      await writeDaemonState(worktreeRoot, state);
-      appendLog(
-        config.logFilePath,
-        `[INFO] Daemon started on port ${daemonPort} (pid ${process.pid})`,
-      );
-
-      shutdownHandler = (): void => {
-        instance
-          .stop()
-          .then(() => process.exit(0))
-          .catch((error: Error) => {
-            appendLog(
-              config.logFilePath,
-              `[ERROR] Daemon failed to shut down: ${error.message}`,
-            );
-            process.exit(1);
-          });
-      };
+        await writeDaemonState(worktreeRoot, state);
+        appendLog(
+          config.logFilePath,
+          `[INFO] Daemon started on port ${daemonPort} (pid ${process.pid})`,
+        );
 
-      process.on('SIGTERM', shutdownHandler);
-      process.on('SIGINT', shutdownHandler);
-
-      const { idleShutdownMs } = config;
-      if (idleShutdownMs && idleShutdownMs > 0) {
-        const checkMs = Math.min(idleShutdownMs / 10, 60_000);
-        idleCheckInterval = setInterval(() => {
-          if (Date.now() - lastRequestTime > idleShutdownMs) {
-            appendLog(
-              config.logFilePath,
-              '[INFO] Idle timeout reached, shutting down',
-            );
-            if (idleCheckInterval) {
-              clearInterval(idleCheckInterval);
-              idleCheckInterval = null;
+        shutdownHandler = (): void => {
+          instance
+            .stop()
+            .then(() => process.exit(0))
+            .catch((error: Error) => {
+              appendLog(
+                config.logFilePath,
+                `[ERROR] Daemon failed to shut down: ${error.message}`,
+              );
+              process.exit(1);
+            });
+        };
+
+        process.on('SIGTERM', shutdownHandler);
+        process.on('SIGINT', shutdownHandler);
+
+        const { idleShutdownMs } = config;
+        if (idleShutdownMs && idleShutdownMs > 0) {
+          const checkMs = Math.min(idleShutdownMs / 10, 60_000);
+          idleCheckInterval = setInterval(() => {
+            if (Date.now() - lastRequestTime > idleShutdownMs) {
+              appendLog(
+                config.logFilePath,
+                '[INFO] Idle timeout reached, shutting down',
+              );
+              if (idleCheckInterval) {
+                clearInterval(idleCheckInterval);
+                idleCheckInterval = null;
+              }
+              shutdownHandler?.();
             }
-            shutdownHandler?.();
-          }
-        }, checkMs);
-        idleCheckInterval.unref();
-      }
+          }, checkMs);
+          idleCheckInterval.unref();
+        }
 
-      return state;
+        return state;
+      } catch (startupError) {
+        // Best-effort rollback: close the HTTP server if it was created,
+        // then let the session manager clean up any resources the
+        // contextFactory may have started.
+        const serverToClose = httpServer;
+        if (serverToClose) {
+          await new Promise<void>((resolve) => {
+            serverToClose.close(() => {
+              httpServer = null;
+              resolve();
+            });
+          });
+        }
+        try {
+          await config.sessionManager.cleanup();
+        } catch {
+          // Swallow — we're already propagating startupError.
+        }
+        workflowContext = null; // eslint-disable-line require-atomic-updates
+        subPorts = {};
+        throw startupError;
+      }
     },
 
     async stop(): Promise<void> {
diff --git a/src/server/daemon-state.test.ts b/src/server/daemon-state.test.ts
index 847cac1..a865f18 100644
--- a/src/server/daemon-state.test.ts
+++ b/src/server/daemon-state.test.ts
@@ -25,7 +25,7 @@ const mockState: DaemonState = {
   startedAt: new Date().toISOString(),
   nonce: 'test-nonce-abc',
   version: pkg.version,
-  subPorts: { anvil: 8545, fixture: 8546, mock: 8547 },
+  subPorts: { serviceA: 3001, serviceB: 3002 },
 };
 
 describe('daemon-state', () => {
diff --git a/src/tools/batch.test.ts b/src/tools/batch.test.ts
index 3c5e061..122651c 100644
--- a/src/tools/batch.test.ts
+++ b/src/tools/batch.test.ts
@@ -455,4 +455,27 @@ describe('runStepsTool', () => {
       });
     }
   });
+
+  it('excludes observations when includeObservations is "none"', async () => {
+    const clickHandler = vi.fn().mockResolvedValue({
+      ok: true,
+      result: { clicked: true },
+    });
+    const context = createMockContext({
+      toolRegistry: new Map([['click', clickHandler]]),
+    });
+
+    const result = await runStepsTool(
+      {
+        steps: [{ tool: 'click', args: { testId: 'btn' } }],
+        includeObservations: 'none',
+      },
+      context,
+    );
+
+    expect(result.ok).toBe(true);
+    if (result.ok) {
+      expect(result.result.steps[0]).not.toHaveProperty('observation');
+    }
+  });
 });
diff --git a/src/tools/launch.test.ts b/src/tools/launch.test.ts
index 8cf0915..99deafa 100644
--- a/src/tools/launch.test.ts
+++ b/src/tools/launch.test.ts
@@ -183,6 +183,20 @@ describe('launchTool', () => {
       }
       expect(context.sessionManager.launch).not.toHaveBeenCalled();
     });
+
+    it('cleans up and relaunches when force is true', async () => {
+      const context = createMockContext({ hasActive: true });
+      vi.spyOn(context.sessionManager, 'cleanup').mockResolvedValue(true);
+
+      const result = await launchTool(
+        { stateMode: 'default', force: true },
+        context,
+      );
+
+      expect(result.ok).toBe(true);
+      expect(context.sessionManager.cleanup).toHaveBeenCalled();
+      expect(context.sessionManager.launch).toHaveBeenCalled();
+    });
   });
 
   describe('launch failures', () => {
diff --git a/src/tools/screenshot.test.ts b/src/tools/screenshot.test.ts
index 994c269..1b2ee2e 100644
--- a/src/tools/screenshot.test.ts
+++ b/src/tools/screenshot.test.ts
@@ -197,6 +197,25 @@ describe('screenshotTool', () => {
   });
 
   describe('error handling', () => {
+    it('generates default name when not provided', async () => {
+      const context = createMockContext();
+
+      vi.spyOn(context.sessionManager, 'screenshot').mockResolvedValue({
+        path: '/path/to/screenshot.png',
+        width: 1280,
+        height: 720,
+      });
+
+      const result = await screenshotTool({}, context);
+
+      expect(result.ok).toBe(true);
+      expect(context.sessionManager.screenshot).toHaveBeenCalledWith(
+        expect.objectContaining({
+          name: expect.stringMatching(/^screenshot-\d+$/u),
+        }),
+      );
+    });
+
     it('returns error when no active session', async () => {
       const context = createMockContext({ hasActive: false });
 
diff --git a/src/tools/seeding.test.ts b/src/tools/seeding.test.ts
index 28cae73..81738a1 100644
--- a/src/tools/seeding.test.ts
+++ b/src/tools/seeding.test.ts
@@ -131,6 +131,17 @@ describe('seeding tools', () => {
         expect(result.error.message).toContain('Contract not found');
       }
     });
+
+    it('returns capability unavailable when no seeding capability exists', async () => {
+      const context = createMockContext();
+
+      const result = await seedContractTool({ contractName: 'hst' }, context);
+
+      expect(result.ok).toBe(false);
+      if (!result.ok) {
+        expect(result.error.code).toBe(ErrorCodes.MM_CAPABILITY_NOT_AVAILABLE);
+      }
+    });
   });
 
   describe('seedContractsTool', () => {
@@ -197,6 +208,17 @@ describe('seeding tools', () => {
         expect(result.error.message).toContain('Anvil not running');
       }
     });
+
+    it('returns capability unavailable when no seeding capability exists', async () => {
+      const context = createMockContext();
+
+      const result = await seedContractsTool({ contracts: ['hst'] }, context);
+
+      expect(result.ok).toBe(false);
+      if (!result.ok) {
+        expect(result.error.code).toBe(ErrorCodes.MM_CAPABILITY_NOT_AVAILABLE);
+      }
+    });
   });
 
   describe('getContractAddressTool', () => {
@@ -259,6 +281,20 @@ describe('seeding tools', () => {
         expect(result.error.message).toContain('Connection lost');
       }
     });
+
+    it('returns capability unavailable when no seeding capability exists', async () => {
+      const context = createMockContext();
+
+      const result = await getContractAddressTool(
+        { contractName: 'hst' },
+        context,
+      );
+
+      expect(result.ok).toBe(false);
+      if (!result.ok) {
+        expect(result.error.code).toBe(ErrorCodes.MM_CAPABILITY_NOT_AVAILABLE);
+      }
+    });
   });
 
   describe('listContractsTool', () => {
diff --git a/src/tools/utils/discovery.test.ts b/src/tools/utils/discovery.test.ts
index 048b43a..6f5e93f 100644
--- a/src/tools/utils/discovery.test.ts
+++ b/src/tools/utils/discovery.test.ts
@@ -308,6 +308,17 @@ describe('collectTrimmedA11ySnapshot', () => {
     expect(result.refMap.size).toBe(0);
   });
 
+  it('handles empty parsed roots from valid yaml', async () => {
+    const page = createMockPage({
+      a11ySnapshot: '- text: just text\n- /url: https://example.com',
+    });
+
+    const result = await collectTrimmedA11ySnapshot(page);
+
+    expect(result.nodes).toHaveLength(0);
+    expect(result.refMap.size).toBe(0);
+  });
+
   it('uses root selector when provided', async () => {
     const a11yTree = `- dialog:\n  - button "Close"`;
 
diff --git a/src/types/http.ts b/src/types/http.ts
index 31c0640..9d3fa9a 100644
--- a/src/types/http.ts
+++ b/src/types/http.ts
@@ -6,7 +6,7 @@
 
 import type { Page } from '@playwright/test';
 
-import type { WorkflowContext } from '../capabilities/context.js';
+import type { PortMap, WorkflowContext } from '../capabilities/context.js';
 import type { KnowledgeStore } from '../knowledge-store/knowledge-store.js';
 import type { ISessionManager } from '../server/session-manager.js';
 
@@ -54,23 +54,6 @@ export type ToolFunction<TParams = unknown, TResult = unknown> = (
   context: ToolContext,
 ) => Promise<ToolResponse<TResult>>;
 
-/**
- * Port configuration passed to contextFactory at runtime.
- *
- * These ports are used to configure test infrastructure (Anvil, fixture server, mock server).
- */
-export type ContextFactoryOptions = {
-  /** Port configuration for test services */
-  ports: {
-    /** Anvil local chain port */
-    anvil: number;
-    /** Fixture server port */
-    fixture: number;
-    /** Mock server port */
-    mock: number;
-  };
-};
-
 /**
  * Configuration for createServer().
  *
@@ -80,8 +63,8 @@ export type ContextFactoryOptions = {
 export type ServerConfig = {
   /** Session manager instance */
   sessionManager: ISessionManager;
-  /** Factory function to create workflow context */
-  contextFactory: (options: ContextFactoryOptions) => WorkflowContext;
+  /** Factory function to create workflow context (may be sync or async) */
+  contextFactory: () => WorkflowContext | Promise<WorkflowContext>;
   /** Idle timeout for daemon auto-shutdown in milliseconds (default: 1_800_000 = 30 min) */
   idleShutdownMs?: number;
   /** Per-request execution timeout in milliseconds (default: 30_000) */
@@ -107,13 +90,6 @@ export type DaemonState = {
   nonce: string;
   /** Package version of the daemon process (absent in state files written before version tracking) */
   version?: string;
-  /** Port configuration for test services */
-  subPorts: {
-    /** Anvil local chain port */
-    anvil: number;
-    /** Fixture server port */
-    fixture: number;
-    /** Mock server port */
-    mock: number;
-  };
+  /** Port configuration for sub-services */
+  subPorts: PortMap;
 };
diff --git a/src/validation/schemas.test.ts b/src/validation/schemas.test.ts
index e2d915c..f38417a 100644
--- a/src/validation/schemas.test.ts
+++ b/src/validation/schemas.test.ts
@@ -13,6 +13,7 @@ import {
   switchToTabInputSchema,
   closeTabInputSchema,
   clipboardInputSchema,
+  navigateInputSchema,
 } from './schemas.js';
 
 describe('switchToTabInputSchema', () => {
@@ -278,3 +279,47 @@ describe('clipboardInputSchema', () => {
     });
   });
 });
+
+describe('navigateInputSchema', () => {
+  describe('refine validation: url required when screen is "url"', () => {
+    it('passes with screen "home"', () => {
+      const input = { screen: 'home' as const };
+      const result = navigateInputSchema.safeParse(input);
+
+      expect(result.success).toBe(true);
+    });
+
+    it('passes with screen "settings"', () => {
+      const input = { screen: 'settings' as const };
+      const result = navigateInputSchema.safeParse(input);
+
+      expect(result.success).toBe(true);
+    });
+
+    it('passes with screen "url" and url provided', () => {
+      const input = { screen: 'url' as const, url: 'https://example.com' };
+      const result = navigateInputSchema.safeParse(input);
+
+      expect(result.success).toBe(true);
+    });
+
+    it('fails with screen "url" and no url', () => {
+      const input = { screen: 'url' as const };
+      const result = navigateInputSchema.safeParse(input);
+
+      expect(result.success).toBe(false);
+      if (!result.success) {
+        expect(result.error.issues[0].message).toBe(
+          'url is required when screen is "url"',
+        );
+      }
+    });
+
+    it('fails with screen "url" and empty url', () => {
+      const input = { screen: 'url' as const, url: '' };
+      const result = navigateInputSchema.safeParse(input);
+
+      expect(result.success).toBe(false);
+    });
+  });
+});
diff --git a/vitest.config.mts b/vitest.config.mts
index a531075..216ccc7 100644
--- a/vitest.config.mts
+++ b/vitest.config.mts
@@ -35,10 +35,10 @@ export default defineConfig({
         // Auto-update the coverage thresholds when running locally.
         // Disabled in CI to prevent non-deterministic config changes.
         autoUpdate: !process.env.CI,
-        branches: 87.38,
-        functions: 92.98,
-        lines: 94.67,
-        statements: 94.46,
+        branches: 88.08,
+        functions: 92.03,
+        lines: 94.44,
+        statements: 94.14,
       },
     },
 

From 668a165dd70bf116d2cbd3853ebc526a782f83c8 Mon Sep 17 00:00:00 2001
From: cryptotavares <joao.tavares@consensys.net>
Date: Fri, 17 Apr 2026 00:29:56 +0100
Subject: [PATCH 06/36] feat: add structural roles, a11y enrichment, and node
 collapsing to discovery layer

---
 src/tools/types/discovery.ts      |  14 ++
 src/tools/utils/discovery.test.ts | 318 ++++++++++++++++++++++++++++++
 src/tools/utils/discovery.ts      | 223 +++++++++++++++++++--
 3 files changed, 542 insertions(+), 13 deletions(-)

diff --git a/src/tools/types/discovery.ts b/src/tools/types/discovery.ts
index 397196d..4d5f35f 100644
--- a/src/tools/types/discovery.ts
+++ b/src/tools/types/discovery.ts
@@ -9,6 +9,16 @@ export const ACTIONABLE_ROLES = [
   'menuitem',
 ] as const;
 
+export const STRUCTURAL_ROLES = [
+  'menu',
+  'listbox',
+  'option',
+  'tab',
+  'tabpanel',
+  'list',
+  'listitem',
+] as const;
+
 export const IMPORTANT_ROLES = [
   'dialog',
   'alert',
@@ -18,10 +28,12 @@ export const IMPORTANT_ROLES = [
 
 export const INCLUDED_ROLES = [
   ...ACTIONABLE_ROLES,
+  ...STRUCTURAL_ROLES,
   ...IMPORTANT_ROLES,
 ] as const;
 
 export type ActionableRole = (typeof ACTIONABLE_ROLES)[number];
+export type StructuralRole = (typeof STRUCTURAL_ROLES)[number];
 export type ImportantRole = (typeof IMPORTANT_ROLES)[number];
 export type IncludedRole = (typeof INCLUDED_ROLES)[number];
 
@@ -40,6 +52,8 @@ export type A11yNodeTrimmed = {
   checked?: boolean;
   expanded?: boolean;
   path: string[];
+  testId?: string;
+  textContent?: string;
 };
 
 export type RawA11yNode = {
diff --git a/src/tools/utils/discovery.test.ts b/src/tools/utils/discovery.test.ts
index 6f5e93f..d738973 100644
--- a/src/tools/utils/discovery.test.ts
+++ b/src/tools/utils/discovery.test.ts
@@ -197,6 +197,50 @@ describe('collectTestIds', () => {
     expect(result[0].text?.length).toBeLessThanOrEqual(200);
   });
 
+  it('handles isVisible rejection gracefully', async () => {
+    const mockLocators = [
+      {
+        getAttribute: vi.fn().mockResolvedValue('btn-1'),
+        isVisible: vi.fn().mockRejectedValue(new Error('detached')),
+        textContent: vi.fn().mockResolvedValue('OK'),
+      },
+    ];
+
+    const page = {
+      waitForLoadState: vi.fn().mockResolvedValue(undefined),
+      locator: vi.fn().mockReturnValue({
+        all: vi.fn().mockResolvedValue(mockLocators),
+      }),
+    } as unknown as Page;
+
+    const result = await collectTestIds(page);
+
+    expect(result).toHaveLength(1);
+    expect(result[0].visible).toBe(false);
+  });
+
+  it('handles textContent rejection gracefully', async () => {
+    const mockLocators = [
+      {
+        getAttribute: vi.fn().mockResolvedValue('btn-1'),
+        isVisible: vi.fn().mockResolvedValue(true),
+        textContent: vi.fn().mockRejectedValue(new Error('detached')),
+      },
+    ];
+
+    const page = {
+      waitForLoadState: vi.fn().mockResolvedValue(undefined),
+      locator: vi.fn().mockReturnValue({
+        all: vi.fn().mockResolvedValue(mockLocators),
+      }),
+    } as unknown as Page;
+
+    const result = await collectTestIds(page);
+
+    expect(result).toHaveLength(1);
+    expect(result[0].text).toBeUndefined();
+  });
+
   it('handles page load state failure', async () => {
     const page = createMockPage({
       testIds: [{ testId: 'test-1', visible: true }],
@@ -350,6 +394,245 @@ describe('collectTrimmedA11ySnapshot', () => {
     expect(result.nodes[1].name).toBe('Child');
     expect(result.nodes[2].name).toBe('Grandchild');
   });
+
+  it('collapses 3+ consecutive identical nodes into summary', async () => {
+    const a11yTree = [
+      '- main:',
+      '  - button "maskicon"',
+      '  - button "maskicon"',
+      '  - button "maskicon"',
+      '  - button "maskicon"',
+      '  - button "Submit"',
+    ].join('\n');
+
+    const page = createMockPage({ a11ySnapshot: a11yTree });
+    const result = await collectTrimmedA11ySnapshot(page);
+
+    expect(result.nodes).toHaveLength(3);
+    expect(result.nodes[0]).toMatchObject({
+      ref: 'e1',
+      role: 'button',
+      name: 'maskicon',
+    });
+    expect(result.nodes[1].name).toContain('3 more');
+    expect(result.nodes[1].name).toContain('maskicon');
+    expect(result.nodes[2]).toMatchObject({
+      ref: 'e5',
+      role: 'button',
+      name: 'Submit',
+    });
+    expect(result.refMap.has('e1')).toBe(true);
+    expect(result.refMap.has('e2')).toBe(true);
+    expect(result.refMap.has('e3')).toBe(true);
+    expect(result.refMap.has('e4')).toBe(true);
+  });
+
+  it('does not collapse nodes with same role and name but different paths', async () => {
+    const a11yTree = [
+      '- main:',
+      '  - dialog "A":',
+      '    - button "OK"',
+      '    - button "OK"',
+      '    - button "OK"',
+      '  - dialog "B":',
+      '    - button "OK"',
+      '    - button "OK"',
+      '    - button "OK"',
+    ].join('\n');
+
+    const page = createMockPage({ a11ySnapshot: a11yTree });
+    const result = await collectTrimmedA11ySnapshot(page);
+
+    const dialogAButtons = result.nodes.filter(
+      (n) => n.role === 'button' && n.path.some((p) => p.includes('dialog:A')),
+    );
+    const dialogBButtons = result.nodes.filter(
+      (n) => n.role === 'button' && n.path.some((p) => p.includes('dialog:B')),
+    );
+    expect(dialogAButtons.length).toBeGreaterThanOrEqual(1);
+    expect(dialogBButtons.length).toBeGreaterThanOrEqual(1);
+  });
+
+  it('does not collapse fewer than 3 identical nodes', async () => {
+    const a11yTree = [
+      '- main:',
+      '  - button "maskicon"',
+      '  - button "maskicon"',
+      '  - button "Submit"',
+    ].join('\n');
+
+    const page = createMockPage({ a11ySnapshot: a11yTree });
+    const result = await collectTrimmedA11ySnapshot(page);
+
+    expect(result.nodes).toHaveLength(3);
+    expect(result.nodes[0].name).toBe('maskicon');
+    expect(result.nodes[1].name).toBe('maskicon');
+    expect(result.nodes[2].name).toBe('Submit');
+  });
+
+  it('enriches nodes with short names using testId from DOM', async () => {
+    const a11yTree = `- main:\n  - button "x"`;
+    const mockGetAttribute = vi.fn().mockResolvedValue('action-button');
+    const mockTextContent = vi.fn().mockResolvedValue('Click me');
+    const mockBodyLocator = {
+      ariaSnapshot: vi.fn().mockResolvedValue(a11yTree),
+    };
+
+    const page = {
+      waitForLoadState: vi.fn().mockResolvedValue(undefined),
+      locator: vi.fn((selector: string) => {
+        if (selector === 'body') {
+          return { first: vi.fn().mockReturnValue(mockBodyLocator) };
+        }
+        return {
+          first: vi.fn().mockReturnValue({
+            getAttribute: mockGetAttribute,
+            textContent: mockTextContent,
+          }),
+        };
+      }),
+    } as unknown as Page;
+
+    const result = await collectTrimmedA11ySnapshot(page);
+
+    expect(result.nodes.length).toBeGreaterThan(0);
+    expect(result.nodes[0].testId).toBe('action-button');
+    expect(result.nodes[0].textContent).toBe('Click me');
+  });
+
+  it('skips textContent enrichment when text matches the node name', async () => {
+    const a11yTree = `- main:\n  - button "maskicon"`;
+    const mockBodyLocator = {
+      ariaSnapshot: vi.fn().mockResolvedValue(a11yTree),
+    };
+
+    const page = {
+      waitForLoadState: vi.fn().mockResolvedValue(undefined),
+      locator: vi.fn((selector: string) => {
+        if (selector === 'body') {
+          return { first: vi.fn().mockReturnValue(mockBodyLocator) };
+        }
+        return {
+          first: vi.fn().mockReturnValue({
+            getAttribute: vi.fn().mockResolvedValue(null),
+            textContent: vi.fn().mockResolvedValue('maskicon'),
+          }),
+        };
+      }),
+    } as unknown as Page;
+
+    const result = await collectTrimmedA11ySnapshot(page);
+
+    expect(result.nodes[0].textContent).toBeUndefined();
+    expect(result.nodes[0].testId).toBeUndefined();
+  });
+
+  it('skips enrichment when all node names exceed threshold', async () => {
+    const a11yTree = `- main:\n  - button "A very long button name that exceeds threshold"`;
+    const page = createMockPage({ a11ySnapshot: a11yTree });
+
+    const result = await collectTrimmedA11ySnapshot(page);
+
+    expect(result.nodes).toHaveLength(1);
+    expect(result.nodes[0].testId).toBeUndefined();
+    expect(result.nodes[0].textContent).toBeUndefined();
+  });
+
+  it('handles enrichment errors when getAttribute/textContent reject', async () => {
+    const a11yTree = `- main:\n  - button "x"`;
+    const mockBodyLocator = {
+      ariaSnapshot: vi.fn().mockResolvedValue(a11yTree),
+    };
+
+    const page = {
+      waitForLoadState: vi.fn().mockResolvedValue(undefined),
+      locator: vi.fn((selector: string) => {
+        if (selector === 'body') {
+          return { first: vi.fn().mockReturnValue(mockBodyLocator) };
+        }
+        return {
+          first: vi.fn().mockReturnValue({
+            getAttribute: vi.fn().mockRejectedValue(new Error('detached')),
+            textContent: vi.fn().mockRejectedValue(new Error('detached')),
+          }),
+        };
+      }),
+    } as unknown as Page;
+
+    const result = await collectTrimmedA11ySnapshot(page);
+
+    expect(result.nodes).toHaveLength(1);
+    expect(result.nodes[0].testId).toBeUndefined();
+    expect(result.nodes[0].textContent).toBeUndefined();
+  });
+
+  it('handles enrichment errors when locator.first() throws', async () => {
+    const a11yTree = `- main:\n  - button "y"`;
+    const mockBodyLocator = {
+      ariaSnapshot: vi.fn().mockResolvedValue(a11yTree),
+    };
+
+    const page = {
+      waitForLoadState: vi.fn().mockResolvedValue(undefined),
+      locator: vi.fn((selector: string) => {
+        if (selector === 'body') {
+          return { first: vi.fn().mockReturnValue(mockBodyLocator) };
+        }
+        return {
+          first: vi.fn().mockImplementation(() => {
+            throw new Error('locator disposed');
+          }),
+        };
+      }),
+    } as unknown as Page;
+
+    const result = await collectTrimmedA11ySnapshot(page);
+
+    expect(result.nodes).toHaveLength(1);
+    expect(result.nodes[0].testId).toBeUndefined();
+    expect(result.nodes[0].textContent).toBeUndefined();
+  });
+
+  it('does not collapse nodes with different textContent', async () => {
+    const a11yTree = [
+      '- main:',
+      '  - button "maskicon"',
+      '  - button "maskicon"',
+      '  - button "maskicon"',
+      '  - button "maskicon"',
+    ].join('\n');
+
+    const textValues = ['Rename', 'Account details', 'Hide', 'Remove'];
+    let callIdx = 0;
+    const mockBodyLocator = {
+      ariaSnapshot: vi.fn().mockResolvedValue(a11yTree),
+    };
+
+    const page = {
+      waitForLoadState: vi.fn().mockResolvedValue(undefined),
+      locator: vi.fn((selector: string) => {
+        if (selector === 'body') {
+          return { first: vi.fn().mockReturnValue(mockBodyLocator) };
+        }
+        const idx = callIdx;
+        callIdx += 1;
+        return {
+          first: vi.fn().mockReturnValue({
+            getAttribute: vi.fn().mockResolvedValue(null),
+            textContent: vi
+              .fn()
+              .mockResolvedValue(textValues[idx % textValues.length]),
+          }),
+        };
+      }),
+    } as unknown as Page;
+
+    const result = await collectTrimmedA11ySnapshot(page);
+
+    expect(result.nodes).toHaveLength(4);
+    expect(result.nodes[0].textContent).toBe('Rename');
+    expect(result.nodes[1].textContent).toBe('Account details');
+  });
 });
 
 describe('resolveTarget', () => {
@@ -453,6 +736,41 @@ describe('waitForTarget', () => {
 
     expect(page.locator).toHaveBeenCalledWith('.submit-button');
   });
+
+  it('scopes target within a parent when within is provided', async () => {
+    const childLocator = createMockLocator({ visible: true });
+    const firstParentLocator = {
+      waitFor: vi.fn().mockResolvedValue(undefined),
+      locator: vi.fn().mockReturnValue(childLocator),
+    };
+    const parentLocator = {
+      first: vi.fn().mockReturnValue(firstParentLocator),
+    };
+
+    const page = {
+      locator: vi.fn().mockReturnValue(parentLocator),
+    } as unknown as Page;
+
+    const result = await waitForTarget(
+      page,
+      'testId',
+      'end-accessory',
+      new Map(),
+      5000,
+      { type: 'testId', value: 'account-cell' },
+    );
+
+    expect(page.locator).toHaveBeenCalledWith('[data-testid="account-cell"]');
+    expect(parentLocator.first).toHaveBeenCalled();
+    expect(firstParentLocator.waitFor).toHaveBeenCalledWith({
+      state: 'visible',
+      timeout: 5000,
+    });
+    expect(firstParentLocator.locator).toHaveBeenCalledWith(
+      '[data-testid="end-accessory"]',
+    );
+    expect(result).toBe(childLocator);
+  });
 });
 
 describe('parseAriaSnapshotYaml', () => {
diff --git a/src/tools/utils/discovery.ts b/src/tools/utils/discovery.ts
index 3063255..b3d37e7 100644
--- a/src/tools/utils/discovery.ts
+++ b/src/tools/utils/discovery.ts
@@ -302,7 +302,154 @@ export async function collectTrimmedA11ySnapshot(
     traverseNode(root, []);
   }
 
-  return { nodes: trimmedNodes, refMap };
+  await enrichNodesWithDOMContext(page, trimmedNodes, refMap);
+
+  const collapsedNodes = collapseIdenticalRuns(trimmedNodes);
+
+  return { nodes: collapsedNodes, refMap };
+}
+
+const GENERIC_NAME_MAX_LENGTH = 20;
+const ENRICHMENT_BATCH_LIMIT = 100;
+const ENRICHMENT_ELEMENT_TIMEOUT_MS = 500;
+const TEXT_CONTENT_MAX_LENGTH = 60;
+
+type EnrichmentResult = {
+  ref: string;
+  testId: string | null;
+  textContent: string | null;
+};
+
+/**
+ * Enriches a11y nodes that have generic or empty names with data-testid
+ * values and visible text content from the corresponding DOM elements.
+ *
+ * @param page - The Playwright page to query.
+ * @param nodes - The trimmed a11y nodes to enrich (mutated in place).
+ * @param refMap - Map of a11y refs to selectors for element lookup.
+ */
+async function enrichNodesWithDOMContext(
+  page: Page,
+  nodes: A11yNodeTrimmed[],
+  refMap: Map<string, string>,
+): Promise<void> {
+  const candidates = nodes.filter(
+    (node) => !node.name || node.name.length <= GENERIC_NAME_MAX_LENGTH,
+  );
+
+  if (candidates.length === 0) {
+    return;
+  }
+
+  const enrichBatch = candidates.slice(0, ENRICHMENT_BATCH_LIMIT);
+
+  const results = await Promise.allSettled(
+    enrichBatch.map(async (node): Promise<EnrichmentResult> => {
+      const selector = refMap.get(node.ref);
+      if (!selector) {
+        return { ref: node.ref, testId: null, textContent: null };
+      }
+      try {
+        const locator = page.locator(selector).first();
+        const [testId, rawText] = await Promise.all([
+          locator
+            .getAttribute('data-testid', {
+              timeout: ENRICHMENT_ELEMENT_TIMEOUT_MS,
+            })
+            .catch(() => null),
+          locator
+            .textContent({ timeout: ENRICHMENT_ELEMENT_TIMEOUT_MS })
+            .catch(() => null),
+        ]);
+        const trimmedText = rawText?.trim().slice(0, TEXT_CONTENT_MAX_LENGTH);
+        const textContent =
+          trimmedText && trimmedText !== node.name ? trimmedText : null;
+        return { ref: node.ref, testId, textContent };
+      } catch {
+        return { ref: node.ref, testId: null, textContent: null };
+      }
+    }),
+  );
+
+  const enrichMap = new Map<string, EnrichmentResult>();
+  for (const result of results) {
+    if (result.status === 'fulfilled') {
+      enrichMap.set(result.value.ref, result.value);
+    }
+  }
+
+  for (const node of enrichBatch) {
+    const data = enrichMap.get(node.ref);
+    if (!data) {
+      continue;
+    }
+    if (data.testId) {
+      node.testId = data.testId;
+    }
+    if (data.textContent) {
+      node.textContent = data.textContent;
+    }
+  }
+}
+
+const COLLAPSE_THRESHOLD = 3;
+
+/**
+ * Checks whether two string arrays contain identical elements in order.
+ *
+ * @param left - First array to compare.
+ * @param right - Second array to compare.
+ * @returns True if both arrays are equal.
+ */
+function arraysEqual(left: string[], right: string[]): boolean {
+  return (
+    left.length === right.length && left.every((val, idx) => val === right[idx])
+  );
+}
+
+/**
+ * Collapses consecutive runs of identical a11y nodes into a summary entry.
+ * The refMap retains individual entries so refs still resolve — collapsing
+ * only affects the agent-facing representation to reduce token waste.
+ *
+ * @param nodes - The flat list of trimmed a11y nodes to collapse.
+ * @returns A new array with runs of 3+ identical nodes collapsed.
+ */
+function collapseIdenticalRuns(nodes: A11yNodeTrimmed[]): A11yNodeTrimmed[] {
+  const collapsed: A11yNodeTrimmed[] = [];
+  let cursor = 0;
+  while (cursor < nodes.length) {
+    const current = nodes[cursor];
+    let runEnd = cursor + 1;
+    while (
+      runEnd < nodes.length &&
+      nodes[runEnd].role === current.role &&
+      nodes[runEnd].name === current.name &&
+      nodes[runEnd].testId === current.testId &&
+      nodes[runEnd].textContent === current.textContent &&
+      arraysEqual(nodes[runEnd].path, current.path)
+    ) {
+      runEnd += 1;
+    }
+
+    const runLength = runEnd - cursor;
+    if (runLength >= COLLAPSE_THRESHOLD) {
+      collapsed.push(current);
+      const lastInRun = nodes[runEnd - 1];
+      collapsed.push({
+        ref: `${current.ref}\u2013${lastInRun.ref}`,
+        role: current.role,
+        name: `\u2026 ${runLength - 1} more "${current.name || current.role}" (refs ${current.ref}\u2013${lastInRun.ref})`,
+        path: current.path,
+      });
+    } else {
+      for (let idx = cursor; idx < runEnd; idx += 1) {
+        collapsed.push(nodes[idx]);
+      }
+    }
+    cursor = runEnd;
+  }
+  return collapsed;
 }
 
 /**
@@ -318,20 +465,33 @@ function buildA11ySelector(role: IncludedRole, name: string): string {
 }
 
 /**
- * Resolve a target element to a Playwright Locator.
+ * Target type for scoping selectors.
+ */
+export type TargetType = 'a11yRef' | 'testId' | 'selector';
+
+/**
+ * Optional parent scope for chained locator resolution.
+ */
+export type WithinScope = {
+  type: TargetType;
+  value: string;
+};
+
+/**
+ * Resolve a target element to a Playwright Locator, optionally scoped within a parent.
  *
- * @param page The Playwright page to search
+ * @param scope The Playwright Page or Locator to search within
  * @param targetType The type of target identifier (a11yRef, testId, or CSS selector)
  * @param targetValue The target value to resolve
  * @param refMap Map of a11y refs to selectors (used when targetType is 'a11yRef')
  * @returns Playwright Locator for the resolved element
  */
-export async function resolveTarget(
-  page: Page,
-  targetType: 'a11yRef' | 'testId' | 'selector',
+function resolveTargetScoped(
+  scope: Page | Locator,
+  targetType: TargetType,
   targetValue: string,
   refMap: Map<string, string>,
-): Promise<Locator> {
+): Locator {
   switch (targetType) {
     case 'a11yRef': {
       const selector = refMap.get(targetValue);
@@ -341,12 +501,12 @@ export async function resolveTarget(
             `Available refs: ${Array.from(refMap.keys()).join(', ')}`,
         );
       }
-      return page.locator(selector);
+      return scope.locator(selector);
     }
     case 'testId':
-      return page.locator(`[data-testid="${targetValue}"]`);
+      return scope.locator(`[data-testid="${targetValue}"]`);
     case 'selector':
-      return page.locator(targetValue);
+      return scope.locator(targetValue);
     default: {
       const exhaustiveCheck: never = targetType;
       throw new Error(`Unknown target type: ${exhaustiveCheck as string}`);
@@ -355,23 +515,60 @@ export async function resolveTarget(
 }
 
 /**
- * Wait for a target element to become visible.
+ * Resolve a target element to a Playwright Locator (page-level).
+ *
+ * @param page The Playwright page to search
+ * @param targetType The type of target identifier (a11yRef, testId, or CSS selector)
+ * @param targetValue The target value to resolve
+ * @param refMap Map of a11y refs to selectors (used when targetType is 'a11yRef')
+ * @returns Playwright Locator for the resolved element
+ */
+export async function resolveTarget(
+  page: Page,
+  targetType: TargetType,
+  targetValue: string,
+  refMap: Map<string, string>,
+): Promise<Locator> {
+  return resolveTargetScoped(page, targetType, targetValue, refMap);
+}
+
+/**
+ * Wait for a target element to become visible, optionally scoped within a parent.
  *
  * @param page The Playwright page to search
  * @param targetType The type of target identifier (a11yRef, testId, or CSS selector)
  * @param targetValue The target value to resolve
  * @param refMap Map of a11y refs to selectors (used when targetType is 'a11yRef')
  * @param timeoutMs Maximum time to wait in milliseconds
+ * @param within Optional parent scope — resolves the target within this element
  * @returns Playwright Locator for the visible element
  */
 export async function waitForTarget(
   page: Page,
-  targetType: 'a11yRef' | 'testId' | 'selector',
+  targetType: TargetType,
   targetValue: string,
   refMap: Map<string, string>,
   timeoutMs: number,
+  within?: WithinScope,
 ): Promise<Locator> {
-  const locator = await resolveTarget(page, targetType, targetValue, refMap);
+  let scope: Page | Locator = page;
+  if (within) {
+    const parentLocator = resolveTargetScoped(
+      page,
+      within.type,
+      within.value,
+      refMap,
+    );
+    await parentLocator
+      .first()
+      .waitFor({ state: 'visible', timeout: timeoutMs });
+    // Use .first() to guarantee the child search is scoped to exactly one
+    // parent element.  Without this, Playwright chains the child locator
+    // across ALL matching parents, producing phantom multi-matches
+    // (e.g. 63 "end-accessory" buttons across 63 account cells).
+    scope = parentLocator.first();
+  }
+  const locator = resolveTargetScoped(scope, targetType, targetValue, refMap);
   await locator.waitFor({ state: 'visible', timeout: timeoutMs });
   return locator;
 }

From 4b28791a0f2dae9c511bf6a9df9b9034bdd5f198 Mon Sep 17 00:00:00 2001
From: cryptotavares <joao.tavares@consensys.net>
Date: Fri, 17 Apr 2026 00:30:14 +0100
Subject: [PATCH 07/36] feat: add within scoping for click, type, and wait_for
 tools

---
 src/tools/interaction.test.ts  | 86 ++++++++++++++++++++++++++++++++++
 src/tools/interaction.ts       | 29 ++++++++++++
 src/tools/types/tool-inputs.ts | 10 ++++
 src/validation/schemas.ts      | 36 ++++++++++++++
 4 files changed, 161 insertions(+)

diff --git a/src/tools/interaction.test.ts b/src/tools/interaction.test.ts
index 911059a..7da273c 100644
--- a/src/tools/interaction.test.ts
+++ b/src/tools/interaction.test.ts
@@ -68,6 +68,7 @@ describe('interaction', () => {
         'my-button',
         context.refMap,
         15000,
+        undefined,
       );
       expect(locator.click).toHaveBeenCalled();
     });
@@ -89,6 +90,32 @@ describe('interaction', () => {
         'my-button',
         context.refMap,
         5000,
+        undefined,
+      );
+    });
+
+    it('passes within scope to waitForTarget', async () => {
+      const page = {};
+      const locator = createMockLocator();
+      const context = createMockContext({ page });
+
+      vi.spyOn(discoveryModule, 'waitForTarget').mockResolvedValue(
+        locator as any,
+      );
+
+      const result = await clickTool(
+        { testId: 'btn', within: { testId: 'parent' } },
+        context,
+      );
+
+      expect(result.ok).toBe(true);
+      expect(discoveryModule.waitForTarget).toHaveBeenCalledWith(
+        page,
+        'testId',
+        'btn',
+        context.refMap,
+        15000,
+        { type: 'testId', value: 'parent' },
       );
     });
 
@@ -114,6 +141,7 @@ describe('interaction', () => {
         'button.primary',
         context.refMap,
         15000,
+        undefined,
       );
     });
 
@@ -140,6 +168,7 @@ describe('interaction', () => {
         'e5',
         refMap,
         15000,
+        undefined,
       );
     });
 
@@ -290,6 +319,7 @@ describe('interaction', () => {
         'amount-input',
         context.refMap,
         15000,
+        undefined,
       );
       expect(locator.fill).toHaveBeenCalledWith('0.5');
     });
@@ -314,6 +344,32 @@ describe('interaction', () => {
         'input',
         context.refMap,
         3000,
+        undefined,
+      );
+    });
+
+    it('passes within scope to waitForTarget', async () => {
+      const page = {};
+      const locator = createMockLocator();
+      const context = createMockContext({ page });
+
+      vi.spyOn(discoveryModule, 'waitForTarget').mockResolvedValue(
+        locator as any,
+      );
+
+      const result = await typeTool(
+        { testId: 'input', text: 'hello', within: { selector: '.form' } },
+        context,
+      );
+
+      expect(result.ok).toBe(true);
+      expect(discoveryModule.waitForTarget).toHaveBeenCalledWith(
+        page,
+        'testId',
+        'input',
+        context.refMap,
+        15000,
+        { type: 'selector', value: '.form' },
       );
     });
 
@@ -363,6 +419,7 @@ describe('interaction', () => {
         'e3',
         refMap,
         15000,
+        undefined,
       );
     });
 
@@ -498,6 +555,7 @@ describe('interaction', () => {
         'loading-spinner',
         context.refMap,
         15000,
+        undefined,
       );
     });
 
@@ -518,6 +576,32 @@ describe('interaction', () => {
         'element',
         context.refMap,
         30000,
+        undefined,
+      );
+    });
+
+    it('passes within scope to waitForTarget', async () => {
+      const page = {};
+      const locator = createMockLocator();
+      const context = createMockContext({ page });
+
+      vi.spyOn(discoveryModule, 'waitForTarget').mockResolvedValue(
+        locator as any,
+      );
+
+      const result = await waitForTool(
+        { a11yRef: 'e5', within: { a11yRef: 'e1' } },
+        context,
+      );
+
+      expect(result.ok).toBe(true);
+      expect(discoveryModule.waitForTarget).toHaveBeenCalledWith(
+        page,
+        'a11yRef',
+        'e5',
+        context.refMap,
+        15000,
+        { type: 'a11yRef', value: 'e1' },
       );
     });
 
@@ -546,6 +630,7 @@ describe('interaction', () => {
         '.success-message',
         context.refMap,
         15000,
+        undefined,
       );
     });
 
@@ -572,6 +657,7 @@ describe('interaction', () => {
         'e10',
         refMap,
         15000,
+        undefined,
       );
     });
 
diff --git a/src/tools/interaction.ts b/src/tools/interaction.ts
index 6e7fabe..13ce247 100644
--- a/src/tools/interaction.ts
+++ b/src/tools/interaction.ts
@@ -11,10 +11,12 @@ import type {
   TypeResult,
   WaitForInput,
   WaitForResult,
+  WithinTarget,
 } from './types';
 import { ErrorCodes } from './types';
 import { DEFAULT_INTERACTION_TIMEOUT_MS } from './utils/constants.js';
 import { waitForTarget } from './utils/discovery.js';
+import type { WithinScope } from './utils/discovery.js';
 import { validateTargetSelection } from './utils/targets.js';
 import {
   isInvalidTargetSelection,
@@ -27,6 +29,30 @@ import {
 } from './utils.js';
 import type { ToolContext, ToolResponse } from '../types/http.js';
 
+/**
+ * Converts a WithinTarget input to the WithinScope format expected by waitForTarget.
+ *
+ * @param within - The optional within target from tool input.
+ * @returns The resolved scope, or undefined if no within target is provided.
+ */
+function resolveWithinScope(
+  within: WithinTarget | undefined,
+): WithinScope | undefined {
+  if (!within) {
+    return undefined;
+  }
+  if (within.a11yRef) {
+    return { type: 'a11yRef', value: within.a11yRef };
+  }
+  if (within.testId) {
+    return { type: 'testId', value: within.testId };
+  }
+  if (within.selector) {
+    return { type: 'selector', value: within.selector };
+  }
+  return undefined;
+}
+
 /**
  * Clicks an element identified by ref, test ID, or selector.
  *
@@ -66,6 +92,7 @@ export async function clickTool(
       targetValue,
       context.refMap,
       timeoutMs,
+      resolveWithinScope(input.within),
     );
 
     try {
@@ -130,6 +157,7 @@ export async function typeTool(
       targetValue,
       context.refMap,
       timeoutMs,
+      resolveWithinScope(input.within),
     );
 
     await locator.fill(input.text);
@@ -184,6 +212,7 @@ export async function waitForTool(
       targetValue,
       context.refMap,
       timeoutMs,
+      resolveWithinScope(input.within),
     );
 
     return createToolSuccess({
diff --git a/src/tools/types/tool-inputs.ts b/src/tools/types/tool-inputs.ts
index 211ee5c..d1fb42b 100644
--- a/src/tools/types/tool-inputs.ts
+++ b/src/tools/types/tool-inputs.ts
@@ -65,17 +65,26 @@ export type TargetSelection = {
   selector?: string;
 };
 
+export type WithinTarget = {
+  a11yRef?: string;
+  testId?: string;
+  selector?: string;
+};
+
 export type ClickInput = TargetSelection & {
   timeoutMs?: number;
+  within?: WithinTarget;
 };
 
 export type TypeInput = TargetSelection & {
   text: string;
   timeoutMs?: number;
+  within?: WithinTarget;
 };
 
 export type WaitForInput = TargetSelection & {
   timeoutMs?: number;
+  within?: WithinTarget;
 };
 
 export type KnowledgeScope =
@@ -122,6 +131,7 @@ export type RunStepsInput = {
   }[];
   stopOnError?: boolean;
   includeObservations?: 'none' | 'failures' | 'all';
+  batchTimeoutMs?: number;
 };
 
 export type SwitchToTabInput = {
diff --git a/src/validation/schemas.ts b/src/validation/schemas.ts
index 58585c1..43462ae 100644
--- a/src/validation/schemas.ts
+++ b/src/validation/schemas.ts
@@ -265,6 +265,29 @@ export const screenshotInputSchema = z.object({
     .describe('Include base64-encoded image in response'),
 });
 
+export const withinTargetSchema = z
+  .object({
+    a11yRef: a11yRefPattern.optional(),
+    testId: z.string().min(1).optional(),
+    selector: z.string().min(1).optional(),
+  })
+  .refine(
+    (data) => {
+      const provided = [data.a11yRef, data.testId, data.selector].filter(
+        Boolean,
+      );
+      return provided.length === 1;
+    },
+    {
+      message:
+        'Exactly one of a11yRef, testId, or selector must be provided in within',
+    },
+  )
+  .describe(
+    'Scope the target search within a parent element. ' +
+      'Accepts the same targeting options (a11yRef, testId, or selector).',
+  );
+
 export const clickInputSchema = targetSelectionSchema.and(
   z.object({
     timeoutMs: z
@@ -274,6 +297,7 @@ export const clickInputSchema = targetSelectionSchema.and(
       .max(60000)
       .default(15000)
       .describe('Timeout to wait for element to become visible'),
+    within: withinTargetSchema.optional(),
   }),
 );
 
@@ -287,6 +311,7 @@ export const typeInputSchema = targetSelectionSchema.and(
       .max(60000)
       .default(15000)
       .describe('Timeout to wait for element to become visible'),
+    within: withinTargetSchema.optional(),
   }),
 );
 
@@ -299,6 +324,7 @@ export const waitForInputSchema = targetSelectionSchema.and(
       .max(120000)
       .default(15000)
       .describe('Timeout to wait for element'),
+    within: withinTargetSchema.optional(),
   }),
 );
 
@@ -449,6 +475,16 @@ export const runStepsInputSchema = z.object({
       'When to include observations in results: ' +
         'none = never (fastest), failures = only for failed steps, all = always',
     ),
+  batchTimeoutMs: z
+    .number()
+    .int()
+    .min(1000)
+    .max(300_000)
+    .describe(
+      'Overall timeout for the batch in milliseconds. ' +
+        'When exceeded, remaining steps are marked as skipped and partial results are returned.',
+    )
+    .optional(),
 });
 
 export const setContextInputSchema = z.object({

From ad1a6537943314b7de45f5c32a9e890804f256b5 Mon Sep 17 00:00:00 2001
From: cryptotavares <joao.tavares@consensys.net>
Date: Fri, 17 Apr 2026 00:30:28 +0100
Subject: [PATCH 08/36] feat: add batch timeout, tool aliases, and ref
 shorthand to run_steps

---
 src/tools/batch.test.ts         | 146 ++++++++++++++++++++++++++++++++
 src/tools/batch.ts              |  98 ++++++++++++++++++++-
 src/tools/types/tool-outputs.ts |   2 +
 3 files changed, 244 insertions(+), 2 deletions(-)

diff --git a/src/tools/batch.test.ts b/src/tools/batch.test.ts
index 122651c..2cfffab 100644
--- a/src/tools/batch.test.ts
+++ b/src/tools/batch.test.ts
@@ -478,4 +478,150 @@ describe('runStepsTool', () => {
       expect(result.result.steps[0]).not.toHaveProperty('observation');
     }
   });
+
+  it('marks remaining steps as skipped when batchTimeoutMs is exceeded', async () => {
+    const clickHandler = vi.fn().mockImplementation(
+      async () =>
+        new Promise((resolve) => {
+          setTimeout(() => resolve({ ok: true, result: 'clicked' }), 50);
+        }),
+    );
+    const typeHandler = vi.fn().mockResolvedValue({
+      ok: true,
+      result: 'typed',
+    });
+    const context = createMockContext({
+      toolRegistry: new Map([
+        ['click', clickHandler],
+        ['type', typeHandler],
+      ]),
+    });
+
+    const result = await runStepsTool(
+      {
+        steps: [
+          { tool: 'click', args: { testId: 'btn' } },
+          { tool: 'type', args: { testId: 'input', text: 'hello' } },
+          { tool: 'click', args: { testId: 'submit' } },
+        ],
+        batchTimeoutMs: 1,
+      },
+      context,
+    );
+
+    expect(result.ok).toBe(true);
+    if (result.ok) {
+      expect(result.result.steps).toHaveLength(3);
+      // First step may succeed or be skipped depending on timing
+      // Steps after deadline should be skipped
+      const skippedSteps = result.result.steps.filter(
+        (step) => step.meta.skipped === true,
+      );
+      expect(skippedSteps.length).toBeGreaterThan(0);
+      skippedSteps.forEach((step) => {
+        expect(step.ok).toBe(false);
+        expect(step.error?.code).toBe('MM_BATCH_TIMEOUT');
+      });
+      expect(result.result.summary.skipped).toBeGreaterThan(0);
+    }
+  });
+
+  it('resolves navigate_home alias to navigate with screen: home', async () => {
+    const navigateHandler = vi.fn().mockResolvedValue({
+      ok: true,
+      result: { navigated: true },
+    });
+    const context = createMockContext({
+      toolRegistry: new Map([['navigate', navigateHandler]]),
+    });
+
+    const result = await runStepsTool(
+      { steps: [{ tool: 'navigate_home' }] },
+      context,
+    );
+
+    expect(result.ok).toBe(true);
+    if (result.ok) {
+      expect(result.result.steps[0].ok).toBe(true);
+    }
+    expect(navigateHandler).toHaveBeenCalledWith({ screen: 'home' }, context);
+  });
+
+  it('resolves navigate-home (hyphenated) alias to navigate with screen: home', async () => {
+    const navigateHandler = vi.fn().mockResolvedValue({
+      ok: true,
+      result: { navigated: true },
+    });
+    const context = createMockContext({
+      toolRegistry: new Map([['navigate', navigateHandler]]),
+    });
+
+    const result = await runStepsTool(
+      { steps: [{ tool: 'navigate-home' }] },
+      context,
+    );
+
+    expect(result.ok).toBe(true);
+    if (result.ok) {
+      expect(result.result.steps[0].ok).toBe(true);
+    }
+    expect(navigateHandler).toHaveBeenCalledWith({ screen: 'home' }, context);
+  });
+
+  it('resolves navigate_settings alias to navigate with screen: settings', async () => {
+    const navigateHandler = vi.fn().mockResolvedValue({
+      ok: true,
+      result: { navigated: true },
+    });
+    const context = createMockContext({
+      toolRegistry: new Map([['navigate', navigateHandler]]),
+    });
+
+    const result = await runStepsTool(
+      { steps: [{ tool: 'navigate_settings' }] },
+      context,
+    );
+
+    expect(result.ok).toBe(true);
+    if (result.ok) {
+      expect(result.result.steps[0].ok).toBe(true);
+    }
+    expect(navigateHandler).toHaveBeenCalledWith(
+      { screen: 'settings' },
+      context,
+    );
+  });
+
+  it('normalises within.ref to within.a11yRef in step args', async () => {
+    const clickHandler = vi.fn().mockResolvedValue({
+      ok: true,
+      result: 'clicked',
+    });
+    const context = createMockContext({
+      toolRegistry: new Map([['click', clickHandler]]),
+    });
+
+    const result = await runStepsTool(
+      {
+        steps: [
+          {
+            tool: 'click',
+            args: { testId: 'btn', within: { ref: 'e1' } },
+          },
+        ],
+      },
+      context,
+    );
+
+    expect(result.ok).toBe(true);
+    if (result.ok) {
+      expect(clickHandler).toHaveBeenCalledWith(
+        expect.objectContaining({
+          testId: 'btn',
+          within: { a11yRef: 'e1' },
+        }),
+        context,
+      );
+    }
+  });
 });
diff --git a/src/tools/batch.ts b/src/tools/batch.ts
index 40d5e02..dc88651 100644
--- a/src/tools/batch.ts
+++ b/src/tools/batch.ts
@@ -6,6 +6,73 @@ import { extractErrorMessage } from '../utils';
 import type { ToolName } from '../validation/schemas.js';
 import { toolSchemas } from '../validation/schemas.js';
 
+/** Tools whose args include a target selection (a11yRef/testId/selector). */
+const TARGET_TOOLS = new Set(['click', 'type', 'wait_for']);
+
+/**
+ * Maps CLI-style compound tool names to their registry name + injected args.
+ * The CLI handles these conversions for standalone commands, but agents using
+ * run-steps bypass CLI parsing and may send compound names directly.
+ */
+const TOOL_ALIASES: Record<
+  string,
+  { tool: string; inject: Record<string, unknown> }
+> = {
+  navigate_home: { tool: 'navigate', inject: { screen: 'home' } },
+  'navigate-home': { tool: 'navigate', inject: { screen: 'home' } },
+  navigate_settings: { tool: 'navigate', inject: { screen: 'settings' } },
+  'navigate-settings': { tool: 'navigate', inject: { screen: 'settings' } },
+  navigate_notification: {
+    tool: 'navigate',
+    inject: { screen: 'notification' },
+  },
+  'navigate-notification': {
+    tool: 'navigate',
+    inject: { screen: 'notification' },
+  },
+};
+
+type NormalisedStep = {
+  tool: string;
+  args: Record<string, unknown>;
+};
+
+/**
+ * Resolves tool aliases and normalises shorthand arg keys.
+ *
+ * @param tool - Raw tool name (may be an alias like `navigate_home`).
+ * @param args - Raw step arguments.
+ * @returns Resolved tool name and normalised arguments.
+ */
+function normaliseStep(
+  tool: string,
+  args: Record<string, unknown>,
+): NormalisedStep {
+  const alias = TOOL_ALIASES[tool];
+  const resolvedTool = alias ? alias.tool : tool;
+  let normalised = alias ? { ...alias.inject, ...args } : args;
+
+  if (TARGET_TOOLS.has(resolvedTool)) {
+    if ('ref' in normalised && !('a11yRef' in normalised)) {
+      const { ref, ...rest } = normalised;
+      normalised = { a11yRef: ref, ...rest };
+    }
+
+    if (typeof normalised.within === 'object' && normalised.within !== null) {
+      const withinObj = normalised.within as Record<string, unknown>;
+      if ('ref' in withinObj && !('a11yRef' in withinObj)) {
+        const { ref: withinRef, ...withinRest } = withinObj;
+        normalised = {
+          ...normalised,
+          within: { a11yRef: withinRef, ...withinRest },
+        };
+      }
+    }
+  }
+
+  return { tool: resolvedTool, args: normalised };
+}
+
 /**
  * Executes a batch of tool steps sequentially.
  *
@@ -31,15 +98,41 @@ export async function runStepsTool(
     );
   }
 
-  const { steps: stepInputs, stopOnError = false } = input;
+  const { steps: stepInputs, stopOnError = false, batchTimeoutMs } = input;
   const stepResults: StepResult[] = [];
   let succeeded = 0;
   let failed = 0;
+  let skipped = 0;
   const batchStartTime = Date.now();
+  const batchDeadline = batchTimeoutMs
+    ? batchStartTime + batchTimeoutMs
+    : undefined;
 
   for (const stepInput of stepInputs) {
+    if (batchDeadline && Date.now() > batchDeadline) {
+      const remainingIndex = stepInputs.indexOf(stepInput);
+      for (const remaining of stepInputs.slice(remainingIndex)) {
+        stepResults.push({
+          tool: remaining.tool,
+          ok: false,
+          error: {
+            code: 'MM_BATCH_TIMEOUT',
+            message: `Batch deadline exceeded after ${batchTimeoutMs}ms`,
+          },
+          meta: {
+            durationMs: 0,
+            timestamp: new Date().toISOString(),
+            skipped: true,
+          },
+        });
+        skipped += 1;
+        failed += 1;
+      }
+      break;
+    }
     const stepStartTime = Date.now();
-    const { tool, args = {} } = stepInput;
+    const { tool: rawTool, args: rawArgs = {} } = stepInput;
+    const { tool, args } = normaliseStep(rawTool, rawArgs);
     const handler = context.toolRegistry.get(tool) as
       | ToolFunction<Record<string, unknown>, unknown>
       | undefined;
@@ -151,6 +244,7 @@ export async function runStepsTool(
       total: stepResults.length,
       succeeded,
       failed,
+      skipped,
       durationMs: Date.now() - batchStartTime,
     },
   });
diff --git a/src/tools/types/tool-outputs.ts b/src/tools/types/tool-outputs.ts
index 969030a..e24ca39 100644
--- a/src/tools/types/tool-outputs.ts
+++ b/src/tools/types/tool-outputs.ts
@@ -105,6 +105,7 @@ export type StepResult = {
   meta: {
     durationMs: number;
     timestamp: string;
+    skipped?: boolean;
   };
 };
 
@@ -115,6 +116,7 @@ export type RunStepsResult = {
     total: number;
     succeeded: number;
     failed: number;
+    skipped: number;
     durationMs: number;
   };
 };

From 76a12f341f220692432a1cd4c7d3752d523cc4db Mon Sep 17 00:00:00 2001
From: cryptotavares <joao.tavares@consensys.net>
Date: Fri, 17 Apr 2026 00:30:40 +0100
Subject: [PATCH 09/36] feat: wait for animations to settle before
 post-mutation observations

---
 src/server/create-server.test.ts |  5 ++++-
 src/server/create-server.ts      | 20 ++++++++++++++++++++
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/src/server/create-server.test.ts b/src/server/create-server.test.ts
index daad92d..93e2e65 100644
--- a/src/server/create-server.test.ts
+++ b/src/server/create-server.test.ts
@@ -74,7 +74,10 @@ function createMockSessionManager() {
       state: {},
     })),
     cleanup: vi.fn(async () => true),
-    getPage: vi.fn(() => ({})),
+    getPage: vi.fn(() => ({
+      waitForLoadState: vi.fn(async () => undefined),
+      waitForFunction: vi.fn(async () => undefined),
+    })),
     setActivePage: vi.fn(),
     getTrackedPages: vi.fn(() => []),
     classifyPageRole: vi.fn(() => 'extension'),
diff --git a/src/server/create-server.ts b/src/server/create-server.ts
index 3f269ac..07a15f3 100644
--- a/src/server/create-server.ts
+++ b/src/server/create-server.ts
@@ -440,6 +440,26 @@ export function createServer(config: ServerConfig): ServerInstance {
         ) {
           try {
             const page = config.sessionManager.getPage();
+
+            if (category === 'mutating') {
+              await page
+                .waitForLoadState('domcontentloaded')
+                .catch(() => undefined);
+              await page
+                .waitForFunction(
+                  async () =>
+                    new Promise<boolean>((resolve) => {
+                      requestAnimationFrame(() => {
+                        const allSettled = document
+                          .getAnimations()
+                          .every((a: Animation) => a.playState !== 'running');
+                        resolve(allSettled);
+                      });
+                    }),
+                  { timeout: 3000 },
+                )
+                .catch(() => undefined);
+            }
             const state = await config.sessionManager.getExtensionState();
             const testIds = await collectTestIds(
               page,

From eab29eacf36b362b866e1a0b0d26a925f1667a9a Mon Sep 17 00:00:00 2001
From: cryptotavares <joao.tavares@consensys.net>
Date: Fri, 17 Apr 2026 00:30:52 +0100
Subject: [PATCH 10/36] feat: add --within flag and observations output to CLI

---
 src/cli/mm.test.ts | 55 ++++++++++++++++++++++++++++++++++
 src/cli/mm.ts      | 73 ++++++++++++++++++++++++++++++++++++----------
 2 files changed, 113 insertions(+), 15 deletions(-)

diff --git a/src/cli/mm.test.ts b/src/cli/mm.test.ts
index 61c2e8e..75c484a 100644
--- a/src/cli/mm.test.ts
+++ b/src/cli/mm.test.ts
@@ -12,6 +12,7 @@ import type { MockInstance } from 'vitest';
 import {
   extractProjectFlag,
   resolveTargetFromArgs,
+  resolveWithinFromArgs,
   getPositionalTarget,
   isTransientError,
   parseIntFlag,
@@ -217,6 +218,60 @@ describe('resolveTargetFromArgs', () => {
   });
 });
 
+describe('resolveWithinFromArgs', () => {
+  it('returns undefined when --within is not present', () => {
+    expect(resolveWithinFromArgs(['e1', '--timeout', '5000'])).toBeUndefined();
+    expect(resolveWithinFromArgs([])).toBeUndefined();
+  });
+
+  it('returns testId when value starts with "testid:"', () => {
+    expect(resolveWithinFromArgs(['--within', 'testid:parent'])).toStrictEqual({
+      testId: 'parent',
+    });
+  });
+
+  it('returns selector when value starts with "selector:"', () => {
+    expect(
+      resolveWithinFromArgs(['--within', 'selector:.container']),
+    ).toStrictEqual({
+      selector: '.container',
+    });
+  });
+
+  it('returns a11yRef when value matches /^e\\d+$/', () => {
+    expect(resolveWithinFromArgs(['--within', 'e1'])).toStrictEqual({
+      a11yRef: 'e1',
+    });
+    expect(resolveWithinFromArgs(['--within', 'e123'])).toStrictEqual({
+      a11yRef: 'e123',
+    });
+  });
+
+  it('returns testId for bare non-ref value', () => {
+    expect(resolveWithinFromArgs(['--within', 'parent-id'])).toStrictEqual({
+      testId: 'parent-id',
+    });
+    expect(resolveWithinFromArgs(['--within', 'eabc'])).toStrictEqual({
+      testId: 'eabc',
+    });
+  });
+
+  it('exits when --within has no value', () => {
+    expect(() => resolveWithinFromArgs(['--within'])).toThrowError(
+      'process.exit',
+    );
+    expect(stderrSpy).toHaveBeenCalledWith(
+      'Error: --within requires a value\n',
+    );
+  });
+
+  it('exits when --within value starts with --', () => {
+    expect(() => resolveWithinFromArgs(['--within', '--other'])).toThrowError(
+      'process.exit',
+    );
+  });
+});
+
 describe('getPositionalTarget', () => {
   it('returns first non-flag argument', () => {
     expect(getPositionalTarget(['e1', '--timeout', '5000'])).toBe('e1');
diff --git a/src/cli/mm.ts b/src/cli/mm.ts
index e9cc384..359b0ae 100644
--- a/src/cli/mm.ts
+++ b/src/cli/mm.ts
@@ -176,6 +176,35 @@ export async function main(): Promise<void> {
   await routeCommand(command, args.slice(1), daemonState.port);
 }
 
+/**
+ * Resolves `--within` scoping from CLI arguments.
+ *
+ * @param args - The CLI arguments to scan.
+ * @returns A within target object, or undefined if `--within` is absent.
+ */
+export function resolveWithinFromArgs(
+  args: string[],
+): { a11yRef: string } | { testId: string } | { selector: string } | undefined {
+  const withinIdx = args.indexOf('--within');
+  if (withinIdx < 0) {
+    return undefined;
+  }
+  const val = args[withinIdx + 1];
+  if (!val || val.startsWith('--')) {
+    process.stderr.write('Error: --within requires a value\n');
+    process.exit(1);
+  }
+
+  // "testid:value" → testId, "selector:value" → selector, otherwise auto-detect
+  if (val.startsWith('testid:')) {
+    return { testId: val.slice('testid:'.length) };
+  }
+  if (val.startsWith('selector:')) {
+    return { selector: val.slice('selector:'.length) };
+  }
+  return /^e[0-9]+$/u.test(val) ? { a11yRef: val } : { testId: val };
+}
+
 /**
  * Resolves element targeting from CLI arguments. Supports three targeting modes:
  * --selector <css>  → CSS selector (explicit)
@@ -258,16 +287,15 @@ export async function routeCommand(
         !args.includes('--testid')
       ) {
         process.stderr.write(
-          'Usage: mm click <ref> [--selector <css>] [--testid <id>]\n',
+          'Usage: mm click <ref> [--selector <css>] [--testid <id>] [--within <scope>]\n',
         );
         process.exit(1);
       }
-      await sendRequest(
-        port,
-        'POST',
-        '/tool/click',
-        resolveTargetFromArgs(args),
-      );
+      const clickWithin = resolveWithinFromArgs(args);
+      await sendRequest(port, 'POST', '/tool/click', {
+        ...resolveTargetFromArgs(args),
+        ...(clickWithin ? { within: clickWithin } : {}),
+      });
       break;
     }
     case 'type': {
@@ -278,7 +306,7 @@ export async function routeCommand(
         !args.includes('--testid')
       ) {
         process.stderr.write(
-          'Usage: mm type <ref> <text> [--selector <css>] [--testid <id>]\n',
+          'Usage: mm type <ref> <text> [--selector <css>] [--testid <id>] [--within <scope>]\n',
         );
         process.exit(1);
       }
@@ -293,9 +321,11 @@ export async function routeCommand(
         process.stderr.write('Usage: mm type <ref> <text>\n');
         process.exit(1);
       }
+      const typeWithin = resolveWithinFromArgs(args);
       await sendRequest(port, 'POST', '/tool/type', {
         ...resolveTargetFromArgs(args),
         text,
+        ...(typeWithin ? { within: typeWithin } : {}),
       });
       break;
     }
@@ -316,14 +346,16 @@ export async function routeCommand(
         !args.includes('--testid')
       ) {
         process.stderr.write(
-          'Usage: mm wait-for <ref> [--timeout <ms>] [--selector <css>] [--testid <id>]\n',
+          'Usage: mm wait-for <ref> [--timeout <ms>] [--selector <css>] [--testid <id>] [--within <scope>]\n',
         );
         process.exit(1);
       }
       const timeoutMs = parseIntFlag(args, '--timeout');
+      const waitWithin = resolveWithinFromArgs(args);
       await sendRequest(port, 'POST', '/tool/wait_for', {
         ...resolveTargetFromArgs(args),
         ...(timeoutMs === undefined ? {} : { timeoutMs }),
+        ...(waitWithin ? { within: waitWithin } : {}),
       });
       break;
     }
@@ -605,10 +637,21 @@ export async function sendRequest(
       }
 
       const result = data.result ?? data;
-      if (typeof result === 'string') {
-        process.stdout.write(`${result}\n`);
+      const observations = data.observations as
+        | Record<string, unknown>
+        | undefined;
+      let output: unknown = result;
+      if (observations) {
+        const base =
+          typeof result === 'object' && result !== null
+            ? (result as Record<string, unknown>)
+            : { result };
+        output = { ...base, observations };
+      }
+      if (typeof output === 'string') {
+        process.stdout.write(`${output}\n`);
       } else {
-        process.stdout.write(`${JSON.stringify(result, null, 2)}\n`);
+        process.stdout.write(`${JSON.stringify(output, null, 2)}\n`);
       }
       return;
     } catch (error) {
@@ -1005,11 +1048,11 @@ Lifecycle:
   mm build [--force]
 
 Interaction:
-  mm click <ref> [--selector <css>] [--testid <id>]
-  mm type <ref> <text> [--selector <css>] [--testid <id>]
+  mm click <ref> [--selector <css>] [--testid <id>] [--within <scope>]
+  mm type <ref> <text> [--selector <css>] [--testid <id>] [--within <scope>]
   mm describe-screen
   mm screenshot [--name <name>]
-  mm wait-for <ref> [--timeout <ms>] [--selector <css>] [--testid <id>]
+  mm wait-for <ref> [--timeout <ms>] [--selector <css>] [--testid <id>] [--within <scope>]
   mm wait-for-notification [--timeout <ms>]
   mm clipboard <read|write> [text]
 

From 0d53d1bf767c17c53d41fc01524a1836fd201897 Mon Sep 17 00:00:00 2001
From: cryptotavares <joao.tavares@consensys.net>
Date: Fri, 17 Apr 2026 00:31:04 +0100
Subject: [PATCH 11/36] fix: make CLI binary executable after build

---
 scripts/prepack.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/scripts/prepack.sh b/scripts/prepack.sh
index ad99af5..e741983 100755
--- a/scripts/prepack.sh
+++ b/scripts/prepack.sh
@@ -9,3 +9,5 @@ if [[ -n $SKIP_PREPACK ]]; then
 fi
 
 yarn build
+
+chmod +x dist/cli/mm.cjs

From 36eaa12368f3d3dbb375a3a8bae9d07cff89f653 Mon Sep 17 00:00:00 2001
From: cryptotavares <joao.tavares@consensys.net>
Date: Fri, 17 Apr 2026 00:31:16 +0100
Subject: [PATCH 12/36] docs: update tool docs for within scoping, batch
 timeout, and enrichment

---
 README.md | 121 +++++++++++++++++++++++++++---------------------------
 SKILL.md  |  21 ++++++++--
 2 files changed, 78 insertions(+), 64 deletions(-)

diff --git a/README.md b/README.md
index 1d2b00f..9fef621 100644
--- a/README.md
+++ b/README.md
@@ -358,44 +358,44 @@ The daemon routes `POST /tool/:name` requests through the registry, applies Zod
 
 **Registered tools:**
 
-| Tool                     | Description                                                                                                                                                                                                                                                                 |
-| ------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| **Lifecycle**            |                                                                                                                                                                                                                                                                             |
-| `build`                  | Triggers an extension build using the configured `BuildCapability`. Accepts build type and force options.                                                                                                                                                                   |
-| `launch`                 | Launches a new browser session with the configured extension. Supports state modes (`default`, `onboarding`, `custom`), fixture presets, goal/tag metadata, and optional contract seeding on start.                                                                         |
-| `cleanup`                | Tears down the active browser session and cleans up all resources (browser, services, fixtures).                                                                                                                                                                            |
-| **Interaction**          |                                                                                                                                                                                                                                                                             |
-| `click`                  | Clicks an element identified by a11y ref, test ID, or CSS selector. Waits for the element to be visible before clicking.                                                                                                                                                    |
-| `type`                   | Types text into an input element identified by a11y ref, test ID, or CSS selector. Uses Playwright's `fill()` for reliable input.                                                                                                                                           |
-| `wait_for`               | Waits for an element to become visible on the page within a configurable timeout.                                                                                                                                                                                           |
-| `clipboard`              | Reads from or writes to the system clipboard via Chrome DevTools Protocol. Useful for pasting seed phrases or copying addresses.                                                                                                                                            |
-| **Navigation**           |                                                                                                                                                                                                                                                                             |
-| `navigate`               | Navigates the browser to a named screen (`home`, `settings`, `notification`) or an arbitrary URL.                                                                                                                                                                           |
-| `switch_to_tab`          | Switches the active page to a tab matching a given role (e.g., `extension`, `dapp`) or URL prefix.                                                                                                                                                                          |
-| `close_tab`              | Closes a browser tab matching a given role or URL. Falls back to the extension tab if the active tab is closed.                                                                                                                                                             |
-| `wait_for_notification`  | Waits for the extension notification popup to appear within a timeout. Returns the notification page URL.                                                                                                                                                                   |
-| **Discovery**            |                                                                                                                                                                                                                                                                             |
-| `describe_screen`        | Captures a comprehensive screen snapshot: extension state, visible test IDs, trimmed a11y tree with refs, optional screenshot, and prior knowledge from historical sessions.                                                                                                |
-| `accessibility_snapshot` | Captures a trimmed accessibility tree of the current page with deterministic refs (`e1`, `e2`, ...). Supports scoping to a root CSS selector.                                                                                                                               |
-| `list_testids`           | Collects all visible `data-testid` attributes from the current page with text previews and visibility status.                                                                                                                                                               |
-| **State**                |                                                                                                                                                                                                                                                                             |
-| `get_state`              | Retrieves the current extension state (URL, screen, network, balance, account) and tracked tab information.                                                                                                                                                                 |
-| `get_context`            | Returns the current environment context (`e2e` or `prod`), session status, available capabilities, and whether context switching is allowed.                                                                                                                                |
-| `set_context`            | Switches the session environment between `e2e` and `prod` modes. Blocked while a session is active.                                                                                                                                                                         |
-| **Screenshots**          |                                                                                                                                                                                                                                                                             |
-| `screenshot`             | Captures a screenshot of the current page. Supports naming, full-page capture, scoping to a CSS selector, and optional base64 output.                                                                                                                                       |
-| **Knowledge**            |                                                                                                                                                                                                                                                                             |
-| `knowledge_last`         | Retrieves the N most recent step records from the knowledge store, with optional scope and filter parameters.                                                                                                                                                               |
-| `knowledge_search`       | Searches step records by query string with token-based matching and synonym expansion. Scores results by relevance to screen, URL, test IDs, and a11y nodes.                                                                                                                |
-| `knowledge_summarize`    | Generates a recipe-style summary of a session's tool invocations, showing the step sequence with targets and outcomes.                                                                                                                                                      |
-| `knowledge_sessions`     | Lists available knowledge sessions with metadata (goal, flow tags, timestamps), with optional filtering.                                                                                                                                                                    |
-| **Contracts**            |                                                                                                                                                                                                                                                                             |
-| `seed_contract`          | Deploys a single smart contract to the local Anvil chain by name. Requires `ContractSeedingCapability`.                                                                                                                                                                     |
-| `seed_contracts`         | Deploys multiple smart contracts in sequence. Returns both successful deployments and individual failures.                                                                                                                                                                  |
-| `get_contract_address`   | Looks up the deployed address of a contract by name from the session's deployment registry.                                                                                                                                                                                 |
-| `list_contracts`         | Lists all contracts deployed in the current session with addresses and deployment timestamps.                                                                                                                                                                               |
-| **Batching**             |                                                                                                                                                                                                                                                                             |
-| `run_steps`              | Executes a batch of tool invocations sequentially. Supports `stopOnError` to halt on first failure and `includeObservations` (`'all'`, `'none'`, `'failures'`) to control whether post-execution observations appear in the response. Returns per-step results with timing. |
+| Tool                     | Description                                                                                                                                                                                                                                                                                                                                                                       |
+| ------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| **Lifecycle**            |                                                                                                                                                                                                                                                                                                                                                                                   |
+| `build`                  | Triggers an extension build using the configured `BuildCapability`. Accepts build type and force options.                                                                                                                                                                                                                                                                         |
+| `launch`                 | Launches a new browser session with the configured extension. Supports state modes (`default`, `onboarding`, `custom`), fixture presets, goal/tag metadata, and optional contract seeding on start.                                                                                                                                                                               |
+| `cleanup`                | Tears down the active browser session and cleans up all resources (browser, services, fixtures).                                                                                                                                                                                                                                                                                  |
+| **Interaction**          |                                                                                                                                                                                                                                                                                                                                                                                   |
+| `click`                  | Clicks an element identified by a11y ref, test ID, or CSS selector. Waits for the element to be visible before clicking. Supports `within` to scope the target inside a parent element.                                                                                                                                                                                           |
+| `type`                   | Types text into an input element identified by a11y ref, test ID, or CSS selector. Clears the field first, then sets the new value (uses Playwright's `fill()`). Supports `within` scoping.                                                                                                                                                                                       |
+| `wait_for`               | Waits for an element to become visible on the page within a configurable timeout. Supports `within` to scope the target inside a parent element.                                                                                                                                                                                                                                  |
+| `clipboard`              | Reads from or writes to the system clipboard via Chrome DevTools Protocol. Useful for pasting seed phrases or copying addresses.                                                                                                                                                                                                                                                  |
+| **Navigation**           |                                                                                                                                                                                                                                                                                                                                                                                   |
+| `navigate`               | Navigates the browser to a named screen (`home`, `settings`, `notification`) or an arbitrary URL.                                                                                                                                                                                                                                                                                 |
+| `switch_to_tab`          | Switches the active page to a tab matching a given role (e.g., `extension`, `dapp`) or URL prefix.                                                                                                                                                                                                                                                                                |
+| `close_tab`              | Closes a browser tab matching a given role or URL. Falls back to the extension tab if the active tab is closed.                                                                                                                                                                                                                                                                   |
+| `wait_for_notification`  | Waits for the extension notification popup to appear within a timeout. Returns the notification page URL.                                                                                                                                                                                                                                                                         |
+| **Discovery**            |                                                                                                                                                                                                                                                                                                                                                                                   |
+| `describe_screen`        | Captures a comprehensive screen snapshot: extension state, visible test IDs, trimmed a11y tree with refs, optional screenshot, and prior knowledge from historical sessions.                                                                                                                                                                                                      |
+| `accessibility_snapshot` | Captures a trimmed accessibility tree of the current page with deterministic refs (`e1`, `e2`, ...). Supports scoping to a root CSS selector.                                                                                                                                                                                                                                     |
+| `list_testids`           | Collects all visible `data-testid` attributes from the current page with text previews and visibility status.                                                                                                                                                                                                                                                                     |
+| **State**                |                                                                                                                                                                                                                                                                                                                                                                                   |
+| `get_state`              | Retrieves the current extension state (URL, screen, network, balance, account) and tracked tab information.                                                                                                                                                                                                                                                                       |
+| `get_context`            | Returns the current environment context (`e2e` or `prod`), session status, available capabilities, and whether context switching is allowed.                                                                                                                                                                                                                                      |
+| `set_context`            | Switches the session environment between `e2e` and `prod` modes. Blocked while a session is active.                                                                                                                                                                                                                                                                               |
+| **Screenshots**          |                                                                                                                                                                                                                                                                                                                                                                                   |
+| `screenshot`             | Captures a screenshot of the current page. Supports naming, full-page capture, scoping to a CSS selector, and optional base64 output.                                                                                                                                                                                                                                             |
+| **Knowledge**            |                                                                                                                                                                                                                                                                                                                                                                                   |
+| `knowledge_last`         | Retrieves the N most recent step records from the knowledge store, with optional scope and filter parameters.                                                                                                                                                                                                                                                                     |
+| `knowledge_search`       | Searches step records by query string with token-based matching and synonym expansion. Scores results by relevance to screen, URL, test IDs, and a11y nodes.                                                                                                                                                                                                                      |
+| `knowledge_summarize`    | Generates a recipe-style summary of a session's tool invocations, showing the step sequence with targets and outcomes.                                                                                                                                                                                                                                                            |
+| `knowledge_sessions`     | Lists available knowledge sessions with metadata (goal, flow tags, timestamps), with optional filtering.                                                                                                                                                                                                                                                                          |
+| **Contracts**            |                                                                                                                                                                                                                                                                                                                                                                                   |
+| `seed_contract`          | Deploys a single smart contract to the local Anvil chain by name. Requires `ContractSeedingCapability`.                                                                                                                                                                                                                                                                           |
+| `seed_contracts`         | Deploys multiple smart contracts in sequence. Returns both successful deployments and individual failures.                                                                                                                                                                                                                                                                        |
+| `get_contract_address`   | Looks up the deployed address of a contract by name from the session's deployment registry.                                                                                                                                                                                                                                                                                       |
+| `list_contracts`         | Lists all contracts deployed in the current session with addresses and deployment timestamps.                                                                                                                                                                                                                                                                                     |
+| **Batching**             |                                                                                                                                                                                                                                                                                                                                                                                   |
+| `run_steps`              | Executes a batch of tool invocations sequentially. Supports `stopOnError` to halt on first failure, `includeObservations` (`'all'`, `'none'`, `'failures'`) to control observations, and `batchTimeoutMs` to set an overall deadline (remaining steps are skipped on timeout). Accepts tool aliases like `navigate_home` / `navigate-home`. Returns per-step results with timing. |
 
 ### Accessibility References
 
@@ -548,13 +548,13 @@ mm describe-screen
 
 ### Interaction
 
-| Command                              | Description                                                                                                                                                                                                                                                                |
-| ------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `mm click <ref>`                     | Clicks an element by its accessibility reference (e.g., `e3`). The ref comes from a prior `describe-screen` call. Waits for the element to be visible before clicking.                                                                                                     |
-| `mm type <ref> <text>`               | Types text into an input element identified by its accessibility reference. Replaces any existing content in the field.                                                                                                                                                    |
-| `mm describe-screen`                 | Captures the full screen state: extension info, visible test IDs, a trimmed accessibility tree with deterministic refs (`e1`, `e2`, ...), and prior knowledge from historical sessions. This is the primary command for understanding what's on screen before interacting. |
-| `mm screenshot [--name <name>]`      | Takes a full-page screenshot of the current page. Saves to the artifacts directory. Use `--name` to set a descriptive filename.                                                                                                                                            |
-| `mm wait-for <ref> [--timeout <ms>]` | Blocks until an element identified by its accessibility reference becomes visible, or the timeout expires. Default timeout is 5 seconds.                                                                                                                                   |
+| Command                                                 | Description                                                                                                                                                                                                                                                                              |
+| ------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `mm click <ref> [--within <scope>]`                     | Clicks an element by its accessibility reference (e.g., `e3`). The ref comes from a prior `describe-screen` call. Waits for the element to be visible before clicking. Use `--within` to scope the target inside a parent element (`testid:<id>`, `selector:<css>`, or a bare a11y ref). |
+| `mm type <ref> <text> [--within <scope>]`               | Types text into an input element identified by its accessibility reference. Clears the field first, then sets the new value (uses Playwright's `fill()`). Use `--within` to scope the target inside a parent element.                                                                    |
+| `mm describe-screen`                                    | Captures the full screen state: extension info, visible test IDs, a trimmed accessibility tree with deterministic refs (`e1`, `e2`, ...), and prior knowledge from historical sessions. This is the primary command for understanding what's on screen before interacting.               |
+| `mm screenshot [--name <name>]`                         | Takes a full-page screenshot of the current page. Saves to the artifacts directory. Use `--name` to set a descriptive filename.                                                                                                                                                          |
+| `mm wait-for <ref> [--timeout <ms>] [--within <scope>]` | Blocks until an element identified by its accessibility reference becomes visible, or the timeout expires. Default timeout is 5 seconds. Use `--within` to scope the target inside a parent element.                                                                                     |
 
 ### Navigation
 
@@ -580,21 +580,22 @@ For the full agent-facing reference and workflow guidelines, see [SKILL.md](./SK
 
 Tool errors are classified into specific error codes for structured handling:
 
-| Code                        | Meaning                                       |
-| --------------------------- | --------------------------------------------- |
-| `MM_TARGET_NOT_FOUND`       | Element not found by ref, testId, or selector |
-| `MM_WAIT_TIMEOUT`           | Timeout waiting for element or condition      |
-| `MM_CLICK_FAILED`           | Click operation failed                        |
-| `MM_TYPE_FAILED`            | Type operation failed                         |
-| `MM_NAVIGATION_FAILED`      | Navigation error or network failure           |
-| `MM_PAGE_CLOSED`            | Browser page was closed unexpectedly          |
-| `MM_NOTIFICATION_TIMEOUT`   | Notification popup did not appear             |
-| `MM_TAB_NOT_FOUND`          | Tab not found by role or URL                  |
-| `MM_DISCOVERY_FAILED`       | Discovery tool failure                        |
-| `MM_SCREENSHOT_FAILED`      | Screenshot capture failure                    |
-| `MM_CONTRACT_NOT_FOUND`     | Unknown contract name                         |
-| `MM_SEED_FAILED`            | Contract deployment failure                   |
-| `MM_CONTEXT_SWITCH_BLOCKED` | Context switch while session is active        |
+| Code                        | Meaning                                         |
+| --------------------------- | ----------------------------------------------- |
+| `MM_TARGET_NOT_FOUND`       | Element not found by ref, testId, or selector   |
+| `MM_WAIT_TIMEOUT`           | Timeout waiting for element or condition        |
+| `MM_CLICK_FAILED`           | Click operation failed                          |
+| `MM_TYPE_FAILED`            | Type operation failed                           |
+| `MM_NAVIGATION_FAILED`      | Navigation error or network failure             |
+| `MM_PAGE_CLOSED`            | Browser page was closed unexpectedly            |
+| `MM_NOTIFICATION_TIMEOUT`   | Notification popup did not appear               |
+| `MM_TAB_NOT_FOUND`          | Tab not found by role or URL                    |
+| `MM_DISCOVERY_FAILED`       | Discovery tool failure                          |
+| `MM_SCREENSHOT_FAILED`      | Screenshot capture failure                      |
+| `MM_BATCH_TIMEOUT`          | `batchTimeoutMs` deadline exceeded in run_steps |
+| `MM_CONTRACT_NOT_FOUND`     | Unknown contract name                           |
+| `MM_SEED_FAILED`            | Contract deployment failure                     |
+| `MM_CONTEXT_SWITCH_BLOCKED` | Context switch while session is active          |
 
 ## Development
 
diff --git a/SKILL.md b/SKILL.md
index a73bc48..7bd3ccf 100644
--- a/SKILL.md
+++ b/SKILL.md
@@ -154,7 +154,7 @@ mm serve [--background]
 mm describe-screen
 ```
 
-The a11y tree only includes actionable roles: `button`, `link`, `checkbox`, `radio`, `switch`, `textbox`, `combobox`, `menuitem`, and important roles: `dialog`, `alert`, `status`, `heading`.
+The a11y tree includes actionable roles: `button`, `link`, `checkbox`, `radio`, `switch`, `textbox`, `combobox`, `menuitem`; structural roles: `menu`, `listbox`, `option`, `tab`, `tabpanel`, `list`, `listitem`; and important roles: `dialog`, `alert`, `status`, `heading`.
 
 Each node looks like:
 
@@ -163,10 +163,16 @@ Each node looks like:
   "ref": "e3",
   "role": "button",
   "name": "Confirm",
-  "path": ["dialog:Transaction"]
+  "path": ["dialog:Transaction"],
+  "testId": "confirm-footer-button",
+  "textContent": "Confirm"
 }
 ```
 
+The `testId` and `textContent` fields appear only on nodes with short or generic names — they provide extra context from the DOM to help identify ambiguous elements. Nodes with clear names omit these fields.
+
+When 3+ consecutive identical nodes appear (same role, name, and path), they are collapsed into a summary like `… 3 more "maskicon" (refs e2–e4)` to reduce token waste. Individual refs still work for targeting.
+
 Use the `ref` value (`e3`) for click/type/wait-for commands.
 
 #### `mm get-state`
@@ -199,13 +205,16 @@ Clicks an element. Waits up to 15s for it to become visible.
 
 ```
 mm click e3
+mm click --testid end-accessory --within "testid:account-list-item/0"
 ```
 
+Use `--within` to scope the target inside a parent element. Values use the format `testid:<id>`, `selector:<css>`, or a bare a11y ref (`e5`).
+
 If the page closes after clicking (e.g., confirmation popup), the response includes `pageClosedAfterClick: true` — this is normal, not an error.
 
 #### `mm type <ref> <text>`
 
-Types text into an input field. Replaces existing content (uses `fill()`).
+Types text into an input field. **Clears the field first**, then sets the new value (uses Playwright's `fill()`). No `clearFirst` flag needed — clearing is always implicit.
 
 ```
 mm type e5 "0x1234abcd..."
@@ -217,6 +226,7 @@ Blocks until an element becomes visible. Default timeout: 15s.
 
 ```
 mm wait-for e7 [--timeout <ms>]
+mm wait-for --testid confirm-btn --within "testid:dialog-container"
 ```
 
 ### Navigation
@@ -283,7 +293,9 @@ Executes multiple tool invocations in sequence from a JSON array. Each step spec
 mm run-steps '{"steps":[{"tool":"click","args":{"a11yRef":"e3"}},{"tool":"wait_for","args":{"a11yRef":"e5"}}]}'
 ```
 
-Supports `stopOnError` (halt on first failure) and returns per-step results with timing. The `includeObservations` param controls whether final-state observations appear in the response: `'all'` (default), `'none'`, or `'failures'` (only on partial failure).
+Supports `stopOnError` (halt on first failure) and returns per-step results with timing. The `includeObservations` param controls whether final-state observations appear in the response: `'all'` (default), `'none'`, or `'failures'` (only on partial failure). Use `batchTimeoutMs` to set an overall deadline — if exceeded, remaining steps are marked as skipped and partial results are returned immediately. The summary includes a `skipped` count alongside `succeeded` and `failed`.
+
+Tool aliases are supported in steps: `navigate_home` / `navigate-home`, `navigate_settings` / `navigate-settings`, and `navigate_notification` / `navigate-notification` resolve to `navigate` with the appropriate `screen` argument. You can also use `ref` as shorthand for `a11yRef` in step args and within targets.
 
 ## Element Targeting
 
@@ -326,6 +338,7 @@ When a command fails, the response includes `error.code`. Use this to decide wha
 | `MM_CAPABILITY_NOT_AVAILABLE` | Feature requires a capability not configured | Check environment mode (e2e vs prod)                      |
 | `MM_CONTEXT_SWITCH_BLOCKED`   | Can't switch context with active session     | Run `mm cleanup` first                                    |
 | `MM_INVALID_INPUT`            | Bad parameters                               | Fix input and retry                                       |
+| `MM_BATCH_TIMEOUT`            | `batchTimeoutMs` deadline exceeded           | Remaining steps were skipped; check partial results       |
 | `MM_CONTRACT_NOT_FOUND`       | Unknown contract name for seeding            | See available contracts below                             |
 
 ## Available Contracts (E2E only)

From e8a4e0ad50e82d9690dc1838b4ffabbba0360ef8 Mon Sep 17 00:00:00 2001
From: cryptotavares <joao.tavares@consensys.net>
Date: Fri, 17 Apr 2026 00:31:27 +0100
Subject: [PATCH 13/36] chore: update coverage thresholds

---
 vitest.config.mts | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/vitest.config.mts b/vitest.config.mts
index 216ccc7..dc69768 100644
--- a/vitest.config.mts
+++ b/vitest.config.mts
@@ -35,10 +35,10 @@ export default defineConfig({
         // Auto-update the coverage thresholds when running locally.
         // Disabled in CI to prevent non-deterministic config changes.
         autoUpdate: !process.env.CI,
-        branches: 88.08,
-        functions: 92.03,
-        lines: 94.44,
-        statements: 94.14,
+        branches: 87.83,
+        functions: 91.17,
+        lines: 94.26,
+        statements: 94.03,
       },
     },
 

From dc3d2aeaa6d31cac6e946ab4e9aab47285f18a94 Mon Sep 17 00:00:00 2001
From: cryptotavares <joao.tavares@consensys.net>
Date: Fri, 17 Apr 2026 00:57:31 +0100
Subject: [PATCH 14/36] fix: handle cross-platform error codes in lock release
 test

unlink on a directory returns EISDIR on Linux but EPERM on macOS,
causing CI (Ubuntu) to fail. Accept both error codes in the assertion
and lower coverage thresholds to account for platform instrumentation
differences.
---
 src/server/daemon-state.test.ts | 3 ++-
 vitest.config.mts               | 8 ++++----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/server/daemon-state.test.ts b/src/server/daemon-state.test.ts
index a865f18..f8a26be 100644
--- a/src/server/daemon-state.test.ts
+++ b/src/server/daemon-state.test.ts
@@ -191,8 +191,9 @@ describe('daemon-state', () => {
     it('throws when lock release fails with a non-ENOENT error', async () => {
       await fs.mkdir(path.join(tmpDir, '.mm-server.lock'));
 
+      // Linux returns EISDIR, macOS returns EPERM for unlink on a directory
       await expect(releaseStartupLock(tmpDir)).rejects.toMatchObject({
-        code: 'EPERM',
+        code: expect.stringMatching(/^(EPERM|EISDIR)$/u),
       });
     });
   });
diff --git a/vitest.config.mts b/vitest.config.mts
index dc69768..8f98685 100644
--- a/vitest.config.mts
+++ b/vitest.config.mts
@@ -35,10 +35,10 @@ export default defineConfig({
         // Auto-update the coverage thresholds when running locally.
         // Disabled in CI to prevent non-deterministic config changes.
         autoUpdate: !process.env.CI,
-        branches: 87.83,
+        branches: 87.77,
         functions: 91.17,
-        lines: 94.26,
-        statements: 94.03,
+        lines: 94.18,
+        statements: 93.95,
       },
     },
 
@@ -49,4 +49,4 @@ export default defineConfig({
       tsconfig: './tsconfig.test.json',
     },
   },
-});
\ No newline at end of file
+});

From c256214f779f637a712809f837c2dbb2191e46af Mon Sep 17 00:00:00 2001
From: cryptotavares <joao.tavares@consensys.net>
Date: Fri, 17 Apr 2026 11:28:53 +0100
Subject: [PATCH 15/36] feat: add type definitions and schemas for get_text,
 launch context, and activeTab

---
 src/tools/types/tool-inputs.ts  |  6 ++++++
 src/tools/types/tool-outputs.ts |  7 +++++++
 src/validation/schemas.ts       | 23 +++++++++++++++++++++++
 3 files changed, 36 insertions(+)

diff --git a/src/tools/types/tool-inputs.ts b/src/tools/types/tool-inputs.ts
index d1fb42b..fdf5747 100644
--- a/src/tools/types/tool-inputs.ts
+++ b/src/tools/types/tool-inputs.ts
@@ -9,6 +9,7 @@ export type BuildInput = {
 
 export type LaunchInput = {
   autoBuild?: boolean;
+  context?: 'e2e' | 'prod';
   stateMode?: 'default' | 'onboarding' | 'custom';
   fixturePreset?: string;
   fixture?: Record<string, unknown>;
@@ -87,6 +88,11 @@ export type WaitForInput = TargetSelection & {
   within?: WithinTarget;
 };
 
+export type GetTextInput = TargetSelection & {
+  timeoutMs?: number;
+  within?: WithinTarget;
+};
+
 export type KnowledgeScope =
   | 'current'
   | 'all'
diff --git a/src/tools/types/tool-outputs.ts b/src/tools/types/tool-outputs.ts
index e24ca39..05e56c5 100644
--- a/src/tools/types/tool-outputs.ts
+++ b/src/tools/types/tool-outputs.ts
@@ -59,6 +59,7 @@ export type ScreenshotInfo = {
 
 export type DescribeScreenResult = {
   state: ExtensionState;
+  activeTab?: TabInfo;
   testIds: {
     items: TestIdItem[];
   };
@@ -93,6 +94,12 @@ export type WaitForResult = {
   target: string;
 };
 
+export type GetTextResult = {
+  text: string;
+  target: string;
+  length: number;
+};
+
 export type StepResult = {
   tool: string;
   ok: boolean;
diff --git a/src/validation/schemas.ts b/src/validation/schemas.ts
index 43462ae..5269cd4 100644
--- a/src/validation/schemas.ts
+++ b/src/validation/schemas.ts
@@ -90,6 +90,14 @@ export const launchInputSchema = z.object({
     .boolean()
     .default(true)
     .describe('Automatically run build if extension is not found'),
+  context: z
+    .enum(['e2e', 'prod'])
+    .describe(
+      'Environment context to use for this session. ' +
+        'Sets the context before launching so you can start in prod mode directly: ' +
+        'mm launch --context prod --state onboarding',
+    )
+    .optional(),
   stateMode: z
     .enum(['default', 'onboarding', 'custom'])
     .default('default')
@@ -328,6 +336,19 @@ export const waitForInputSchema = targetSelectionSchema.and(
   }),
 );
 
+export const getTextInputSchema = targetSelectionSchema.and(
+  z.object({
+    timeoutMs: z
+      .number()
+      .int()
+      .min(0)
+      .max(60000)
+      .default(15000)
+      .describe('Timeout to wait for element to become visible'),
+    within: withinTargetSchema.optional(),
+  }),
+);
+
 export const knowledgeLastInputSchema = z.object({
   n: z
     .number()
@@ -544,6 +565,7 @@ export const toolSchemas = {
   click: clickInputSchema,
   type: typeInputSchema,
   wait_for: waitForInputSchema,
+  get_text: getTextInputSchema,
   knowledge_last: knowledgeLastInputSchema,
   knowledge_search: knowledgeSearchInputSchema,
   knowledge_summarize: knowledgeSummarizeInputSchema,
@@ -577,6 +599,7 @@ export type ScreenshotInputZ = z.infer<typeof screenshotInputSchema>;
 export type ClickInputZ = z.infer<typeof clickInputSchema>;
 export type TypeInputZ = z.infer<typeof typeInputSchema>;
 export type WaitForInputZ = z.infer<typeof waitForInputSchema>;
+export type GetTextInputZ = z.infer<typeof getTextInputSchema>;
 export type KnowledgeLastInputZ = z.infer<typeof knowledgeLastInputSchema>;
 export type KnowledgeSearchInputZ = z.infer<typeof knowledgeSearchInputSchema>;
 export type KnowledgeSummarizeInputZ = z.infer<

From c26a8d30b7d3c6b57b5fb454ec8c6ade4ebd005d Mon Sep 17 00:00:00 2001
From: cryptotavares <joao.tavares@consensys.net>
Date: Fri, 17 Apr 2026 11:29:05 +0100
Subject: [PATCH 16/36] feat: implement get_text tool for reading element text
 content

---
 src/tools/interaction.test.ts | 150 +++++++++++++++++++++++++++++++++-
 src/tools/interaction.ts      |  57 +++++++++++++
 2 files changed, 206 insertions(+), 1 deletion(-)

diff --git a/src/tools/interaction.test.ts b/src/tools/interaction.test.ts
index 7da273c..4099384 100644
--- a/src/tools/interaction.test.ts
+++ b/src/tools/interaction.test.ts
@@ -7,7 +7,12 @@
 
 import { describe, it, expect, vi, afterEach } from 'vitest';
 
-import { clickTool, typeTool, waitForTool } from './interaction.js';
+import {
+  clickTool,
+  getTextTool,
+  typeTool,
+  waitForTool,
+} from './interaction.js';
 import { createMockSessionManager } from './test-utils/mock-factories.js';
 import { ErrorCodes } from './types/errors.js';
 import * as discoveryModule from './utils/discovery.js';
@@ -19,6 +24,7 @@ function createMockLocator() {
     click: vi.fn().mockResolvedValue(undefined),
     fill: vi.fn().mockResolvedValue(undefined),
     waitFor: vi.fn().mockResolvedValue(undefined),
+    textContent: vi.fn().mockResolvedValue('Hello World'),
   };
 }
 
@@ -743,4 +749,146 @@ describe('interaction', () => {
       }
     });
   });
+
+  describe('getTextTool', () => {
+    it('returns textContent by testId', async () => {
+      const locator = createMockLocator();
+      const context = createMockContext();
+
+      vi.spyOn(discoveryModule, 'waitForTarget').mockResolvedValue(
+        locator as any,
+      );
+
+      const result = await getTextTool({ testId: 'my-element' }, context);
+
+      expect(result.ok).toBe(true);
+      if (result.ok) {
+        expect(result.result.text).toBe('Hello World');
+        expect(result.result).toHaveLength(11);
+        expect(result.result.target).toBe('testId:my-element');
+      }
+    });
+
+    it('returns empty string when textContent is null', async () => {
+      const locator = createMockLocator();
+      locator.textContent.mockResolvedValue(null);
+      const context = createMockContext();
+
+      vi.spyOn(discoveryModule, 'waitForTarget').mockResolvedValue(
+        locator as any,
+      );
+
+      const result = await getTextTool({ testId: 'empty-node' }, context);
+
+      expect(result.ok).toBe(true);
+      if (result.ok) {
+        expect(result.result.text).toBe('');
+        expect(result.result).toHaveLength(0);
+      }
+    });
+
+    it('returns textContent by a11yRef', async () => {
+      const locator = createMockLocator();
+      locator.textContent.mockResolvedValue('Ref content');
+      const context = createMockContext({
+        refMap: new Map([['e1', 'button[name="Submit"]']]),
+      });
+
+      vi.spyOn(discoveryModule, 'waitForTarget').mockResolvedValue(
+        locator as any,
+      );
+
+      const result = await getTextTool({ a11yRef: 'e1' }, context);
+
+      expect(result.ok).toBe(true);
+      if (result.ok) {
+        expect(result.result.text).toBe('Ref content');
+      }
+    });
+
+    it('returns textContent by CSS selector', async () => {
+      const locator = createMockLocator();
+      locator.textContent.mockResolvedValue('Selector content');
+      const context = createMockContext();
+
+      vi.spyOn(discoveryModule, 'waitForTarget').mockResolvedValue(
+        locator as any,
+      );
+
+      const result = await getTextTool({ selector: '#result-text' }, context);
+
+      expect(result.ok).toBe(true);
+      if (result.ok) {
+        expect(result.result.text).toBe('Selector content');
+        expect(result.result.target).toBe('selector:#result-text');
+      }
+    });
+
+    it('returns error when element not found', async () => {
+      const context = createMockContext();
+
+      vi.spyOn(discoveryModule, 'waitForTarget').mockRejectedValue(
+        new Error('Timeout waiting for selector'),
+      );
+
+      const result = await getTextTool({ testId: 'missing' }, context);
+
+      expect(result.ok).toBe(false);
+    });
+
+    it('returns error with invalid target selection', async () => {
+      const context = createMockContext();
+
+      vi.spyOn(targetsModule, 'validateTargetSelection').mockReturnValue({
+        valid: false,
+        error: 'No target provided',
+      } as any);
+
+      const result = await getTextTool({} as any, context);
+
+      expect(result.ok).toBe(false);
+      if (!result.ok) {
+        expect(result.error.code).toBe(ErrorCodes.MM_INVALID_INPUT);
+      }
+    });
+
+    it('returns error when no session active', async () => {
+      const result = await getTextTool(
+        { testId: 'element' },
+        createMockContext({ hasActive: false }),
+      );
+
+      expect(result.ok).toBe(false);
+      if (!result.ok) {
+        expect(result.error.code).toBe(ErrorCodes.MM_NO_ACTIVE_SESSION);
+      }
+    });
+
+    it('supports --within scoping', async () => {
+      const locator = createMockLocator();
+      locator.textContent.mockResolvedValue('Scoped text');
+      const context = createMockContext();
+      const spy = vi
+        .spyOn(discoveryModule, 'waitForTarget')
+        .mockResolvedValue(locator as any);
+
+      const result = await getTextTool(
+        {
+          testId: 'child-element',
+          within: { testId: 'parent-container' },
+        },
+        context,
+      );
+
+      expect(result.ok).toBe(true);
+      expect(spy).toHaveBeenCalledWith(
+        expect.anything(),
+        'testId',
+        'child-element',
+        expect.any(Map),
+        expect.any(Number),
+        { type: 'testId', value: 'parent-container' },
+      );
+    });
+  });
 });
diff --git a/src/tools/interaction.ts b/src/tools/interaction.ts
index 13ce247..5e3c3af 100644
--- a/src/tools/interaction.ts
+++ b/src/tools/interaction.ts
@@ -7,6 +7,8 @@ import {
 import type {
   ClickInput,
   ClickResult,
+  GetTextInput,
+  GetTextResult,
   TypeInput,
   TypeResult,
   WaitForInput,
@@ -224,3 +226,58 @@ export async function waitForTool(
     return createToolError(errorInfo.code, errorInfo.message);
   }
 }
+
+/**
+ * Reads the text content of an element identified by ref, test ID, or selector.
+ *
+ * @param input - The target element and timeout options.
+ * @param context - The tool execution context.
+ * @returns The text content of the matched element.
+ */
+export async function getTextTool(
+  input: GetTextInput,
+  context: ToolContext,
+): Promise<ToolResponse<GetTextResult>> {
+  const missingSession = requireActiveSession<GetTextResult>(context);
+  if (missingSession) {
+    return missingSession;
+  }
+
+  const timeoutMs = input.timeoutMs ?? DEFAULT_INTERACTION_TIMEOUT_MS;
+  const validation = validateTargetSelection(input);
+
+  if (isInvalidTargetSelection(validation)) {
+    return createToolError(ErrorCodes.MM_INVALID_INPUT, validation.error);
+  }
+
+  if (!isValidTargetSelection(validation)) {
+    return createToolError(
+      ErrorCodes.MM_INVALID_INPUT,
+      'Invalid target selection',
+    );
+  }
+
+  const { type: targetType, value: targetValue } = validation;
+
+  try {
+    const locator = await waitForTarget(
+      context.page,
+      targetType,
+      targetValue,
+      context.refMap,
+      timeoutMs,
+      resolveWithinScope(input.within),
+    );
+
+    const text = (await locator.textContent()) ?? '';
+
+    return createToolSuccess({
+      text,
+      target: `${targetType}:${targetValue}`,
+      length: text.length,
+    });
+  } catch (error) {
+    const errorInfo = classifyWaitError(error);
+    return createToolError(errorInfo.code, errorInfo.message);
+  }
+}

From a1d3b80010105b866ff5d285b2865740a918d04c Mon Sep 17 00:00:00 2001
From: cryptotavares <joao.tavares@consensys.net>
Date: Fri, 17 Apr 2026 11:29:18 +0100
Subject: [PATCH 17/36] feat: add --context flag to launch for pre-session
 environment selection

---
 src/tools/launch.test.ts | 29 +++++++++++++++++++++++++++++
 src/tools/launch.ts      |  4 ++++
 2 files changed, 33 insertions(+)

diff --git a/src/tools/launch.test.ts b/src/tools/launch.test.ts
index 99deafa..b198fa0 100644
--- a/src/tools/launch.test.ts
+++ b/src/tools/launch.test.ts
@@ -166,6 +166,35 @@ describe('launchTool', () => {
       expect(result.ok).toBe(true);
       expect(context.sessionManager.launch).toHaveBeenCalledWith(input);
     });
+
+    it('calls setContext before launch when context is provided', async () => {
+      const context = createMockContext({ hasActive: false });
+      const input: LaunchInput = {
+        context: 'prod',
+        stateMode: 'onboarding',
+      };
+
+      await launchTool(input, context);
+
+      expect(context.sessionManager.setContext).toHaveBeenCalledWith('prod');
+      expect(context.sessionManager.launch).toHaveBeenCalledWith(input);
+      const setContextOrder = (
+        context.sessionManager.setContext as ReturnType<typeof vi.fn>
+      ).mock.invocationCallOrder[0];
+      const launchOrder = (
+        context.sessionManager.launch as ReturnType<typeof vi.fn>
+      ).mock.invocationCallOrder[0];
+      expect(setContextOrder).toBeLessThan(launchOrder);
+    });
+
+    it('does not call setContext when context is not provided', async () => {
+      const context = createMockContext({ hasActive: false });
+      const input: LaunchInput = { stateMode: 'default' };
+
+      await launchTool(input, context);
+
+      expect(context.sessionManager.setContext).not.toHaveBeenCalled();
+    });
   });
 
   describe('session already running', () => {
diff --git a/src/tools/launch.ts b/src/tools/launch.ts
index c4cfc51..16a7bc2 100644
--- a/src/tools/launch.ts
+++ b/src/tools/launch.ts
@@ -47,6 +47,10 @@ export async function launchTool(
       }
     }
 
+    if (input.context) {
+      sessionManager.setContext(input.context);
+    }
+
     const result = await sessionManager.launch(input);
     const isProdMode = sessionManager.getEnvironmentMode() === 'prod';
 

From e5a5de4c7c06c946cae7a3e5d3990563ef1becfb Mon Sep 17 00:00:00 2001
From: cryptotavares <joao.tavares@consensys.net>
Date: Fri, 17 Apr 2026 11:29:30 +0100
Subject: [PATCH 18/36] feat: include active tab info in describe_screen output

---
 src/tools/discovery-tools.ts | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/tools/discovery-tools.ts b/src/tools/discovery-tools.ts
index fba199a..60962fc 100644
--- a/src/tools/discovery-tools.ts
+++ b/src/tools/discovery-tools.ts
@@ -110,6 +110,13 @@ export async function describeScreenTool(
 
     context.sessionManager.setRefMap(refMap);
 
+    const trackedPages = context.sessionManager.getTrackedPages();
+    const activePage = context.sessionManager.getPage();
+    const activeTracked = trackedPages.find((tp) => tp.page === activePage);
+    const activeTab = activeTracked
+      ? { role: activeTracked.role, url: activePage.url() }
+      : undefined;
+
     let screenshot: DescribeScreenResult['screenshot'] = null;
 
     if (input.includeScreenshot) {
@@ -143,6 +150,7 @@ export async function describeScreenTool(
 
     return createToolSuccess({
       state,
+      activeTab,
       testIds: { items: testIds },
       a11y: { nodes },
       screenshot,

From f4e9cb9088cc81b0987894bbb5dbf7cf13c3ce1e Mon Sep 17 00:00:00 2001
From: cryptotavares <joao.tavares@consensys.net>
Date: Fri, 17 Apr 2026 11:29:45 +0100
Subject: [PATCH 19/36] feat: register get_text tool and add new CLI commands

---
 src/cli/mm.test.ts         | 91 ++++++++++++++++++++++++++++++++++++++
 src/cli/mm.ts              | 42 +++++++++++++++---
 src/tools/registry.test.ts |  2 +-
 src/tools/registry.ts      |  9 +++-
 4 files changed, 137 insertions(+), 7 deletions(-)

diff --git a/src/cli/mm.test.ts b/src/cli/mm.test.ts
index 75c484a..a73a2b4 100644
--- a/src/cli/mm.test.ts
+++ b/src/cli/mm.test.ts
@@ -432,6 +432,34 @@ describe('parseLaunchArgs', () => {
     );
   });
 
+  it('parses --context value', () => {
+    expect(parseLaunchArgs(['--context', 'prod'])).toStrictEqual({
+      context: 'prod',
+    });
+  });
+
+  it('parses --context with other flags', () => {
+    expect(
+      parseLaunchArgs(['--context', 'prod', '--state', 'onboarding']),
+    ).toStrictEqual({
+      context: 'prod',
+      stateMode: 'onboarding',
+    });
+  });
+
+  it('exits for --context without value', () => {
+    expect(() => parseLaunchArgs(['--context'])).toThrowError('process.exit');
+    expect(stderrSpy).toHaveBeenCalledWith(
+      'Error: --context requires a value (e2e|prod)\n',
+    );
+  });
+
+  it('exits for --context with flag as value', () => {
+    expect(() => parseLaunchArgs(['--context', '--force'])).toThrowError(
+      'process.exit',
+    );
+  });
+
   it('writes warning for unknown flags', () => {
     parseLaunchArgs(['--unknown']);
     expect(stderrSpy).toHaveBeenCalledWith(
@@ -1175,12 +1203,75 @@ describe('routeCommand', () => {
     );
   });
 
+  it('routes switch-to-tab with positional role', async () => {
+    await routeCommand('switch-to-tab', ['dapp'], 3000);
+    expect(globalThis.fetch).toHaveBeenCalledWith(
+      'http://127.0.0.1:3000/tool/switch_to_tab',
+      expect.objectContaining({
+        body: JSON.stringify({ role: 'dapp' }),
+      }),
+    );
+  });
+
   it('exits when switch-to-tab has no flags', async () => {
     await expect(routeCommand('switch-to-tab', [], 3000)).rejects.toThrowError(
       'process.exit',
     );
   });
 
+  it('routes get-text with positional ref', async () => {
+    await routeCommand('get-text', ['e1'], 3000);
+    expect(globalThis.fetch).toHaveBeenCalledWith(
+      'http://127.0.0.1:3000/tool/get_text',
+      expect.objectContaining({
+        body: JSON.stringify({ a11yRef: 'e1' }),
+      }),
+    );
+  });
+
+  it('routes get-text with --testid', async () => {
+    await routeCommand('get-text', ['--testid', 'result-box'], 3000);
+    expect(globalThis.fetch).toHaveBeenCalledWith(
+      'http://127.0.0.1:3000/tool/get_text',
+      expect.objectContaining({
+        body: JSON.stringify({ testId: 'result-box' }),
+      }),
+    );
+  });
+
+  it('routes get-text with --selector', async () => {
+    await routeCommand('get-text', ['--selector', '#output'], 3000);
+    expect(globalThis.fetch).toHaveBeenCalledWith(
+      'http://127.0.0.1:3000/tool/get_text',
+      expect.objectContaining({
+        body: JSON.stringify({ selector: '#output' }),
+      }),
+    );
+  });
+
+  it('routes get-text with --within scoping', async () => {
+    await routeCommand(
+      'get-text',
+      ['--testid', 'amount', '--within', 'testid:tx-row'],
+      3000,
+    );
+    expect(globalThis.fetch).toHaveBeenCalledWith(
+      'http://127.0.0.1:3000/tool/get_text',
+      expect.objectContaining({
+        body: JSON.stringify({
+          testId: 'amount',
+          within: { testId: 'tx-row' },
+        }),
+      }),
+    );
+  });
+
+  it('exits when get-text has no target', async () => {
+    await expect(routeCommand('get-text', [], 3000)).rejects.toThrowError(
+      'process.exit',
+    );
+  });
+
   it('routes close-tab with --role', async () => {
     await routeCommand('close-tab', ['--role', 'dapp'], 3000);
     expect(globalThis.fetch).toHaveBeenCalledWith(
diff --git a/src/cli/mm.ts b/src/cli/mm.ts
index 359b0ae..ed8a896 100644
--- a/src/cli/mm.ts
+++ b/src/cli/mm.ts
@@ -329,6 +329,25 @@ export async function routeCommand(
       });
       break;
     }
+    case 'get-text': {
+      const getTextTarget = getPositionalTarget(args);
+      if (
+        !getTextTarget &&
+        !args.includes('--selector') &&
+        !args.includes('--testid')
+      ) {
+        process.stderr.write(
+          'Usage: mm get-text <ref> [--selector <css>] [--testid <id>] [--within <scope>]\n',
+        );
+        process.exit(1);
+      }
+      const getTextWithin = resolveWithinFromArgs(args);
+      await sendRequest(port, 'POST', '/tool/get_text', {
+        ...resolveTargetFromArgs(args),
+        ...(getTextWithin ? { within: getTextWithin } : {}),
+      });
+      break;
+    }
     case 'describe-screen':
       await sendRequest(port, 'POST', '/tool/describe_screen', {});
       break;
@@ -409,14 +428,18 @@ export async function routeCommand(
     case 'switch-to-tab': {
       const tabRole = parseStringFlag(args, '--role');
       const tabUrl = parseStringFlag(args, '--url');
-      if (!tabRole && !tabUrl) {
+      // Support positional arg as role: mm switch-to-tab dapp
+      const positionalRole =
+        !tabRole && !tabUrl ? getPositionalTarget(args) : undefined;
+      const resolvedRole = tabRole ?? positionalRole;
+      if (!resolvedRole && !tabUrl) {
         process.stderr.write(
-          'Usage: mm switch-to-tab --role <role> | --url <url>\n',
+          'Usage: mm switch-to-tab <role> | --role <role> | --url <url>\n',
         );
         process.exit(1);
       }
       await sendRequest(port, 'POST', '/tool/switch_to_tab', {
-        ...(tabRole ? { role: tabRole } : {}),
+        ...(resolvedRole ? { role: resolvedRole } : {}),
         ...(tabUrl ? { url: tabUrl } : {}),
       });
       break;
@@ -974,6 +997,7 @@ export function parseStringFlag(
 export function parseLaunchArgs(args: string[]): Record<string, unknown> {
   const result: Record<string, unknown> = {};
   const knownFlags = new Set([
+    '--context',
     '--state',
     '--extension-path',
     '--goal',
@@ -985,6 +1009,13 @@ export function parseLaunchArgs(args: string[]): Record<string, unknown> {
     const arg = args[i];
     if (arg === '--force') {
       result.force = true;
+    } else if (arg === '--context') {
+      i += 1;
+      if (!args[i] || args[i].startsWith('--')) {
+        process.stderr.write('Error: --context requires a value (e2e|prod)\n');
+        process.exit(1);
+      }
+      result.context = args[i];
     } else if (arg === '--state') {
       i += 1;
       if (!args[i] || args[i].startsWith('--')) {
@@ -1041,7 +1072,7 @@ Environment Variables:
                       Falls back to the current git worktree root.
 
 Lifecycle:
-  mm launch [--state default|onboarding|custom] [--extension-path <path>] [--goal <text>] [--force] [--flow-tags <tags>]
+  mm launch [--context e2e|prod] [--state default|onboarding|custom] [--extension-path <path>] [--goal <text>] [--force] [--flow-tags <tags>]
   mm cleanup [--shutdown]
   mm status
   mm serve [--background]
@@ -1050,6 +1081,7 @@ Lifecycle:
 Interaction:
   mm click <ref> [--selector <css>] [--testid <id>] [--within <scope>]
   mm type <ref> <text> [--selector <css>] [--testid <id>] [--within <scope>]
+  mm get-text <ref> [--selector <css>] [--testid <id>] [--within <scope>]
   mm describe-screen
   mm screenshot [--name <name>]
   mm wait-for <ref> [--timeout <ms>] [--selector <css>] [--testid <id>] [--within <scope>]
@@ -1060,7 +1092,7 @@ Navigation:
   mm navigate <url>
   mm navigate-home
   mm navigate-settings
-  mm switch-to-tab --role <role> | --url <url>
+  mm switch-to-tab <role> | --role <role> | --url <url>
   mm close-tab --role <role> | --url <url>
 
 Discovery:
diff --git a/src/tools/registry.test.ts b/src/tools/registry.test.ts
index 7d0dc1d..98f54ff 100644
--- a/src/tools/registry.test.ts
+++ b/src/tools/registry.test.ts
@@ -31,7 +31,7 @@ describe('toolRegistry', () => {
   });
 
   it('has the expected number of entries', () => {
-    expect(toolRegistry.size).toBe(27);
+    expect(toolRegistry.size).toBe(28);
   });
 
   it('stores only functions as values', () => {
diff --git a/src/tools/registry.ts b/src/tools/registry.ts
index dc376cb..0df43dd 100644
--- a/src/tools/registry.ts
+++ b/src/tools/registry.ts
@@ -8,7 +8,12 @@ import {
   describeScreenTool,
   listTestIdsTool,
 } from './discovery-tools.js';
-import { clickTool, typeTool, waitForTool } from './interaction.js';
+import {
+  clickTool,
+  getTextTool,
+  typeTool,
+  waitForTool,
+} from './interaction.js';
 import {
   knowledgeLastTool,
   knowledgeSearchTool,
@@ -52,6 +57,7 @@ export const toolRegistry = new Map<string, ToolFunction<any, any>>([
   ['click', clickTool],
   ['type', typeTool],
   ['wait_for', waitForTool],
+  ['get_text', getTextTool],
   ['knowledge_last', knowledgeLastTool],
   ['knowledge_search', knowledgeSearchTool],
   ['knowledge_summarize', knowledgeSummarizeTool],
@@ -88,6 +94,7 @@ export const TOOL_CATEGORIES: Record<string, ToolCategory> = {
   knowledge_search: 'readonly',
   knowledge_summarize: 'readonly',
   knowledge_sessions: 'readonly',
+  get_text: 'readonly',
   get_state: 'readonly',
   get_context: 'readonly',
   // set_context is blocked while a session is active (MM_CONTEXT_SWITCH_BLOCKED),

From 33eb1410e81aa247f69c18f1bd193b73b92767e7 Mon Sep 17 00:00:00 2001
From: cryptotavares <joao.tavares@consensys.net>
Date: Fri, 17 Apr 2026 11:29:58 +0100
Subject: [PATCH 20/36] docs: update README and SKILL reference for new tools
 and commands

---
 README.md |  75 +++++++++++++++++++++++++---------------
 SKILL.md  | 100 ++++++++++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 132 insertions(+), 43 deletions(-)

diff --git a/README.md b/README.md
index 9fef621..a36e491 100644
--- a/README.md
+++ b/README.md
@@ -368,6 +368,7 @@ The daemon routes `POST /tool/:name` requests through the registry, applies Zod
 | `click`                  | Clicks an element identified by a11y ref, test ID, or CSS selector. Waits for the element to be visible before clicking. Supports `within` to scope the target inside a parent element.                                                                                                                                                                                           |
 | `type`                   | Types text into an input element identified by a11y ref, test ID, or CSS selector. Clears the field first, then sets the new value (uses Playwright's `fill()`). Supports `within` scoping.                                                                                                                                                                                       |
 | `wait_for`               | Waits for an element to become visible on the page within a configurable timeout. Supports `within` to scope the target inside a parent element.                                                                                                                                                                                                                                  |
+| `get_text`               | Reads the text content of an element identified by a11y ref, test ID, or CSS selector. Returns the text, target descriptor, and character length. Supports `within` scoping. Categorized as read-only (no observations in response).                                                                                                                                              |
 | `clipboard`              | Reads from or writes to the system clipboard via Chrome DevTools Protocol. Useful for pasting seed phrases or copying addresses.                                                                                                                                                                                                                                                  |
 | **Navigation**           |                                                                                                                                                                                                                                                                                                                                                                                   |
 | `navigate`               | Navigates the browser to a named screen (`home`, `settings`, `notification`) or an arbitrary URL.                                                                                                                                                                                                                                                                                 |
@@ -539,40 +540,58 @@ mm describe-screen
 
 ### Lifecycle
 
-| Command                                                                               | Description                                                                                                                                                                                                                                                                                                           |
-| ------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `mm launch [--state default\|onboarding\|custom] [--extension-path <path>] [--force]` | Auto-starts the daemon if needed, then launches a headed Chrome session with the configured extension. Use `--state` to control wallet initialization (pre-configured, onboarding flow, or custom fixture). Use `--extension-path` to override the extension directory. Use `--force` to replace an existing session. |
-| `mm cleanup [--shutdown]`                                                             | Stops the browser, tears down test services (fixture server, Anvil, mock server), and releases session resources. Add `--shutdown` to also terminate the daemon process.                                                                                                                                              |
-| `mm status`                                                                           | Displays the daemon's current status: PID, port, uptime, allocated sub-ports, and whether a browser session is active.                                                                                                                                                                                                |
-| `mm serve [--background]`                                                             | Manually starts the HTTP daemon without launching a browser session. Use `--background` to detach the process. Fails if a daemon is already running for this worktree.                                                                                                                                                |
+| Command                                                                                                                                          | Description                                                                                                                                                                                                                                                                                                                                                                             |
+| ------------------------------------------------------------------------------------------------------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `mm launch [--context e2e\|prod] [--state default\|onboarding\|custom] [--extension-path <path>] [--goal <text>] [--force] [--flow-tags <tags>]` | Auto-starts the daemon if needed, then launches a headed Chrome session with the configured extension. Use `--context` to set the environment context before launching. Use `--state` to control wallet initialization. Use `--extension-path` to override the extension directory. Use `--goal` and `--flow-tags` for knowledge tagging. Use `--force` to replace an existing session. |
+| `mm cleanup [--shutdown]`                                                                                                                        | Stops the browser, tears down test services (fixture server, Anvil, mock server), and releases session resources. Add `--shutdown` to also terminate the daemon process.                                                                                                                                                                                                                |
+| `mm status`                                                                                                                                      | Displays the daemon's current status: PID, port, uptime, allocated sub-ports, and whether a browser session is active.                                                                                                                                                                                                                                                                  |
+| `mm serve [--background]`                                                                                                                        | Manually starts the HTTP daemon without launching a browser session. Use `--background` to detach the process. Fails if a daemon is already running for this worktree.                                                                                                                                                                                                                  |
 
 ### Interaction
 
-| Command                                                 | Description                                                                                                                                                                                                                                                                              |
-| ------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `mm click <ref> [--within <scope>]`                     | Clicks an element by its accessibility reference (e.g., `e3`). The ref comes from a prior `describe-screen` call. Waits for the element to be visible before clicking. Use `--within` to scope the target inside a parent element (`testid:<id>`, `selector:<css>`, or a bare a11y ref). |
-| `mm type <ref> <text> [--within <scope>]`               | Types text into an input element identified by its accessibility reference. Clears the field first, then sets the new value (uses Playwright's `fill()`). Use `--within` to scope the target inside a parent element.                                                                    |
-| `mm describe-screen`                                    | Captures the full screen state: extension info, visible test IDs, a trimmed accessibility tree with deterministic refs (`e1`, `e2`, ...), and prior knowledge from historical sessions. This is the primary command for understanding what's on screen before interacting.               |
-| `mm screenshot [--name <name>]`                         | Takes a full-page screenshot of the current page. Saves to the artifacts directory. Use `--name` to set a descriptive filename.                                                                                                                                                          |
-| `mm wait-for <ref> [--timeout <ms>] [--within <scope>]` | Blocks until an element identified by its accessibility reference becomes visible, or the timeout expires. Default timeout is 5 seconds. Use `--within` to scope the target inside a parent element.                                                                                     |
+| Command                                                                                    | Description                                                                                                                                                                                                                                                                              |
+| ------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `mm click <ref> [--selector <css>] [--testid <id>] [--within <scope>]`                     | Clicks an element by its accessibility reference (e.g., `e3`). The ref comes from a prior `describe-screen` call. Waits for the element to be visible before clicking. Use `--within` to scope the target inside a parent element (`testid:<id>`, `selector:<css>`, or a bare a11y ref). |
+| `mm type <ref> <text> [--selector <css>] [--testid <id>] [--within <scope>]`               | Types text into an input element identified by its accessibility reference. Clears the field first, then sets the new value (uses Playwright's `fill()`). Use `--within` to scope the target inside a parent element.                                                                    |
+| `mm get-text <ref> [--selector <css>] [--testid <id>] [--within <scope>]`                  | Reads the text content of an element. Returns the inner text, target descriptor, and character length. Useful for asserting visible values without screenshots.                                                                                                                          |
+| `mm describe-screen`                                                                       | Captures the full screen state: extension info, visible test IDs, a trimmed accessibility tree with deterministic refs (`e1`, `e2`, ...), and prior knowledge from historical sessions. This is the primary command for understanding what's on screen before interacting.               |
+| `mm screenshot [--name <name>]`                                                            | Takes a full-page screenshot of the current page. Saves to the artifacts directory. Use `--name` to set a descriptive filename.                                                                                                                                                          |
+| `mm wait-for <ref> [--timeout <ms>] [--selector <css>] [--testid <id>] [--within <scope>]` | Blocks until an element identified by its accessibility reference becomes visible, or the timeout expires. Default timeout is 15 seconds. Use `--within` to scope the target inside a parent element.                                                                                    |
+| `mm wait-for-notification [--timeout <ms>]`                                                | Waits for the extension notification popup to appear within a timeout. Returns the notification page URL.                                                                                                                                                                                |
+| `mm clipboard <read\|write> [text]`                                                        | Reads from or writes to the system clipboard via Chrome DevTools Protocol. Useful for pasting seed phrases or copying addresses.                                                                                                                                                         |
 
 ### Navigation
 
-| Command                | Description                                                                                       |
-| ---------------------- | ------------------------------------------------------------------------------------------------- |
-| `mm navigate <url>`    | Opens a new tab and navigates to the given URL. Useful for navigating to dApps or external pages. |
-| `mm navigate-home`     | Navigates the extension tab to the wallet home screen.                                            |
-| `mm navigate-settings` | Navigates the extension tab to the settings page.                                                 |
-
-### State & Knowledge
-
-| Command                       | Description                                                                                                                                                         |
-| ----------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `mm get-state`                | Returns the current extension state: loaded status, current URL, screen name, network, chain ID, account address, and balance. Also lists all tracked browser tabs. |
-| `mm knowledge-search <query>` | Searches the knowledge store for past tool invocations matching the query. Results are scored by relevance to screen, URL, test IDs, and a11y nodes.                |
-| `mm knowledge-last`           | Retrieves the most recent step records from the current session's knowledge store.                                                                                  |
-| `mm knowledge-sessions`       | Lists recent knowledge sessions with metadata (goal, flow tags, timestamps).                                                                                        |
-| `mm run-steps <json>`         | Executes a batch of tool invocations sequentially from a JSON definition. Each step specifies a tool name and arguments.                                            |
+| Command                                                   | Description                                                                                                                                      |
+| --------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `mm navigate <url>`                                       | Opens a new tab and navigates to the given URL. Useful for navigating to dApps or external pages.                                                |
+| `mm navigate-home`                                        | Navigates the extension tab to the wallet home screen.                                                                                           |
+| `mm navigate-settings`                                    | Navigates the extension tab to the settings page.                                                                                                |
+| `mm switch-to-tab <role> \| --role <role> \| --url <url>` | Switches the active page to a tab matching a given role (e.g., `extension`, `dapp`) or URL prefix. Supports a positional role as first argument. |
+| `mm close-tab --role <role> \| --url <url>`               | Closes a browser tab matching a given role or URL. Falls back to the extension tab if the active tab is closed.                                  |
+
+### State & Context
+
+| Command                      | Description                                                                                                                                                         |
+| ---------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `mm get-state`               | Returns the current extension state: loaded status, current URL, screen name, network, chain ID, account address, and balance. Also lists all tracked browser tabs. |
+| `mm get-context`             | Returns the current environment context (`e2e` or `prod`), session status, available capabilities, and whether context switching is allowed.                        |
+| `mm set-context <e2e\|prod>` | Switches the session environment between `e2e` and `prod` modes. Blocked while a session is active — run `mm cleanup` first.                                        |
+
+### Knowledge
+
+| Command                                   | Description                                                                                                                                          |
+| ----------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `mm knowledge-search <query>`             | Searches the knowledge store for past tool invocations matching the query. Results are scored by relevance to screen, URL, test IDs, and a11y nodes. |
+| `mm knowledge-last`                       | Retrieves the most recent step records from the current session's knowledge store.                                                                   |
+| `mm knowledge-sessions`                   | Lists recent knowledge sessions with metadata (goal, flow tags, timestamps).                                                                         |
+| `mm knowledge-summarize [--session <id>]` | Generates a recipe-style summary of a session's tool invocations, showing the step sequence with targets and outcomes.                               |
+
+### Batching
+
+| Command               | Description                                                                                                              |
+| --------------------- | ------------------------------------------------------------------------------------------------------------------------ |
+| `mm run-steps <json>` | Executes a batch of tool invocations sequentially from a JSON definition. Each step specifies a tool name and arguments. |
 
 For the full agent-facing reference and workflow guidelines, see [SKILL.md](./SKILL.md).
 
diff --git a/SKILL.md b/SKILL.md
index 7bd3ccf..cd213f5 100644
--- a/SKILL.md
+++ b/SKILL.md
@@ -27,8 +27,8 @@ Tool responses include different data based on the tool's category:
 
 | Category      | Examples                                                          | Observations in response?                 |
 | ------------- | ----------------------------------------------------------------- | ----------------------------------------- |
-| **Mutating**  | click, type, navigate, launch, cleanup, build                     | Yes — `state` + `a11y` + `testIds`        |
-| **Read-only** | get*state, knowledge*\*, get_context, set_context                 | No — faster response                      |
+| **Mutating**  | click, type, navigate, launch, cleanup, build, clipboard          | Yes — `state` + `a11y` + `testIds`        |
+| **Read-only** | get_state, get_text, knowledge\_\*, get_context, set_context      | No — faster response                      |
 | **Discovery** | describe_screen, list_testids, accessibility_snapshot, screenshot | Data is already in `result`               |
 | **Batch**     | run_steps                                                         | Controlled by `includeObservations` param |
 
@@ -96,16 +96,19 @@ The `run_steps` tool collects observations once after all steps complete. Contro
 Starts the daemon (if not running) and launches a headed Chrome session with the extension.
 
 ```
-mm launch [--state default|onboarding|custom] [--extension-path <path>] [--force]
+mm launch [--context e2e|prod] [--state default|onboarding|custom] [--extension-path <path>] [--goal <text>] [--force] [--flow-tags <tags>]
 ```
 
 | Flag                      | Description                                                     |
 | ------------------------- | --------------------------------------------------------------- |
+| `--context e2e\|prod`     | Set the environment context before launching                    |
 | `--state default`         | Pre-onboarded wallet with 25 ETH on local Anvil chain (default) |
 | `--state onboarding`      | Fresh wallet requiring manual onboarding setup                  |
 | `--state custom`          | Use a custom fixture for wallet state                           |
 | `--extension-path <path>` | Override the extension build directory                          |
+| `--goal <text>`           | Tag the session with a goal for knowledge store                 |
 | `--force`                 | Replace an existing active session                              |
+| `--flow-tags <tags>`      | Comma-separated flow tags for cross-session knowledge           |
 
 Returns: `sessionId`, `extensionId`, `state` (current extension state).
 
@@ -146,6 +149,7 @@ mm serve [--background]
 **Your primary observation tool.** Returns the complete screen state:
 
 - **Extension state**: current URL, screen name, network, account, balance
+- **Active tab**: the currently focused tab's role and URL (if tracked)
 - **Test IDs**: visible `data-testid` attributes with text previews
 - **A11y tree**: interactive elements with deterministic refs (`e1`, `e2`, ...)
 - **Prior knowledge**: suggested actions from past sessions on this screen
@@ -173,17 +177,7 @@ The `testId` and `textContent` fields appear only on nodes with short or generic
 
 When 3+ consecutive identical nodes appear (same role, name, and path), they are collapsed into a summary like `… 3 more "maskicon" (refs e2–e4)` to reduce token waste. Individual refs still work for targeting.
 
-Use the `ref` value (`e3`) for click/type/wait-for commands.
-
-#### `mm get-state`
-
-Returns extension state and tracked tabs without the full a11y tree.
-
-```
-mm get-state
-```
-
-Returns: `state` (extension state) and `tabs` (active + tracked tabs with roles and URLs).
+Use the `ref` value (`e3`) for click/type/get-text/wait-for commands.
 
 #### `mm screenshot`
 
@@ -220,6 +214,18 @@ Types text into an input field. **Clears the field first**, then sets the new va
 mm type e5 "0x1234abcd..."
 ```
 
+#### `mm get-text <ref>`
+
+Reads the text content of an element. Returns the inner text, target descriptor, and character length. Useful for asserting visible values without screenshots. Categorized as read-only (no observations in response).
+
+```
+mm get-text e5
+mm get-text --testid balance-amount
+mm get-text --testid amount --within "testid:tx-row"
+```
+
+Returns: `text` (string content), `target` (descriptor like `testId:balance-amount`), `length` (character count).
+
 #### `mm wait-for <ref>`
 
 Blocks until an element becomes visible. Default timeout: 15s.
@@ -229,6 +235,23 @@ mm wait-for e7 [--timeout <ms>]
 mm wait-for --testid confirm-btn --within "testid:dialog-container"
 ```
 
+#### `mm wait-for-notification`
+
+Waits for the extension notification popup to appear within a timeout. Returns the notification page URL.
+
+```
+mm wait-for-notification [--timeout <ms>]
+```
+
+#### `mm clipboard`
+
+Reads from or writes to the system clipboard via Chrome DevTools Protocol. Useful for pasting seed phrases or copying addresses.
+
+```
+mm clipboard read
+mm clipboard write "0x1234abcd..."
+```
+
 ### Navigation
 
 #### `mm navigate <url>`
@@ -255,6 +278,53 @@ Navigates the extension tab to the settings page.
 mm navigate-settings
 ```
 
+#### `mm switch-to-tab`
+
+Switches the active page to a tab matching a given role or URL prefix. Supports a positional role as the first argument.
+
+```
+mm switch-to-tab dapp
+mm switch-to-tab --role extension
+mm switch-to-tab --url https://app.uniswap.org
+```
+
+#### `mm close-tab`
+
+Closes a browser tab matching a given role or URL. Falls back to the extension tab if the active tab is closed.
+
+```
+mm close-tab --role dapp
+mm close-tab --url https://app.uniswap.org
+```
+
+### State & Context
+
+#### `mm get-state`
+
+Returns extension state and tracked tabs without the full a11y tree.
+
+```
+mm get-state
+```
+
+Returns: `state` (extension state) and `tabs` (active + tracked tabs with roles and URLs).
+
+#### `mm get-context`
+
+Returns the current environment context (`e2e` or `prod`), session status, available capabilities, and whether context switching is allowed.
+
+```
+mm get-context
+```
+
+#### `mm set-context`
+
+Switches the session environment between `e2e` and `prod` modes. Blocked while a session is active — run `mm cleanup` first.
+
+```
+mm set-context <e2e|prod>
+```
+
 ### Knowledge Store
 
 The knowledge store records every tool invocation and uses past sessions to suggest actions.
@@ -299,7 +369,7 @@ Tool aliases are supported in steps: `navigate_home` / `navigate-home`, `navigat
 
 ## Element Targeting
 
-Every interaction command (`click`, `type`, `wait-for`) needs a target. You must provide exactly ONE of:
+Every interaction command (`click`, `type`, `get-text`, `wait-for`) needs a target. You must provide exactly ONE of:
 
 | Method           | Format              | Stability                       | When to use                                          |
 | ---------------- | ------------------- | ------------------------------- | ---------------------------------------------------- |

From 92505a42ed342ae53124ae1308e2f3b3ac3309e0 Mon Sep 17 00:00:00 2001
From: cryptotavares <joao.tavares@consensys.net>
Date: Fri, 17 Apr 2026 11:30:09 +0100
Subject: [PATCH 21/36] chore: update vitest coverage thresholds

---
 vitest.config.mts | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/vitest.config.mts b/vitest.config.mts
index 8f98685..9baea9d 100644
--- a/vitest.config.mts
+++ b/vitest.config.mts
@@ -35,10 +35,10 @@ export default defineConfig({
         // Auto-update the coverage thresholds when running locally.
         // Disabled in CI to prevent non-deterministic config changes.
         autoUpdate: !process.env.CI,
-        branches: 87.77,
-        functions: 91.17,
-        lines: 94.18,
-        statements: 93.95,
+        branches: 87.95,
+        functions: 90.87,
+        lines: 94.23,
+        statements: 93.97,
       },
     },
 
@@ -49,4 +49,4 @@ export default defineConfig({
       tsconfig: './tsconfig.test.json',
     },
   },
-});
+});
\ No newline at end of file

From 8cd7d6554a39843edda696bb83a81582c68d4a87 Mon Sep 17 00:00:00 2001
From: cryptotavares <joao.tavares@consensys.net>
Date: Fri, 17 Apr 2026 11:33:09 +0100
Subject: [PATCH 22/36] refactor: make KnowledgeStore injectable via
 ServerConfig

---
 src/server/create-server.ts | 2 +-
 src/types/http.ts           | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/server/create-server.ts b/src/server/create-server.ts
index 07a15f3..c317309 100644
--- a/src/server/create-server.ts
+++ b/src/server/create-server.ts
@@ -238,7 +238,7 @@ export function createServer(config: ServerConfig): ServerInstance {
   const app = express();
   const queue = new RequestQueue(config.requestTimeoutMs);
   const nonce = randomUUID();
-  const knowledgeStore = new KnowledgeStore();
+  const knowledgeStore = config.knowledgeStore ?? new KnowledgeStore();
 
   let httpServer: http.Server | null = null;
   let worktreeRoot = '';
diff --git a/src/types/http.ts b/src/types/http.ts
index 9d3fa9a..d1cac1c 100644
--- a/src/types/http.ts
+++ b/src/types/http.ts
@@ -65,6 +65,8 @@ export type ServerConfig = {
   sessionManager: ISessionManager;
   /** Factory function to create workflow context (may be sync or async) */
   contextFactory: () => WorkflowContext | Promise<WorkflowContext>;
+  /** Shared knowledge store instance (optional — a new instance is created if omitted) */
+  knowledgeStore?: KnowledgeStore;
   /** Idle timeout for daemon auto-shutdown in milliseconds (default: 1_800_000 = 30 min) */
   idleShutdownMs?: number;
   /** Per-request execution timeout in milliseconds (default: 30_000) */

From 2bfcfc9264148f619e12538e83566fbcc4a00ea0 Mon Sep 17 00:00:00 2001
From: cryptotavares <joao.tavares@consensys.net>
Date: Fri, 17 Apr 2026 15:55:26 +0100
Subject: [PATCH 23/36] feat(server): add observation compaction module with
 option collapsing

---
 src/server/observation-compaction.test.ts | 247 ++++++++++++++++++++++
 src/server/observation-compaction.ts      | 129 +++++++++++
 src/tools/utils/constants.ts              |   3 +
 3 files changed, 379 insertions(+)
 create mode 100644 src/server/observation-compaction.test.ts
 create mode 100644 src/server/observation-compaction.ts

diff --git a/src/server/observation-compaction.test.ts b/src/server/observation-compaction.test.ts
new file mode 100644
index 0000000..debe50c
--- /dev/null
+++ b/src/server/observation-compaction.test.ts
@@ -0,0 +1,247 @@
+import { describe, expect, it, vi } from 'vitest';
+
+import type { A11yNodeTrimmed } from '../tools/types/discovery.js';
+import type { StepRecordObservation } from '../tools/types/step-record.js';
+import {
+  collapseOptionSubtrees,
+  compactObservation,
+  observationCompactionDeps,
+} from './observation-compaction.js';
+
+function createNode(
+  ref: string,
+  role: string,
+  overrides: Partial<A11yNodeTrimmed> = {},
+): A11yNodeTrimmed {
+  return {
+    ref,
+    role,
+    name: overrides.name ?? `${role}-${ref}`,
+    path: overrides.path ?? ['root', ref],
+    ...overrides,
+  };
+}
+
+function createOptionRun(count: number, start = 1): A11yNodeTrimmed[] {
+  return Array.from({ length: count }, (_, index) => {
+    const refNumber = start + index;
+    return createNode(`e${refNumber}`, 'option', {
+      name: `Option ${refNumber}`,
+      path: ['root', 'combo', `option-${refNumber}`],
+    });
+  });
+}
+
+describe('collapseOptionSubtrees', () => {
+  it('collapses 55 options after a combobox into a summary node', () => {
+    const combobox = createNode('e1', 'combobox', {
+      name: 'Select network',
+      path: ['root', 'combobox'],
+    });
+    const nodes = [combobox, ...createOptionRun(55, 2)];
+
+    const result = collapseOptionSubtrees(nodes);
+
+    expect(result).toHaveLength(2);
+    expect(result[0]).toBe(combobox);
+    expect(result[1]).toStrictEqual({
+      ref: 'e2\u2013e56',
+      role: 'option',
+      name: '55 options (refs e2\u2013e56)',
+      path: ['root', 'combo', 'option-2'],
+    });
+  });
+
+  it('does not collapse runs below the threshold', () => {
+    const combobox = createNode('e1', 'combobox');
+    const optionOne = createNode('e2', 'option');
+    const optionTwo = createNode('e3', 'option');
+
+    const result = collapseOptionSubtrees([combobox, optionOne, optionTwo]);
+
+    expect(result).toHaveLength(3);
+    expect(result).toStrictEqual([combobox, optionOne, optionTwo]);
+  });
+
+  it('leaves bare options unchanged when no combobox or listbox precedes them', () => {
+    const options = createOptionRun(4);
+
+    const result = collapseOptionSubtrees(options);
+
+    expect(result).toStrictEqual(options);
+  });
+
+  it('handles multiple combobox and listbox groups independently', () => {
+    const firstCombobox = createNode('e1', 'combobox', {
+      path: ['root', 'first-combobox'],
+    });
+    const separator = createNode('e12', 'button', {
+      name: 'Continue',
+      path: ['root', 'separator'],
+    });
+    const secondListbox = createNode('e13', 'listbox', {
+      path: ['root', 'second-listbox'],
+    });
+    const nodes = [
+      firstCombobox,
+      ...createOptionRun(10, 2),
+      separator,
+      secondListbox,
+      ...createOptionRun(5, 14),
+    ];
+
+    const result = collapseOptionSubtrees(nodes);
+
+    expect(result).toStrictEqual([
+      firstCombobox,
+      {
+        ref: 'e2\u2013e11',
+        role: 'option',
+        name: '10 options (refs e2\u2013e11)',
+        path: ['root', 'combo', 'option-2'],
+      },
+      separator,
+      secondListbox,
+      {
+        ref: 'e14\u2013e18',
+        role: 'option',
+        name: '5 options (refs e14\u2013e18)',
+        path: ['root', 'combo', 'option-14'],
+      },
+    ]);
+  });
+
+  it('preserves already-collapsed option summaries as a single entry', () => {
+    const combobox = createNode('e1', 'combobox');
+    const collapsedSummary = createNode('e2\u2013e6', 'option', {
+      name: '5 options (refs e2\u2013e6)',
+      path: ['root', 'combo', 'option-2'],
+    });
+
+    const result = collapseOptionSubtrees([combobox, collapsedSummary]);
+
+    expect(result).toStrictEqual([combobox, collapsedSummary]);
+  });
+
+  it('does not collapse when a non-option node immediately follows the combobox', () => {
+    const combobox = createNode('e1', 'combobox');
+    const button = createNode('e2', 'button', { name: 'Apply' });
+    const options = createOptionRun(3, 3);
+
+    const result = collapseOptionSubtrees([combobox, button, ...options]);
+
+    expect(result).toStrictEqual([combobox, button, ...options]);
+  });
+
+  it('treats malformed option range refs as single options during compaction', () => {
+    const combobox = createNode('e1', 'combobox');
+    const malformedSummary = createNode(`e${'9'.repeat(400)}\u2013e2`, 'option', {
+      name: 'Malformed range',
+      path: ['root', 'combo', 'option-weird'],
+    });
+    const optionTwo = createNode('e3', 'option', {
+      name: 'Option 3',
+      path: ['root', 'combo', 'option-3'],
+    });
+    const optionThree = createNode('e4', 'option', {
+      name: 'Option 4',
+      path: ['root', 'combo', 'option-4'],
+    });
+
+    const result = collapseOptionSubtrees([
+      combobox,
+      malformedSummary,
+      optionTwo,
+      optionThree,
+    ]);
+
+    expect(result).toStrictEqual([
+      combobox,
+      {
+        ref: `${malformedSummary.ref}\u2013e4`,
+        role: 'option',
+        name: `3 options (refs ${malformedSummary.ref}\u2013e4)`,
+        path: ['root', 'combo', 'option-weird'],
+      },
+    ]);
+  });
+});
+
+describe('compactObservation', () => {
+  it('preserves non-a11y fields by reference while returning a new object', () => {
+    const state = { connected: true };
+    const testIds = [{ testId: 'submit', tag: 'button', visible: true }];
+    const priorKnowledge = { schemaVersion: 1, notes: ['cached'] };
+    const observation = {
+      state,
+      testIds,
+      a11y: {
+        nodes: [createNode('e1', 'combobox'), ...createOptionRun(4, 2)],
+      },
+      priorKnowledge,
+    } as unknown as StepRecordObservation;
+
+    const result = compactObservation(observation);
+
+    expect(result).not.toBe(observation);
+    expect(result.state).toBe(state);
+    expect(result.testIds).toBe(testIds);
+    expect(result.priorKnowledge).toBe(priorKnowledge);
+    expect(result.a11y).not.toBe(observation.a11y);
+    expect(result.a11y.nodes).toStrictEqual([
+      observation.a11y.nodes[0],
+      {
+        ref: 'e2\u2013e5',
+        role: 'option',
+        name: '4 options (refs e2\u2013e5)',
+        path: ['root', 'combo', 'option-2'],
+      },
+    ]);
+  });
+
+  it('is idempotent when called repeatedly on the same result', () => {
+    const observation = {
+      state: {},
+      testIds: [],
+      a11y: {
+        nodes: [createNode('e1', 'listbox'), ...createOptionRun(6, 2)],
+      },
+    } as unknown as StepRecordObservation;
+
+    const first = compactObservation(observation);
+    const second = compactObservation(first);
+
+    expect(second).toStrictEqual(first);
+  });
+
+  it('falls back to the original observation when compaction throws', () => {
+    const observation = {
+      state: {},
+      testIds: [],
+      a11y: { nodes: [createNode('e1', 'combobox')] },
+    } as unknown as StepRecordObservation;
+    const collapseSpy = vi
+      .spyOn(observationCompactionDeps, 'collapseOptionSubtrees')
+      .mockImplementation(() => {
+        throw new Error('boom');
+      });
+
+    const result = compactObservation(observation);
+
+    expect(result).toBe(observation);
+    collapseSpy.mockRestore();
+  });
+
+  it('handles empty node arrays gracefully', () => {
+    const observation = {
+      state: {},
+      testIds: [],
+      a11y: { nodes: [] },
+    } as unknown as StepRecordObservation;
+
+    const result = compactObservation(observation);
+
+    expect(result).not.toBe(observation);
+    expect(result.a11y.nodes).toStrictEqual([]);
+  });
+});
diff --git a/src/server/observation-compaction.ts b/src/server/observation-compaction.ts
new file mode 100644
index 0000000..787457e
--- /dev/null
+++ b/src/server/observation-compaction.ts
@@ -0,0 +1,129 @@
+import type { StepRecordObservation } from '../tools/types/step-record.js';
+import type { A11yNodeTrimmed } from '../tools/types/discovery.js';
+import { OPTION_COLLAPSE_MIN_COUNT } from '../tools/utils/constants.js';
+
+const OPTION_RANGE_PATTERN = /^(?<prefix>[^\d]+)(?<start>\d+)\u2013\k<prefix>(?<end>\d+)$/u;
+
+type RefRange = {
+  firstRef: string;
+  lastRef: string;
+  count: number;
+};
+
+function parseRefRange(ref: string): RefRange {
+  const match = OPTION_RANGE_PATTERN.exec(ref);
+  if (!match?.groups) {
+    return { firstRef: ref, lastRef: ref, count: 1 };
+  }
+
+  const { prefix, start, end } = match.groups;
+  const startIndex = Number(start);
+  const endIndex = Number(end);
+
+  if (!Number.isFinite(startIndex) || !Number.isFinite(endIndex)) {
+    return { firstRef: ref, lastRef: ref, count: 1 };
+  }
+
+  return {
+    firstRef: `${prefix}${start}`,
+    lastRef: `${prefix}${end}`,
+    count: Math.abs(endIndex - startIndex) + 1,
+  };
+}
+
+function buildOptionSummary(nodes: A11yNodeTrimmed[]): A11yNodeTrimmed {
+  const firstRange = parseRefRange(nodes[0].ref);
+  const lastRange = parseRefRange(nodes[nodes.length - 1].ref);
+  const optionCount = nodes.reduce(
+    (count, node) => count + parseRefRange(node.ref).count,
+    0,
+  );
+  const refRange = `${firstRange.firstRef}\u2013${lastRange.lastRef}`;
+
+  return {
+    ref: refRange,
+    role: 'option',
+    name: `${optionCount} options (refs ${refRange})`,
+    path: nodes[0].path,
+  };
+}
+
+export const observationCompactionDeps = {
+  collapseOptionSubtrees(nodes: A11yNodeTrimmed[]): A11yNodeTrimmed[] {
+    const collapsed: A11yNodeTrimmed[] = [];
+    let cursor = 0;
+
+    while (cursor < nodes.length) {
+      const current = nodes[cursor];
+
+      if (current.role !== 'combobox' && current.role !== 'listbox') {
+        collapsed.push(current);
+        cursor += 1;
+        continue;
+      }
+
+      collapsed.push(current);
+      cursor += 1;
+
+      const optionNodes: A11yNodeTrimmed[] = [];
+      while (cursor < nodes.length && nodes[cursor].role === 'option') {
+        optionNodes.push(nodes[cursor]);
+        cursor += 1;
+      }
+
+      if (optionNodes.length === 0) {
+        continue;
+      }
+
+      const optionCount = optionNodes.reduce(
+        (count, node) => count + parseRefRange(node.ref).count,
+        0,
+      );
+
+      if (optionCount >= OPTION_COLLAPSE_MIN_COUNT) {
+        collapsed.push(buildOptionSummary(optionNodes));
+        continue;
+      }
+
+      collapsed.push(...optionNodes);
+    }
+
+    return collapsed;
+  },
+};
+
+/**
+ * Collapses consecutive option nodes immediately beneath combobox/listbox nodes.
+ *
+ * @param nodes - Flat accessibility nodes to compact.
+ * @returns A new node array with large option runs summarized.
+ */
+export function collapseOptionSubtrees(
+  nodes: A11yNodeTrimmed[],
+): A11yNodeTrimmed[] {
+  return observationCompactionDeps.collapseOptionSubtrees(nodes);
+}
+
+/**
+ * Creates a compacted copy of an observation while preserving non-a11y fields.
+ *
+ * @param observation - Observation to compact.
+ * @returns A new compacted observation, or the original observation on failure.
+ */
+export function compactObservation(
+  observation: StepRecordObservation,
+): StepRecordObservation {
+  try {
+    return {
+      ...observation,
+      a11y: {
+        ...observation.a11y,
+        nodes: observationCompactionDeps.collapseOptionSubtrees(
+          observation.a11y.nodes,
+        ),
+      },
+    };
+  } catch {
+    return observation;
+  }
+}
diff --git a/src/tools/utils/constants.ts b/src/tools/utils/constants.ts
index 8adf3cd..8c65086 100644
--- a/src/tools/utils/constants.ts
+++ b/src/tools/utils/constants.ts
@@ -22,3 +22,6 @@ export const OBSERVATION_TESTID_LIMIT = 50;
 
 /** Maximum length for text content preview in discovery */
 export const TEXT_PREVIEW_MAX_LENGTH = 100;
+
+/** Minimum number of option nodes under a combobox/listbox to trigger collapsing */
+export const OPTION_COLLAPSE_MIN_COUNT = 3;

From 4764feb78ca9dce2749519de442cae24e6fb83f0 Mon Sep 17 00:00:00 2001
From: cryptotavares <joao.tavares@consensys.net>
Date: Fri, 17 Apr 2026 16:00:39 +0100
Subject: [PATCH 24/36] feat(types): add optional diff metadata to
 StepRecordObservation

---
 src/tools/types/step-record.ts | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/tools/types/step-record.ts b/src/tools/types/step-record.ts
index 23d220b..175ad89 100644
--- a/src/tools/types/step-record.ts
+++ b/src/tools/types/step-record.ts
@@ -65,6 +65,12 @@ export type StepRecordObservation = {
   testIds: TestIdItem[];
   a11y: {
     nodes: A11yNodeTrimmed[];
+    /** Present only in diff-mode compact observations (Phase 2). */
+    diff?: {
+      added: string[];
+      removed: string[];
+      unchanged: number;
+    };
   };
   priorKnowledge?: PriorKnowledgeV1;
 };

From 35c41cd58715a58f4c9afd241c2529cee2ff4a49 Mon Sep 17 00:00:00 2001
From: cryptotavares <joao.tavares@consensys.net>
Date: Fri, 17 Apr 2026 16:07:18 +0100
Subject: [PATCH 25/36] feat(server): wire compact observations into HTTP
 response path

---
 src/server/create-server.test.ts | 160 +++++++++++++++++++++++++++++++
 src/server/create-server.ts      |  12 +--
 2 files changed, 166 insertions(+), 6 deletions(-)

diff --git a/src/server/create-server.test.ts b/src/server/create-server.test.ts
index 93e2e65..3627b34 100644
--- a/src/server/create-server.test.ts
+++ b/src/server/create-server.test.ts
@@ -1201,3 +1201,163 @@ describe('createServer with logging', () => {
     expect(await testServer.stop()).toBeUndefined();
   });
 });
+
+describe('observation compaction in HTTP responses', () => {
+  let server: ServerInstance;
+  let state: DaemonState;
+  let mockSM: ReturnType<typeof createMockSessionManager>;
+
+  const comboboxAndOptions = [
+    { ref: 'e1', role: 'combobox', name: 'Language', path: ['root'] },
+    ...Array.from({ length: 10 }, (_, i) => ({
+      ref: `e${i + 2}`,
+      role: 'option',
+      name: `Lang ${i + 1}`,
+      path: ['root', 'combobox'],
+    })),
+    { ref: 'e12', role: 'button', name: 'Submit', path: ['root'] },
+  ];
+
+  beforeEach(async () => {
+    await fs.mkdir(tmpDir, { recursive: true });
+    exitSpy = vi.spyOn(process, 'exit').mockImplementation((() => {}) as never);
+
+    mockSM = createMockSessionManager();
+    mockSM.hasActiveSession.mockReturnValue(true);
+    mockSM.getExtensionState.mockResolvedValue({
+      isLoaded: true,
+      currentUrl: 'chrome-extension://test/home.html',
+    });
+
+    const { collectTrimmedA11ySnapshot } = await import(
+      '../tools/utils/discovery.js'
+    );
+    vi.mocked(collectTrimmedA11ySnapshot).mockResolvedValue({
+      nodes: comboboxAndOptions as never,
+      refMap: new Map(),
+    });
+
+    server = createServer(
+      buildConfig({
+        sessionManager: mockSM as unknown as ServerConfig['sessionManager'],
+      }),
+    );
+    state = await server.start();
+  });
+
+  afterEach(async () => {
+    await server.stop();
+    exitSpy.mockRestore();
+
+    const { collectTrimmedA11ySnapshot } = await import(
+      '../tools/utils/discovery.js'
+    );
+    vi.mocked(collectTrimmedA11ySnapshot).mockResolvedValue({
+      nodes: [],
+      refMap: new Map(),
+    });
+
+    await fs.rm(tmpDir, { recursive: true, force: true }).catch(() => {});
+  });
+
+  it('mutating tool returns compact observations in HTTP response', async () => {
+    const res = await httpRequest(`http://127.0.0.1:${state.port}/cleanup`, {
+      method: 'POST',
+      headers: { 'Content-Type': 'application/json' },
+      body: JSON.stringify({}),
+    });
+    const body = (await res.json()) as {
+      ok: boolean;
+      observations?: { a11y: { nodes: unknown[] } };
+    };
+
+    expect(res.status).toBe(200);
+    expect(body.observations).toBeDefined();
+    // 12 original nodes → compacted: combobox + summary + button = 3
+    expect(body.observations!.a11y.nodes).toHaveLength(3);
+  });
+
+  it('knowledge store receives full uncompacted observations', async () => {
+    const { KnowledgeStore } = await import(
+      '../knowledge-store/knowledge-store.js'
+    );
+    const mockStore = vi.mocked(KnowledgeStore).mock.results.at(-1)
+      ?.value as {
+      recordStep: ReturnType<typeof vi.fn>;
+    };
+    mockStore.recordStep.mockClear();
+
+    await httpRequest(`http://127.0.0.1:${state.port}/cleanup`, {
+      method: 'POST',
+      headers: { 'Content-Type': 'application/json' },
+      body: JSON.stringify({}),
+    });
+
+    expect(mockStore.recordStep).toHaveBeenCalled();
+    const recorded = mockStore.recordStep.mock.calls[0][0] as {
+      observation: { a11y: { nodes: unknown[] } };
+    };
+    expect(recorded.observation.a11y.nodes).toHaveLength(12);
+  });
+
+  it('batch with includeObservations=all returns compact observations', async () => {
+    const res = await httpRequest(
+      `http://127.0.0.1:${state.port}/tool/run_steps`,
+      {
+        method: 'POST',
+        headers: { 'Content-Type': 'application/json' },
+        body: JSON.stringify({
+          steps: [{ tool: 'get_state' }],
+          includeObservations: 'all',
+        }),
+      },
+    );
+    const body = (await res.json()) as {
+      ok: boolean;
+      observations?: { a11y: { nodes: unknown[] } };
+    };
+
+    expect(res.status).toBe(200);
+    expect(body.observations).toBeDefined();
+    expect(body.observations!.a11y.nodes).toHaveLength(3);
+  });
+
+  it('batch with includeObservations=none omits observations', async () => {
+    const res = await httpRequest(
+      `http://127.0.0.1:${state.port}/tool/run_steps`,
+      {
+        method: 'POST',
+        headers: { 'Content-Type': 'application/json' },
+        body: JSON.stringify({
+          steps: [{ tool: 'get_state' }],
+          includeObservations: 'none',
+        }),
+      },
+    );
+    const body = (await res.json()) as {
+      ok: boolean;
+      observations?: unknown;
+    };
+
+    expect(res.status).toBe(200);
+    expect(body.observations).toBeUndefined();
+  });
+
+  it('describe_screen response omits observations', async () => {
+    const res = await httpRequest(
+      `http://127.0.0.1:${state.port}/tool/describe_screen`,
+      {
+        method: 'POST',
+        headers: { 'Content-Type': 'application/json' },
+        body: JSON.stringify({}),
+      },
+    );
+    const body = (await res.json()) as {
+      ok: boolean;
+      observations?: unknown;
+    };
+
+    // Discovery tools never include observations in the HTTP response
+    expect(body.observations).toBeUndefined();
+  });
+});
diff --git a/src/server/create-server.ts b/src/server/create-server.ts
index c317309..920a208 100644
--- a/src/server/create-server.ts
+++ b/src/server/create-server.ts
@@ -5,6 +5,7 @@ import * as fs from 'node:fs/promises';
 import * as http from 'node:http';
 
 import { writeDaemonState, removeDaemonState } from './daemon-state.js';
+import { compactObservation } from './observation-compaction.js';
 import { RequestQueue } from './request-queue.js';
 import pkg from '../../package.json';
 import type { PortMap, WorkflowContext } from '../capabilities/context.js';
@@ -491,12 +492,11 @@ export function createServer(config: ServerConfig): ServerInstance {
         toolResult,
         validatedInput as Record<string, unknown>,
       );
-      res.json(
-        buildResponseBody(
-          toolResult,
-          includeInResponse ? observations : undefined,
-        ),
-      );
+      const responseObservations =
+        includeInResponse && observations
+          ? compactObservation(observations)
+          : undefined;
+      res.json(buildResponseBody(toolResult, responseObservations));
     } catch (error) {
       await recordToolStep(
         toolName,

From 87b7c9ea6a2db6e2e0509388e35ff861c4095c7a Mon Sep 17 00:00:00 2001
From: cryptotavares <joao.tavares@consensys.net>
Date: Fri, 17 Apr 2026 16:10:28 +0100
Subject: [PATCH 26/36] docs: document compact observation behavior in SKILL.md

---
 SKILL.md | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/SKILL.md b/SKILL.md
index cd213f5..6c5c43e 100644
--- a/SKILL.md
+++ b/SKILL.md
@@ -25,12 +25,14 @@ mm cleanup --shutdown      # 5. Clean up when done
 
 Tool responses include different data based on the tool's category:
 
-| Category      | Examples                                                          | Observations in response?                 |
-| ------------- | ----------------------------------------------------------------- | ----------------------------------------- |
-| **Mutating**  | click, type, navigate, launch, cleanup, build, clipboard          | Yes — `state` + `a11y` + `testIds`        |
-| **Read-only** | get_state, get_text, knowledge\_\*, get_context, set_context      | No — faster response                      |
-| **Discovery** | describe_screen, list_testids, accessibility_snapshot, screenshot | Data is already in `result`               |
-| **Batch**     | run_steps                                                         | Controlled by `includeObservations` param |
+| Category      | Examples                                                          | Observations in response?                          |
+| ------------- | ----------------------------------------------------------------- | -------------------------------------------------- |
+| **Mutating**  | click, type, navigate, launch, cleanup, build, clipboard          | Yes — `state` + `a11y` (compacted) + `testIds`    |
+| **Read-only** | get_state, get_text, knowledge\_\*, get_context, set_context      | No — faster response                               |
+| **Discovery** | describe_screen, list_testids, accessibility_snapshot, screenshot | Data is already in `result`                        |
+| **Batch**     | run_steps                                                         | Controlled by `includeObservations` param          |
+
+**Observation Compaction:** Mutating tool observations are **compacted** before returning: option runs of 3 or more under a combobox or listbox are replaced with a single summary node (e.g., `"55 options (refs e2–e56)"`). The `describe-screen` tool always returns the **full, unfiltered** a11y tree — use it when you need the complete option list or `priorKnowledge`.
 
 ### Using inline observations (mutating tools)
 
@@ -53,7 +55,13 @@ After a mutating action, the response includes fresh screen state:
 }
 ```
 
-You can use the `ref` values from `observations.a11y.nodes` for the next interaction — no `describe-screen` needed.
+You can use the `ref` values from `observations.a11y.nodes` for the next interaction — no `describe-screen` needed. Note that refs in compacted observations may be summary nodes (e.g., `"55 options (refs e2–e56)"`) when there are 3+ options under a combobox or listbox.
+
+**Quick reference:**
+
+- Use `observations.state` for quick checks (screen name, loading status, balance, etc.)
+- Use `observations.a11y.nodes` with the compact refs for the next interaction
+- Call `describe-screen` only when you need the full tree or `priorKnowledge`
 
 ```bash
 mm click e3                 # mutating: response includes fresh observations
@@ -66,6 +74,7 @@ Call `describe-screen` explicitly when you need:
 - `priorKnowledge` (historical actions for this screen)
 - A screenshot via `includeScreenshot`
 - Full context after unexpected navigation
+- The complete, unfiltered a11y tree (e.g., all options in a dropdown)
 
 ### `run_steps` and `includeObservations`
 

From 21bc5883f56b95bf7101be1219ce451c6c919b52 Mon Sep 17 00:00:00 2001
From: cryptotavares <joao.tavares@consensys.net>
Date: Fri, 17 Apr 2026 16:14:00 +0100
Subject: [PATCH 27/36] feat(server): implement diff-based observation
 compaction

---
 src/server/observation-compaction.test.ts | 221 ++++++++++++++++++++++
 src/server/observation-compaction.ts      |  72 +++++++
 2 files changed, 293 insertions(+)

diff --git a/src/server/observation-compaction.test.ts b/src/server/observation-compaction.test.ts
index debe50c..e7de1a3 100644
--- a/src/server/observation-compaction.test.ts
+++ b/src/server/observation-compaction.test.ts
@@ -5,6 +5,8 @@ import type { StepRecordObservation } from '../tools/types/step-record.js';
 import {
   collapseOptionSubtrees,
   compactObservation,
+  diffObservation,
+  nodeChanged,
   observationCompactionDeps,
 } from './observation-compaction.js';
 
@@ -32,6 +34,23 @@ function createOptionRun(count: number, start = 1): A11yNodeTrimmed[] {
   });
 }
 
+function createObservation(
+  nodes: A11yNodeTrimmed[],
+  overrides: Partial<StepRecordObservation> = {},
+): StepRecordObservation {
+  return {
+    state: overrides.state ?? ({} as StepRecordObservation['state']),
+    testIds: overrides.testIds ?? [],
+    a11y: {
+      nodes,
+      ...(overrides.a11y?.diff ? { diff: overrides.a11y.diff } : {}),
+    },
+    ...(overrides.priorKnowledge
+      ? { priorKnowledge: overrides.priorKnowledge }
+      : {}),
+  } as StepRecordObservation;
+}
+
 describe('collapseOptionSubtrees', () => {
   it('collapses 55 options after a combobox into a summary node', () => {
     const combobox = createNode('e1', 'combobox', {
@@ -245,3 +264,205 @@ describe('compactObservation', () => {
     expect(result.a11y.nodes).toStrictEqual([]);
   });
 });
+
+describe('nodeChanged', () => {
+  it('returns true when the name changes', () => {
+    const previous = createNode('e1', 'button', { name: 'Continue' });
+    const current = createNode('e1', 'button', { name: 'Confirm' });
+
+    expect(nodeChanged(current, previous)).toBe(true);
+  });
+
+  it('returns true when the role changes', () => {
+    const previous = createNode('e1', 'button');
+    const current = createNode('e1', 'link');
+
+    expect(nodeChanged(current, previous)).toBe(true);
+  });
+
+  it('returns true when the path changes', () => {
+    const previous = createNode('e1', 'button', { path: ['root', 'page'] });
+    const current = createNode('e1', 'button', {
+      path: ['root', 'dialog', 'page'],
+    });
+
+    expect(nodeChanged(current, previous)).toBe(true);
+  });
+
+  it('returns true when the disabled state changes', () => {
+    const previous = createNode('e1', 'button', { disabled: false });
+    const current = createNode('e1', 'button', { disabled: true });
+
+    expect(nodeChanged(current, previous)).toBe(true);
+  });
+
+  it('returns false for identical nodes', () => {
+    const previous = createNode('e1', 'checkbox', {
+      checked: true,
+      expanded: false,
+      testId: 'accept',
+      textContent: 'Accept terms',
+      path: ['root', 'form', 'accept'],
+    });
+    const current = createNode('e1', 'checkbox', {
+      checked: true,
+      expanded: false,
+      testId: 'accept',
+      textContent: 'Accept terms',
+      path: ['root', 'form', 'accept'],
+    });
+
+    expect(nodeChanged(current, previous)).toBe(false);
+  });
+
+  it('does not compare refs', () => {
+    const previous = createNode('e1', 'button', {
+      name: 'Continue',
+      path: ['root', 'actions'],
+    });
+    const current = createNode('e999', 'button', {
+      name: 'Continue',
+      path: ['root', 'actions'],
+    });
+
+    expect(nodeChanged(current, previous)).toBe(false);
+  });
+});
+
+describe('diffObservation', () => {
+  it('tracks added nodes and omits unchanged nodes from the diff payload', () => {
+    const stable = createNode('e1', 'button', { name: 'Continue' });
+    const added = createNode('e2', 'button', { name: 'Cancel' });
+    const previous = createObservation([stable]);
+    const current = createObservation([stable, added]);
+
+    const result = diffObservation(current, previous);
+
+    expect(result.a11y.nodes).toStrictEqual([added]);
+    expect(result.a11y.diff).toStrictEqual({
+      added: ['e2'],
+      removed: [],
+      unchanged: 1,
+    });
+  });
+
+  it('tracks removed nodes without including them in nodes', () => {
+    const stable = createNode('e1', 'button', { name: 'Continue' });
+    const removed = createNode('e2', 'button', { name: 'Cancel' });
+    const previous = createObservation([stable, removed]);
+    const current = createObservation([stable]);
+
+    const result = diffObservation(current, previous);
+
+    expect(result.a11y.nodes).toStrictEqual([]);
+    expect(result.a11y.diff).toStrictEqual({
+      added: [],
+      removed: ['e2'],
+      unchanged: 1,
+    });
+  });
+
+  it('includes changed nodes without marking them as added or removed', () => {
+    const previous = createObservation([
+      createNode('e1', 'button', { disabled: false, name: 'Continue' }),
+    ]);
+    const changed = createNode('e1', 'button', {
+      disabled: true,
+      name: 'Continue',
+    });
+    const current = createObservation([changed]);
+
+    const result = diffObservation(current, previous);
+
+    expect(result.a11y.nodes).toStrictEqual([changed]);
+    expect(result.a11y.diff).toStrictEqual({
+      added: [],
+      removed: [],
+      unchanged: 0,
+    });
+  });
+
+  it('returns an empty diff payload when nothing changed', () => {
+    const previous = createObservation([
+      createNode('e1', 'button'),
+      createNode('e2', 'checkbox', { checked: true }),
+    ]);
+    const current = createObservation([
+      createNode('e1', 'button'),
+      createNode('e2', 'checkbox', { checked: true }),
+    ]);
+
+    const result = diffObservation(current, previous);
+
+    expect(result.a11y.nodes).toStrictEqual([]);
+    expect(result.a11y.diff).toStrictEqual({
+      added: [],
+      removed: [],
+      unchanged: 2,
+    });
+  });
+
+  it('supports mixed added, removed, changed, and unchanged nodes', () => {
+    const unchangedNodes = Array.from({ length: 5 }, (_, index) =>
+      createNode(`u${index + 1}`, 'button', { name: `Stable ${index + 1}` }),
+    );
+    const previous = createObservation([
+      ...unchangedNodes,
+      createNode('c1', 'button', { disabled: false, name: 'Changed' }),
+      createNode('r1', 'button', { name: 'Removed' }),
+    ]);
+    const changed = createNode('c1', 'button', {
+      disabled: true,
+      name: 'Changed',
+    });
+    const addedOne = createNode('a1', 'button', { name: 'Added 1' });
+    const addedTwo = createNode('a2', 'button', { name: 'Added 2' });
+    const current = createObservation([
+      ...unchangedNodes,
+      changed,
+      addedOne,
+      addedTwo,
+    ]);
+
+    const result = diffObservation(current, previous);
+
+    expect(result.a11y.nodes).toStrictEqual([changed, addedOne, addedTwo]);
+    expect(result.a11y.diff).toStrictEqual({
+      added: ['a1', 'a2'],
+      removed: ['r1'],
+      unchanged: 5,
+    });
+  });
+
+  it('preserves the current state and testIds', () => {
+    const state = {
+      mode: 'current',
+    } as unknown as StepRecordObservation['state'];
+    const testIds = [{ testId: 'submit', tag: 'button', visible: true }];
+    const previous = createObservation([]);
+    const current = createObservation([createNode('e1', 'button')], {
+      state,
+      testIds,
+    });
+
+    const result = diffObservation(current, previous);
+
+    expect(result.state).toBe(state);
+    expect(result.testIds).toBe(testIds);
+  });
+
+  it('preserves the current priorKnowledge', () => {
+    const priorKnowledge = {
+      schemaVersion: 1,
+      notes: ['cached'],
+    } as unknown as StepRecordObservation['priorKnowledge'];
+    const previous = createObservation([]);
+    const current = createObservation([createNode('e1', 'button')], {
+      priorKnowledge,
+    });
+
+    const result = diffObservation(current, previous);
+
+    expect(result.priorKnowledge).toBe(priorKnowledge);
+  });
+});
diff --git a/src/server/observation-compaction.ts b/src/server/observation-compaction.ts
index 787457e..6640b90 100644
--- a/src/server/observation-compaction.ts
+++ b/src/server/observation-compaction.ts
@@ -127,3 +127,75 @@ export function compactObservation(
     return observation;
   }
 }
+
+function arraysEqual(left: string[], right: string[]): boolean {
+  return (
+    left.length === right.length && left.every((val, idx) => val === right[idx])
+  );
+}
+
+export function nodeChanged(a: A11yNodeTrimmed, b: A11yNodeTrimmed): boolean {
+  return (
+    a.name !== b.name ||
+    a.role !== b.role ||
+    a.disabled !== b.disabled ||
+    a.checked !== b.checked ||
+    a.expanded !== b.expanded ||
+    a.testId !== b.testId ||
+    a.textContent !== b.textContent ||
+    !arraysEqual(a.path, b.path)
+  );
+}
+
+export function diffObservation(
+  current: StepRecordObservation,
+  previous: StepRecordObservation,
+): StepRecordObservation {
+  const prevMap = new Map(
+    previous.a11y.nodes.map((node) => [node.ref, node] as const),
+  );
+  const currMap = new Map(
+    current.a11y.nodes.map((node) => [node.ref, node] as const),
+  );
+  const changedOrNewNodes: A11yNodeTrimmed[] = [];
+  const addedRefs: string[] = [];
+  const removedRefs: string[] = [];
+  let unchangedCount = 0;
+
+  for (const [ref, currNode] of currMap) {
+    const prevNode = prevMap.get(ref);
+
+    if (!prevNode) {
+      addedRefs.push(ref);
+      changedOrNewNodes.push(currNode);
+      continue;
+    }
+
+    if (nodeChanged(currNode, prevNode)) {
+      changedOrNewNodes.push(currNode);
+      continue;
+    }
+
+    unchangedCount += 1;
+  }
+
+  for (const ref of prevMap.keys()) {
+    if (!currMap.has(ref)) {
+      removedRefs.push(ref);
+    }
+  }
+
+  return {
+    state: current.state,
+    testIds: current.testIds,
+    a11y: {
+      nodes: changedOrNewNodes,
+      diff: {
+        added: addedRefs,
+        removed: removedRefs,
+        unchanged: unchangedCount,
+      },
+    },
+    priorKnowledge: current.priorKnowledge,
+  };
+}

From 5f28b5ee55d07dc6e481c4dd1c522aa337518c30 Mon Sep 17 00:00:00 2001
From: cryptotavares <joao.tavares@consensys.net>
Date: Fri, 17 Apr 2026 16:23:39 +0100
Subject: [PATCH 28/36] feat(server): wire diff baseline and size guard into
 executeTool

---
 src/server/create-server.test.ts          | 226 ++++++++++++++++++++++
 src/server/create-server.ts               |  13 +-
 src/server/observation-compaction.test.ts |  36 ++++
 src/server/observation-compaction.ts      |  25 ++-
 4 files changed, 298 insertions(+), 2 deletions(-)

diff --git a/src/server/create-server.test.ts b/src/server/create-server.test.ts
index 3627b34..ae57e80 100644
--- a/src/server/create-server.test.ts
+++ b/src/server/create-server.test.ts
@@ -32,6 +32,11 @@ vi.mock('../tools/utils/discovery.js', () => ({
     nodes: [],
     refMap: new Map(),
   }),
+  waitForTarget: vi.fn().mockResolvedValue({
+    click: vi.fn().mockResolvedValue(undefined),
+    fill: vi.fn().mockResolvedValue(undefined),
+    textContent: vi.fn().mockResolvedValue(''),
+  }),
 }));
 
 vi.mock('../knowledge-store/knowledge-store.js', () => {
@@ -1218,6 +1223,23 @@ describe('observation compaction in HTTP responses', () => {
     { ref: 'e12', role: 'button', name: 'Submit', path: ['root'] },
   ];
 
+  const initialButtons = [
+    { ref: 'e1', role: 'button', name: 'Continue', path: ['root'] },
+    { ref: 'e2', role: 'button', name: 'Cancel', path: ['root'] },
+  ];
+
+  const changedButtons = [
+    { ref: 'e1', role: 'button', name: 'Continue', path: ['root'] },
+    { ref: 'e3', role: 'button', name: 'Confirm', path: ['root'] },
+  ];
+
+  const manyNewButtons = Array.from({ length: 10 }, (_, index) => ({
+    ref: `e${index + 10}`,
+    role: 'button',
+    name: `Action ${index + 1}`,
+    path: ['root'],
+  }));
+
   beforeEach(async () => {
     await fs.mkdir(tmpDir, { recursive: true });
     exitSpy = vi.spyOn(process, 'exit').mockImplementation((() => {}) as never);
@@ -1277,6 +1299,210 @@ describe('observation compaction in HTTP responses', () => {
     expect(body.observations!.a11y.nodes).toHaveLength(3);
   });
 
+  it('first mutation returns a full compact observation when no baseline exists', async () => {
+    const res = await httpRequest(`http://127.0.0.1:${state.port}/tool/click`, {
+      method: 'POST',
+      headers: { 'Content-Type': 'application/json' },
+      body: JSON.stringify({ a11yRef: 'e1' }),
+    });
+    const body = (await res.json()) as {
+      ok: boolean;
+      observations?: {
+        a11y: {
+          nodes: unknown[];
+          diff?: unknown;
+        };
+      };
+    };
+
+    expect(res.status).toBe(200);
+    expect(body.observations).toBeDefined();
+    expect(body.observations!.a11y.diff).toBeUndefined();
+    expect(body.observations!.a11y.nodes).toHaveLength(3);
+  });
+
+  it('second mutation returns a diff-based observation', async () => {
+    const { collectTrimmedA11ySnapshot } = await import(
+      '../tools/utils/discovery.js'
+    );
+    vi.mocked(collectTrimmedA11ySnapshot)
+      .mockResolvedValueOnce({
+        nodes: initialButtons as never,
+        refMap: new Map(),
+      })
+      .mockResolvedValueOnce({
+        nodes: changedButtons as never,
+        refMap: new Map(),
+      });
+
+    await httpRequest(`http://127.0.0.1:${state.port}/tool/click`, {
+      method: 'POST',
+      headers: { 'Content-Type': 'application/json' },
+      body: JSON.stringify({ a11yRef: 'e1' }),
+    });
+
+    const res = await httpRequest(`http://127.0.0.1:${state.port}/tool/click`, {
+      method: 'POST',
+      headers: { 'Content-Type': 'application/json' },
+      body: JSON.stringify({ a11yRef: 'e1' }),
+    });
+    const body = (await res.json()) as {
+      ok: boolean;
+      observations?: {
+        a11y: {
+          nodes: unknown[];
+          diff?: { added: string[]; removed: string[]; unchanged: number };
+        };
+      };
+    };
+
+    expect(res.status).toBe(200);
+    expect(body.observations?.a11y.diff).toStrictEqual({
+      added: ['e3'],
+      removed: ['e2'],
+      unchanged: 1,
+    });
+    expect(body.observations?.a11y.nodes).toHaveLength(1);
+  });
+
+  it('describe_screen resets the diff baseline', async () => {
+    const { collectTrimmedA11ySnapshot } = await import(
+      '../tools/utils/discovery.js'
+    );
+    vi.mocked(collectTrimmedA11ySnapshot)
+      .mockResolvedValueOnce({
+        nodes: initialButtons as never,
+        refMap: new Map(),
+      })
+      .mockResolvedValueOnce({
+        nodes: initialButtons as never,
+        refMap: new Map(),
+      })
+      .mockResolvedValueOnce({
+        nodes: changedButtons as never,
+        refMap: new Map(),
+      });
+
+    await httpRequest(`http://127.0.0.1:${state.port}/tool/click`, {
+      method: 'POST',
+      headers: { 'Content-Type': 'application/json' },
+      body: JSON.stringify({ a11yRef: 'e1' }),
+    });
+
+    await httpRequest(
+      `http://127.0.0.1:${state.port}/tool/describe_screen`,
+      {
+        method: 'POST',
+        headers: { 'Content-Type': 'application/json' },
+        body: JSON.stringify({}),
+      },
+    );
+
+    const res = await httpRequest(`http://127.0.0.1:${state.port}/tool/click`, {
+      method: 'POST',
+      headers: { 'Content-Type': 'application/json' },
+      body: JSON.stringify({ a11yRef: 'e1' }),
+    });
+    const body = (await res.json()) as {
+      ok: boolean;
+      observations?: {
+        a11y: {
+          nodes: unknown[];
+          diff?: unknown;
+        };
+      };
+    };
+
+    expect(res.status).toBe(200);
+    expect(body.observations).toBeDefined();
+    expect(body.observations!.a11y.diff).toBeUndefined();
+    expect(body.observations!.a11y.nodes.length).toBeGreaterThan(1);
+  });
+
+  it('falls back to the full observation when the diff is not smaller', async () => {
+    const { collectTrimmedA11ySnapshot } = await import(
+      '../tools/utils/discovery.js'
+    );
+    vi.mocked(collectTrimmedA11ySnapshot)
+      .mockResolvedValueOnce({
+        nodes: [initialButtons[0]] as never,
+        refMap: new Map(),
+      })
+      .mockResolvedValueOnce({
+        nodes: manyNewButtons as never,
+        refMap: new Map(),
+      });
+
+    await httpRequest(`http://127.0.0.1:${state.port}/tool/click`, {
+      method: 'POST',
+      headers: { 'Content-Type': 'application/json' },
+      body: JSON.stringify({ a11yRef: 'e1' }),
+    });
+
+    const res = await httpRequest(`http://127.0.0.1:${state.port}/tool/click`, {
+      method: 'POST',
+      headers: { 'Content-Type': 'application/json' },
+      body: JSON.stringify({ a11yRef: 'e1' }),
+    });
+    const body = (await res.json()) as {
+      ok: boolean;
+      observations?: {
+        a11y: {
+          nodes: unknown[];
+          diff?: unknown;
+        };
+      };
+    };
+
+    expect(res.status).toBe(200);
+    expect(body.observations).toBeDefined();
+    expect(body.observations!.a11y.diff).toBeUndefined();
+    expect(body.observations!.a11y.nodes).toHaveLength(10);
+  });
+
+  it('knowledge store always receives the full observation instead of the diff', async () => {
+    const { collectTrimmedA11ySnapshot } = await import(
+      '../tools/utils/discovery.js'
+    );
+    vi.mocked(collectTrimmedA11ySnapshot)
+      .mockResolvedValueOnce({
+        nodes: initialButtons as never,
+        refMap: new Map(),
+      })
+      .mockResolvedValueOnce({
+        nodes: changedButtons as never,
+        refMap: new Map(),
+      });
+
+    const { KnowledgeStore } = await import(
+      '../knowledge-store/knowledge-store.js'
+    );
+    const mockStore = vi.mocked(KnowledgeStore).mock.results.at(-1)?.value as {
+      recordStep: ReturnType<typeof vi.fn>;
+    };
+    mockStore.recordStep.mockClear();
+
+    await httpRequest(`http://127.0.0.1:${state.port}/tool/click`, {
+      method: 'POST',
+      headers: { 'Content-Type': 'application/json' },
+      body: JSON.stringify({ a11yRef: 'e1' }),
+    });
+
+    await httpRequest(`http://127.0.0.1:${state.port}/tool/click`, {
+      method: 'POST',
+      headers: { 'Content-Type': 'application/json' },
+      body: JSON.stringify({ a11yRef: 'e1' }),
+    });
+
+    expect(mockStore.recordStep).toHaveBeenCalledTimes(2);
+    const recorded = mockStore.recordStep.mock.calls[1][0] as {
+      observation: { a11y: { nodes: unknown[]; diff?: unknown } };
+    };
+
+    expect(recorded.observation.a11y.diff).toBeUndefined();
+    expect(recorded.observation.a11y.nodes).toStrictEqual(changedButtons);
+  });
+
   it('knowledge store receives full uncompacted observations', async () => {
     const { KnowledgeStore } = await import(
       '../knowledge-store/knowledge-store.js'
diff --git a/src/server/create-server.ts b/src/server/create-server.ts
index 920a208..54f2df9 100644
--- a/src/server/create-server.ts
+++ b/src/server/create-server.ts
@@ -251,6 +251,7 @@ export function createServer(config: ServerConfig): ServerInstance {
   let shutdownHandler: (() => void) | null = null;
   let lastRequestTime = Date.now();
   let idleCheckInterval: ReturnType<typeof setInterval> | null = null;
+  let lastObservation: StepRecordObservation | null = null;
 
   // eslint-disable-next-line import-x/no-named-as-default-member
   app.use(express.json({ limit: '10mb' }));
@@ -494,9 +495,19 @@ export function createServer(config: ServerConfig): ServerInstance {
       );
       const responseObservations =
         includeInResponse && observations
-          ? compactObservation(observations)
+          ? compactObservation(observations, lastObservation)
           : undefined;
       res.json(buildResponseBody(toolResult, responseObservations));
+
+      if (
+        toolName === 'describe_screen' ||
+        toolName === 'launch' ||
+        toolName === 'cleanup'
+      ) {
+        lastObservation = null;
+      } else if (observations) {
+        lastObservation = observations;
+      }
     } catch (error) {
       await recordToolStep(
         toolName,
diff --git a/src/server/observation-compaction.test.ts b/src/server/observation-compaction.test.ts
index e7de1a3..59bfe35 100644
--- a/src/server/observation-compaction.test.ts
+++ b/src/server/observation-compaction.test.ts
@@ -233,6 +233,42 @@ describe('compactObservation', () => {
     expect(second).toStrictEqual(first);
   });
 
+  it('returns a diff when the previous observation produces a smaller payload', () => {
+    const previous = createObservation([
+      createNode('e1', 'button', { name: 'Continue' }),
+      createNode('e2', 'button', { name: 'Cancel' }),
+    ]);
+    const current = createObservation([
+      createNode('e1', 'button', { name: 'Continue' }),
+      createNode('e3', 'button', { name: 'Confirm' }),
+    ]);
+
+    const result = compactObservation(current, previous);
+
+    expect(result.a11y.nodes).toStrictEqual([
+      createNode('e3', 'button', { name: 'Confirm' }),
+    ]);
+    expect(result.a11y.diff).toStrictEqual({
+      added: ['e3'],
+      removed: ['e2'],
+      unchanged: 1,
+    });
+  });
+
+  it('returns the full compacted observation when the diff is not smaller', () => {
+    const previous = createObservation([createNode('e1', 'button')]);
+    const current = createObservation(
+      Array.from({ length: 10 }, (_, index) =>
+        createNode(`e${index + 10}`, 'button', { name: `Action ${index + 1}` }),
+      ),
+    );
+
+    const result = compactObservation(current, previous);
+
+    expect(result.a11y.diff).toBeUndefined();
+    expect(result.a11y.nodes).toStrictEqual(current.a11y.nodes);
+  });
+
   it('falls back to the original observation when compaction throws', () => {
     const observation = {
       state: {},
diff --git a/src/server/observation-compaction.ts b/src/server/observation-compaction.ts
index 6640b90..9ed5507 100644
--- a/src/server/observation-compaction.ts
+++ b/src/server/observation-compaction.ts
@@ -112,9 +112,10 @@ export function collapseOptionSubtrees(
  */
 export function compactObservation(
   observation: StepRecordObservation,
+  previousObservation?: StepRecordObservation | null,
 ): StepRecordObservation {
   try {
-    return {
+    const optionFiltered: StepRecordObservation = {
       ...observation,
       a11y: {
         ...observation.a11y,
@@ -123,6 +124,28 @@ export function compactObservation(
         ),
       },
     };
+
+    if (!previousObservation) {
+      return optionFiltered;
+    }
+
+    const previousFiltered: StepRecordObservation = {
+      ...previousObservation,
+      a11y: {
+        ...previousObservation.a11y,
+        nodes: observationCompactionDeps.collapseOptionSubtrees(
+          previousObservation.a11y.nodes,
+        ),
+      },
+    };
+
+    const diffResult = diffObservation(optionFiltered, previousFiltered);
+
+    if (diffResult.a11y.nodes.length >= optionFiltered.a11y.nodes.length) {
+      return optionFiltered;
+    }
+
+    return diffResult;
   } catch {
     return observation;
   }

From 93c50b24d92d037dc5dbf3a9d5a10ce9ca37bb77 Mon Sep 17 00:00:00 2001
From: cryptotavares <joao.tavares@consensys.net>
Date: Fri, 17 Apr 2026 16:25:51 +0100
Subject: [PATCH 29/36] docs: document diff-based observation behavior

---
 SKILL.md | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/SKILL.md b/SKILL.md
index 6c5c43e..69ad41f 100644
--- a/SKILL.md
+++ b/SKILL.md
@@ -34,6 +34,16 @@ Tool responses include different data based on the tool's category:
 
 **Observation Compaction:** Mutating tool observations are **compacted** before returning: option runs of 3 or more under a combobox or listbox are replaced with a single summary node (e.g., `"55 options (refs e2–e56)"`). The `describe-screen` tool always returns the **full, unfiltered** a11y tree — use it when you need the complete option list or `priorKnowledge`.
 
+**Diff-Based Observations:** After the first mutating tool call sets a baseline, subsequent mutations return **diff-based** observations. The `observations.a11y.diff` field (when present) shows what changed:
+```json
+{
+  "added": ["e4", "e5"],      // new node refs
+  "removed": ["e2"],          // disappeared node refs
+  "unchanged": 3              // count of unchanged nodes
+}
+```
+The `observations.a11y.nodes` field contains **only the changed and new nodes** (not all nodes). The baseline resets after `describe-screen`, `launch`, or `cleanup` — the next mutation returns a full compact observation (no `diff` field). When the diff would be larger than the full observation, the full option-filtered observation is returned instead (no `diff` field).
+
 ### Using inline observations (mutating tools)
 
 After a mutating action, the response includes fresh screen state:

From 2a8b6f22d5ba0ff598e4ba422773426997af513f Mon Sep 17 00:00:00 2001
From: cryptotavares <joao.tavares@consensys.net>
Date: Fri, 17 Apr 2026 16:33:24 +0100
Subject: [PATCH 30/36] fix(server): add required JSDoc to observation
 compaction functions

---
 src/server/observation-compaction.ts | 41 ++++++++++++++++++++++++++--
 1 file changed, 39 insertions(+), 2 deletions(-)

diff --git a/src/server/observation-compaction.ts b/src/server/observation-compaction.ts
index 9ed5507..e525c8d 100644
--- a/src/server/observation-compaction.ts
+++ b/src/server/observation-compaction.ts
@@ -1,8 +1,9 @@
-import type { StepRecordObservation } from '../tools/types/step-record.js';
 import type { A11yNodeTrimmed } from '../tools/types/discovery.js';
+import type { StepRecordObservation } from '../tools/types/step-record.js';
 import { OPTION_COLLAPSE_MIN_COUNT } from '../tools/utils/constants.js';
 
-const OPTION_RANGE_PATTERN = /^(?<prefix>[^\d]+)(?<start>\d+)\u2013\k<prefix>(?<end>\d+)$/u;
+const OPTION_RANGE_PATTERN =
+  /^(?<prefix>[^\d]+)(?<start>\d+)\u2013\k<prefix>(?<end>\d+)$/u;
 
 type RefRange = {
   firstRef: string;
@@ -10,6 +11,14 @@ type RefRange = {
   count: number;
 };
 
+/**
+ * Parses a ref string into its first/last ref and total node count.
+ * Handles range refs like "e2–e6" from collapseIdenticalRuns, returning
+ * the spanning range and the count of individual nodes it represents.
+ *
+ * @param ref - A node ref string, either a simple ref (e.g. "e3") or a range (e.g. "e2–e6").
+ * @returns The first ref, last ref, and total count of nodes the ref represents.
+ */
 function parseRefRange(ref: string): RefRange {
   const match = OPTION_RANGE_PATTERN.exec(ref);
   if (!match?.groups) {
@@ -31,6 +40,12 @@ function parseRefRange(ref: string): RefRange {
   };
 }
 
+/**
+ * Builds a summary node representing a collapsed group of option nodes.
+ *
+ * @param nodes - Array of option nodes to summarize.
+ * @returns A single summary node representing the collapsed options.
+ */
 function buildOptionSummary(nodes: A11yNodeTrimmed[]): A11yNodeTrimmed {
   const firstRange = parseRefRange(nodes[0].ref);
   const lastRange = parseRefRange(nodes[nodes.length - 1].ref);
@@ -108,6 +123,7 @@ export function collapseOptionSubtrees(
  * Creates a compacted copy of an observation while preserving non-a11y fields.
  *
  * @param observation - Observation to compact.
+ * @param previousObservation - Optional previous observation to compute diff against.
  * @returns A new compacted observation, or the original observation on failure.
  */
 export function compactObservation(
@@ -151,12 +167,26 @@ export function compactObservation(
   }
 }
 
+/**
+ * Checks if two string arrays are equal.
+ *
+ * @param left - First array to compare.
+ * @param right - Second array to compare.
+ * @returns True if arrays have equal length and identical elements.
+ */
 function arraysEqual(left: string[], right: string[]): boolean {
   return (
     left.length === right.length && left.every((val, idx) => val === right[idx])
   );
 }
 
+/**
+ * Checks if two accessibility nodes have changed.
+ *
+ * @param a - First node to compare.
+ * @param b - Second node to compare.
+ * @returns True if any property differs between the nodes.
+ */
 export function nodeChanged(a: A11yNodeTrimmed, b: A11yNodeTrimmed): boolean {
   return (
     a.name !== b.name ||
@@ -170,6 +200,13 @@ export function nodeChanged(a: A11yNodeTrimmed, b: A11yNodeTrimmed): boolean {
   );
 }
 
+/**
+ * Computes the diff between two observations, returning only changed or new nodes.
+ *
+ * @param current - The current observation to compare.
+ * @param previous - The previous observation to compare against.
+ * @returns A new observation containing only changed/new nodes with diff metadata.
+ */
 export function diffObservation(
   current: StepRecordObservation,
   previous: StepRecordObservation,

From 9417bbbf8b92763c44c613745f8b68d4a5977338 Mon Sep 17 00:00:00 2001
From: cryptotavares <joao.tavares@consensys.net>
Date: Fri, 17 Apr 2026 16:37:52 +0100
Subject: [PATCH 31/36] fix(server): remove non-null assertions from
 integration tests

---
 src/server/create-server.test.ts          | 72 ++++++++++-------------
 src/server/observation-compaction.test.ts | 16 +++--
 vitest.config.mts                         |  8 +--
 3 files changed, 44 insertions(+), 52 deletions(-)

diff --git a/src/server/create-server.test.ts b/src/server/create-server.test.ts
index ae57e80..70a00a5 100644
--- a/src/server/create-server.test.ts
+++ b/src/server/create-server.test.ts
@@ -1251,9 +1251,8 @@ describe('observation compaction in HTTP responses', () => {
       currentUrl: 'chrome-extension://test/home.html',
     });
 
-    const { collectTrimmedA11ySnapshot } = await import(
-      '../tools/utils/discovery.js'
-    );
+    const { collectTrimmedA11ySnapshot } =
+      await import('../tools/utils/discovery.js');
     vi.mocked(collectTrimmedA11ySnapshot).mockResolvedValue({
       nodes: comboboxAndOptions as never,
       refMap: new Map(),
@@ -1271,9 +1270,8 @@ describe('observation compaction in HTTP responses', () => {
     await server.stop();
     exitSpy.mockRestore();
 
-    const { collectTrimmedA11ySnapshot } = await import(
-      '../tools/utils/discovery.js'
-    );
+    const { collectTrimmedA11ySnapshot } =
+      await import('../tools/utils/discovery.js');
     vi.mocked(collectTrimmedA11ySnapshot).mockResolvedValue({
       nodes: [],
       refMap: new Map(),
@@ -1296,7 +1294,7 @@ describe('observation compaction in HTTP responses', () => {
     expect(res.status).toBe(200);
     expect(body.observations).toBeDefined();
     // 12 original nodes → compacted: combobox + summary + button = 3
-    expect(body.observations!.a11y.nodes).toHaveLength(3);
+    expect(body.observations?.a11y.nodes).toHaveLength(3);
   });
 
   it('first mutation returns a full compact observation when no baseline exists', async () => {
@@ -1317,14 +1315,13 @@ describe('observation compaction in HTTP responses', () => {
 
     expect(res.status).toBe(200);
     expect(body.observations).toBeDefined();
-    expect(body.observations!.a11y.diff).toBeUndefined();
-    expect(body.observations!.a11y.nodes).toHaveLength(3);
+    expect(body.observations?.a11y.diff).toBeUndefined();
+    expect(body.observations?.a11y.nodes).toHaveLength(3);
   });
 
   it('second mutation returns a diff-based observation', async () => {
-    const { collectTrimmedA11ySnapshot } = await import(
-      '../tools/utils/discovery.js'
-    );
+    const { collectTrimmedA11ySnapshot } =
+      await import('../tools/utils/discovery.js');
     vi.mocked(collectTrimmedA11ySnapshot)
       .mockResolvedValueOnce({
         nodes: initialButtons as never,
@@ -1366,9 +1363,8 @@ describe('observation compaction in HTTP responses', () => {
   });
 
   it('describe_screen resets the diff baseline', async () => {
-    const { collectTrimmedA11ySnapshot } = await import(
-      '../tools/utils/discovery.js'
-    );
+    const { collectTrimmedA11ySnapshot } =
+      await import('../tools/utils/discovery.js');
     vi.mocked(collectTrimmedA11ySnapshot)
       .mockResolvedValueOnce({
         nodes: initialButtons as never,
@@ -1389,14 +1385,11 @@ describe('observation compaction in HTTP responses', () => {
       body: JSON.stringify({ a11yRef: 'e1' }),
     });
 
-    await httpRequest(
-      `http://127.0.0.1:${state.port}/tool/describe_screen`,
-      {
-        method: 'POST',
-        headers: { 'Content-Type': 'application/json' },
-        body: JSON.stringify({}),
-      },
-    );
+    await httpRequest(`http://127.0.0.1:${state.port}/tool/describe_screen`, {
+      method: 'POST',
+      headers: { 'Content-Type': 'application/json' },
+      body: JSON.stringify({}),
+    });
 
     const res = await httpRequest(`http://127.0.0.1:${state.port}/tool/click`, {
       method: 'POST',
@@ -1415,14 +1408,13 @@ describe('observation compaction in HTTP responses', () => {
 
     expect(res.status).toBe(200);
     expect(body.observations).toBeDefined();
-    expect(body.observations!.a11y.diff).toBeUndefined();
-    expect(body.observations!.a11y.nodes.length).toBeGreaterThan(1);
+    expect(body.observations?.a11y.diff).toBeUndefined();
+    expect(body.observations?.a11y.nodes.length).toBeGreaterThan(1);
   });
 
   it('falls back to the full observation when the diff is not smaller', async () => {
-    const { collectTrimmedA11ySnapshot } = await import(
-      '../tools/utils/discovery.js'
-    );
+    const { collectTrimmedA11ySnapshot } =
+      await import('../tools/utils/discovery.js');
     vi.mocked(collectTrimmedA11ySnapshot)
       .mockResolvedValueOnce({
         nodes: [initialButtons[0]] as never,
@@ -1456,14 +1448,13 @@ describe('observation compaction in HTTP responses', () => {
 
     expect(res.status).toBe(200);
     expect(body.observations).toBeDefined();
-    expect(body.observations!.a11y.diff).toBeUndefined();
-    expect(body.observations!.a11y.nodes).toHaveLength(10);
+    expect(body.observations?.a11y.diff).toBeUndefined();
+    expect(body.observations?.a11y.nodes).toHaveLength(10);
   });
 
   it('knowledge store always receives the full observation instead of the diff', async () => {
-    const { collectTrimmedA11ySnapshot } = await import(
-      '../tools/utils/discovery.js'
-    );
+    const { collectTrimmedA11ySnapshot } =
+      await import('../tools/utils/discovery.js');
     vi.mocked(collectTrimmedA11ySnapshot)
       .mockResolvedValueOnce({
         nodes: initialButtons as never,
@@ -1474,9 +1465,8 @@ describe('observation compaction in HTTP responses', () => {
         refMap: new Map(),
       });
 
-    const { KnowledgeStore } = await import(
-      '../knowledge-store/knowledge-store.js'
-    );
+    const { KnowledgeStore } =
+      await import('../knowledge-store/knowledge-store.js');
     const mockStore = vi.mocked(KnowledgeStore).mock.results.at(-1)?.value as {
       recordStep: ReturnType<typeof vi.fn>;
     };
@@ -1504,11 +1494,9 @@ describe('observation compaction in HTTP responses', () => {
   });
 
   it('knowledge store receives full uncompacted observations', async () => {
-    const { KnowledgeStore } = await import(
-      '../knowledge-store/knowledge-store.js'
-    );
-    const mockStore = vi.mocked(KnowledgeStore).mock.results.at(-1)
-      ?.value as {
+    const { KnowledgeStore } =
+      await import('../knowledge-store/knowledge-store.js');
+    const mockStore = vi.mocked(KnowledgeStore).mock.results.at(-1)?.value as {
       recordStep: ReturnType<typeof vi.fn>;
     };
     mockStore.recordStep.mockClear();
@@ -1545,7 +1533,7 @@ describe('observation compaction in HTTP responses', () => {
 
     expect(res.status).toBe(200);
     expect(body.observations).toBeDefined();
-    expect(body.observations!.a11y.nodes).toHaveLength(3);
+    expect(body.observations?.a11y.nodes).toHaveLength(3);
   });
 
   it('batch with includeObservations=none omits observations', async () => {
diff --git a/src/server/observation-compaction.test.ts b/src/server/observation-compaction.test.ts
index 59bfe35..906ffd1 100644
--- a/src/server/observation-compaction.test.ts
+++ b/src/server/observation-compaction.test.ts
@@ -1,7 +1,5 @@
 import { describe, expect, it, vi } from 'vitest';
 
-import type { A11yNodeTrimmed } from '../tools/types/discovery.js';
-import type { StepRecordObservation } from '../tools/types/step-record.js';
 import {
   collapseOptionSubtrees,
   compactObservation,
@@ -9,6 +7,8 @@ import {
   nodeChanged,
   observationCompactionDeps,
 } from './observation-compaction.js';
+import type { A11yNodeTrimmed } from '../tools/types/discovery.js';
+import type { StepRecordObservation } from '../tools/types/step-record.js';
 
 function createNode(
   ref: string,
@@ -154,10 +154,14 @@ describe('collapseOptionSubtrees', () => {
 
   it('treats malformed option range refs as single options during compaction', () => {
     const combobox = createNode('e1', 'combobox');
-    const malformedSummary = createNode(`e${'9'.repeat(400)}\u2013e2`, 'option', {
-      name: 'Malformed range',
-      path: ['root', 'combo', 'option-weird'],
-    });
+    const malformedSummary = createNode(
+      `e${'9'.repeat(400)}\u2013e2`,
+      'option',
+      {
+        name: 'Malformed range',
+        path: ['root', 'combo', 'option-weird'],
+      },
+    );
     const optionTwo = createNode('e3', 'option', {
       name: 'Option 3',
       path: ['root', 'combo', 'option-3'],
diff --git a/vitest.config.mts b/vitest.config.mts
index 9baea9d..43f6708 100644
--- a/vitest.config.mts
+++ b/vitest.config.mts
@@ -35,10 +35,10 @@ export default defineConfig({
         // Auto-update the coverage thresholds when running locally.
         // Disabled in CI to prevent non-deterministic config changes.
         autoUpdate: !process.env.CI,
-        branches: 87.95,
-        functions: 90.87,
-        lines: 94.23,
-        statements: 93.97,
+        branches: 88.41,
+        functions: 91.63,
+        lines: 94.49,
+        statements: 94.23,
       },
     },
 

From 2645ada6a3f63dbf1a423085437ab71c992d9c3a Mon Sep 17 00:00:00 2001
From: cryptotavares <joao.tavares@consensys.net>
Date: Mon, 20 Apr 2026 13:25:54 +0100
Subject: [PATCH 32/36] feat(types): add activeTab field to ExtensionState

---
 src/capabilities/types.ts | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/capabilities/types.ts b/src/capabilities/types.ts
index 6ac77c5..8a15177 100644
--- a/src/capabilities/types.ts
+++ b/src/capabilities/types.ts
@@ -55,6 +55,11 @@ export type ExtensionState = {
   networkName: string | null;
   chainId: number | null;
   balance: string | null;
+  activeTab?: {
+    role: string;
+    url: string;
+    title?: string;
+  };
 };
 
 export type LaunchOptions = {

From 767a8c3ddb55aff97d75fa534a5a8c166622a50e Mon Sep 17 00:00:00 2001
From: cryptotavares <joao.tavares@consensys.net>
Date: Mon, 20 Apr 2026 13:26:06 +0100
Subject: [PATCH 33/36] feat(server): add post-mutation state recheck for
 unknown screens

---
 src/server/create-server.test.ts | 153 +++++++++++++++++++++++++++++++
 src/server/create-server.ts      |  22 ++++-
 2 files changed, 174 insertions(+), 1 deletion(-)

diff --git a/src/server/create-server.test.ts b/src/server/create-server.test.ts
index 70a00a5..58d3884 100644
--- a/src/server/create-server.test.ts
+++ b/src/server/create-server.test.ts
@@ -1094,6 +1094,159 @@ describe('createServer with active session', () => {
 
     expect(mockStore.recordStep).toHaveBeenCalled();
   });
+
+  describe('post-mutation state recheck', () => {
+    it('resolves immediately when getExtensionState returns a known screen', async () => {
+      mockSM.getExtensionState.mockReset();
+      mockSM.getExtensionState.mockResolvedValue({
+        isLoaded: true,
+        currentScreen: 'home',
+        currentUrl: 'chrome-extension://test/home.html',
+      });
+
+      const res = await httpRequest(`http://127.0.0.1:${state.port}/cleanup`, {
+        method: 'POST',
+        headers: { 'Content-Type': 'application/json' },
+        body: JSON.stringify({}),
+      });
+      const body = (await res.json()) as {
+        ok: boolean;
+        observations?: { state: { currentScreen?: string } };
+      };
+
+      expect(res.status).toBe(200);
+      expect(mockSM.getExtensionState).toHaveBeenCalledTimes(1);
+      expect(body.observations?.state.currentScreen).toBe('home');
+    });
+
+    it("retries when first call returns 'unknown', resolves on second call", async () => {
+      mockSM.getExtensionState.mockReset();
+      mockSM.getExtensionState
+        .mockResolvedValueOnce({
+          isLoaded: true,
+          currentScreen: 'unknown',
+          currentUrl: 'chrome-extension://test/unknown.html',
+        })
+        .mockResolvedValueOnce({
+          isLoaded: true,
+          currentScreen: 'home',
+          currentUrl: 'chrome-extension://test/home.html',
+        });
+
+      const res = await httpRequest(`http://127.0.0.1:${state.port}/cleanup`, {
+        method: 'POST',
+        headers: { 'Content-Type': 'application/json' },
+        body: JSON.stringify({}),
+      });
+      const body = (await res.json()) as {
+        ok: boolean;
+        observations?: { state: { currentScreen?: string } };
+      };
+
+      expect(res.status).toBe(200);
+      expect(mockSM.getExtensionState).toHaveBeenCalledTimes(2);
+      expect(body.observations?.state.currentScreen).toBe('home');
+    });
+
+    it("retries up to deadline and returns 'unknown' if all calls return 'unknown'", async () => {
+      vi.useFakeTimers();
+      mockSM.getExtensionState.mockReset();
+      mockSM.getExtensionState.mockResolvedValue({
+        isLoaded: true,
+        currentScreen: 'unknown',
+        currentUrl: 'chrome-extension://test/unknown.html',
+      });
+
+      const start = Date.now();
+      const responsePromise = httpRequest(
+        `http://127.0.0.1:${state.port}/cleanup`,
+        {
+          method: 'POST',
+          headers: { 'Content-Type': 'application/json' },
+          body: JSON.stringify({}),
+        },
+      );
+
+      await vi.advanceTimersByTimeAsync(500);
+      vi.useRealTimers();
+
+      const res = await responsePromise;
+      const body = (await res.json()) as {
+        ok: boolean;
+        observations?: { state: { currentScreen?: string } };
+      };
+
+      expect(res.status).toBe(200);
+      expect(Date.now() - start).toBeLessThanOrEqual(600);
+      expect(mockSM.getExtensionState).toHaveBeenCalledTimes(6);
+      expect(body.observations?.state.currentScreen).toBe('unknown');
+    });
+
+    it('does not recheck for readonly tool category', async () => {
+      mockSM.getExtensionState.mockReset();
+      mockSM.getExtensionState.mockResolvedValue({
+        isLoaded: true,
+        currentScreen: 'unknown',
+        currentUrl: 'chrome-extension://test/unknown.html',
+      });
+
+      const res = await httpRequest(
+        `http://127.0.0.1:${state.port}/tool/knowledge_last`,
+        {
+          method: 'POST',
+          headers: { 'Content-Type': 'application/json' },
+          body: JSON.stringify({}),
+        },
+      );
+
+      expect(res.status).toBe(200);
+      expect(mockSM.getExtensionState).toHaveBeenCalledTimes(1);
+    });
+
+    it('does not recheck for discovery tool category', async () => {
+      mockSM.getExtensionState.mockReset();
+      mockSM.getExtensionState.mockResolvedValue({
+        isLoaded: true,
+        currentScreen: 'unknown',
+        currentUrl: 'chrome-extension://test/unknown.html',
+      });
+
+      const res = await httpRequest(
+        `http://127.0.0.1:${state.port}/tool/list_testids`,
+        {
+          method: 'POST',
+          headers: { 'Content-Type': 'application/json' },
+          body: JSON.stringify({}),
+        },
+      );
+
+      expect(res.status).toBe(200);
+      expect(mockSM.getExtensionState).toHaveBeenCalledTimes(1);
+    });
+
+    it('does not recheck for batch tool category', async () => {
+      mockSM.getExtensionState.mockReset();
+      mockSM.getExtensionState.mockResolvedValue({
+        isLoaded: true,
+        currentScreen: 'unknown',
+        currentUrl: 'chrome-extension://test/unknown.html',
+      });
+
+      const res = await httpRequest(
+        `http://127.0.0.1:${state.port}/tool/run_steps`,
+        {
+          method: 'POST',
+          headers: { 'Content-Type': 'application/json' },
+          body: JSON.stringify({
+            steps: [{ tool: 'knowledge_last', args: {} }],
+          }),
+        },
+      );
+
+      expect(res.status).toBe(200);
+      expect(mockSM.getExtensionState).toHaveBeenCalledTimes(1);
+    });
+  });
 });
 
 describe('createServer with logging', () => {
diff --git a/src/server/create-server.ts b/src/server/create-server.ts
index 54f2df9..3c1e893 100644
--- a/src/server/create-server.ts
+++ b/src/server/create-server.ts
@@ -462,7 +462,27 @@ export function createServer(config: ServerConfig): ServerInstance {
                 )
                 .catch(() => undefined);
             }
-            const state = await config.sessionManager.getExtensionState();
+            let state = await config.sessionManager.getExtensionState();
+
+            // Post-mutation recheck: if currentScreen is 'unknown' after a mutation,
+            // the extension's internal router may not have updated yet. Poll briefly.
+            if (category === 'mutating' && state.currentScreen === 'unknown') {
+              const RECHECK_DEADLINE_MS = 500;
+              const RECHECK_INTERVAL_MS = 100;
+              const deadline = Date.now() + RECHECK_DEADLINE_MS;
+
+              while (Date.now() < deadline) {
+                await new Promise<void>((resolve) =>
+                  setTimeout(resolve, RECHECK_INTERVAL_MS),
+                );
+                const rechecked =
+                  await config.sessionManager.getExtensionState();
+                if (rechecked.currentScreen !== 'unknown') {
+                  state = rechecked;
+                  break;
+                }
+              }
+            }
             const testIds = await collectTestIds(
               page,
               OBSERVATION_TESTID_LIMIT,

From fcafd1f77830f7979aebeaaa26af8920fbf889e7 Mon Sep 17 00:00:00 2001
From: cryptotavares <joao.tavares@consensys.net>
Date: Mon, 20 Apr 2026 13:26:17 +0100
Subject: [PATCH 34/36] test(server): verify activeTab passthrough in
 observation compaction

---
 src/server/observation-compaction.test.ts | 53 +++++++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/src/server/observation-compaction.test.ts b/src/server/observation-compaction.test.ts
index 906ffd1..033c41f 100644
--- a/src/server/observation-compaction.test.ts
+++ b/src/server/observation-compaction.test.ts
@@ -303,6 +303,59 @@ describe('compactObservation', () => {
     expect(result).not.toBe(observation);
     expect(result.a11y.nodes).toStrictEqual([]);
   });
+
+  describe('activeTab passthrough', () => {
+    it('preserves state.activeTab when present', () => {
+      const state = {
+        isLoaded: true,
+        currentUrl: 'chrome-extension://extension/home.html',
+        extensionId: 'extension-id',
+        isUnlocked: true,
+        currentScreen: 'home',
+        accountAddress: '0x123',
+        networkName: 'Ethereum Mainnet',
+        chainId: 1,
+        balance: '1 ETH',
+        activeTab: {
+          role: 'dapp',
+          url: 'https://app.uniswap.org/',
+          title: 'Uniswap',
+        },
+      } satisfies StepRecordObservation['state'];
+      const observation = createObservation(
+        [createNode('e1', 'listbox'), ...createOptionRun(6, 2)],
+        { state },
+      );
+
+      const result = compactObservation(observation);
+
+      expect(result.state).toBe(state);
+      expect(result.state.activeTab).toStrictEqual(state.activeTab);
+    });
+
+    it('works when state.activeTab is undefined (backward compat)', () => {
+      const state = {
+        isLoaded: true,
+        currentUrl: 'chrome-extension://extension/home.html',
+        extensionId: 'extension-id',
+        isUnlocked: false,
+        currentScreen: 'unlock',
+        accountAddress: null,
+        networkName: null,
+        chainId: null,
+        balance: null,
+      } satisfies StepRecordObservation['state'];
+      const observation = createObservation(
+        [createNode('e1', 'listbox'), ...createOptionRun(6, 2)],
+        { state },
+      );
+
+      const result = compactObservation(observation);
+
+      expect(result.state).toBe(state);
+      expect(result.state.activeTab).toBeUndefined();
+    });
+  });
 });
 
 describe('nodeChanged', () => {

From 1287f390b342b6e7355f9be16badfaafae4a2d3f Mon Sep 17 00:00:00 2001
From: cryptotavares <joao.tavares@consensys.net>
Date: Mon, 20 Apr 2026 13:26:28 +0100
Subject: [PATCH 35/36] docs: format SKILL.md table alignment and code blocks

---
 SKILL.md | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/SKILL.md b/SKILL.md
index 69ad41f..c0855a9 100644
--- a/SKILL.md
+++ b/SKILL.md
@@ -25,23 +25,25 @@ mm cleanup --shutdown      # 5. Clean up when done
 
 Tool responses include different data based on the tool's category:
 
-| Category      | Examples                                                          | Observations in response?                          |
-| ------------- | ----------------------------------------------------------------- | -------------------------------------------------- |
-| **Mutating**  | click, type, navigate, launch, cleanup, build, clipboard          | Yes — `state` + `a11y` (compacted) + `testIds`    |
-| **Read-only** | get_state, get_text, knowledge\_\*, get_context, set_context      | No — faster response                               |
-| **Discovery** | describe_screen, list_testids, accessibility_snapshot, screenshot | Data is already in `result`                        |
-| **Batch**     | run_steps                                                         | Controlled by `includeObservations` param          |
+| Category      | Examples                                                          | Observations in response?                      |
+| ------------- | ----------------------------------------------------------------- | ---------------------------------------------- |
+| **Mutating**  | click, type, navigate, launch, cleanup, build, clipboard          | Yes — `state` + `a11y` (compacted) + `testIds` |
+| **Read-only** | get_state, get_text, knowledge\_\*, get_context, set_context      | No — faster response                           |
+| **Discovery** | describe_screen, list_testids, accessibility_snapshot, screenshot | Data is already in `result`                    |
+| **Batch**     | run_steps                                                         | Controlled by `includeObservations` param      |
 
 **Observation Compaction:** Mutating tool observations are **compacted** before returning: option runs of 3 or more under a combobox or listbox are replaced with a single summary node (e.g., `"55 options (refs e2–e56)"`). The `describe-screen` tool always returns the **full, unfiltered** a11y tree — use it when you need the complete option list or `priorKnowledge`.
 
 **Diff-Based Observations:** After the first mutating tool call sets a baseline, subsequent mutations return **diff-based** observations. The `observations.a11y.diff` field (when present) shows what changed:
+
 ```json
 {
-  "added": ["e4", "e5"],      // new node refs
-  "removed": ["e2"],          // disappeared node refs
-  "unchanged": 3              // count of unchanged nodes
+  "added": ["e4", "e5"], // new node refs
+  "removed": ["e2"], // disappeared node refs
+  "unchanged": 3 // count of unchanged nodes
 }
 ```
+
 The `observations.a11y.nodes` field contains **only the changed and new nodes** (not all nodes). The baseline resets after `describe-screen`, `launch`, or `cleanup` — the next mutation returns a full compact observation (no `diff` field). When the diff would be larger than the full observation, the full option-filtered observation is returned instead (no `diff` field).
 
 ### Using inline observations (mutating tools)

From 635aa5dcb6b31ef877806cd30a6e815de3e6eef1 Mon Sep 17 00:00:00 2001
From: cryptotavares <joao.tavares@consensys.net>
Date: Mon, 20 Apr 2026 13:29:27 +0100
Subject: [PATCH 36/36] chore: update testing coverage

---
 vitest.config.mts | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/vitest.config.mts b/vitest.config.mts
index 43f6708..004999a 100644
--- a/vitest.config.mts
+++ b/vitest.config.mts
@@ -35,10 +35,10 @@ export default defineConfig({
         // Auto-update the coverage thresholds when running locally.
         // Disabled in CI to prevent non-deterministic config changes.
         autoUpdate: !process.env.CI,
-        branches: 88.41,
-        functions: 91.63,
-        lines: 94.49,
-        statements: 94.23,
+        branches: 88.46,
+        functions: 91.66,
+        lines: 94.51,
+        statements: 94.26,
       },
     },