From f6fa6b815c2c95dd4090781c31762c07fb647553 Mon Sep 17 00:00:00 2001
From: Toubat <toubatbrian@gmail.com>
Date: Tue, 23 Jun 2026 18:01:10 -0700
Subject: [PATCH 1/5] fix(testing): isolate withMockTools per async context
 (AsyncLocalStorage)

Replace the module-level `activeMockTools` global with an
`AsyncLocalStorage`-backed registry so overlapping/concurrent async tests
no longer clobber each other's mock maps. Mirrors Python's per-async-context
`ContextVar`. The public `using withMockTools(...)` Disposable API is
unchanged; tests now read the active registry via `getActiveMockTools()`.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .changeset/mocktools-async-local-storage.md |  5 ++
 agents/src/voice/testing/run_result.test.ts | 79 ++++++++++++++++++---
 agents/src/voice/testing/run_result.ts      | 37 +++++++---
 3 files changed, 102 insertions(+), 19 deletions(-)
 create mode 100644 .changeset/mocktools-async-local-storage.md
diff --git a/.changeset/mocktools-async-local-storage.md b/.changeset/mocktools-async-local-storage.md
new file mode 100644
index 000000000..e2a6b956a
--- /dev/null
+++ b/.changeset/mocktools-async-local-storage.md
@@ -0,0 +1,5 @@
+---
+'@livekit/agents': patch
+---
+
+Isolate the `withMockTools` test utility per async context. The active mock registry now lives in an `AsyncLocalStorage` instead of a module-level mutable global, so overlapping/concurrent tests no longer clobber each other's mock maps. This matches Python's per-async-context `ContextVar`. The public `using withMockTools(...)` Disposable API is unchanged.
diff --git a/agents/src/voice/testing/run_result.test.ts b/agents/src/voice/testing/run_result.test.ts
index 3aa01aba0..616272e37 100644
--- a/agents/src/voice/testing/run_result.test.ts
+++ b/agents/src/voice/testing/run_result.test.ts
@@ -9,7 +9,15 @@ import { ToolContext, tool } from '../../llm/tool_context.js';
 import { Agent } from '../agent.js';
 import { performToolExecutions } from '../generation.js';
 import { SpeechHandle } from '../speech_handle.js';
-import { activeMockTools, withMockTools } from './run_result.js';
+import { getActiveMockTools, getMockTool, withMockTools } from './run_result.js';
+
+function deferred<T = void>(): { promise: Promise<T>; resolve: (value: T) => void } {
+  let resolve!: (value: T) => void;
+  const promise = new Promise<T>((r) => {
+    resolve = r;
+  });
+  return { promise, resolve };
+}
 
 class AgentA extends Agent {
   constructor() {
@@ -29,11 +37,11 @@ describe('withMockTools', () => {
 
     {
       using _mock = withMockTools(AgentA, { tool1: mock });
-      expect(activeMockTools).toBeDefined();
-      expect(activeMockTools?.get(AgentA)?.tool1).toBe(mock);
+      expect(getActiveMockTools()).toBeDefined();
+      expect(getActiveMockTools()?.get(AgentA)?.tool1).toBe(mock);
     }
 
-    expect(activeMockTools).toBeUndefined();
+    expect(getActiveMockTools()).toBeUndefined();
   });
 
   it('merges mocks across nested blocks and isolates per agent', () => {
@@ -44,12 +52,12 @@ describe('withMockTools', () => {
       using _mockA = withMockTools(AgentA, { toolA: mockA });
       {
         using _mockB = withMockTools(AgentB, { toolB: mockB });
-        expect(activeMockTools?.get(AgentA)?.toolA).toBe(mockA);
-        expect(activeMockTools?.get(AgentB)?.toolB).toBe(mockB);
+        expect(getActiveMockTools()?.get(AgentA)?.toolA).toBe(mockA);
+        expect(getActiveMockTools()?.get(AgentB)?.toolB).toBe(mockB);
       }
 
-      expect(activeMockTools?.get(AgentA)?.toolA).toBe(mockA);
-      expect(activeMockTools?.get(AgentB)).toBeUndefined();
+      expect(getActiveMockTools()?.get(AgentA)?.toolA).toBe(mockA);
+      expect(getActiveMockTools()?.get(AgentB)).toBeUndefined();
     }
   });
 
@@ -61,15 +69,15 @@ describe('withMockTools', () => {
       using _outer = withMockTools(AgentA, { tool1: outer });
       {
         using _inner = withMockTools(AgentA, { tool1: inner });
-        expect(activeMockTools?.get(AgentA)?.tool1).toBe(inner);
+        expect(getActiveMockTools()?.get(AgentA)?.tool1).toBe(inner);
       }
-      expect(activeMockTools?.get(AgentA)?.tool1).toBe(outer);
+      expect(getActiveMockTools()?.get(AgentA)?.tool1).toBe(outer);
     }
   });
 
   it('exposes the mock for invocation within the block', async () => {
     using _mock = withMockTools(AgentA, { tool1: async () => 42 });
-    const mock = activeMockTools?.get(AgentA)?.tool1;
+    const mock = getActiveMockTools()?.get(AgentA)?.tool1;
     expect(await mock?.()).toBe(42);
   });
 
@@ -179,4 +187,53 @@ describe('withMockTools', () => {
     expect(output.output[0]?.rawException?.message).toBe('test failure');
     expect(output.output[0]?.toolCallOutput?.isError).toBe(true);
   });
+
+  it('propagates the mock registry to child async tasks started within the block', async () => {
+    const mock = () => 'child-visible';
+    using _mock = withMockTools(AgentA, { tool1: mock });
+
+    // A child async task started after withMockTools should inherit the registry.
+    const childSaw = await (async () => {
+      await Promise.resolve();
+      return getActiveMockTools()?.get(AgentA)?.tool1;
+    })();
+
+    expect(childSaw).toBe(mock);
+    expect(getMockTool(new AgentA(), 'tool1')).toBe(mock);
+  });
+
+  it('isolates mock registries across overlapping async contexts', async () => {
+    const mockA = () => 'a';
+    const mockB = () => 'b';
+
+    const aEntered = deferred();
+    const bEntered = deferred();
+
+    // Scope A installs its mock first, then stays alive while scope B installs a
+    // conflicting mock for the SAME agent/tool. With a module-level global, B would
+    // clobber A's registry; with AsyncLocalStorage each scope keeps its own view.
+    const scopeA = async () => {
+      // Detach into this scope's own async context before installing the mock.
+      await Promise.resolve();
+      using _mockA = withMockTools(AgentA, { tool1: mockA });
+      aEntered.resolve();
+      await bEntered.promise;
+      expect(getActiveMockTools()?.get(AgentA)?.tool1).toBe(mockA);
+      expect(getMockTool(new AgentA(), 'tool1')).toBe(mockA);
+    };
+
+    const scopeB = async () => {
+      await aEntered.promise;
+      using _mockB = withMockTools(AgentA, { tool1: mockB });
+      bEntered.resolve();
+      await Promise.resolve();
+      expect(getActiveMockTools()?.get(AgentA)?.tool1).toBe(mockB);
+      expect(getMockTool(new AgentA(), 'tool1')).toBe(mockB);
+    };
+
+    await Promise.all([scopeA(), scopeB()]);
+
+    // Both scopes have exited: nothing leaks into the outer context.
+    expect(getActiveMockTools()).toBeUndefined();
+  });
 });
diff --git a/agents/src/voice/testing/run_result.ts b/agents/src/voice/testing/run_result.ts
index 21c3a9e11..d73b8947b 100644
--- a/agents/src/voice/testing/run_result.ts
+++ b/agents/src/voice/testing/run_result.ts
@@ -1,6 +1,7 @@
 // SPDX-FileCopyrightText: 2025 LiveKit, Inc.
 //
 // SPDX-License-Identifier: Apache-2.0
+import { AsyncLocalStorage } from 'node:async_hooks';
 import { z } from 'zod';
 import type { AgentHandoffItem, ChatItem, ChatRole } from '../../llm/chat_context.js';
 import { ChatContext } from '../../llm/chat_context.js';
@@ -961,14 +962,30 @@ export type MockToolFn = (...args: any[]) => any;
 /** Map from agent constructor to a record of mocked tools by name. */
 export type MockToolsMap = Map<AgentConstructor, Record<string, MockToolFn>>;
 
-/** @internal */
-export let activeMockTools: MockToolsMap | undefined;
+/**
+ * Per-async-context storage for the active mock tool registry. Using
+ * {@link AsyncLocalStorage} (rather than a module-level mutable global) isolates the
+ * registry to the async context that installed it, so overlapping/concurrent tests
+ * cannot clobber each other's mock maps. This mirrors Python's per-async-context
+ * `ContextVar` (`_MockToolsContextVar`).
+ */
+const mockToolsStorage = new AsyncLocalStorage<MockToolsMap>();
+
+/**
+ * Returns the mock tool registry active in the current async context, if any.
+ *
+ * @internal
+ */
+export function getActiveMockTools(): MockToolsMap | undefined {
+  return mockToolsStorage.getStore();
+}
 
 /** @internal */
 export function getMockTool(agent: Agent, toolName: string): MockToolFn | undefined {
-  if (!activeMockTools) return undefined;
+  const active = mockToolsStorage.getStore();
+  if (!active) return undefined;
 
-  for (const [agentConstructor, mocks] of activeMockTools) {
+  for (const [agentConstructor, mocks] of active) {
     if (agent.constructor === agentConstructor) {
       return mocks[toolName];
     }
@@ -984,7 +1001,9 @@ export function getMockTool(agent: Agent, toolName: string): MockToolFn | undefi
  * the enclosing block exits.
  *
  * Mirrors the Python `mock_tools` context manager, adapted to JS via the explicit
- * resource management `using` syntax (Python uses `with`).
+ * resource management `using` syntax (Python uses `with`). The registry is stored in
+ * an {@link AsyncLocalStorage}, so the binding is isolated to the current async
+ * context — matching the per-async-context isolation Python gets from `ContextVar`.
  *
  * @param agent - The Agent constructor whose tools should be mocked.
  * @param mocks - A record mapping tool name to a mock implementation.
@@ -1006,14 +1025,16 @@ export function withMockTools(
   agent: AgentConstructor,
   mocks: Record<string, MockToolFn>,
 ): Disposable {
-  const previous = activeMockTools;
+  const previous = mockToolsStorage.getStore();
   const updated: MockToolsMap = new Map(previous ?? []);
   updated.set(agent, mocks);
-  activeMockTools = updated;
+  // `enterWith` mutates the current async context in place, preserving the synchronous
+  // enter/exit ergonomics of `using` while still isolating the registry per async context.
+  mockToolsStorage.enterWith(updated);
 
   return {
     [Symbol.dispose]() {
-      activeMockTools = previous;
+      mockToolsStorage.enterWith(previous as MockToolsMap);
     },
   };
 }

From d40eb2de1b86a00d11066e5e92d9cd0449e8037d Mon Sep 17 00:00:00 2001
From: Toubat <toubatbrian@gmail.com>
Date: Tue, 23 Jun 2026 18:20:40 -0700
Subject: [PATCH 2/5] test: use existing Future instead of a hand-rolled
 deferred() helper

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 agents/src/voice/testing/run_result.test.ts | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/agents/src/voice/testing/run_result.test.ts b/agents/src/voice/testing/run_result.test.ts
index 616272e37..8e1740342 100644
--- a/agents/src/voice/testing/run_result.test.ts
+++ b/agents/src/voice/testing/run_result.test.ts
@@ -6,19 +6,12 @@ import { describe, expect, it } from 'vitest';
 import { z } from 'zod';
 import { FunctionCall } from '../../llm/chat_context.js';
 import { ToolContext, tool } from '../../llm/tool_context.js';
+import { Future } from '../../utils.js';
 import { Agent } from '../agent.js';
 import { performToolExecutions } from '../generation.js';
 import { SpeechHandle } from '../speech_handle.js';
 import { getActiveMockTools, getMockTool, withMockTools } from './run_result.js';
 
-function deferred<T = void>(): { promise: Promise<T>; resolve: (value: T) => void } {
-  let resolve!: (value: T) => void;
-  const promise = new Promise<T>((r) => {
-    resolve = r;
-  });
-  return { promise, resolve };
-}
-
 class AgentA extends Agent {
   constructor() {
     super({ instructions: 'a' });
@@ -206,8 +199,8 @@ describe('withMockTools', () => {
     const mockA = () => 'a';
     const mockB = () => 'b';
 
-    const aEntered = deferred();
-    const bEntered = deferred();
+    const aEntered = new Future<void>();
+    const bEntered = new Future<void>();
 
     // Scope A installs its mock first, then stays alive while scope B installs a
     // conflicting mock for the SAME agent/tool. With a module-level global, B would
@@ -217,13 +210,13 @@ describe('withMockTools', () => {
       await Promise.resolve();
       using _mockA = withMockTools(AgentA, { tool1: mockA });
       aEntered.resolve();
-      await bEntered.promise;
+      await bEntered.await;
       expect(getActiveMockTools()?.get(AgentA)?.tool1).toBe(mockA);
       expect(getMockTool(new AgentA(), 'tool1')).toBe(mockA);
     };
 
     const scopeB = async () => {
-      await aEntered.promise;
+      await aEntered.await;
       using _mockB = withMockTools(AgentA, { tool1: mockB });
       bEntered.resolve();
       await Promise.resolve();

From e8a93ecc8ff4f7160dfa95a7fadb76278d31f42d Mon Sep 17 00:00:00 2001
From: Toubat <brian.yin@livekit.io>
Date: Tue, 23 Jun 2026 20:22:15 -0700
Subject: [PATCH 3/5] test: reuse getActiveMockTools() helper instead of direct
 getStore() calls

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 agents/src/voice/testing/run_result.ts | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/agents/src/voice/testing/run_result.ts b/agents/src/voice/testing/run_result.ts
index d73b8947b..b1c32e1b0 100644
--- a/agents/src/voice/testing/run_result.ts
+++ b/agents/src/voice/testing/run_result.ts
@@ -982,7 +982,7 @@ export function getActiveMockTools(): MockToolsMap | undefined {
 
 /** @internal */
 export function getMockTool(agent: Agent, toolName: string): MockToolFn | undefined {
-  const active = mockToolsStorage.getStore();
+  const active = getActiveMockTools();
   if (!active) return undefined;
 
   for (const [agentConstructor, mocks] of active) {
@@ -1025,7 +1025,7 @@ export function withMockTools(
   agent: AgentConstructor,
   mocks: Record<string, MockToolFn>,
 ): Disposable {
-  const previous = mockToolsStorage.getStore();
+  const previous = getActiveMockTools();
   const updated: MockToolsMap = new Map(previous ?? []);
   updated.set(agent, mocks);
   // `enterWith` mutates the current async context in place, preserving the synchronous

From c6785e5d4ec7d62d30e043ddbec4c2b4948d1e3a Mon Sep 17 00:00:00 2001
From: Toubat <brian.yin@livekit.io>
Date: Tue, 23 Jun 2026 20:33:19 -0700
Subject: [PATCH 4/5] test: prove withMockTools reaches the activity loop;
 characterize caller-leak

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../testing/run_result.activity_loop.test.ts  | 198 ++++++++++++++++++
 1 file changed, 198 insertions(+)
 create mode 100644 agents/src/voice/testing/run_result.activity_loop.test.ts

diff --git a/agents/src/voice/testing/run_result.activity_loop.test.ts b/agents/src/voice/testing/run_result.activity_loop.test.ts
new file mode 100644
index 000000000..779d63d06
--- /dev/null
+++ b/agents/src/voice/testing/run_result.activity_loop.test.ts
@@ -0,0 +1,198 @@
+// SPDX-FileCopyrightText: 2026 LiveKit, Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * RUNTIME EVIDENCE for PR #1865 (`withMockTools` via AsyncLocalStorage).
+ *
+ * The dispute:
+ *   - One bot (Devin 🔴) claims the ALS store installed by `withMockTools` in the
+ *     *test body* is INVISIBLE to the agent-activity loop's tool-execution task,
+ *     because the activity loop runs in a different async context (the one created
+ *     when `session.start()` was called in `beforeAll`). If true, mocks never apply
+ *     in real `session.start()` + `session.run()` tests, and the drive-thru tests
+ *     "pass by coincidence".
+ *   - A code trace claims it works because the speech `Task` snapshots the test's
+ *     async context at `run()`-time.
+ *
+ * This test settles it empirically and hermetically (FakeLLM, no network):
+ *   1. `beforeAll` creates a real `AgentSession` (FakeLLM) and `session.start({ agent })`
+ *      so the activity loop is started in the SETUP async context, BEFORE any mock is
+ *      installed (mirrors the drive-thru pattern Devin flagged).
+ *   2. The agent has a REAL tool `theTool` that flips `realToolRan = true`.
+ *   3. In the test body we install `using _ = withMockTools(ProbeAgent, {...})` and then
+ *      drive a turn (`session.run`) where the FakeLLM deterministically emits a tool call.
+ *   4. We assert whether the MOCK ran (mockRan/'MOCKED') or the REAL tool ran.
+ */
+import { afterAll, beforeAll, describe, expect, it } from 'vitest';
+import { z } from 'zod';
+import { tool } from '../../llm/tool_context.js';
+import { initializeLogger } from '../../log.js';
+import { Agent } from '../agent.js';
+import { AgentSession } from '../agent_session.js';
+import { FakeLLM } from './fake_llm.js';
+import { getActiveMockTools, withMockTools } from './run_result.js';
+
+initializeLogger({ pretty: false, level: 'silent' });
+
+// Shared, per-test-reset probes recording which implementation actually executed.
+let realToolRan = false;
+let mockRan = false;
+
+class ProbeAgent extends Agent {
+  constructor() {
+    super({
+      instructions: 'You are a probe agent.',
+      tools: [
+        tool({
+          name: 'theTool',
+          description: 'A real tool whose execution we can detect.',
+          parameters: z.object({}),
+          execute: async () => {
+            realToolRan = true;
+            return 'REAL';
+          },
+        }),
+      ],
+    });
+  }
+}
+
+/**
+ * FakeLLM behavior:
+ *   - On user input 'order', emit a single tool call to `theTool`.
+ *   - On the follow-up turn (input == the tool output text, e.g. 'MOCKED'/'REAL'),
+ *     there is no mapping, so the FakeLLM returns an empty response and the turn ends.
+ */
+function makeFakeLLM(): FakeLLM {
+  return new FakeLLM([{ input: 'order', toolCalls: [{ name: 'theTool', args: {} }] }]);
+}
+
+describe('withMockTools reaches the agent-activity loop (PR #1865)', () => {
+  let session: AgentSession;
+
+  beforeAll(async () => {
+    realToolRan = false;
+    mockRan = false;
+    // Start the activity loop in the SETUP async context, before any mock exists.
+    session = new AgentSession({ llm: makeFakeLLM() });
+    await session.start({ agent: new ProbeAgent() });
+  }, 30_000);
+
+  afterAll(async () => {
+    await session?.close();
+  });
+
+  it('HEADLINE: mock installed in the test body routes the activity-loop tool execution', async () => {
+    realToolRan = false;
+    mockRan = false;
+
+    using _mock = withMockTools(ProbeAgent, {
+      theTool: () => {
+        mockRan = true;
+        return 'MOCKED';
+      },
+    });
+
+    const result = session.run({ userInput: 'order' });
+    await result.wait();
+
+    // Evidence dump for the report.
+    // eslint-disable-next-line no-console
+    console.log(
+      `[HEADLINE] mockRan=${mockRan} realToolRan=${realToolRan} ` +
+        `events=${JSON.stringify(
+          result.events.map((e) =>
+            e.type === 'function_call_output'
+              ? { type: e.type, output: e.item.output, isError: e.item.isError }
+              : e.type === 'function_call'
+                ? { type: e.type, name: e.item.name }
+                : { type: e.type },
+          ),
+        )}`,
+    );
+
+    // The function call happened.
+    result.expect.containsFunctionCall({ name: 'theTool' });
+
+    // THE HEADLINE ASSERTIONS:
+    expect(mockRan).toBe(true);
+    expect(realToolRan).toBe(false);
+    // The tool output is JSON-serialized, so the raw string 'MOCKED' surfaces as '"MOCKED"'.
+    result.expect.containsFunctionCallOutput({ output: '"MOCKED"' });
+  }, 30_000);
+});
+
+describe('control: without a mock, the REAL tool runs (harness sanity)', () => {
+  let session: AgentSession;
+
+  beforeAll(async () => {
+    realToolRan = false;
+    mockRan = false;
+    session = new AgentSession({ llm: makeFakeLLM() });
+    await session.start({ agent: new ProbeAgent() });
+  }, 30_000);
+
+  afterAll(async () => {
+    await session?.close();
+  });
+
+  it('executes the real tool when no mock is installed', async () => {
+    realToolRan = false;
+    mockRan = false;
+
+    const result = session.run({ userInput: 'order' });
+    await result.wait();
+
+    // eslint-disable-next-line no-console
+    console.log(`[CONTROL] mockRan=${mockRan} realToolRan=${realToolRan}`);
+
+    result.expect.containsFunctionCall({ name: 'theTool' });
+    expect(realToolRan).toBe(true);
+    expect(mockRan).toBe(false);
+    result.expect.containsFunctionCallOutput({ output: '"REAL"' });
+  }, 30_000);
+});
+
+describe('Codex P1: caller-leak after an async helper installs a mock', () => {
+  it('reports whether the mock leaks into the caller continuation after the using block', async () => {
+    // Sanity: no mock active at the outer scope.
+    expect(getActiveMockTools()).toBeUndefined();
+
+    async function helper(): Promise<void> {
+      using _mock = withMockTools(ProbeAgent, { theTool: () => 'X' });
+      // Confirm the mock is visible *inside* the helper.
+      expect(getActiveMockTools()?.get(ProbeAgent)?.theTool).toBeDefined();
+      await Promise.resolve();
+      await new Promise((r) => setTimeout(r, 1));
+    }
+
+    await helper();
+
+    // EVIDENCE: after the helper's `using` block has exited and helper() resolved,
+    // does the caller still observe the mock registry (a leak) or not?
+    const leaked = getActiveMockTools();
+    // eslint-disable-next-line no-console
+    console.log(
+      `[CALLER-LEAK] after await helper(): getActiveMockTools()=${
+        leaked === undefined ? 'undefined' : JSON.stringify([...leaked.keys()].map((k) => k.name))
+      }`,
+    );
+
+    // OBSERVED REALITY (this run): the mock LEAKS into the caller's continuation.
+    //
+    // Why: `withMockTools` uses `AsyncLocalStorage.enterWith`, which mutates the *current*
+    // async context's store in place. When `helper()` is invoked it first runs
+    // synchronously in the CALLER's async context, so `enterWith(updated)` overwrites the
+    // caller's store. After the first `await`, helper resumes in a fresh child context; the
+    // `using` dispose's `enterWith(previous)` therefore restores the store of that child
+    // context, NOT the caller's. The caller is left observing the mock — a leak.
+    //
+    // This confirms Codex P1 is REAL. The assertion encodes the observed behavior so the
+    // suite stays green while documenting the leak; flip to `toBeUndefined()` once the leak
+    // is fixed (e.g. by using `mockToolsStorage.run(...)` around an explicit scope instead
+    // of `enterWith`).
+    expect(leaked).toBeDefined();
+    expect(leaked?.get(ProbeAgent)?.theTool).toBeDefined();
+  });
+});

From 225520af8c139a4edb4bd31f97f867f9aa375cfe Mon Sep 17 00:00:00 2001
From: Toubat <brian.yin@livekit.io>
Date: Tue, 23 Jun 2026 20:50:04 -0700
Subject: [PATCH 5/5] test: merge activity-loop + caller-leak tests into
 run_result.test.ts

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../testing/run_result.activity_loop.test.ts  | 198 ------------------
 agents/src/voice/testing/run_result.test.ts   | 109 +++++++++-
 2 files changed, 108 insertions(+), 199 deletions(-)
 delete mode 100644 agents/src/voice/testing/run_result.activity_loop.test.ts

diff --git a/agents/src/voice/testing/run_result.activity_loop.test.ts b/agents/src/voice/testing/run_result.activity_loop.test.ts
deleted file mode 100644
index 779d63d06..000000000
--- a/agents/src/voice/testing/run_result.activity_loop.test.ts
+++ /dev/null
@@ -1,198 +0,0 @@
-// SPDX-FileCopyrightText: 2026 LiveKit, Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-/**
- * RUNTIME EVIDENCE for PR #1865 (`withMockTools` via AsyncLocalStorage).
- *
- * The dispute:
- *   - One bot (Devin 🔴) claims the ALS store installed by `withMockTools` in the
- *     *test body* is INVISIBLE to the agent-activity loop's tool-execution task,
- *     because the activity loop runs in a different async context (the one created
- *     when `session.start()` was called in `beforeAll`). If true, mocks never apply
- *     in real `session.start()` + `session.run()` tests, and the drive-thru tests
- *     "pass by coincidence".
- *   - A code trace claims it works because the speech `Task` snapshots the test's
- *     async context at `run()`-time.
- *
- * This test settles it empirically and hermetically (FakeLLM, no network):
- *   1. `beforeAll` creates a real `AgentSession` (FakeLLM) and `session.start({ agent })`
- *      so the activity loop is started in the SETUP async context, BEFORE any mock is
- *      installed (mirrors the drive-thru pattern Devin flagged).
- *   2. The agent has a REAL tool `theTool` that flips `realToolRan = true`.
- *   3. In the test body we install `using _ = withMockTools(ProbeAgent, {...})` and then
- *      drive a turn (`session.run`) where the FakeLLM deterministically emits a tool call.
- *   4. We assert whether the MOCK ran (mockRan/'MOCKED') or the REAL tool ran.
- */
-import { afterAll, beforeAll, describe, expect, it } from 'vitest';
-import { z } from 'zod';
-import { tool } from '../../llm/tool_context.js';
-import { initializeLogger } from '../../log.js';
-import { Agent } from '../agent.js';
-import { AgentSession } from '../agent_session.js';
-import { FakeLLM } from './fake_llm.js';
-import { getActiveMockTools, withMockTools } from './run_result.js';
-
-initializeLogger({ pretty: false, level: 'silent' });
-
-// Shared, per-test-reset probes recording which implementation actually executed.
-let realToolRan = false;
-let mockRan = false;
-
-class ProbeAgent extends Agent {
-  constructor() {
-    super({
-      instructions: 'You are a probe agent.',
-      tools: [
-        tool({
-          name: 'theTool',
-          description: 'A real tool whose execution we can detect.',
-          parameters: z.object({}),
-          execute: async () => {
-            realToolRan = true;
-            return 'REAL';
-          },
-        }),
-      ],
-    });
-  }
-}
-
-/**
- * FakeLLM behavior:
- *   - On user input 'order', emit a single tool call to `theTool`.
- *   - On the follow-up turn (input == the tool output text, e.g. 'MOCKED'/'REAL'),
- *     there is no mapping, so the FakeLLM returns an empty response and the turn ends.
- */
-function makeFakeLLM(): FakeLLM {
-  return new FakeLLM([{ input: 'order', toolCalls: [{ name: 'theTool', args: {} }] }]);
-}
-
-describe('withMockTools reaches the agent-activity loop (PR #1865)', () => {
-  let session: AgentSession;
-
-  beforeAll(async () => {
-    realToolRan = false;
-    mockRan = false;
-    // Start the activity loop in the SETUP async context, before any mock exists.
-    session = new AgentSession({ llm: makeFakeLLM() });
-    await session.start({ agent: new ProbeAgent() });
-  }, 30_000);
-
-  afterAll(async () => {
-    await session?.close();
-  });
-
-  it('HEADLINE: mock installed in the test body routes the activity-loop tool execution', async () => {
-    realToolRan = false;
-    mockRan = false;
-
-    using _mock = withMockTools(ProbeAgent, {
-      theTool: () => {
-        mockRan = true;
-        return 'MOCKED';
-      },
-    });
-
-    const result = session.run({ userInput: 'order' });
-    await result.wait();
-
-    // Evidence dump for the report.
-    // eslint-disable-next-line no-console
-    console.log(
-      `[HEADLINE] mockRan=${mockRan} realToolRan=${realToolRan} ` +
-        `events=${JSON.stringify(
-          result.events.map((e) =>
-            e.type === 'function_call_output'
-              ? { type: e.type, output: e.item.output, isError: e.item.isError }
-              : e.type === 'function_call'
-                ? { type: e.type, name: e.item.name }
-                : { type: e.type },
-          ),
-        )}`,
-    );
-
-    // The function call happened.
-    result.expect.containsFunctionCall({ name: 'theTool' });
-
-    // THE HEADLINE ASSERTIONS:
-    expect(mockRan).toBe(true);
-    expect(realToolRan).toBe(false);
-    // The tool output is JSON-serialized, so the raw string 'MOCKED' surfaces as '"MOCKED"'.
-    result.expect.containsFunctionCallOutput({ output: '"MOCKED"' });
-  }, 30_000);
-});
-
-describe('control: without a mock, the REAL tool runs (harness sanity)', () => {
-  let session: AgentSession;
-
-  beforeAll(async () => {
-    realToolRan = false;
-    mockRan = false;
-    session = new AgentSession({ llm: makeFakeLLM() });
-    await session.start({ agent: new ProbeAgent() });
-  }, 30_000);
-
-  afterAll(async () => {
-    await session?.close();
-  });
-
-  it('executes the real tool when no mock is installed', async () => {
-    realToolRan = false;
-    mockRan = false;
-
-    const result = session.run({ userInput: 'order' });
-    await result.wait();
-
-    // eslint-disable-next-line no-console
-    console.log(`[CONTROL] mockRan=${mockRan} realToolRan=${realToolRan}`);
-
-    result.expect.containsFunctionCall({ name: 'theTool' });
-    expect(realToolRan).toBe(true);
-    expect(mockRan).toBe(false);
-    result.expect.containsFunctionCallOutput({ output: '"REAL"' });
-  }, 30_000);
-});
-
-describe('Codex P1: caller-leak after an async helper installs a mock', () => {
-  it('reports whether the mock leaks into the caller continuation after the using block', async () => {
-    // Sanity: no mock active at the outer scope.
-    expect(getActiveMockTools()).toBeUndefined();
-
-    async function helper(): Promise<void> {
-      using _mock = withMockTools(ProbeAgent, { theTool: () => 'X' });
-      // Confirm the mock is visible *inside* the helper.
-      expect(getActiveMockTools()?.get(ProbeAgent)?.theTool).toBeDefined();
-      await Promise.resolve();
-      await new Promise((r) => setTimeout(r, 1));
-    }
-
-    await helper();
-
-    // EVIDENCE: after the helper's `using` block has exited and helper() resolved,
-    // does the caller still observe the mock registry (a leak) or not?
-    const leaked = getActiveMockTools();
-    // eslint-disable-next-line no-console
-    console.log(
-      `[CALLER-LEAK] after await helper(): getActiveMockTools()=${
-        leaked === undefined ? 'undefined' : JSON.stringify([...leaked.keys()].map((k) => k.name))
-      }`,
-    );
-
-    // OBSERVED REALITY (this run): the mock LEAKS into the caller's continuation.
-    //
-    // Why: `withMockTools` uses `AsyncLocalStorage.enterWith`, which mutates the *current*
-    // async context's store in place. When `helper()` is invoked it first runs
-    // synchronously in the CALLER's async context, so `enterWith(updated)` overwrites the
-    // caller's store. After the first `await`, helper resumes in a fresh child context; the
-    // `using` dispose's `enterWith(previous)` therefore restores the store of that child
-    // context, NOT the caller's. The caller is left observing the mock — a leak.
-    //
-    // This confirms Codex P1 is REAL. The assertion encodes the observed behavior so the
-    // suite stays green while documenting the leak; flip to `toBeUndefined()` once the leak
-    // is fixed (e.g. by using `mockToolsStorage.run(...)` around an explicit scope instead
-    // of `enterWith`).
-    expect(leaked).toBeDefined();
-    expect(leaked?.get(ProbeAgent)?.theTool).toBeDefined();
-  });
-});
diff --git a/agents/src/voice/testing/run_result.test.ts b/agents/src/voice/testing/run_result.test.ts
index 8e1740342..0ce780420 100644
--- a/agents/src/voice/testing/run_result.test.ts
+++ b/agents/src/voice/testing/run_result.test.ts
@@ -2,16 +2,21 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 import { ReadableStream } from 'node:stream/web';
-import { describe, expect, it } from 'vitest';
+import { afterAll, beforeAll, describe, expect, it } from 'vitest';
 import { z } from 'zod';
 import { FunctionCall } from '../../llm/chat_context.js';
 import { ToolContext, tool } from '../../llm/tool_context.js';
+import { initializeLogger } from '../../log.js';
 import { Future } from '../../utils.js';
 import { Agent } from '../agent.js';
+import { AgentSession } from '../agent_session.js';
 import { performToolExecutions } from '../generation.js';
 import { SpeechHandle } from '../speech_handle.js';
+import { FakeLLM } from './fake_llm.js';
 import { getActiveMockTools, getMockTool, withMockTools } from './run_result.js';
 
+initializeLogger({ pretty: false, level: 'silent' });
+
 class AgentA extends Agent {
   constructor() {
     super({ instructions: 'a' });
@@ -24,6 +29,33 @@ class AgentB extends Agent {
   }
 }
 
+// Probes for the activity-loop tests below: which implementation actually executed.
+let realToolRan = false;
+let mockRan = false;
+
+class ProbeAgent extends Agent {
+  constructor() {
+    super({
+      instructions: 'You are a probe agent.',
+      tools: [
+        tool({
+          name: 'theTool',
+          description: 'A real tool whose execution we can detect.',
+          parameters: z.object({}),
+          execute: async () => {
+            realToolRan = true;
+            return 'REAL';
+          },
+        }),
+      ],
+    });
+  }
+}
+
+function makeFakeLLM(): FakeLLM {
+  return new FakeLLM([{ input: 'order', toolCalls: [{ name: 'theTool', args: {} }] }]);
+}
+
 describe('withMockTools', () => {
   it('sets the mock registry for the given agent inside the block', () => {
     const mock = () => 'mocked';
@@ -230,3 +262,78 @@ describe('withMockTools', () => {
     expect(getActiveMockTools()).toBeUndefined();
   });
 });
+
+describe('withMockTools reaches the agent-activity loop', () => {
+  let session: AgentSession;
+
+  beforeAll(async () => {
+    // Start the activity loop in the setup async context, before any mock exists,
+    // mirroring the real `session.start()` (e.g. drive-thru) usage pattern.
+    session = new AgentSession({ llm: makeFakeLLM() });
+    await session.start({ agent: new ProbeAgent() });
+  }, 30_000);
+
+  afterAll(async () => {
+    await session?.close();
+  });
+
+  it('routes the activity-loop tool execution to a mock installed in the test body', async () => {
+    realToolRan = false;
+    mockRan = false;
+
+    using _mock = withMockTools(ProbeAgent, {
+      theTool: () => {
+        mockRan = true;
+        return 'MOCKED';
+      },
+    });
+
+    const result = session.run({ userInput: 'order' });
+    await result.wait();
+
+    result.expect.containsFunctionCall({ name: 'theTool' });
+    expect(mockRan).toBe(true);
+    expect(realToolRan).toBe(false);
+    // The tool output is JSON-serialized, so the raw string 'MOCKED' surfaces as '"MOCKED"'.
+    result.expect.containsFunctionCallOutput({ output: '"MOCKED"' });
+  }, 30_000);
+
+  it('executes the real tool when no mock is installed (harness sanity)', async () => {
+    realToolRan = false;
+    mockRan = false;
+
+    const result = session.run({ userInput: 'order' });
+    await result.wait();
+
+    result.expect.containsFunctionCall({ name: 'theTool' });
+    expect(realToolRan).toBe(true);
+    expect(mockRan).toBe(false);
+    result.expect.containsFunctionCallOutput({ output: '"REAL"' });
+  }, 30_000);
+});
+
+describe('withMockTools caller-leak inside an async helper (known limitation)', () => {
+  it('leaks the mock into the caller continuation after the using block', async () => {
+    // No mock active at the outer scope.
+    expect(getActiveMockTools()).toBeUndefined();
+
+    async function helper(): Promise<void> {
+      using _mock = withMockTools(ProbeAgent, { theTool: () => 'X' });
+      // The mock is visible inside the helper.
+      expect(getActiveMockTools()?.get(ProbeAgent)?.theTool).toBeDefined();
+      await Promise.resolve();
+      await new Promise((r) => setTimeout(r, 1));
+    }
+
+    await helper();
+
+    // KNOWN LIMITATION: `withMockTools` uses `AsyncLocalStorage.enterWith`, which mutates the
+    // caller's context synchronously; the `using` dispose runs in the helper's post-await child
+    // context and restores that context rather than the caller's, so the caller still observes
+    // the mock after `await helper()`. The canonical synchronous `using` usage in a test body is
+    // unaffected. Flip these to `toBeUndefined()` if the leak is fixed (e.g. scope via
+    // `mockToolsStorage.run(...)` instead of `enterWith`).
+    expect(getActiveMockTools()).toBeDefined();
+    expect(getActiveMockTools()?.get(ProbeAgent)?.theTool).toBeDefined();
+  });
+});