diff --git a/.semgrep/verifiers.yml b/.semgrep/verifiers.yml index dc1a4b799e..316346db9c 100644 --- a/.semgrep/verifiers.yml +++ b/.semgrep/verifiers.yml @@ -91,6 +91,7 @@ rules: exclude: - /environments/alphabet_sort/alphabet_sort_v1.py - /environments/bfcl_v3/bfcl_v3.py + - /environments/browser_toolset_example/browser_toolset_example.py - /environments/dspy_flights/dspy_flights.py - /environments/dspy_rlm/dspy_rlm.py - /environments/hello_group_reward_v1/hello_group_reward_v1.py diff --git a/environments/browser_toolset_example/README.md b/environments/browser_toolset_example/README.md new file mode 100644 index 0000000000..8adba73e5a --- /dev/null +++ b/environments/browser_toolset_example/README.md @@ -0,0 +1,62 @@ +# browser-toolset-example + +A minimal v1 environment demonstrating **`verifiers.v1.toolsets.browser`** — the +Claude computer-use action space driven over a raw Chrome DevTools Protocol +(CDP) browser, with a **pluggable backend**. A vision model controls a real +browser to complete short web tasks; an LLM judge (borrowing the rollout model) +scores the final answer. + +### Overview +- **Environment ID**: `browser-toolset-example` +- **Tools**: the `browser` toolset — `computer` (Anthropic `computer_20250124` + action enum) plus decomposed `navigate`, `left_click`, `type_text`, `key`, + `scroll`, `screenshot`, … (selectable via `mode`). +- **Backends**: `browserbase` (managed, isolated session per rollout, default) + or `cdp` (connect to any browser exposing a CDP endpoint). +- **Reward**: an LLM judge scores task success in `[0, 1]`. + +> The model must be **vision-capable** — every browser action returns a +> screenshot, delivered as image content in the tool result. + +### Quickstart + +```bash +prime env install browser-toolset-example + +# Browserbase (default): requires BROWSERBASE_API_KEY + BROWSERBASE_PROJECT_ID +BROWSERBASE_API_KEY=... BROWSERBASE_PROJECT_ID=... \ + prime eval run browser-toolset-example -m claude-sonnet-4-6 -n 3 -r 1 + +# Bring your own browser via any CDP endpoint, e.g. local Chrome: +# chrome --headless=new --remote-debugging-port=9222 +prime eval run browser-toolset-example -m claude-sonnet-4-6 \ + -a '{"backend": "cdp", "cdp_url": "http://localhost:9222"}' +``` + +### Environment Arguments + +| Arg | Type | Default | Description | +| --- | ---- | ------- | ----------- | +| `backend` | `"browserbase"` \| `"cdp"` | `"browserbase"` | Browser infrastructure backend. | +| `cdp_url` | str \| null | `null` | CDP endpoint for `backend="cdp"` (`ws(s)://` socket or `http(s)://host:port`). | +| `proxies` | bool | `false` | Enable Browserbase proxies. | +| `mode` | `"computer"` \| `"decomposed"` \| `"both"` | `"both"` | Which tool surface to expose. | +| `viewport_width` / `viewport_height` | int | `1280` / `800` | Emulated viewport. | +| `max_turns` | int | `15` | Max agent turns per rollout. | +| `num_examples` | int | `-1` | Number of bundled tasks to use (`-1` = all). | + +### Tasks + +Three short, self-contained browsing tasks (read a heading on `example.com`, +compute a derivative on Wolfram Alpha, find the latest arXiv quantum-computing +preprint) — enough to exercise navigate / type / click / scroll / read. + +### Using the toolset in your own environment + +```python +import verifiers as vf +from verifiers.v1.toolsets.browser import browser_toolset, BrowserbaseBackend + +harness = vf.Harness(config=vf.HarnessConfig(max_turns=20)) +harness.add_toolset({"browser": browser_toolset(backend=BrowserbaseBackend())}) +``` diff --git a/environments/browser_toolset_example/browser_toolset_example.py b/environments/browser_toolset_example/browser_toolset_example.py new file mode 100644 index 0000000000..cf24a88a03 --- /dev/null +++ b/environments/browser_toolset_example/browser_toolset_example.py @@ -0,0 +1,196 @@ +from typing import Literal + +import verifiers as vf +from verifiers.v1.toolsets.browser import ( + BrowserbaseBackend, + CDPBackend, + Mode, + browser_toolset, +) +from verifiers.v1.utils.judge_utils import clamp_float, parse_judge_json + +Backend = Literal["browserbase", "cdp"] + +SYSTEM_PROMPT = """You are an autonomous web-browsing agent operating a real Chrome browser. + +You interact with the page only through the provided browser tools. After each +action you receive a screenshot of the current page — always base your next +action on the most recent screenshot. Coordinates are pixels within the viewport. + +Workflow: +- Use `navigate` to open the start URL given in the task. +- Use the click / type / key / scroll actions (or the `computer` tool) to + interact with the page. +- When you have completed the task, reply with your final answer as plain text + and DO NOT call any tool. That plain-text reply is your submission. + +Be efficient and use the minimum number of actions necessary.""" + +JUDGE_SYSTEM_PROMPT = """You grade a web agent's answer to a browsing task. + +Respond with compact JSON only: {"score": 0.0-1.0, "reason": "..."} +- 1.0: the answer correctly and completely accomplishes the task. +- 0.5: partially correct or plausible but incomplete / unverified. +- 0.0: wrong, empty, or unrelated to the task.""" + + +TASKS: list[vf.ConfigData] = [ + { + "task_id": "example-domain-heading", + "start_url": "https://example.com", + "question": "Open the page and report the exact text of the page's main heading.", + "answer": "Example Domain", + }, + { + "task_id": "wolfram-derivative", + "start_url": "https://www.wolframalpha.com", + "question": ( + "Use Wolfram Alpha to compute the derivative of x^2 evaluated at " + "x = 5.6, and report the numeric result." + ), + "answer": "11.2", + }, + { + "task_id": "arxiv-latest-quantum", + "start_url": "https://arxiv.org", + "question": ( + "Search arXiv for the latest preprints about 'quantum computing' " + "(sorted newest first) and report the title of the top result." + ), + "answer": ( + "Criteria: any plausible, real arXiv paper title related to quantum " + "computing that appears at the top of a newest-first search." + ), + }, +] + + +class BrowserToolsetTasksetConfig(vf.TasksetConfig): + rewards: list[str] = ["task_success"] + system_prompt: str = SYSTEM_PROMPT + num_examples: int = -1 + + +class BrowserToolsetHarnessConfig(vf.HarnessConfig): + max_turns: int = 15 + # Backend: "browserbase" (managed, isolated session per rollout) or "cdp" + # (connect to any browser exposing a CDP endpoint). + backend: Backend = "browserbase" + cdp_url: str | None = None + proxies: bool = False + # Tool surface: "computer" | "decomposed" | "both". + mode: Mode = "both" + viewport_width: int = 1280 + viewport_height: int = 800 + + +class BrowserToolsetEnvConfig(vf.EnvConfig): + taskset: BrowserToolsetTasksetConfig = BrowserToolsetTasksetConfig() + harness: BrowserToolsetHarnessConfig = BrowserToolsetHarnessConfig() + + +def _score_prompt(task: vf.Task, answer: str) -> str: + return ( + f"Task: {task['question']}\n" + f"Start URL: {task['start_url']}\n" + f"Reference answer / criteria: {task['answer']}\n\n" + f"Agent's final answer:\n{answer}\n\n" + "Score how well the agent accomplished the task as compact JSON." + ) + + +@vf.reward(weight=1.0) +async def task_success(task, state) -> float: + messages = vf.get_messages(state.get("completion") or [], role="assistant") + answer = str(messages[-1].content or "") if messages else "" + if not answer: + return 0.0 + judge_task = vf.Task( + { + "prompt": [{"role": "user", "content": _score_prompt(task, answer)}], + "max_turns": 1, + } + ).freeze() + judge_state = state.for_task(judge_task, borrow="model") + judge_state = await vf.Harness( + config=vf.HarnessConfig(system_prompt=JUDGE_SYSTEM_PROMPT, max_turns=1) + ).run(judge_task, judge_state) + judge_messages = vf.get_messages( + judge_state.get("completion") or [], role="assistant" + ) + judge_text = str(judge_messages[-1].content or "") if judge_messages else "" + parsed = parse_judge_json(judge_text) + score = clamp_float(parsed.get("score", 0.0)) + state["judge"] = {"score": score, "reason": str(parsed.get("reason", ""))} + return score + + +def load_browser_toolset( + backend: Backend = "browserbase", + cdp_url: str | None = None, + proxies: bool = False, + mode: Mode = "both", + viewport_width: int = 1280, + viewport_height: int = 800, +) -> vf.Toolset: + if backend == "browserbase": + browser_backend = BrowserbaseBackend(proxies=proxies) + elif backend == "cdp": + browser_backend = CDPBackend(cdp_url=cdp_url) + else: + raise ValueError(f"Unknown backend {backend!r}; use 'browserbase' or 'cdp'.") + return browser_toolset( + backend=browser_backend, + mode=mode, + width=viewport_width, + height=viewport_height, + ) + + +def load_tasks(num_examples: int = -1): + rows = TASKS if num_examples < 0 else TASKS[:num_examples] + for index, row in enumerate(rows): + yield { + **row, + "example_id": index, + "prompt": [ + { + "role": "user", + "content": ( + f"{row['question']}\n\nStart by navigating to: {row['start_url']}" + ), + } + ], + } + + +class BrowserToolsetTaskset(vf.Taskset[BrowserToolsetTasksetConfig]): + def load_tasks(self, split: vf.TaskSplit = "train") -> vf.Tasks: + return load_tasks(num_examples=self.config.num_examples) + + +class BrowserToolsetHarness(vf.Harness[BrowserToolsetHarnessConfig]): + pass + + +def load_environment(config: BrowserToolsetEnvConfig) -> vf.Env: + if config.harness.backend == "browserbase": + vf.ensure_keys(["BROWSERBASE_API_KEY", "BROWSERBASE_PROJECT_ID"]) + harness = BrowserToolsetHarness(config=config.harness) + if "toolsets" not in config.harness.model_fields_set: + harness.add_toolset( + { + "browser": load_browser_toolset( + backend=config.harness.backend, + cdp_url=config.harness.cdp_url, + proxies=config.harness.proxies, + mode=config.harness.mode, + viewport_width=config.harness.viewport_width, + viewport_height=config.harness.viewport_height, + ) + } + ) + return vf.Env( + taskset=BrowserToolsetTaskset(config=config.taskset), + harness=harness, + ) diff --git a/environments/browser_toolset_example/pyproject.toml b/environments/browser_toolset_example/pyproject.toml new file mode 100644 index 0000000000..e0f0862ebb --- /dev/null +++ b/environments/browser_toolset_example/pyproject.toml @@ -0,0 +1,20 @@ +[project] +name = "browser-toolset-example" +version = "0.1.0" +description = "Example v1 environment demonstrating the browser computer-use toolset over raw CDP (Browserbase by default)" +tags = ["browser", "browserbase", "cdp", "computer-use", "vision", "example"] +requires-python = ">=3.10" +dependencies = [ + "verifiers[browser]>=0.1.15", +] + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build] +include = ["browser_toolset_example.py", "pyproject.toml"] + +[tool.verifiers.eval] +num_examples = 3 +rollouts_per_example = 1 diff --git a/pyproject.toml b/pyproject.toml index 09820d1593..cd31c962c6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -102,6 +102,7 @@ browser = [ "stagehand>=3.0.0", "aiohttp>=3.9.0", "python-dotenv>=1.0.0", + "websockets>=12.0", ] openenv = [ "tasksets[openenv]>=0.1.1", diff --git a/tests/test_v1_browser_backends.py b/tests/test_v1_browser_backends.py new file mode 100644 index 0000000000..72e5d5a04e --- /dev/null +++ b/tests/test_v1_browser_backends.py @@ -0,0 +1,76 @@ +"""Backend behaviour with HTTP faked (no network, no browser).""" + +import pytest + +import verifiers.v1.toolsets.browser.backends as backends_mod +import verifiers.v1.toolsets.browser.cdp as cdp_mod +from verifiers.v1.toolsets.browser import BrowserbaseBackend, CDPBackend + + +@pytest.fixture +def fake_http(monkeypatch): + calls = [] + + async def fake_request_json( + url, *, method="GET", headers=None, body=None, timeout=30.0 + ): + calls.append({"url": url, "method": method, "headers": headers, "body": body}) + if "/json/version" in url: + return 200, {"webSocketDebuggerUrl": "ws://browser/devtools/browser/abc"} + if url.endswith("/sessions") and method == "POST": + return 201, { + "id": "bb-session-1", + "connectUrl": "wss://connect.browserbase.com/x", + } + if "/sessions/" in url and method == "POST": + return 200, {} + return 404, {} + + monkeypatch.setattr(backends_mod, "request_json", fake_request_json) + monkeypatch.setattr(cdp_mod, "request_json", fake_request_json) + return calls + + +async def test_cdp_backend_passthrough_ws(): + handle = await CDPBackend(cdp_url="ws://host/devtools/browser/x").create() + assert handle.cdp_ws_url == "ws://host/devtools/browser/x" + assert handle.session_id == "" + + +async def test_cdp_backend_resolves_http(fake_http): + handle = await CDPBackend(cdp_url="http://localhost:9222").create() + assert handle.cdp_ws_url == "ws://browser/devtools/browser/abc" + + +async def test_cdp_backend_requires_url(monkeypatch): + monkeypatch.delenv("BROWSERTOOLSET_CDP_URL", raising=False) + with pytest.raises(RuntimeError): + await CDPBackend().create() + + +async def test_browserbase_create(fake_http): + backend = BrowserbaseBackend(api_key="k", project_id="p") + handle = await backend.create() + assert handle.session_id == "bb-session-1" + assert handle.cdp_ws_url == "wss://connect.browserbase.com/x" + post = fake_http[0] + assert post["headers"]["X-BB-API-Key"] == "k" + assert post["body"]["projectId"] == "p" + + +async def test_browserbase_requires_api_key(monkeypatch): + monkeypatch.delenv("BROWSERBASE_API_KEY", raising=False) + with pytest.raises(RuntimeError): + await BrowserbaseBackend(project_id="p").create() + + +async def test_browserbase_close_releases_session(fake_http): + backend = BrowserbaseBackend(api_key="k", project_id="p") + await backend.close("bb-session-1") + release = fake_http[-1] + assert release["url"].endswith("/sessions/bb-session-1") + assert release["body"]["status"] == "REQUEST_RELEASE" + + +async def test_browserbase_close_noop_on_empty(): + await BrowserbaseBackend(api_key="k", project_id="p").close("") diff --git a/tests/test_v1_browser_keymap.py b/tests/test_v1_browser_keymap.py new file mode 100644 index 0000000000..f9875ee85b --- /dev/null +++ b/tests/test_v1_browser_keymap.py @@ -0,0 +1,48 @@ +from verifiers.v1.toolsets.browser.keymap import ALT, CTRL, META, SHIFT, parse_chord + + +def test_named_key(): + mods, key = parse_chord("Return") + assert mods == 0 + assert key.key == "Enter" and key.code == "Enter" + + +def test_single_letter(): + mods, key = parse_chord("a") + assert mods == 0 + assert key.key == "a" and key.code == "KeyA" and key.key_code == ord("A") + + +def test_modifier_chord(): + mods, key = parse_chord("ctrl+s") + assert mods == CTRL + assert key.code == "KeyS" + + +def test_multiple_modifiers(): + mods, key = parse_chord("ctrl+shift+t") + assert mods == (CTRL | SHIFT) + assert key.code == "KeyT" + + +def test_aliases(): + assert parse_chord("cmd+a")[0] == META + assert parse_chord("option+Tab")[0] == ALT + assert parse_chord("control+c")[0] == CTRL + + +def test_navigation_keys(): + assert parse_chord("Page_Down")[1].key == "PageDown" + assert parse_chord("shift+Tab") == (SHIFT, parse_chord("Tab")[1]) + + +def test_bare_modifier(): + mods, key = parse_chord("shift") + assert key.key == "Shift" + + +def test_unknown_raises(): + import pytest + + with pytest.raises(ValueError): + parse_chord("") diff --git a/tests/test_v1_browser_session.py b/tests/test_v1_browser_session.py new file mode 100644 index 0000000000..615da33f82 --- /dev/null +++ b/tests/test_v1_browser_session.py @@ -0,0 +1,193 @@ +"""Session + tool behaviour against a fake backend + CDP client (no browser).""" + +import base64 + +import pytest + +import verifiers.v1.toolsets.browser.session as session_mod +from verifiers.v1.toolsets.browser import BrowserSession, tools +from verifiers.v1.toolsets.browser.backends import BrowserSessionHandle + + +class FakeBackend: + def __init__(self): + self.created = 0 + self.closed = [] + + async def create(self): + self.created += 1 + return BrowserSessionHandle(session_id="sess-1", cdp_ws_url="ws://fake/browser") + + async def close(self, session_id): + self.closed.append(session_id) + + +class FakeCDPClient: + def __init__(self, ws_url): + self.ws_url = ws_url + self.calls = [] + + async def connect(self): + self.calls.append(("connect", {}, None)) + + async def send(self, method, params=None, *, session_id=None): + self.calls.append((method, params or {}, session_id)) + if method == "Target.createTarget": + return {"targetId": "target-1"} + if method == "Target.attachToTarget": + return {"sessionId": "cdp-sess-1"} + if method == "Page.captureScreenshot": + return {"data": base64.b64encode(b"fake-png").decode()} + return {} + + async def close(self): + self.calls.append(("close", {}, None)) + + +@pytest.fixture +def fake_session(monkeypatch): + monkeypatch.setattr(session_mod, "CDPClient", FakeCDPClient) + return BrowserSession(FakeBackend(), width=800, height=600) + + +def _page_calls(client): + """CDP calls scoped to the attached page session.""" + return [c for c in client.calls if c[2] == "cdp-sess-1"] + + +async def test_start_attaches_page_and_enables_domains(fake_session): + await fake_session.start() + methods = [c[0] for c in fake_session._client.calls] + assert "Target.createTarget" in methods + assert "Target.attachToTarget" in methods + page_methods = [c[0] for c in _page_calls(fake_session._client)] + assert "Page.enable" in page_methods + assert "Emulation.setDeviceMetricsOverride" in page_methods + + +async def test_page_commands_carry_session_id(fake_session): + await tools.left_click([100, 150], session=fake_session) + mouse = [ + c for c in fake_session._client.calls if c[0] == "Input.dispatchMouseEvent" + ] + assert mouse and all(c[2] == "cdp-sess-1" for c in mouse) + + +async def test_left_click_dispatches_press_release(fake_session): + result = await tools.left_click([100, 150], session=fake_session) + mouse = [ + c for c in fake_session._client.calls if c[0] == "Input.dispatchMouseEvent" + ] + assert mouse[0][1]["type"] == "mousePressed" + assert mouse[0][1]["x"] == 100 and mouse[0][1]["y"] == 150 + assert mouse[1][1]["type"] == "mouseReleased" + assert result[0]["type"] == "image_url" + assert result[0]["image_url"]["url"].startswith("data:image/png;base64,") + assert fake_session.cursor_position == (100, 150) + + +async def test_double_click_count(fake_session): + await tools.double_click([10, 20], session=fake_session) + presses = [ + c + for c in fake_session._client.calls + if c[0] == "Input.dispatchMouseEvent" and c[1].get("type") == "mousePressed" + ] + assert [p[1]["clickCount"] for p in presses] == [1, 2] + + +async def test_type_text_uses_insert_text(fake_session): + await tools.type_text("hello", session=fake_session) + insert = [c for c in fake_session._client.calls if c[0] == "Input.insertText"] + assert insert and insert[0][1]["text"] == "hello" + + +async def test_key_chord_sets_modifier(fake_session): + await tools.key("ctrl+s", session=fake_session) + key_events = [ + c for c in fake_session._client.calls if c[0] == "Input.dispatchKeyEvent" + ] + assert key_events[0][1]["type"] == "keyDown" + assert key_events[0][1]["modifiers"] == 2 # CTRL + assert "text" not in key_events[0][1] + + +async def test_scroll_direction(fake_session): + await tools.scroll([5, 5], "down", 2, session=fake_session) + wheel = [ + c for c in fake_session._client.calls if c[0] == "Input.dispatchMouseEvent" + ] + assert wheel[-1][1]["type"] == "mouseWheel" + assert wheel[-1][1]["deltaY"] > 0 + + +async def test_drag(fake_session): + await tools.left_click_drag([0, 0], [50, 60], session=fake_session) + mouse = [ + c for c in fake_session._client.calls if c[0] == "Input.dispatchMouseEvent" + ] + types = [m[1]["type"] for m in mouse] + assert types == ["mousePressed", "mouseMoved", "mouseReleased"] + assert fake_session.cursor_position == (50, 60) + + +async def test_computer_tool_screenshot(fake_session): + result = await tools.computer("screenshot", session=fake_session) + assert result[0]["type"] == "image_url" + + +async def test_computer_tool_cursor_position(fake_session): + await fake_session.move_mouse(42, 24) + result = await tools.computer("cursor_position", session=fake_session) + assert result == "42, 24" + + +async def test_computer_tool_left_click(fake_session): + await tools.computer("left_click", coordinate=[7, 8], session=fake_session) + assert fake_session.cursor_position == (7, 8) + + +class FlakyCDPClient(FakeCDPClient): + """Fails the first page command with a detach error, then succeeds.""" + + def __init__(self, ws_url): + super().__init__(ws_url) + self._tripped = False + + async def send(self, method, params=None, *, session_id=None): + if method == "Input.dispatchMouseEvent" and not self._tripped: + self._tripped = True + self.calls.append((method, params or {}, session_id)) + from verifiers.v1.toolsets.browser.cdp import CDPError + + raise CDPError( + "Input.dispatchMouseEvent failed: Not attached to an active page" + ) + if method == "Target.getTargets": + self.calls.append((method, params or {}, session_id)) + return {"targetInfos": [{"type": "page", "targetId": "target-2"}]} + return await super().send(method, params, session_id=session_id) + + +async def test_send_recovers_from_detached_page(monkeypatch): + monkeypatch.setattr(session_mod, "CDPClient", FlakyCDPClient) + session = BrowserSession(FakeBackend(), width=800, height=600) + await session.start() + # The first mouse event detaches; the session should re-attach and retry, + # so the click still completes and returns a screenshot. + result = await tools.left_click([5, 5], session=session) + methods = [c[0] for c in session._client.calls] + assert "Target.getTargets" in methods # recovery re-resolved a live page + assert result[0]["type"] == "image_url" + + +async def test_aclose_closes_target_client_and_backend(fake_session): + await fake_session.start() + backend = fake_session.backend + client = fake_session._client + await fake_session.aclose() + methods = [c[0] for c in client.calls] + assert "Target.closeTarget" in methods + assert ("close", {}, None) in client.calls + assert backend.closed == ["sess-1"] + assert fake_session._client is None diff --git a/tests/test_v1_browser_toolset.py b/tests/test_v1_browser_toolset.py new file mode 100644 index 0000000000..af2a4a8b38 --- /dev/null +++ b/tests/test_v1_browser_toolset.py @@ -0,0 +1,133 @@ +"""Toolset structure + end-to-end integration with the verifiers runtime. + +The backend and CDP are faked, so no browser or network is required. +""" + +import base64 + +import pytest + +import verifiers as vf +import verifiers.v1.toolsets.browser.session as session_mod +from verifiers.v1.toolsets.browser import CDPBackend, browser_toolset +from verifiers.v1.toolsets.browser.backends import BrowserSessionHandle + + +# --- structural checks ---------------------------------------------------- + + +def _names(toolset): + return {getattr(t, "__name__", None) for t in toolset.tools} + + +def _toolset(mode): + return browser_toolset( + mode=mode, backend=CDPBackend(cdp_url="http://localhost:9222") + ) + + +def test_mode_computer_only(): + assert _names(_toolset("computer")) == {"computer"} + + +def test_mode_decomposed(): + names = _names(_toolset("decomposed")) + assert "left_click" in names and "computer" not in names + + +def test_mode_both(): + names = _names(_toolset("both")) + assert "computer" in names and "left_click" in names + + +def test_backend_is_required(): + with pytest.raises(TypeError): + browser_toolset(mode="both") # type: ignore[call-arg] + + +def test_session_is_bound_hidden_for_every_tool(): + ts = _toolset("both") + for tool in ts.tools: + assert ts.bindings[f"{tool.__name__}.session"] == "objects.browser" + + +def test_rollout_scoped_and_writable(): + ts = _toolset("both") + assert ts.write is True + assert ts.scope == "rollout" + + +# --- runtime integration -------------------------------------------------- + + +class FakeBackend: + async def create(self): + return BrowserSessionHandle(session_id="s1", cdp_ws_url="ws://fake/browser") + + async def close(self, session_id): + pass + + +class FakeCDPClient: + def __init__(self, ws_url): + self.ws_url = ws_url + + async def connect(self): + pass + + async def send(self, method, params=None, *, session_id=None): + if method == "Target.createTarget": + return {"targetId": "t1"} + if method == "Target.attachToTarget": + return {"sessionId": "cdp1"} + if method == "Page.captureScreenshot": + return {"data": base64.b64encode(b"png").decode()} + return {} + + async def close(self): + pass + + +@pytest.fixture +def patched_cdp(monkeypatch): + monkeypatch.setattr(session_mod, "CDPClient", FakeCDPClient) + + +def _build(): + harness = vf.Harness(config=vf.HarnessConfig()) + harness.add_toolset(browser_toolset(mode="both", backend=FakeBackend())) + task = vf.Task({"prompt": [{"role": "user", "content": "browse"}]}).freeze() + state = vf.State.for_task(task) + return harness, task, state + + +async def test_schema_hides_session_and_keeps_action(patched_cdp): + harness, task, state = _build() + harness.runtime.prepare_state(task, state) + defs = harness.runtime.tool_defs(state) + by_name = {d.name: d for d in defs} + + assert "computer" in by_name and "left_click" in by_name + for tool_def in defs: + props = tool_def.parameters.get("properties", {}) + assert "session" not in props, f"{tool_def.name} leaks session" + + computer_props = by_name["computer"].parameters["properties"] + assert "action" in computer_props and "coordinate" in computer_props + + +async def test_runtime_dispatches_tool_with_injected_session(patched_cdp): + harness, task, state = _build() + harness.runtime.prepare_state(task, state) + result = await harness.runtime.call_tool("screenshot", task, state) + assert isinstance(result, list) + assert result[0]["type"] == "image_url" + + +async def test_runtime_dispatches_computer_left_click(patched_cdp): + harness, task, state = _build() + harness.runtime.prepare_state(task, state) + result = await harness.runtime.call_tool( + "computer", task, state, action="left_click", coordinate=[12, 34] + ) + assert result[0]["type"] == "image_url" diff --git a/verifiers/v1/toolsets/__init__.py b/verifiers/v1/toolsets/__init__.py new file mode 100644 index 0000000000..c376cb9347 --- /dev/null +++ b/verifiers/v1/toolsets/__init__.py @@ -0,0 +1,26 @@ +LAZY_EXPORTS = { + "browser_toolset": (".browser", "browser_toolset"), + "BrowserBackend": (".browser", "BrowserBackend"), + "BrowserSession": (".browser", "BrowserSession"), + "BrowserSessionHandle": (".browser", "BrowserSessionHandle"), + "BrowserbaseBackend": (".browser", "BrowserbaseBackend"), + "CDPBackend": (".browser", "CDPBackend"), +} + +__all__ = [*LAZY_EXPORTS] + + +def __getattr__(name: str): + if name in LAZY_EXPORTS: + module_name, symbol_name = LAZY_EXPORTS[name] + from importlib import import_module + + try: + return getattr(import_module(module_name, __name__), symbol_name) + except ModuleNotFoundError as exc: + if exc.name == "websockets": + raise ImportError( + f"To use {name}, install the browser extra: `verifiers[browser]`." + ) from exc + raise + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") diff --git a/verifiers/v1/toolsets/browser/__init__.py b/verifiers/v1/toolsets/browser/__init__.py new file mode 100644 index 0000000000..8fa50e375a --- /dev/null +++ b/verifiers/v1/toolsets/browser/__init__.py @@ -0,0 +1,18 @@ +from .backends import ( + BrowserBackend, + BrowserbaseBackend, + BrowserSessionHandle, + CDPBackend, +) +from .session import BrowserSession +from .toolset import Mode, browser_toolset + +__all__ = [ + "browser_toolset", + "Mode", + "BrowserSession", + "BrowserBackend", + "BrowserSessionHandle", + "BrowserbaseBackend", + "CDPBackend", +] diff --git a/verifiers/v1/toolsets/browser/_http.py b/verifiers/v1/toolsets/browser/_http.py new file mode 100644 index 0000000000..90fb61dbe9 --- /dev/null +++ b/verifiers/v1/toolsets/browser/_http.py @@ -0,0 +1,35 @@ +import asyncio +import json +import urllib.error +import urllib.request + +from verifiers.v1.types import ConfigData + + +async def request_json( + url: str, + *, + method: str = "GET", + headers: dict[str, str] | None = None, + body: ConfigData | None = None, + timeout: float = 30.0, +) -> tuple[int, ConfigData]: + """HTTP request returning (status, parsed json object); 4xx/5xx return, not raise.""" + + def _do() -> tuple[int, ConfigData]: + payload = json.dumps(body).encode("utf-8") if body is not None else None + hdrs = dict(headers or {}) + if payload is not None: + hdrs.setdefault("Content-Type", "application/json") + request = urllib.request.Request(url, data=payload, method=method, headers=hdrs) + try: + with urllib.request.urlopen(request, timeout=timeout) as response: + text = response.read().decode("utf-8") + status = response.status + except urllib.error.HTTPError as exc: + text = exc.read().decode("utf-8", "replace") + status = exc.code + parsed = json.loads(text) if text.strip() else {} + return status, parsed if isinstance(parsed, dict) else {} + + return await asyncio.to_thread(_do) diff --git a/verifiers/v1/toolsets/browser/backends.py b/verifiers/v1/toolsets/browser/backends.py new file mode 100644 index 0000000000..a0eb7b7661 --- /dev/null +++ b/verifiers/v1/toolsets/browser/backends.py @@ -0,0 +1,141 @@ +import os +from dataclasses import dataclass +from typing import Protocol, runtime_checkable + +from verifiers.v1.types import ConfigData + +from ._http import request_json +from .cdp import browser_ws_from_http + +BROWSERBASE_API_URL = "https://api.browserbase.com/v1" +_SESSION_CREATE_TIMEOUT_S = 60.0 +_SUCCESS_STATUSES = {200, 201, 202} + + +@dataclass(frozen=True, slots=True) +class BrowserSessionHandle: + """The result of provisioning a browser: an id and a CDP socket to drive it.""" + + session_id: str + cdp_ws_url: str + + +@runtime_checkable +class BrowserBackend(Protocol): + """Provisions and releases browser sessions, returning CDP endpoints.""" + + async def create(self) -> BrowserSessionHandle: + """Provision a browser and return a handle to its CDP endpoint.""" + ... + + async def close(self, session_id: str) -> None: + """Release the session identified by ``session_id`` (best effort).""" + ... + + +class CDPBackend: + """Connect to an existing browser via a ws(s):// socket or http(s)://host:port.""" + + def __init__(self, cdp_url: str | None = None): + self._cdp_url = cdp_url + + def _url(self) -> str: + url = self._cdp_url or os.environ.get("BROWSERTOOLSET_CDP_URL", "") + if not url: + raise RuntimeError( + "CDPBackend requires a cdp_url (or $BROWSERTOOLSET_CDP_URL): a " + "ws(s):// browser socket or an http(s)://host:port address." + ) + return url + + async def create(self) -> BrowserSessionHandle: + url = self._url() + scheme = url.split(":", 1)[0].lower() + ws_url = url if scheme in ("ws", "wss") else await browser_ws_from_http(url) + return BrowserSessionHandle(session_id="", cdp_ws_url=ws_url) + + async def close(self, session_id: str) -> None: + return None + + +class BrowserbaseBackend: + """Provision an isolated Browserbase session per rollout via its REST API.""" + + def __init__( + self, + api_key: str | None = None, + project_id: str | None = None, + *, + proxies: bool = False, + keep_alive: bool = False, + session_create_kwargs: ConfigData | None = None, + ): + self._api_key = api_key + self._project_id = project_id + self._proxies = proxies + self._keep_alive = keep_alive + self._session_create_kwargs = dict(session_create_kwargs or {}) + + def _require_api_key(self) -> str: + value = (self._api_key or os.environ.get("BROWSERBASE_API_KEY", "")).strip() + if not value: + raise RuntimeError( + "BROWSERBASE_API_KEY is not configured. Pass api_key=... or set " + "the environment variable before creating a session." + ) + return value + + def _require_project_id(self) -> str: + value = ( + self._project_id or os.environ.get("BROWSERBASE_PROJECT_ID", "") + ).strip() + if not value: + raise RuntimeError( + "BROWSERBASE_PROJECT_ID is not configured. Pass project_id=... or " + "set the environment variable before creating a session." + ) + return value + + def _headers(self) -> dict[str, str]: + return { + "X-BB-API-Key": self._require_api_key(), + "Content-Type": "application/json", + } + + async def create(self) -> BrowserSessionHandle: + payload: ConfigData = { + "projectId": self._require_project_id(), + "proxies": self._proxies, + "keepAlive": self._keep_alive, + **self._session_create_kwargs, + } + status, data = await request_json( + f"{BROWSERBASE_API_URL}/sessions", + method="POST", + headers=self._headers(), + body=payload, + timeout=_SESSION_CREATE_TIMEOUT_S, + ) + if status not in _SUCCESS_STATUSES: + raise RuntimeError(f"Failed to create Browserbase session: {status} {data}") + session_id = data.get("id") + connect_url = data.get("connectUrl") + if not isinstance(session_id, str) or not isinstance(connect_url, str): + raise RuntimeError(f"Browserbase response missing id/connectUrl: {data}") + return BrowserSessionHandle(session_id=session_id, cdp_ws_url=connect_url) + + async def close(self, session_id: str) -> None: + if not session_id: + return + try: + await request_json( + f"{BROWSERBASE_API_URL}/sessions/{session_id}", + method="POST", + headers=self._headers(), + body={ + "projectId": self._require_project_id(), + "status": "REQUEST_RELEASE", + }, + ) + except Exception: # noqa: BLE001 - close is best effort + return diff --git a/verifiers/v1/toolsets/browser/cdp.py b/verifiers/v1/toolsets/browser/cdp.py new file mode 100644 index 0000000000..66f44cfc5b --- /dev/null +++ b/verifiers/v1/toolsets/browser/cdp.py @@ -0,0 +1,102 @@ +import asyncio +import json + +import websockets + +from verifiers.v1.types import ConfigData + +from ._http import request_json + + +class CDPError(RuntimeError): + """Raised when the browser returns a CDP error or discovery fails.""" + + +async def browser_ws_from_http(http_base: str) -> str: + """Resolve an ``http(s)://host:port`` debugging address to a browser socket.""" + base = http_base.rstrip("/") + try: + status, data = await request_json(f"{base}/json/version") + except Exception as exc: # noqa: BLE001 - surface a clean error + raise CDPError( + f"Could not reach the CDP HTTP endpoint at {base}: {exc}" + ) from exc + ws_url = data.get("webSocketDebuggerUrl") if status == 200 else None + if not isinstance(ws_url, str) or not ws_url: + raise CDPError( + f"No webSocketDebuggerUrl from {base}/json/version (status {status})." + ) + return ws_url + + +class CDPClient: + """A CDP WebSocket connection; ``send`` correlates responses by message id.""" + + def __init__(self, ws_url: str): + self._ws_url = ws_url + self._ws: websockets.ClientConnection | None = None + self._next_id = 0 + self._pending: dict[int, asyncio.Future[ConfigData]] = {} + self._reader: asyncio.Task[None] | None = None + + async def connect(self) -> None: + if self._ws is not None: + return + # Disable the inbound size cap: screenshots can be large. + self._ws = await websockets.connect(self._ws_url, max_size=None) + self._reader = asyncio.create_task(self._read_loop()) + + async def _read_loop(self) -> None: + assert self._ws is not None + try: + async for raw in self._ws: + message = json.loads(raw) + msg_id = message.get("id") + if msg_id is None: + continue # An event; we don't subscribe to any yet. + future = self._pending.pop(msg_id, None) + if future is not None and not future.done(): + future.set_result(message) + except asyncio.CancelledError: + raise + except Exception as exc: # noqa: BLE001 - propagate to waiters + for future in self._pending.values(): + if not future.done(): + future.set_exception(exc) + self._pending.clear() + + async def send( + self, + method: str, + params: ConfigData | None = None, + *, + session_id: str | None = None, + ) -> ConfigData: + if self._ws is None: + raise CDPError("CDP client is not connected.") + self._next_id += 1 + msg_id = self._next_id + loop = asyncio.get_running_loop() + future: asyncio.Future[ConfigData] = loop.create_future() + self._pending[msg_id] = future + message: ConfigData = {"id": msg_id, "method": method, "params": params or {}} + if session_id is not None: + message["sessionId"] = session_id + await self._ws.send(json.dumps(message)) + result = await future + if "error" in result: + raise CDPError(f"{method} failed: {result['error']}") + payload = result.get("result") + return payload if isinstance(payload, dict) else {} + + async def close(self) -> None: + if self._reader is not None: + self._reader.cancel() + try: + await self._reader + except asyncio.CancelledError: + pass + self._reader = None + if self._ws is not None: + await self._ws.close() + self._ws = None diff --git a/verifiers/v1/toolsets/browser/keymap.py b/verifiers/v1/toolsets/browser/keymap.py new file mode 100644 index 0000000000..4fc7f152ab --- /dev/null +++ b/verifiers/v1/toolsets/browser/keymap.py @@ -0,0 +1,107 @@ +from dataclasses import dataclass + +# CDP modifier bitmask. +ALT = 1 +CTRL = 2 +META = 4 +SHIFT = 8 + +_MODIFIER_ALIASES: dict[str, int] = { + "alt": ALT, + "option": ALT, + "ctrl": CTRL, + "control": CTRL, + "meta": META, + "super": META, + "cmd": META, + "command": META, + "win": META, + "shift": SHIFT, +} + + +@dataclass(frozen=True) +class KeyDef: + key: str + code: str + key_code: int + + +# Standalone modifier keys, used when a chord is just a modifier name. +_MODIFIER_KEYDEFS: dict[int, KeyDef] = { + ALT: KeyDef("Alt", "AltLeft", 18), + CTRL: KeyDef("Control", "ControlLeft", 17), + META: KeyDef("Meta", "MetaLeft", 91), + SHIFT: KeyDef("Shift", "ShiftLeft", 16), +} + + +# Named (non-printable) keys, keyed by lowercased xdotool/X11 name. +_NAMED_KEYS: dict[str, KeyDef] = { + "return": KeyDef("Enter", "Enter", 13), + "enter": KeyDef("Enter", "Enter", 13), + "kp_enter": KeyDef("Enter", "NumpadEnter", 13), + "tab": KeyDef("Tab", "Tab", 9), + "space": KeyDef(" ", "Space", 32), + "backspace": KeyDef("Backspace", "Backspace", 8), + "delete": KeyDef("Delete", "Delete", 46), + "escape": KeyDef("Escape", "Escape", 27), + "esc": KeyDef("Escape", "Escape", 27), + "home": KeyDef("Home", "Home", 36), + "end": KeyDef("End", "End", 35), + "page_up": KeyDef("PageUp", "PageUp", 33), + "prior": KeyDef("PageUp", "PageUp", 33), + "page_down": KeyDef("PageDown", "PageDown", 34), + "next": KeyDef("PageDown", "PageDown", 34), + "left": KeyDef("ArrowLeft", "ArrowLeft", 37), + "up": KeyDef("ArrowUp", "ArrowUp", 38), + "right": KeyDef("ArrowRight", "ArrowRight", 39), + "down": KeyDef("ArrowDown", "ArrowDown", 40), + "insert": KeyDef("Insert", "Insert", 45), + "minus": KeyDef("-", "Minus", 189), + "plus": KeyDef("+", "Equal", 187), + "equal": KeyDef("=", "Equal", 187), + "period": KeyDef(".", "Period", 190), + "comma": KeyDef(",", "Comma", 188), + "slash": KeyDef("/", "Slash", 191), + "backslash": KeyDef("\\", "Backslash", 220), + "semicolon": KeyDef(";", "Semicolon", 186), +} +for _n in range(1, 13): # Function keys F1-F12. + _NAMED_KEYS[f"f{_n}"] = KeyDef(f"F{_n}", f"F{_n}", 111 + _n) + + +def _printable_keydef(char: str) -> KeyDef | None: + if len(char) != 1: + return None + if char.isalpha(): + return KeyDef(char, f"Key{char.upper()}", ord(char.upper())) + if char.isdigit(): + return KeyDef(char, f"Digit{char}", ord(char)) + return KeyDef(char, "", ord(char)) + + +def parse_chord(chord: str) -> tuple[int, KeyDef]: + """Parse an xdotool-style chord into (modifier_mask, KeyDef).""" + parts = [part for part in chord.split("+") if part] + if not parts: + raise ValueError(f"Empty key chord: {chord!r}") + # Leading tokens are modifiers; the final token is the primary key. + modifiers = 0 + for part in parts[:-1]: + alias = _MODIFIER_ALIASES.get(part.lower()) + if alias is None: + raise ValueError(f"Unrecognized modifier {part!r} in chord {chord!r}") + modifiers |= alias + last = parts[-1] + token = last.lower() + if token in _NAMED_KEYS: + primary = _NAMED_KEYS[token] + elif token in _MODIFIER_ALIASES: + # The chord is (or ends in) a bare modifier, e.g. ``shift``. + primary = _MODIFIER_KEYDEFS[_MODIFIER_ALIASES[token]] + else: + primary = _printable_keydef(last) + if primary is None: + raise ValueError(f"Unrecognized key chord: {chord!r}") + return modifiers, primary diff --git a/verifiers/v1/toolsets/browser/session.py b/verifiers/v1/toolsets/browser/session.py new file mode 100644 index 0000000000..2e5c329d94 --- /dev/null +++ b/verifiers/v1/toolsets/browser/session.py @@ -0,0 +1,272 @@ +import asyncio + +from verifiers.types import ContentPart +from verifiers.v1.types import ConfigData + +from .backends import BrowserBackend +from .cdp import CDPClient, CDPError +from .keymap import parse_chord + +# A tool returns message content; a screenshot is a single image content part. +ScreenshotContent = list[ContentPart] + + +def _screenshot_content(b64_png: str) -> ScreenshotContent: + return [ + { + "type": "image_url", + "image_url": {"url": f"data:image/png;base64,{b64_png}"}, + } + ] + + +class BrowserSession: + """A backend-provisioned CDP page plus the input/scroll/screenshot primitives.""" + + def __init__( + self, + backend: BrowserBackend, + *, + width: int = 1280, + height: int = 800, + start_url: str | None = None, + ): + self.backend = backend + self.width = width + self.height = height + self.start_url = start_url + self._client: CDPClient | None = None + self._provider_session_id: str | None = None + self._target_id: str | None = None + self._session_id: str | None = None + self._cursor = (width // 2, height // 2) + + async def start(self) -> "BrowserSession": + """Provision, connect, and attach to a fresh page target. Idempotent.""" + if self._client is not None: + return self + handle = await self.backend.create() + self._provider_session_id = handle.session_id + self._client = CDPClient(handle.cdp_ws_url) + await self._client.connect() + await self._attach_page(create=True) + if self.start_url: + await self.navigate(self.start_url) + return self + + async def _attach_page(self, *, create: bool) -> None: + # create=True opens a fresh page; otherwise re-attach to the newest live + # page (recovering when the current one was replaced or closed). + assert self._client is not None + target_id: str | None = None + if not create: + targets = await self._client.send("Target.getTargets") + infos = targets.get("targetInfos") + pages = ( + [t for t in infos if isinstance(t, dict) and t.get("type") == "page"] + if isinstance(infos, list) + else [] + ) + if pages: + target_id = str(pages[-1].get("targetId")) + if target_id is None: + created = await self._client.send( + "Target.createTarget", {"url": "about:blank"} + ) + target_id = str(created["targetId"]) + attached = await self._client.send( + "Target.attachToTarget", {"targetId": target_id, "flatten": True} + ) + self._target_id = target_id + self._session_id = str(attached["sessionId"]) + # Prepare the freshly attached page. These three commands are + # independent, so pipeline them rather than paying three round-trips. + await asyncio.gather( + self._client.send("Page.enable", session_id=self._session_id), + self._client.send("Runtime.enable", session_id=self._session_id), + self._client.send( + "Emulation.setDeviceMetricsOverride", + { + "width": self.width, + "height": self.height, + "deviceScaleFactor": 1, + "mobile": False, + }, + session_id=self._session_id, + ), + ) + + async def aclose(self) -> None: + if self._client is not None: + if self._target_id is not None: + try: + await self._client.send( + "Target.closeTarget", {"targetId": self._target_id} + ) + except Exception: # noqa: BLE001 - best effort + pass + await self._client.close() + self._client = None + if self._provider_session_id is not None: + await self.backend.close(self._provider_session_id) + self._provider_session_id = None + self._target_id = None + self._session_id = None + + async def _ensure(self) -> CDPClient: + if self._client is None: + await self.start() + assert self._client is not None + return self._client + + @staticmethod + def _is_detached_error(error: CDPError) -> bool: + text = str(error).lower() + return any( + marker in text + for marker in ( + "not attached", + "no target with given id", + "session with given id", + "target closed", + "navigated or closed", + "cannot find context", + ) + ) + + async def _send(self, method: str, params: ConfigData | None = None) -> ConfigData: + """Send a page-scoped CDP command, recovering once if the page detached.""" + client = await self._ensure() + try: + return await client.send(method, params, session_id=self._session_id) + except CDPError as error: + if not self._is_detached_error(error): + raise + # The page went away; re-attach to a live one and retry exactly once. + await self._attach_page(create=False) + return await client.send(method, params, session_id=self._session_id) + + @property + def cursor_position(self) -> tuple[int, int]: + return self._cursor + + async def move_mouse(self, x: int, y: int) -> None: + self._cursor = (int(x), int(y)) + await self._send( + "Input.dispatchMouseEvent", {"type": "mouseMoved", "x": int(x), "y": int(y)} + ) + + async def click( + self, + x: int | None = None, + y: int | None = None, + *, + button: str = "left", + count: int = 1, + ) -> None: + if x is not None and y is not None: + self._cursor = (int(x), int(y)) + cx, cy = self._cursor + for press in range(1, count + 1): + base = {"x": cx, "y": cy, "button": button, "clickCount": press} + await self._send( + "Input.dispatchMouseEvent", {"type": "mousePressed", **base} + ) + await self._send( + "Input.dispatchMouseEvent", {"type": "mouseReleased", **base} + ) + + async def drag( + self, start: tuple[int, int], end: tuple[int, int], *, button: str = "left" + ) -> None: + sx, sy = int(start[0]), int(start[1]) + ex, ey = int(end[0]), int(end[1]) + await self._send( + "Input.dispatchMouseEvent", + { + "type": "mousePressed", + "x": sx, + "y": sy, + "button": button, + "clickCount": 1, + }, + ) + await self._send( + "Input.dispatchMouseEvent", + {"type": "mouseMoved", "x": ex, "y": ey, "button": button}, + ) + await self._send( + "Input.dispatchMouseEvent", + { + "type": "mouseReleased", + "x": ex, + "y": ey, + "button": button, + "clickCount": 1, + }, + ) + self._cursor = (ex, ey) + + async def scroll( + self, + direction: str, + amount: int = 3, + x: int | None = None, + y: int | None = None, + ) -> None: + if x is not None and y is not None: + self._cursor = (int(x), int(y)) + cx, cy = self._cursor + step = 100 * int(amount) # ~one notch per unit, like a wheel tick. + delta_x = {"left": -step, "right": step}.get(direction, 0) + delta_y = {"up": -step, "down": step}.get(direction, 0) + await self._send( + "Input.dispatchMouseEvent", + { + "type": "mouseWheel", + "x": cx, + "y": cy, + "deltaX": delta_x, + "deltaY": delta_y, + }, + ) + + async def type_text(self, text: str) -> None: + await self._send("Input.insertText", {"text": text}) + + async def press_key(self, chord: str) -> None: + modifiers, key = parse_chord(chord) + event: ConfigData = { + "modifiers": modifiers, + "key": key.key, + "code": key.code, + "windowsVirtualKeyCode": key.key_code, + "nativeVirtualKeyCode": key.key_code, + } + down = {"type": "keyDown", **event} + # Printable keys with no command modifier should also produce text. + if len(key.key) == 1 and not (modifiers & 0b0110): # not ctrl/meta + down["text"] = key.key + await self._send("Input.dispatchKeyEvent", down) + await self._send("Input.dispatchKeyEvent", {"type": "keyUp", **event}) + + async def navigate(self, url: str) -> None: + await self._send("Page.navigate", {"url": url}) + + async def screenshot(self) -> ScreenshotContent: + result = await self._send( + "Page.captureScreenshot", {"format": "png", "captureBeyondViewport": False} + ) + return _screenshot_content(str(result["data"])) + + +async def open_browser_session( + backend: BrowserBackend, + width: int = 1280, + height: int = 800, + start_url: str | None = None, +) -> BrowserSession: + """Toolset object factory: build and start a rollout browser session.""" + session = BrowserSession(backend, width=width, height=height, start_url=start_url) + await session.start() + return session diff --git a/verifiers/v1/toolsets/browser/tools.py b/verifiers/v1/toolsets/browser/tools.py new file mode 100644 index 0000000000..f22df76188 --- /dev/null +++ b/verifiers/v1/toolsets/browser/tools.py @@ -0,0 +1,255 @@ +import asyncio +from typing import Annotated, Literal + +from pydantic import Field + +from .session import BrowserSession, ScreenshotContent + +Coordinate = Annotated[ + list[int], Field(description="[x, y] pixel coordinate in the viewport.") +] + +_MAX_WAIT_SECONDS = 10.0 + + +def _xy(coordinate: list[int] | None, name: str = "coordinate") -> tuple[int, int]: + if not coordinate or len(coordinate) != 2: + raise ValueError(f"{name} must be an [x, y] pair.") + return int(coordinate[0]), int(coordinate[1]) + + +ComputerAction = Literal[ + "key", + "type", + "mouse_move", + "left_click", + "left_click_drag", + "right_click", + "middle_click", + "double_click", + "triple_click", + "scroll", + "cursor_position", + "screenshot", + "wait", +] + + +async def computer( + action: Annotated[ + ComputerAction, + Field(description="The computer-use action to perform."), + ], + coordinate: Annotated[ + list[int] | None, + Field(description="[x, y] target for click/move/scroll actions."), + ] = None, + text: Annotated[ + str | None, + Field(description="Text to type, or key chord (e.g. 'ctrl+s') to press."), + ] = None, + start_coordinate: Annotated[ + list[int] | None, + Field(description="[x, y] start point for left_click_drag."), + ] = None, + scroll_direction: Annotated[ + Literal["up", "down", "left", "right"] | None, + Field(description="Scroll direction."), + ] = None, + scroll_amount: Annotated[ + int | None, + Field(description="Number of scroll steps."), + ] = None, + duration: Annotated[ + float | None, + Field(description="Seconds to wait for the 'wait' action."), + ] = None, + *, + session: BrowserSession, +) -> ScreenshotContent | str: + """Control the browser via mouse, keyboard, scrolling and screenshots (viewport pixels).""" + if action == "screenshot": + return await session.screenshot() + if action == "cursor_position": + x, y = session.cursor_position + return f"{x}, {y}" + if action == "wait": + await asyncio.sleep(min(float(duration or 1.0), _MAX_WAIT_SECONDS)) + return await session.screenshot() + if action == "mouse_move": + await session.move_mouse(*_xy(coordinate)) + return await session.screenshot() + if action == "key": + if not text: + raise ValueError("key action requires 'text' (the key chord).") + await session.press_key(text) + return await session.screenshot() + if action == "type": + if text is None: + raise ValueError("type action requires 'text'.") + await session.type_text(text) + return await session.screenshot() + if action == "left_click_drag": + await session.drag(_xy(start_coordinate, "start_coordinate"), _xy(coordinate)) + return await session.screenshot() + if action == "scroll": + x, y = _xy(coordinate) if coordinate else session.cursor_position + await session.scroll(scroll_direction or "down", scroll_amount or 3, x, y) + return await session.screenshot() + # Click family. + button = {"left_click": "left", "right_click": "right", "middle_click": "middle"} + counts = {"double_click": 2, "triple_click": 3} + if action in button: + cx, cy = _xy(coordinate) if coordinate else session.cursor_position + await session.click(cx, cy, button=button[action]) + return await session.screenshot() + if action in counts: + cx, cy = _xy(coordinate) if coordinate else session.cursor_position + await session.click(cx, cy, button="left", count=counts[action]) + return await session.screenshot() + raise ValueError(f"Unknown action: {action!r}") + + +async def screenshot(*, session: BrowserSession) -> ScreenshotContent: + """Capture and return a screenshot of the current viewport.""" + return await session.screenshot() + + +async def navigate( + url: Annotated[str, Field(description="URL to load.")], + *, + session: BrowserSession, +) -> ScreenshotContent: + """Navigate the browser to a URL and return a screenshot.""" + await session.navigate(url) + return await session.screenshot() + + +async def mouse_move( + coordinate: Coordinate, *, session: BrowserSession +) -> ScreenshotContent: + """Move the pointer to a coordinate.""" + await session.move_mouse(*_xy(coordinate)) + return await session.screenshot() + + +async def left_click( + coordinate: Coordinate, *, session: BrowserSession +) -> ScreenshotContent: + """Left-click at a coordinate.""" + await session.click(*_xy(coordinate), button="left") + return await session.screenshot() + + +async def right_click( + coordinate: Coordinate, *, session: BrowserSession +) -> ScreenshotContent: + """Right-click at a coordinate.""" + await session.click(*_xy(coordinate), button="right") + return await session.screenshot() + + +async def middle_click( + coordinate: Coordinate, *, session: BrowserSession +) -> ScreenshotContent: + """Middle-click at a coordinate.""" + await session.click(*_xy(coordinate), button="middle") + return await session.screenshot() + + +async def double_click( + coordinate: Coordinate, *, session: BrowserSession +) -> ScreenshotContent: + """Double-click at a coordinate.""" + await session.click(*_xy(coordinate), button="left", count=2) + return await session.screenshot() + + +async def triple_click( + coordinate: Coordinate, *, session: BrowserSession +) -> ScreenshotContent: + """Triple-click at a coordinate.""" + await session.click(*_xy(coordinate), button="left", count=3) + return await session.screenshot() + + +async def left_click_drag( + start_coordinate: Annotated[list[int], Field(description="[x, y] drag start.")], + coordinate: Coordinate, + *, + session: BrowserSession, +) -> ScreenshotContent: + """Press at the start coordinate, drag to the end coordinate and release.""" + await session.drag(_xy(start_coordinate, "start_coordinate"), _xy(coordinate)) + return await session.screenshot() + + +async def type_text( + text: Annotated[str, Field(description="Text to type at the current focus.")], + *, + session: BrowserSession, +) -> ScreenshotContent: + """Type literal text into the focused element.""" + await session.type_text(text) + return await session.screenshot() + + +async def key( + text: Annotated[ + str, Field(description="Key chord, e.g. 'Return', 'ctrl+s', 'shift+Tab'.") + ], + *, + session: BrowserSession, +) -> ScreenshotContent: + """Press a key or key chord.""" + await session.press_key(text) + return await session.screenshot() + + +async def scroll( + coordinate: Coordinate, + scroll_direction: Annotated[ + Literal["up", "down", "left", "right"], Field(description="Scroll direction.") + ], + scroll_amount: Annotated[int, Field(description="Number of scroll steps.")] = 3, + *, + session: BrowserSession, +) -> ScreenshotContent: + """Scroll the page at a coordinate in the given direction.""" + x, y = _xy(coordinate) + await session.scroll(scroll_direction, scroll_amount, x, y) + return await session.screenshot() + + +async def wait( + duration: Annotated[float, Field(description="Seconds to wait.")] = 1.0, + *, + session: BrowserSession, +) -> ScreenshotContent: + """Wait for the page to settle, then return a screenshot.""" + await asyncio.sleep(min(float(duration), _MAX_WAIT_SECONDS)) + return await session.screenshot() + + +async def cursor_position(*, session: BrowserSession) -> str: + """Return the current pointer position as 'x, y'.""" + x, y = session.cursor_position + return f"{x}, {y}" + + +DECOMPOSED_TOOLS = [ + screenshot, + navigate, + mouse_move, + left_click, + right_click, + middle_click, + double_click, + triple_click, + left_click_drag, + type_text, + key, + scroll, + wait, + cursor_position, +] diff --git a/verifiers/v1/toolsets/browser/toolset.py b/verifiers/v1/toolsets/browser/toolset.py new file mode 100644 index 0000000000..c5a5d9f2b3 --- /dev/null +++ b/verifiers/v1/toolsets/browser/toolset.py @@ -0,0 +1,62 @@ +from typing import Literal + +from verifiers.v1 import ObjectsConfig +from verifiers.v1.toolset import Toolset + +from .backends import BrowserBackend +from .tools import DECOMPOSED_TOOLS, computer + +Mode = Literal["computer", "decomposed", "both"] + +# Import ref for the rollout browser object. v1 requires object entries to be +# import-ref strings; per-rollout config (the backend, viewport, start url) is +# supplied through bindings instead of a closure. +_BROWSER_OBJECT_REF = "verifiers.v1.toolsets.browser.session:open_browser_session" + + +def _const(value: object): + """A binding source that always resolves to ``value`` (a literal config).""" + + def source() -> object: + return value + + return source + + +def browser_toolset( + *, + backend: BrowserBackend, + mode: Mode = "both", + width: int = 1280, + height: int = 800, + start_url: str | None = None, +) -> Toolset: + """Build a Toolset for browser/computer control over raw CDP via ``backend``.""" + if backend is None: + raise ValueError( + "browser_toolset requires a backend (e.g. BrowserbaseBackend())." + ) + if mode == "computer": + tools = [computer] + elif mode == "decomposed": + tools = list(DECOMPOSED_TOOLS) + elif mode == "both": + tools = [computer, *DECOMPOSED_TOOLS] + else: # pragma: no cover - guarded by the Literal type + raise ValueError(f"Unknown mode: {mode!r}") + + bindings = { + "browser.backend": _const(backend), + "browser.width": _const(width), + "browser.height": _const(height), + "browser.start_url": _const(start_url), + **{f"{tool.__name__}.session": "objects.browser" for tool in tools}, + } + + return Toolset( + tools=tools, + objects=ObjectsConfig.model_validate({"browser": _BROWSER_OBJECT_REF}), + bindings=bindings, + write=True, + scope="rollout", + )