Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .semgrep/verifiers.yml
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ rules:
exclude:
- /environments/alphabet_sort/alphabet_sort_v1.py
- /environments/bfcl_v3/bfcl_v3.py
- /environments/browser_toolset_example/browser_toolset_example.py
- /environments/dspy_flights/dspy_flights.py
- /environments/dspy_rlm/dspy_rlm.py
- /environments/hello_group_reward_v1/hello_group_reward_v1.py
Expand Down
62 changes: 62 additions & 0 deletions environments/browser_toolset_example/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# browser-toolset-example

A minimal v1 environment demonstrating **`verifiers.v1.toolsets.browser`** — the
Claude computer-use action space driven over a raw Chrome DevTools Protocol
(CDP) browser, with a **pluggable backend**. A vision model controls a real
browser to complete short web tasks; an LLM judge (borrowing the rollout model)
scores the final answer.

### Overview
- **Environment ID**: `browser-toolset-example`
- **Tools**: the `browser` toolset — `computer` (Anthropic `computer_20250124`
action enum) plus decomposed `navigate`, `left_click`, `type_text`, `key`,
`scroll`, `screenshot`, … (selectable via `mode`).
- **Backends**: `browserbase` (managed, isolated session per rollout, default)
or `cdp` (connect to any browser exposing a CDP endpoint).
- **Reward**: an LLM judge scores task success in `[0, 1]`.

> The model must be **vision-capable** — every browser action returns a
> screenshot, delivered as image content in the tool result.

### Quickstart

```bash
prime env install browser-toolset-example

# Browserbase (default): requires BROWSERBASE_API_KEY + BROWSERBASE_PROJECT_ID
BROWSERBASE_API_KEY=... BROWSERBASE_PROJECT_ID=... \
prime eval run browser-toolset-example -m claude-sonnet-4-6 -n 3 -r 1

# Bring your own browser via any CDP endpoint, e.g. local Chrome:
# chrome --headless=new --remote-debugging-port=9222
prime eval run browser-toolset-example -m claude-sonnet-4-6 \
-a '{"backend": "cdp", "cdp_url": "http://localhost:9222"}'
```

### Environment Arguments

| Arg | Type | Default | Description |
| --- | ---- | ------- | ----------- |
| `backend` | `"browserbase"` \| `"cdp"` | `"browserbase"` | Browser infrastructure backend. |
| `cdp_url` | str \| null | `null` | CDP endpoint for `backend="cdp"` (`ws(s)://` socket or `http(s)://host:port`). |
| `proxies` | bool | `false` | Enable Browserbase proxies. |
| `mode` | `"computer"` \| `"decomposed"` \| `"both"` | `"both"` | Which tool surface to expose. |
| `viewport_width` / `viewport_height` | int | `1280` / `800` | Emulated viewport. |
| `max_turns` | int | `15` | Max agent turns per rollout. |
| `num_examples` | int | `-1` | Number of bundled tasks to use (`-1` = all). |

### Tasks

Three short, self-contained browsing tasks (read a heading on `example.com`,
compute a derivative on Wolfram Alpha, find the latest arXiv quantum-computing
preprint) — enough to exercise navigate / type / click / scroll / read.

### Using the toolset in your own environment

```python
import verifiers as vf
from verifiers.v1.toolsets.browser import browser_toolset, BrowserbaseBackend

harness = vf.Harness(config=vf.HarnessConfig(max_turns=20))
harness.add_toolset({"browser": browser_toolset(backend=BrowserbaseBackend())})
```
196 changes: 196 additions & 0 deletions environments/browser_toolset_example/browser_toolset_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
from typing import Literal

import verifiers as vf
from verifiers.v1.toolsets.browser import (
BrowserbaseBackend,
CDPBackend,
Mode,
browser_toolset,
)
from verifiers.v1.utils.judge_utils import clamp_float, parse_judge_json

Backend = Literal["browserbase", "cdp"]

SYSTEM_PROMPT = """You are an autonomous web-browsing agent operating a real Chrome browser.

You interact with the page only through the provided browser tools. After each
action you receive a screenshot of the current page — always base your next
action on the most recent screenshot. Coordinates are pixels within the viewport.

Workflow:
- Use `navigate` to open the start URL given in the task.
- Use the click / type / key / scroll actions (or the `computer` tool) to
interact with the page.
- When you have completed the task, reply with your final answer as plain text
and DO NOT call any tool. That plain-text reply is your submission.

Be efficient and use the minimum number of actions necessary."""

JUDGE_SYSTEM_PROMPT = """You grade a web agent's answer to a browsing task.

Respond with compact JSON only: {"score": 0.0-1.0, "reason": "..."}
- 1.0: the answer correctly and completely accomplishes the task.
- 0.5: partially correct or plausible but incomplete / unverified.
- 0.0: wrong, empty, or unrelated to the task."""


TASKS: list[vf.ConfigData] = [
{
"task_id": "example-domain-heading",
"start_url": "https://example.com",
"question": "Open the page and report the exact text of the page's main heading.",
"answer": "Example Domain",
},
{
"task_id": "wolfram-derivative",
"start_url": "https://www.wolframalpha.com",
"question": (
"Use Wolfram Alpha to compute the derivative of x^2 evaluated at "
"x = 5.6, and report the numeric result."
),
"answer": "11.2",
},
{
"task_id": "arxiv-latest-quantum",
"start_url": "https://arxiv.org",
"question": (
"Search arXiv for the latest preprints about 'quantum computing' "
"(sorted newest first) and report the title of the top result."
),
"answer": (
"Criteria: any plausible, real arXiv paper title related to quantum "
"computing that appears at the top of a newest-first search."
),
},
]


class BrowserToolsetTasksetConfig(vf.TasksetConfig):
rewards: list[str] = ["task_success"]
system_prompt: str = SYSTEM_PROMPT
num_examples: int = -1


class BrowserToolsetHarnessConfig(vf.HarnessConfig):
max_turns: int = 15
# Backend: "browserbase" (managed, isolated session per rollout) or "cdp"
# (connect to any browser exposing a CDP endpoint).
backend: Backend = "browserbase"
cdp_url: str | None = None
proxies: bool = False
# Tool surface: "computer" | "decomposed" | "both".
mode: Mode = "both"
viewport_width: int = 1280
viewport_height: int = 800


class BrowserToolsetEnvConfig(vf.EnvConfig):
taskset: BrowserToolsetTasksetConfig = BrowserToolsetTasksetConfig()
harness: BrowserToolsetHarnessConfig = BrowserToolsetHarnessConfig()


def _score_prompt(task: vf.Task, answer: str) -> str:
return (
f"Task: {task['question']}\n"
f"Start URL: {task['start_url']}\n"
f"Reference answer / criteria: {task['answer']}\n\n"
f"Agent's final answer:\n{answer}\n\n"
"Score how well the agent accomplished the task as compact JSON."
)


@vf.reward(weight=1.0)
async def task_success(task, state) -> float:
messages = vf.get_messages(state.get("completion") or [], role="assistant")
answer = str(messages[-1].content or "") if messages else ""
if not answer:
return 0.0
judge_task = vf.Task(
{
"prompt": [{"role": "user", "content": _score_prompt(task, answer)}],
"max_turns": 1,
}
).freeze()
judge_state = state.for_task(judge_task, borrow="model")
judge_state = await vf.Harness(
config=vf.HarnessConfig(system_prompt=JUDGE_SYSTEM_PROMPT, max_turns=1)
).run(judge_task, judge_state)
judge_messages = vf.get_messages(
judge_state.get("completion") or [], role="assistant"
)
judge_text = str(judge_messages[-1].content or "") if judge_messages else ""
parsed = parse_judge_json(judge_text)
score = clamp_float(parsed.get("score", 0.0))
state["judge"] = {"score": score, "reason": str(parsed.get("reason", ""))}
return score


def load_browser_toolset(
backend: Backend = "browserbase",
cdp_url: str | None = None,
proxies: bool = False,
mode: Mode = "both",
viewport_width: int = 1280,
viewport_height: int = 800,
) -> vf.Toolset:
if backend == "browserbase":
browser_backend = BrowserbaseBackend(proxies=proxies)
elif backend == "cdp":
browser_backend = CDPBackend(cdp_url=cdp_url)
else:
raise ValueError(f"Unknown backend {backend!r}; use 'browserbase' or 'cdp'.")
return browser_toolset(
backend=browser_backend,
mode=mode,
width=viewport_width,
height=viewport_height,
)


def load_tasks(num_examples: int = -1):
rows = TASKS if num_examples < 0 else TASKS[:num_examples]
for index, row in enumerate(rows):
yield {
**row,
"example_id": index,
"prompt": [
{
"role": "user",
"content": (
f"{row['question']}\n\nStart by navigating to: {row['start_url']}"
),
}
],
}


class BrowserToolsetTaskset(vf.Taskset[BrowserToolsetTasksetConfig]):
def load_tasks(self, split: vf.TaskSplit = "train") -> vf.Tasks:
return load_tasks(num_examples=self.config.num_examples)


class BrowserToolsetHarness(vf.Harness[BrowserToolsetHarnessConfig]):
pass


def load_environment(config: BrowserToolsetEnvConfig) -> vf.Env:
if config.harness.backend == "browserbase":
vf.ensure_keys(["BROWSERBASE_API_KEY", "BROWSERBASE_PROJECT_ID"])
harness = BrowserToolsetHarness(config=config.harness)
if "toolsets" not in config.harness.model_fields_set:
harness.add_toolset(
{
"browser": load_browser_toolset(
backend=config.harness.backend,
cdp_url=config.harness.cdp_url,
proxies=config.harness.proxies,
mode=config.harness.mode,
viewport_width=config.harness.viewport_width,
viewport_height=config.harness.viewport_height,
)
}
)
return vf.Env(
taskset=BrowserToolsetTaskset(config=config.taskset),
harness=harness,
)
20 changes: 20 additions & 0 deletions environments/browser_toolset_example/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
[project]
name = "browser-toolset-example"
version = "0.1.0"
description = "Example v1 environment demonstrating the browser computer-use toolset over raw CDP (Browserbase by default)"
tags = ["browser", "browserbase", "cdp", "computer-use", "vision", "example"]
requires-python = ">=3.10"
dependencies = [
"verifiers[browser]>=0.1.15",
]

[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"

[tool.hatch.build]
include = ["browser_toolset_example.py", "pyproject.toml"]

[tool.verifiers.eval]
num_examples = 3
rollouts_per_example = 1
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ browser = [
"stagehand>=3.0.0",
"aiohttp>=3.9.0",
"python-dotenv>=1.0.0",
"websockets>=12.0",
]
openenv = [
"tasksets[openenv]>=0.1.1",
Expand Down
76 changes: 76 additions & 0 deletions tests/test_v1_browser_backends.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
"""Backend behaviour with HTTP faked (no network, no browser)."""

import pytest

import verifiers.v1.toolsets.browser.backends as backends_mod
import verifiers.v1.toolsets.browser.cdp as cdp_mod
from verifiers.v1.toolsets.browser import BrowserbaseBackend, CDPBackend


@pytest.fixture
def fake_http(monkeypatch):
calls = []

async def fake_request_json(
url, *, method="GET", headers=None, body=None, timeout=30.0
):
calls.append({"url": url, "method": method, "headers": headers, "body": body})
if "/json/version" in url:
return 200, {"webSocketDebuggerUrl": "ws://browser/devtools/browser/abc"}
if url.endswith("/sessions") and method == "POST":
return 201, {
"id": "bb-session-1",
"connectUrl": "wss://connect.browserbase.com/x",
}
if "/sessions/" in url and method == "POST":
return 200, {}
return 404, {}

monkeypatch.setattr(backends_mod, "request_json", fake_request_json)
monkeypatch.setattr(cdp_mod, "request_json", fake_request_json)
return calls


async def test_cdp_backend_passthrough_ws():
handle = await CDPBackend(cdp_url="ws://host/devtools/browser/x").create()
assert handle.cdp_ws_url == "ws://host/devtools/browser/x"
assert handle.session_id == ""


async def test_cdp_backend_resolves_http(fake_http):
handle = await CDPBackend(cdp_url="http://localhost:9222").create()
assert handle.cdp_ws_url == "ws://browser/devtools/browser/abc"


async def test_cdp_backend_requires_url(monkeypatch):
monkeypatch.delenv("BROWSERTOOLSET_CDP_URL", raising=False)
with pytest.raises(RuntimeError):
await CDPBackend().create()


async def test_browserbase_create(fake_http):
backend = BrowserbaseBackend(api_key="k", project_id="p")
handle = await backend.create()
assert handle.session_id == "bb-session-1"
assert handle.cdp_ws_url == "wss://connect.browserbase.com/x"
post = fake_http[0]
assert post["headers"]["X-BB-API-Key"] == "k"
assert post["body"]["projectId"] == "p"


async def test_browserbase_requires_api_key(monkeypatch):
monkeypatch.delenv("BROWSERBASE_API_KEY", raising=False)
with pytest.raises(RuntimeError):
await BrowserbaseBackend(project_id="p").create()


async def test_browserbase_close_releases_session(fake_http):
backend = BrowserbaseBackend(api_key="k", project_id="p")
await backend.close("bb-session-1")
release = fake_http[-1]
assert release["url"].endswith("/sessions/bb-session-1")
assert release["body"]["status"] == "REQUEST_RELEASE"


async def test_browserbase_close_noop_on_empty():
await BrowserbaseBackend(api_key="k", project_id="p").close("")
Loading
Loading