Skip to content
Open
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions environments/dspy_flights/dspy_flights.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,6 @@
PROGRAM_SANDBOX = {
"image": "python:3.11-slim",
"network_access": True,
"timeout_minutes": 60,
"command_timeout": 900,
"install_timeout": 900,
}


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -125,8 +125,6 @@
"image": "python:3.11-slim",
"scope": "rollout",
"network_access": True,
"timeout_minutes": 20,
"command_timeout": 120,
}


Expand Down Expand Up @@ -154,7 +152,7 @@ class ParallelSandboxHarnessConfig(vf.HarnessConfig):

async def bash(command: str, sandbox, state) -> str:
"""Run a bash command in the active program sandbox."""
result = await sandbox.execute(command, timeout=120, working_dir="/tmp")
result = await sandbox.execute(command, working_dir="/tmp")
output = {
"exit_code": int(getattr(result, "exit_code", 0)),
"stdout": truncate_text(str(getattr(result, "stdout", "") or "")),
Expand Down
4 changes: 1 addition & 3 deletions environments/hello_self_judge_v1/hello_self_judge_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ class SelfJudgeHarnessConfig(vf.HarnessConfig):

async def bash(command: str, sandbox, state) -> str:
"""Run a bash command in the rollout sandbox and return stdout/stderr."""
result = await sandbox.execute(command, timeout=120, working_dir="/tmp")
result = await sandbox.execute(command, working_dir="/tmp")
output = {
"exit_code": int(getattr(result, "exit_code", 0)),
"stdout": truncate_text(str(getattr(result, "stdout", "") or "")),
Expand Down Expand Up @@ -337,8 +337,6 @@ def load_bash_toolset() -> vf.Toolset:
image="python:3.11-slim",
scope="rollout",
network_access=True,
timeout_minutes=30,
command_timeout=120,
),
cleanups=[collect_bash_commands],
)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import asyncio
import json
from collections.abc import Awaitable, Callable, Iterator, Mapping, Sequence
from typing import Protocol, cast
Expand Down Expand Up @@ -51,6 +50,7 @@ def system_prompt(allow_go_back: bool = True) -> str:


SYSTEM_PROMPT = system_prompt()
WIKISPEEDIA_TASK_TIMEOUT_SECONDS = 1200.0


class WikispeediaTasksetConfig(vf.TasksetConfig):
Expand All @@ -73,7 +73,6 @@ class WikispeediaHarnessConfig(vf.HarnessConfig):
fn="run_langchain_deep_agents_wikispeedia_program"
)
max_turns: int = 50
timeout_seconds: float = 1200.0
Comment thread
cursor[bot] marked this conversation as resolved.
Comment thread
cursor[bot] marked this conversation as resolved.


class WikispeediaTaskset(vf.Taskset[WikispeediaTasksetConfig]):
Expand Down Expand Up @@ -444,7 +443,6 @@ async def go_back() -> str:

def make_langchain_deep_agents_program(
max_turns: int,
timeout_seconds: float,
) -> Callable[[vf.Task, vf.State], Awaitable[vf.State]]:
async def run_langchain_deep_agents_wikispeedia_program(
task: vf.Task, state: vf.State
Expand Down Expand Up @@ -488,19 +486,14 @@ async def run_langchain_deep_agents_wikispeedia_program(
invoke_config = (
{"recursion_limit": recursion_limit} if recursion_limit > 0 else None
)
invoke = agent.ainvoke(
{"messages": [{"role": "user", "content": prompt}]},
config=invoke_config,
)
try:
result = await asyncio.wait_for(invoke, timeout=timeout_seconds)
except (TimeoutError, GraphRecursionError) as exc:
state["agent_timeout"] = True
state.stop(
"agent_timeout"
if isinstance(exc, TimeoutError)
else "agent_recursion_limit"
result = await agent.ainvoke(

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Preserve Wikispeedia timeout metric

When the Wikispeedia agent exceeds the new env.task_timeout_seconds, the timeout is raised and handled by Harness.local_callable_program rather than this try block, so the state keeps the initialized agent_timeout = False. The environment's agent_timeout metric reads that field, so timed-out Wikispeedia rollouts now report agent_timeout as 0 even though the agent hit the timeout; either mark this state on framework timeouts or update the metric to use task_timed_out.

Useful? React with 👍 / 👎.

{"messages": [{"role": "user", "content": prompt}]},
config=invoke_config,
)
except GraphRecursionError:
state["agent_timeout"] = True
state.stop("agent_recursion_limit")
state.setdefault("agent_completion", [])
return state

Expand All @@ -520,7 +513,6 @@ async def run_langchain_deep_agents_wikispeedia_program(
) -> vf.State:
return await make_langchain_deep_agents_program(
max_turns=harness.config.max_turns,
timeout_seconds=harness.config.timeout_seconds,
)(task, state)


Expand Down Expand Up @@ -585,7 +577,9 @@ class WikispeediaEnvConfig(vf.EnvConfig):


def load_environment(config: WikispeediaEnvConfig) -> vf.Env:
return vf.Env(
env = vf.Env(
taskset=vf.load_taskset(config=config.taskset),
harness=vf.load_harness(config=config.harness),
)
env.task_timeout_seconds = WIKISPEEDIA_TASK_TIMEOUT_SECONDS
return env
6 changes: 4 additions & 2 deletions environments/math_python/math_python.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@ def load_environment(
unsupported.append("max_startup_wait_seconds")
if sandbox_client_max_workers is not None:
unsupported.append("sandbox_client_max_workers")
if sandbox_timeout_minutes != 60:
unsupported.append("sandbox_timeout_minutes")
if sandbox_timeout_per_command_seconds != 60:
unsupported.append("sandbox_timeout_per_command_seconds")
if unsupported:
unexpected = ", ".join(sorted(unsupported))
raise TypeError(f"Unsupported v1 load_environment kwargs: {unexpected}")
Expand Down Expand Up @@ -52,8 +56,6 @@ def load_environment(
sandbox_memory_gb=sandbox_memory_gb,
sandbox_disk_size_gb=sandbox_disk_size_gb,
sandbox_gpu_count=sandbox_gpu_count,
sandbox_timeout_minutes=sandbox_timeout_minutes,
sandbox_timeout_per_command_seconds=sandbox_timeout_per_command_seconds,
),
)
)
Expand Down
10 changes: 0 additions & 10 deletions environments/math_python/math_python_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,8 +104,6 @@ class MathPythonHarnessConfig(vf.HarnessConfig):
sandbox_memory_gb: int = 2
sandbox_disk_size_gb: int = 5
sandbox_gpu_count: int = 0
sandbox_timeout_minutes: int = 60
sandbox_timeout_per_command_seconds: int = 60


class MathPythonEnvConfig(vf.EnvConfig):
Expand Down Expand Up @@ -162,8 +160,6 @@ def load_toolset(
sandbox_memory_gb: int = 2,
sandbox_disk_size_gb: int = 5,
sandbox_gpu_count: int = 0,
sandbox_timeout_minutes: int = 60,
sandbox_timeout_per_command_seconds: int = 60,
):
packages = pip_install_packages.split() if pip_install_packages.strip() else []
return vf.Toolset(
Expand All @@ -176,8 +172,6 @@ def load_toolset(
memory_gb=sandbox_memory_gb,
disk_size_gb=sandbox_disk_size_gb,
gpu_count=sandbox_gpu_count,
timeout_minutes=sandbox_timeout_minutes,
command_timeout=sandbox_timeout_per_command_seconds,
packages=packages,
),
cleanups=[collect_python_commands],
Expand All @@ -195,10 +189,6 @@ def load_environment(config: MathPythonEnvConfig) -> vf.Env:
sandbox_memory_gb=config.harness.sandbox_memory_gb,
sandbox_disk_size_gb=config.harness.sandbox_disk_size_gb,
sandbox_gpu_count=config.harness.sandbox_gpu_count,
sandbox_timeout_minutes=config.harness.sandbox_timeout_minutes,
sandbox_timeout_per_command_seconds=(
config.harness.sandbox_timeout_per_command_seconds
),
)
}
)
Expand Down
2 changes: 1 addition & 1 deletion environments/rlm_swe_v1/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ env = load_environment(
harness=RLMConfig(
program=RLMProgramConfig(
local_checkout="/path/to/checkout",
tools=["bash", "edit"],
rlm_tools=["bash", "edit"],
)
),
)
Expand Down
34 changes: 6 additions & 28 deletions environments/rlm_swe_v1/rlm_swe_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ class RlmSweTasksetConfig(vf.TasksetConfig):
filter_repos: list[str] | None = None
ds_num_proc: int | None = None
ds_keep_in_memory: bool = True
timeout_minutes: int | None = None
hide_tests_from_agent: bool = True
env: vf.ConfigData | None = None

Expand Down Expand Up @@ -59,7 +58,7 @@ async def upload_file(
async def upload_bytes(self, remote_path: str, data: bytes, name: str) -> None: ...

async def run_background_job(
self, command: str, timeout: int, working_dir: str
self, command: str, working_dir: str
) -> SandboxCommandResult: ...


Expand All @@ -69,7 +68,6 @@ def load_tasks(
filter_repos: list[str] | None = None,
ds_num_proc: int | None = None,
ds_keep_in_memory: bool = True,
timeout_minutes: int | None = None,
env: vf.ConfigData | None = None,
) -> list[vf.JsonData]:
dataset_kwargs = dict(
Expand Down Expand Up @@ -116,18 +114,15 @@ def load_tasks(
"sandbox": sandbox_config(
info=info,
repo_path=repo_path,
timeout_minutes=timeout_minutes,
),
"program": {"env": program_env},
}
rows.append(task_row)
return rows


def sandbox_config(
*, info: vf.JsonData, repo_path: str, timeout_minutes: int | None
) -> vf.JsonData:
config: vf.JsonData = {
def sandbox_config(*, info: vf.JsonData, repo_path: str) -> vf.JsonData:
return {
"image": f"{REGISTRY_PREFIX}/{info['docker_image']}",
"cpu_cores": 4,
"memory_gb": 4,
Expand All @@ -136,9 +131,6 @@ def sandbox_config(
"workdir": repo_path,
"scope": "rollout",
}
if timeout_minutes is not None:
config["timeout_minutes"] = timeout_minutes
return config


def env_vars(*, repo_path: str, env: vf.ConfigData) -> dict[str, str]:
Expand All @@ -162,7 +154,6 @@ def sandbox_config(self, info: vf.JsonData) -> vf.JsonData:
return sandbox_config(
info=info,
repo_path=self.config.repo_path,
timeout_minutes=self.config.timeout_minutes,
)

def get_env_vars(self) -> dict[str, str]:
Expand All @@ -177,7 +168,6 @@ def load_tasks(self, split: vf.TaskSplit = "train") -> vf.Tasks:
filter_repos=self.config.filter_repos,
ds_num_proc=self.config.ds_num_proc,
ds_keep_in_memory=self.config.ds_keep_in_memory,
timeout_minutes=self.config.timeout_minutes,
env=dict(self.config.env or {}),
)

Expand All @@ -187,10 +177,6 @@ async def setup_r2e_sandbox(self, task, state, sandbox=None) -> None:
raise RuntimeError("R2E SWE setup requires the active program sandbox.")
state["_rlm_swe_sandbox"] = sandbox
state["sandbox_id"] = getattr(sandbox, "id", state.get("sandbox_id"))
sandbox_config = task.get("sandbox")
if isinstance(sandbox_config, Mapping):
timeout_minutes = int(sandbox_config.get("timeout_minutes") or 60)
state.setdefault("test_timeout", timeout_minutes * 60)
await self.setup_sandbox(sandbox, state)

async def setup_sandbox(self, sandbox: R2ESandbox, state: vf.State) -> None:
Expand Down Expand Up @@ -261,11 +247,7 @@ async def solved(self, task, state) -> float:
if sandbox is None:
return 0.0
try:
test_output = await self.run_tests(
sandbox,
state,
int(state.get("test_timeout", 900)),
)
test_output = await self.run_tests(sandbox, state)
state["test_output"] = test_output
except Exception as exc:
logger.warning("Test execution failed: %r", exc)
Expand All @@ -277,7 +259,6 @@ async def run_tests(
self,
sandbox: R2ESandbox,
state: vf.State,
test_timeout: int,
) -> str:
local_archive_path = state.get("r2e_tests_archive_local_path")
if local_archive_path and Path(str(local_archive_path)).exists():
Expand Down Expand Up @@ -306,7 +287,7 @@ async def run_tests(
)
command = f"export {env_str}; /bin/bash run_tests.sh > test_output.txt 2>&1"
result = await sandbox.run_background_job(
command, timeout=test_timeout, working_dir=self.config.repo_path
command, working_dir=self.config.repo_path
Comment thread
cursor[bot] marked this conversation as resolved.
)
Comment thread
xeophon marked this conversation as resolved.
if result.exit_code > 1:
raise RuntimeError(f"Error running tests: exit_code={result.exit_code}")
Expand Down Expand Up @@ -362,12 +343,9 @@ async def apply_gold_patch(self, sandbox: R2ESandbox, state: vf.State) -> None:
async def validate_instance(self, state: vf.State) -> bool:
sandbox = cast(R2ESandbox, state["_rlm_swe_sandbox"])
await self.apply_gold_patch(sandbox, state)
test_timeout = state.get("test_timeout", 900)
assert isinstance(test_timeout, int)
test_output = await self.run_tests(
sandbox,
state,
test_timeout,
)
state["test_output"] = test_output
info = cast(vf.JsonData, state["info"])
Expand Down Expand Up @@ -493,7 +471,7 @@ def load_taskset(

class RlmSweProgramConfig(RLMProgramConfig):
workdir: str = DEFAULT_REPO_PATH
tools: list[str] = list(DEFAULT_RLM_TOOLS)
rlm_tools: list[str] = list(DEFAULT_RLM_TOOLS)


class RlmSweHarnessConfig(RLMConfig):
Expand Down
6 changes: 1 addition & 5 deletions packages/harnesses/harnesses/mini_swe_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
MINI_SWE_AGENT_DEFAULT_PACKAGE = MINI_SWE_AGENT_DEFAULT_VERSION
MINI_SWE_AGENT_DEFAULT_CONFIG_SPEC = "mini"
MINI_SWE_AGENT_DEFAULT_MODEL_CLASS = "litellm"
MINI_SWE_AGENT_DEFAULT_ENVIRONMENT_TIMEOUT = 120


def build_mini_swe_agent_install_script(
Expand Down Expand Up @@ -62,7 +61,6 @@ class MiniSWEAgentProgramConfig(vf.ProgramConfig):
trajectory_path: str = MINI_SWE_AGENT_DEFAULT_TRAJECTORY_PATH
config_spec: str = MINI_SWE_AGENT_DEFAULT_CONFIG_SPEC
model_class: str = MINI_SWE_AGENT_DEFAULT_MODEL_CLASS
environment_timeout: int = MINI_SWE_AGENT_DEFAULT_ENVIRONMENT_TIMEOUT
parallel_tool_calls: bool = True
extra_config_specs: list[str] | None = None
sandbox: vf.SandboxConfig | None = vf.SandboxConfig()
Expand Down Expand Up @@ -103,8 +101,6 @@ def resolve(
"-c",
"agent.cost_limit=0",
"-c",
f"environment.timeout={self.environment_timeout}",
"-c",
f"model.model_class={shlex.quote(self.model_class)}",
"-c",
"model.cost_tracking=ignore_errors",
Expand Down Expand Up @@ -141,7 +137,7 @@ def resolve(
CONFIG_ARGS+=(-c "agent.system_template=$(cat {system_prompt_file})")
fi
cd "$MINI_SWE_AGENT_WORKDIR"
timeout --kill-after=30s "${{AGENT_TIMEOUT_SECONDS:-3600}}" {shlex.quote(DEFAULT_MINI_BINARY)} \\
Comment thread
cursor[bot] marked this conversation as resolved.
{shlex.quote(DEFAULT_MINI_BINARY)} \\

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Keep MiniSWE agent runs bounded

With the default v1 environment settings, task_timeout_seconds is None, so this sandbox command is now sent to run_background_job without any timeout, and the shell-level timeout --kill-after ... ${AGENT_TIMEOUT_SECONDS:-3600} wrapper was removed here. If mini-swe-agent hangs or never exits despite --exit-immediately, evaluations using MiniSWEAgentProgramConfig can now block indefinitely instead of being capped at the previous 3600s default; keep a finite default task timeout or restore a command-level wrapper for this harness.

Useful? React with 👍 / 👎.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no

--model "$OPENAI_MODEL" \\
--task "$MINI_SWE_AGENT_TASK" \\
--output {shlex.quote(self.trajectory_path)} \\
Expand Down
Loading
Loading