diff --git a/docs/byo-harness.md b/docs/byo-harness.md index 1f40a86403..1a11f08380 100644 --- a/docs/byo-harness.md +++ b/docs/byo-harness.md @@ -162,7 +162,7 @@ Examples: - Wikispeedia link tools belong to the Wikispeedia taskset. - TextArena game state and user responses belong to the TextArena taskset. - Harbor task directories, uploads, and tests belong to `HarborTaskset`. -- OpenCode, Pi, mini-swe-agent, Terminus, and RLM execution belong to harness +- CodexCLI, OpenCode, Pi, mini-swe-agent, Terminus, and RLM execution belong to harness classes. - Endpoint routing and interception belong to the harness/runtime, not task rows. @@ -552,7 +552,7 @@ def load_environment(config: vf.EnvConfig) -> vf.Env: ``` Tasksets include Harbor, OpenEnv, OpenReward, ReplayTaskset, TextArena, and -NeMoGym. Harnesses include OpenCode, Pi, mini-swe-agent, Terminus, RLM, +NeMoGym. Harnesses include CodexCLI, OpenCode, Pi, mini-swe-agent, Terminus, RLM, ReplayHarness, and NeMoGymHarness. ## TOML And CLI @@ -595,7 +595,7 @@ id = "tasksets.harbor" tasks_dir = "tasks" [eval.harness] -id = "harnesses.opencode" +id = "harnesses.codex_cli" max_turns = 8 ``` diff --git a/packages/harnesses/README.md b/packages/harnesses/README.md index 191bb71318..cfd383495f 100644 --- a/packages/harnesses/README.md +++ b/packages/harnesses/README.md @@ -49,6 +49,7 @@ own a reusable execution mechanism. | Harness | Purpose | | --- | --- | +| `CodexCLI` | Codex CLI agent. | | `OpenCode` | OpenCode CLI agent. | | `Pi` | Pi Coding Agent. | | `MiniSWEAgent` | mini-swe-agent. | @@ -99,6 +100,12 @@ responsibility of the trainer or renderer that consumes the final transcript. Command agents use `name@version` specs where their installer supports a versioned package or release. Use `@latest` for a moving latest install: +```toml +[eval.harness] +id = "harnesses.codex_cli" +version = "codex@latest" +``` + ```toml [eval.harness] id = "harnesses.opencode" @@ -110,3 +117,19 @@ version = "PrimeIntellect-ai/opencode@latest" id = "harnesses.mini_swe_agent" version = "mini-swe-agent@2.2.8" ``` + +`CodexCLI` defaults to OpenAI API-key auth, using the Verifiers runtime +endpoint. To run against a personal ChatGPT subscription, pass the logged-in +Codex `auth.json` content as a secret environment variable: + +```bash +export CODEX_AUTH_JSON="$(tr -d '\n' < ~/.codex/auth.json)" +``` + +```toml +[eval.harness] +id = "harnesses.codex_cli" + +[eval.harness.program] +auth_mode = "chatgpt" +``` diff --git a/packages/harnesses/harnesses/__init__.py b/packages/harnesses/harnesses/__init__.py index f0f233ccf4..e2de0fde7e 100644 --- a/packages/harnesses/harnesses/__init__.py +++ b/packages/harnesses/harnesses/__init__.py @@ -1,5 +1,6 @@ __version__ = "0.1.2" +from .codex_cli import CodexCLI, CodexCLIConfig, CodexCLIProgramConfig from .mini_swe_agent import MiniSWEAgent, MiniSWEAgentConfig, MiniSWEAgentProgramConfig from .opencode import OpenCode, OpenCodeConfig, OpenCodeProgramConfig from .pi import Pi, PiConfig, PiProgramConfig @@ -13,6 +14,9 @@ } __all__ = [ + "CodexCLI", + "CodexCLIConfig", + "CodexCLIProgramConfig", "MiniSWEAgent", "MiniSWEAgentConfig", "MiniSWEAgentProgramConfig", diff --git a/packages/harnesses/harnesses/codex_cli.py b/packages/harnesses/harnesses/codex_cli.py new file mode 100644 index 0000000000..3a3ae6df53 --- /dev/null +++ b/packages/harnesses/harnesses/codex_cli.py @@ -0,0 +1,172 @@ +import os +import shlex +from pathlib import PurePosixPath +from typing import Literal + +import verifiers as vf + +from .utils import split_versioned_agent_spec + +CODEX_CLI_DEFAULT_VERSION = "codex@latest" +CODEX_CLI_DEFAULT_INSTALL_DIR = "/opt/codex-cli" +CODEX_CLI_DEFAULT_AGENT_WORKDIR = "/app" +CODEX_CLI_DEFAULT_CODEX_HOME_PATH = "/codex-cli/home" +CODEX_CLI_DEFAULT_INSTRUCTION_PATH = "/codex-cli/instruction.txt" +CODEX_CLI_DEFAULT_SYSTEM_PROMPT_PATH = "/codex-cli/system.txt" +CODEX_CLI_DEFAULT_LOG_PATH = "/logs/agent/codex-cli.jsonl" +CODEX_CLI_DEFAULT_LAST_MESSAGE_PATH = "/logs/agent/codex-cli-last-message.txt" +CODEX_CLI_DEFAULT_SYSTEM_PROMPT = "Complete the user's task using the available tools." + +CodexCLIAuthMode = Literal["api_key", "chatgpt"] + + +def codex_chatgpt_auth_json(auth_json_var: str) -> str: + value = os.environ.get(auth_json_var) + if not value: + raise RuntimeError( + f"{auth_json_var} must contain Codex ChatGPT auth.json content." + ) + return value + + +class CodexCLIProgramConfig(vf.ProgramConfig): + agent_workdir: str = CODEX_CLI_DEFAULT_AGENT_WORKDIR + codex_home_path: str = CODEX_CLI_DEFAULT_CODEX_HOME_PATH + auth_json_var: str = "CODEX_AUTH_JSON" + instruction_path: str = CODEX_CLI_DEFAULT_INSTRUCTION_PATH + system_prompt_path: str = CODEX_CLI_DEFAULT_SYSTEM_PROMPT_PATH + log_path: str = CODEX_CLI_DEFAULT_LOG_PATH + last_message_path: str = CODEX_CLI_DEFAULT_LAST_MESSAGE_PATH + auth_mode: CodexCLIAuthMode = "api_key" + sandbox: vf.SandboxConfig | None = vf.SandboxConfig() + + def resolve(self, version: str = CODEX_CLI_DEFAULT_VERSION) -> vf.ProgramConfig: + files: dict[str, vf.ProgramValue] = { + self.instruction_path: {"fn": "verifiers.v1.utils.prompt_utils:task_text"}, + self.system_prompt_path: { + "fn": "verifiers.v1.utils.prompt_utils:state_system_prompt_text" + }, + } + artifacts = vf.ArtifactsConfig.model_validate( + { + "codex_cli_log": { + "path": self.log_path, + "format": "text", + "optional": True, + }, + "codex_cli_last_message": { + "path": self.last_message_path, + "format": "text", + "optional": True, + }, + } + ) + name, parsed_version = split_versioned_agent_spec(version) + release = parsed_version or name + if release in {"", "codex", "openai/codex"}: + release = "latest" + + install_home = f"{CODEX_CLI_DEFAULT_INSTALL_DIR}/home" + bin_dir = f"{CODEX_CLI_DEFAULT_INSTALL_DIR}/bin" + setup = f"""\ +set -e +apt-get -o Acquire::Retries=3 update -qq && apt-get -o Acquire::Retries=3 install -y -qq ca-certificates curl git python3 tar > /dev/null 2>&1 +ln -sf "$(command -v python3)" /usr/local/bin/python +CODEX_RELEASE={shlex.quote(release)} \\ +CODEX_NON_INTERACTIVE=1 \\ +CODEX_INSTALL_DIR={shlex.quote(bin_dir)} \\ +CODEX_HOME={shlex.quote(install_home)} \\ +sh -c "$(curl -fsSL https://chatgpt.com/codex/install.sh)" +""" + if self.auth_mode == "api_key": + auth_setup = """\ +if [ -z "${OPENAI_API_KEY:-}" ]; then + export OPENAI_API_KEY=intercepted +fi +printf '%s' "$OPENAI_API_KEY" | codex login --with-api-key >/dev/null +CODEX_CONFIG_ARGS+=(-c "openai_base_url=\\"${OPENAI_BASE_URL:-https://api.openai.com/v1}\\"") +""" + else: + auth_setup = """\ +CODEX_AUTH_JSON_VAR={auth_json_var} +CODEX_AUTH_JSON="$(printenv "$CODEX_AUTH_JSON_VAR" || true)" +if [ -z "$CODEX_AUTH_JSON" ]; then + echo "Codex ChatGPT auth requires $CODEX_AUTH_JSON_VAR to contain auth.json." >&2 + exit 1 +fi +printf '%s' "$CODEX_AUTH_JSON" > "$CODEX_HOME/auth.json" +chmod 600 "$CODEX_HOME/auth.json" +""".format(auth_json_var=shlex.quote(self.auth_json_var)) + log_dir = str(PurePosixPath(self.log_path).parent) + last_message_dir = str(PurePosixPath(self.last_message_path).parent) + run_script = f"""\ +set -eo pipefail +export PATH={shlex.quote(bin_dir)}:"$PATH" +export CODEX_HOME={shlex.quote(self.codex_home_path)} + +CODEX_WORKDIR="${{AGENT_WORKDIR:-}}" +if [ -z "$CODEX_WORKDIR" ]; then + CODEX_WORKDIR={shlex.quote(self.agent_workdir)} +fi + +CODEX_MODEL="${{OPENAI_MODEL:-gpt-5}}" +case "$CODEX_MODEL" in + openai/*) CODEX_MODEL="${{CODEX_MODEL#openai/}}" ;; +esac + +mkdir -p "$CODEX_HOME" "$CODEX_WORKDIR" {shlex.quote(log_dir)} {shlex.quote(last_message_dir)} +CODEX_CONFIG_ARGS=(-c 'model_provider="openai"') +{auth_setup} +if [ -s {shlex.quote(self.system_prompt_path)} ]; then + CODEX_DEVELOPER_INSTRUCTIONS="$(sed 's/\\\\/\\\\\\\\/g; s/"/\\\\"/g' {shlex.quote(self.system_prompt_path)} | awk '{{printf "%s\\\\n", $0}}')" + CODEX_CONFIG_ARGS+=(-c "developer_instructions=\\"$CODEX_DEVELOPER_INSTRUCTIONS\\"") +fi + +cd "$CODEX_WORKDIR" +timeout --kill-after=30s "${{AGENT_TIMEOUT_SECONDS:-3600}}" codex exec \\ + --ignore-user-config \\ + --ephemeral \\ + --skip-git-repo-check \\ + --dangerously-bypass-approvals-and-sandbox \\ + --json \\ + --model "$CODEX_MODEL" \\ + --cd "$CODEX_WORKDIR" \\ + --output-last-message {shlex.quote(self.last_message_path)} \\ + "${{CODEX_CONFIG_ARGS[@]}}" \\ + - < {shlex.quote(self.instruction_path)} 2>&1 | tee {shlex.quote(self.log_path)} +""" + env: dict[str, vf.ProgramValue] = {"OPENAI_MODEL": "runtime.model"} + if self.auth_mode == "chatgpt": + env[self.auth_json_var] = { + "fn": "harnesses.codex_cli:codex_chatgpt_auth_json", + "auth_json_var": self.auth_json_var, + } + + return self.resolve_command( + command=["bash", "-lc", run_script], + default_sandbox=self.sandbox, + files=files, + setup=setup, + env=env, + artifacts=artifacts, + ) + + +class CodexCLIConfig(vf.HarnessConfig): + system_prompt: vf.PromptInput | vf.SystemPromptConfig | None = ( + CODEX_CLI_DEFAULT_SYSTEM_PROMPT + ) + version: str = CODEX_CLI_DEFAULT_VERSION + program: CodexCLIProgramConfig = CodexCLIProgramConfig() + max_turns: int = 4 + + +class CodexCLI(vf.Harness[CodexCLIConfig]): + config: CodexCLIConfig + + def load_program_config(self, config: CodexCLIConfig) -> vf.ProgramConfig: + return config.program.resolve(version=config.version) + + +def load_harness(config: CodexCLIConfig) -> CodexCLI: + return CodexCLI(config=config) diff --git a/tests/test_imports.py b/tests/test_imports.py index c9441668e2..1c071d705c 100644 --- a/tests/test_imports.py +++ b/tests/test_imports.py @@ -5,6 +5,8 @@ PACKAGE_SYMBOLS = { + "CodexCLI", + "CodexCLIConfig", "HarborTaskset", "HarborTasksetConfig", "MiniSWEAgent", diff --git a/tests/test_v1_codex_cli.py b/tests/test_v1_codex_cli.py new file mode 100644 index 0000000000..c8a1513739 --- /dev/null +++ b/tests/test_v1_codex_cli.py @@ -0,0 +1,95 @@ +from typing import Any, cast + +import pytest +import verifiers as vf +from harnesses import CodexCLI, CodexCLIConfig, CodexCLIProgramConfig +from harnesses.codex_cli import codex_chatgpt_auth_json + + +def test_codex_cli_builds_openai_api_sandbox_program() -> None: + harness = CodexCLI( + config=CodexCLIConfig( + system_prompt="Use tests.", + program=CodexCLIProgramConfig(agent_workdir="/workspace"), + ) + ) + program = cast(dict[str, Any], harness.program_config.data()) + command = cast(list[str], program["command"]) + setup = cast(str, program["setup"]) + files = cast(dict[str, object], program["files"]) + artifacts = cast(dict[str, object], program["artifacts"]) + env = cast(dict[str, object], program["env"]) + run_script = command[-1] + + assert isinstance(harness, vf.Harness) + assert program["sandbox"] is not False + assert "CODEX_RELEASE=latest" in setup + assert "https://chatgpt.com/codex/install.sh" in setup + assert "apt-get -o Acquire::Retries=3 update" in setup + assert "apt-get -o Acquire::Retries=3 install" in setup + assert "python3" in setup + assert "/usr/local/bin/python" in setup + assert "/codex-cli/instruction.txt" in files + assert "/codex-cli/system.txt" in files + assert "codex_cli_log" in artifacts + assert "codex_cli_last_message" in artifacts + assert env["OPENAI_MODEL"] == "runtime.model" + assert "printf '%s' \"$OPENAI_API_KEY\" | codex login --with-api-key" in run_script + assert 'model_provider="openai"' in run_script + assert "openai_base_url=" in run_script + assert "developer_instructions=" in run_script + assert "--dangerously-bypass-approvals-and-sandbox" in run_script + assert "/workspace" in run_script + + +def test_codex_cli_version_spec_sets_installer_release() -> None: + harness = CodexCLI(config=CodexCLIConfig(version="codex@0.137.0")) + program = cast(dict[str, Any], harness.program_config.data()) + setup = cast(str, program["setup"]) + + assert "CODEX_RELEASE=0.137.0" in setup + + +def test_codex_cli_chatgpt_auth_uses_forwarded_auth_json_secret() -> None: + harness = CodexCLI( + config=CodexCLIConfig( + program=CodexCLIProgramConfig( + auth_mode="chatgpt", + ) + ) + ) + program = cast(dict[str, Any], harness.program_config.data()) + env = cast(dict[str, object], program["env"]) + run_script = cast(list[str], program["command"])[-1] + + assert "dirs" not in program + assert cast(dict[str, object], env["CODEX_AUTH_JSON"]) == { + "fn": "harnesses.codex_cli:codex_chatgpt_auth_json", + "auth_json_var": "CODEX_AUTH_JSON", + } + assert 'printf \'%s\' "$CODEX_AUTH_JSON" > "$CODEX_HOME/auth.json"' in run_script + assert "codex login --with-api-key" not in run_script + assert "openai_base_url=" not in run_script + + +def test_codex_chatgpt_auth_json_reads_configured_env_var( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setenv("CUSTOM_CODEX_AUTH_JSON", '{"auth_mode":"chatgpt"}') + + assert codex_chatgpt_auth_json("CUSTOM_CODEX_AUTH_JSON") == ( + '{"auth_mode":"chatgpt"}' + ) + + +def test_codex_chatgpt_auth_json_requires_configured_env_var( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("CODEX_AUTH_JSON", raising=False) + + with pytest.raises(RuntimeError, match="CODEX_AUTH_JSON must contain"): + codex_chatgpt_auth_json("CODEX_AUTH_JSON") + + +def test_codex_cli_imports_from_package() -> None: + assert CodexCLI diff --git a/tests/test_v1_harbor_cli.py b/tests/test_v1_harbor_cli.py index 348c193c94..a88887decc 100644 --- a/tests/test_v1_harbor_cli.py +++ b/tests/test_v1_harbor_cli.py @@ -10,6 +10,9 @@ import verifiers as vf from harnesses import ( + CodexCLI, + CodexCLIConfig, + CodexCLIProgramConfig, MiniSWEAgent, MiniSWEAgentConfig, MiniSWEAgentProgramConfig, @@ -250,6 +253,8 @@ async def test_harbor_reward_uses_background_job_for_tests( def test_packaged_harbor_and_opencode_imports_are_available_from_packages() -> None: + assert CodexCLI + assert CodexCLIConfig assert OpenCode assert OpenCodeConfig assert Pi @@ -324,6 +329,7 @@ def test_opencode_custom_version_uses_versioned_release() -> None: @pytest.mark.parametrize( ("harness_cls", "config_cls", "program_cls"), [ + (CodexCLI, CodexCLIConfig, CodexCLIProgramConfig), (OpenCode, OpenCodeConfig, OpenCodeProgramConfig), (MiniSWEAgent, MiniSWEAgentConfig, MiniSWEAgentProgramConfig), (Pi, PiConfig, PiProgramConfig),