Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions docs/byo-harness.md
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ Examples:
- Wikispeedia link tools belong to the Wikispeedia taskset.
- TextArena game state and user responses belong to the TextArena taskset.
- Harbor task directories, uploads, and tests belong to `HarborTaskset`.
- OpenCode, Pi, mini-swe-agent, Terminus, and RLM execution belong to harness
- CodexCLI, OpenCode, Pi, mini-swe-agent, Terminus, and RLM execution belong to harness
classes.
- Endpoint routing and interception belong to the harness/runtime, not task
rows.
Expand Down Expand Up @@ -552,7 +552,7 @@ def load_environment(config: vf.EnvConfig) -> vf.Env:
```

Tasksets include Harbor, OpenEnv, OpenReward, ReplayTaskset, TextArena, and
NeMoGym. Harnesses include OpenCode, Pi, mini-swe-agent, Terminus, RLM,
NeMoGym. Harnesses include CodexCLI, OpenCode, Pi, mini-swe-agent, Terminus, RLM,
ReplayHarness, and NeMoGymHarness.

## TOML And CLI
Expand Down Expand Up @@ -595,7 +595,7 @@ id = "tasksets.harbor"
tasks_dir = "tasks"

[eval.harness]
id = "harnesses.opencode"
id = "harnesses.codex_cli"
max_turns = 8
```

Expand Down
23 changes: 23 additions & 0 deletions packages/harnesses/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ own a reusable execution mechanism.

| Harness | Purpose |
| --- | --- |
| `CodexCLI` | Codex CLI agent. |
| `OpenCode` | OpenCode CLI agent. |
| `Pi` | Pi Coding Agent. |
| `MiniSWEAgent` | mini-swe-agent. |
Expand Down Expand Up @@ -99,6 +100,12 @@ responsibility of the trainer or renderer that consumes the final transcript.
Command agents use `name@version` specs where their installer supports a
versioned package or release. Use `@latest` for a moving latest install:

```toml
[eval.harness]
id = "harnesses.codex_cli"
version = "codex@latest"
```

```toml
[eval.harness]
id = "harnesses.opencode"
Expand All @@ -110,3 +117,19 @@ version = "PrimeIntellect-ai/opencode@latest"
id = "harnesses.mini_swe_agent"
version = "mini-swe-agent@2.2.8"
```

`CodexCLI` defaults to OpenAI API-key auth, using the Verifiers runtime
endpoint. To run against a personal ChatGPT subscription, pass the logged-in
Codex `auth.json` content as a secret environment variable:

```bash
export CODEX_AUTH_JSON="$(tr -d '\n' < ~/.codex/auth.json)"
```

```toml
[eval.harness]
id = "harnesses.codex_cli"

[eval.harness.program]
auth_mode = "chatgpt"
```
4 changes: 4 additions & 0 deletions packages/harnesses/harnesses/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
__version__ = "0.1.2"

from .codex_cli import CodexCLI, CodexCLIConfig, CodexCLIProgramConfig
from .mini_swe_agent import MiniSWEAgent, MiniSWEAgentConfig, MiniSWEAgentProgramConfig
from .opencode import OpenCode, OpenCodeConfig, OpenCodeProgramConfig
from .pi import Pi, PiConfig, PiProgramConfig
Expand All @@ -13,6 +14,9 @@
}

__all__ = [
"CodexCLI",
"CodexCLIConfig",
"CodexCLIProgramConfig",

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Harness package version not bumped

Medium Severity

This PR adds the public CodexCLI harness and exports it from harnesses, but leaves __version__ at 0.1.2. That leaves package version metadata out of step with the new user-facing behavior, so a post-merge publish may not ship CodexCLI under a new release tag.

Fix in Cursor Fix in Web

Triggered by project rule: BugBot Instructions

Reviewed by Cursor Bugbot for commit e7b4378. Configure here.

"MiniSWEAgent",
"MiniSWEAgentConfig",
"MiniSWEAgentProgramConfig",
Expand Down
172 changes: 172 additions & 0 deletions packages/harnesses/harnesses/codex_cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
import os
import shlex
from pathlib import PurePosixPath
from typing import Literal

import verifiers as vf

from .utils import split_versioned_agent_spec

CODEX_CLI_DEFAULT_VERSION = "codex@latest"
CODEX_CLI_DEFAULT_INSTALL_DIR = "/opt/codex-cli"
CODEX_CLI_DEFAULT_AGENT_WORKDIR = "/app"
CODEX_CLI_DEFAULT_CODEX_HOME_PATH = "/codex-cli/home"
CODEX_CLI_DEFAULT_INSTRUCTION_PATH = "/codex-cli/instruction.txt"
CODEX_CLI_DEFAULT_SYSTEM_PROMPT_PATH = "/codex-cli/system.txt"
CODEX_CLI_DEFAULT_LOG_PATH = "/logs/agent/codex-cli.jsonl"
CODEX_CLI_DEFAULT_LAST_MESSAGE_PATH = "/logs/agent/codex-cli-last-message.txt"
CODEX_CLI_DEFAULT_SYSTEM_PROMPT = "Complete the user's task using the available tools."

CodexCLIAuthMode = Literal["api_key", "chatgpt"]


def codex_chatgpt_auth_json(auth_json_var: str) -> str:
value = os.environ.get(auth_json_var)
if not value:
raise RuntimeError(
f"{auth_json_var} must contain Codex ChatGPT auth.json content."
)
return value


class CodexCLIProgramConfig(vf.ProgramConfig):
agent_workdir: str = CODEX_CLI_DEFAULT_AGENT_WORKDIR
codex_home_path: str = CODEX_CLI_DEFAULT_CODEX_HOME_PATH
auth_json_var: str = "CODEX_AUTH_JSON"
instruction_path: str = CODEX_CLI_DEFAULT_INSTRUCTION_PATH
system_prompt_path: str = CODEX_CLI_DEFAULT_SYSTEM_PROMPT_PATH
log_path: str = CODEX_CLI_DEFAULT_LOG_PATH
last_message_path: str = CODEX_CLI_DEFAULT_LAST_MESSAGE_PATH
auth_mode: CodexCLIAuthMode = "api_key"
sandbox: vf.SandboxConfig | None = vf.SandboxConfig()

def resolve(self, version: str = CODEX_CLI_DEFAULT_VERSION) -> vf.ProgramConfig:
files: dict[str, vf.ProgramValue] = {
self.instruction_path: {"fn": "verifiers.v1.utils.prompt_utils:task_text"},
self.system_prompt_path: {
"fn": "verifiers.v1.utils.prompt_utils:state_system_prompt_text"
},
}
artifacts = vf.ArtifactsConfig.model_validate(
{
"codex_cli_log": {
"path": self.log_path,
"format": "text",
"optional": True,
},
"codex_cli_last_message": {
"path": self.last_message_path,
"format": "text",
"optional": True,
},
}
)
name, parsed_version = split_versioned_agent_spec(version)
release = parsed_version or name
if release in {"", "codex", "openai/codex"}:
release = "latest"

install_home = f"{CODEX_CLI_DEFAULT_INSTALL_DIR}/home"
bin_dir = f"{CODEX_CLI_DEFAULT_INSTALL_DIR}/bin"
setup = f"""\
set -e
apt-get -o Acquire::Retries=3 update -qq && apt-get -o Acquire::Retries=3 install -y -qq ca-certificates curl git python3 tar > /dev/null 2>&1
ln -sf "$(command -v python3)" /usr/local/bin/python
CODEX_RELEASE={shlex.quote(release)} \\
CODEX_NON_INTERACTIVE=1 \\
CODEX_INSTALL_DIR={shlex.quote(bin_dir)} \\
CODEX_HOME={shlex.quote(install_home)} \\
sh -c "$(curl -fsSL https://chatgpt.com/codex/install.sh)"
"""
if self.auth_mode == "api_key":
auth_setup = """\
if [ -z "${OPENAI_API_KEY:-}" ]; then
export OPENAI_API_KEY=intercepted
fi
printf '%s' "$OPENAI_API_KEY" | codex login --with-api-key >/dev/null
CODEX_CONFIG_ARGS+=(-c "openai_base_url=\\"${OPENAI_BASE_URL:-https://api.openai.com/v1}\\"")
"""
else:
auth_setup = """\
CODEX_AUTH_JSON_VAR={auth_json_var}
CODEX_AUTH_JSON="$(printenv "$CODEX_AUTH_JSON_VAR" || true)"
if [ -z "$CODEX_AUTH_JSON" ]; then
echo "Codex ChatGPT auth requires $CODEX_AUTH_JSON_VAR to contain auth.json." >&2
exit 1
fi
printf '%s' "$CODEX_AUTH_JSON" > "$CODEX_HOME/auth.json"
chmod 600 "$CODEX_HOME/auth.json"
""".format(auth_json_var=shlex.quote(self.auth_json_var))
log_dir = str(PurePosixPath(self.log_path).parent)
last_message_dir = str(PurePosixPath(self.last_message_path).parent)
run_script = f"""\
set -eo pipefail
export PATH={shlex.quote(bin_dir)}:"$PATH"
export CODEX_HOME={shlex.quote(self.codex_home_path)}

CODEX_WORKDIR="${{AGENT_WORKDIR:-}}"
if [ -z "$CODEX_WORKDIR" ]; then
CODEX_WORKDIR={shlex.quote(self.agent_workdir)}
fi

CODEX_MODEL="${{OPENAI_MODEL:-gpt-5}}"
case "$CODEX_MODEL" in
openai/*) CODEX_MODEL="${{CODEX_MODEL#openai/}}" ;;
esac

mkdir -p "$CODEX_HOME" "$CODEX_WORKDIR" {shlex.quote(log_dir)} {shlex.quote(last_message_dir)}
CODEX_CONFIG_ARGS=(-c 'model_provider="openai"')
{auth_setup}
if [ -s {shlex.quote(self.system_prompt_path)} ]; then
CODEX_DEVELOPER_INSTRUCTIONS="$(sed 's/\\\\/\\\\\\\\/g; s/"/\\\\"/g' {shlex.quote(self.system_prompt_path)} | awk '{{printf "%s\\\\n", $0}}')"
CODEX_CONFIG_ARGS+=(-c "developer_instructions=\\"$CODEX_DEVELOPER_INSTRUCTIONS\\"")
fi

cd "$CODEX_WORKDIR"
timeout --kill-after=30s "${{AGENT_TIMEOUT_SECONDS:-3600}}" codex exec \\
--ignore-user-config \\
--ephemeral \\
--skip-git-repo-check \\
--dangerously-bypass-approvals-and-sandbox \\
--json \\

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Return Codex's final message as the completion

When this harness is used on tasks that score or display the assistant completion, --json makes Codex print newline-delimited event JSON to stdout rather than the answer text (the Codex CLI docs describe --json this way and --output-last-message as the final-message path: https://developers.openai.com/codex/cli/reference#codex-exec). The v1 sandbox runner records stdout directly into state["completion"] (verifiers/v1/utils/sandbox_utils.py), so these rollouts will expose a JSON event log as the model completion while the actual final message is only an artifact.

Useful? React with 👍 / 👎.

--model "$CODEX_MODEL" \\
--cd "$CODEX_WORKDIR" \\
--output-last-message {shlex.quote(self.last_message_path)} \\
"${{CODEX_CONFIG_ARGS[@]}}" \\
- < {shlex.quote(self.instruction_path)} 2>&1 | tee {shlex.quote(self.log_path)}
"""
env: dict[str, vf.ProgramValue] = {"OPENAI_MODEL": "runtime.model"}
if self.auth_mode == "chatgpt":
env[self.auth_json_var] = {
"fn": "harnesses.codex_cli:codex_chatgpt_auth_json",
"auth_json_var": self.auth_json_var,
}

return self.resolve_command(
command=["bash", "-lc", run_script],
default_sandbox=self.sandbox,
files=files,
setup=setup,
env=env,
artifacts=artifacts,
)


class CodexCLIConfig(vf.HarnessConfig):
system_prompt: vf.PromptInput | vf.SystemPromptConfig | None = (
CODEX_CLI_DEFAULT_SYSTEM_PROMPT
)
version: str = CODEX_CLI_DEFAULT_VERSION
program: CodexCLIProgramConfig = CodexCLIProgramConfig()
max_turns: int = 4


class CodexCLI(vf.Harness[CodexCLIConfig]):
config: CodexCLIConfig

def load_program_config(self, config: CodexCLIConfig) -> vf.ProgramConfig:
return config.program.resolve(version=config.version)


def load_harness(config: CodexCLIConfig) -> CodexCLI:
return CodexCLI(config=config)
2 changes: 2 additions & 0 deletions tests/test_imports.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@


PACKAGE_SYMBOLS = {
"CodexCLI",
"CodexCLIConfig",
"HarborTaskset",
"HarborTasksetConfig",
"MiniSWEAgent",
Expand Down
95 changes: 95 additions & 0 deletions tests/test_v1_codex_cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
from typing import Any, cast

import pytest
import verifiers as vf
from harnesses import CodexCLI, CodexCLIConfig, CodexCLIProgramConfig
from harnesses.codex_cli import codex_chatgpt_auth_json


def test_codex_cli_builds_openai_api_sandbox_program() -> None:
harness = CodexCLI(
config=CodexCLIConfig(
system_prompt="Use tests.",
program=CodexCLIProgramConfig(agent_workdir="/workspace"),
)
)
program = cast(dict[str, Any], harness.program_config.data())
command = cast(list[str], program["command"])
setup = cast(str, program["setup"])
files = cast(dict[str, object], program["files"])
artifacts = cast(dict[str, object], program["artifacts"])
env = cast(dict[str, object], program["env"])
run_script = command[-1]

assert isinstance(harness, vf.Harness)
assert program["sandbox"] is not False
assert "CODEX_RELEASE=latest" in setup
assert "https://chatgpt.com/codex/install.sh" in setup
assert "apt-get -o Acquire::Retries=3 update" in setup
assert "apt-get -o Acquire::Retries=3 install" in setup
assert "python3" in setup
assert "/usr/local/bin/python" in setup
assert "/codex-cli/instruction.txt" in files
assert "/codex-cli/system.txt" in files
assert "codex_cli_log" in artifacts
assert "codex_cli_last_message" in artifacts
assert env["OPENAI_MODEL"] == "runtime.model"
assert "printf '%s' \"$OPENAI_API_KEY\" | codex login --with-api-key" in run_script
assert 'model_provider="openai"' in run_script
assert "openai_base_url=" in run_script
assert "developer_instructions=" in run_script
assert "--dangerously-bypass-approvals-and-sandbox" in run_script
assert "/workspace" in run_script


def test_codex_cli_version_spec_sets_installer_release() -> None:
harness = CodexCLI(config=CodexCLIConfig(version="codex@0.137.0"))
program = cast(dict[str, Any], harness.program_config.data())
setup = cast(str, program["setup"])

assert "CODEX_RELEASE=0.137.0" in setup


def test_codex_cli_chatgpt_auth_uses_forwarded_auth_json_secret() -> None:
harness = CodexCLI(
config=CodexCLIConfig(
program=CodexCLIProgramConfig(
auth_mode="chatgpt",
)
)
)
program = cast(dict[str, Any], harness.program_config.data())
env = cast(dict[str, object], program["env"])
run_script = cast(list[str], program["command"])[-1]

assert "dirs" not in program
assert cast(dict[str, object], env["CODEX_AUTH_JSON"]) == {
"fn": "harnesses.codex_cli:codex_chatgpt_auth_json",
"auth_json_var": "CODEX_AUTH_JSON",
}
assert 'printf \'%s\' "$CODEX_AUTH_JSON" > "$CODEX_HOME/auth.json"' in run_script
assert "codex login --with-api-key" not in run_script
assert "openai_base_url=" not in run_script


def test_codex_chatgpt_auth_json_reads_configured_env_var(
monkeypatch: pytest.MonkeyPatch,
) -> None:
monkeypatch.setenv("CUSTOM_CODEX_AUTH_JSON", '{"auth_mode":"chatgpt"}')

assert codex_chatgpt_auth_json("CUSTOM_CODEX_AUTH_JSON") == (
'{"auth_mode":"chatgpt"}'
)


def test_codex_chatgpt_auth_json_requires_configured_env_var(
monkeypatch: pytest.MonkeyPatch,
) -> None:
monkeypatch.delenv("CODEX_AUTH_JSON", raising=False)

with pytest.raises(RuntimeError, match="CODEX_AUTH_JSON must contain"):
codex_chatgpt_auth_json("CODEX_AUTH_JSON")


def test_codex_cli_imports_from_package() -> None:
assert CodexCLI
6 changes: 6 additions & 0 deletions tests/test_v1_harbor_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@

import verifiers as vf
from harnesses import (
CodexCLI,
CodexCLIConfig,
CodexCLIProgramConfig,
MiniSWEAgent,
MiniSWEAgentConfig,
MiniSWEAgentProgramConfig,
Expand Down Expand Up @@ -250,6 +253,8 @@ async def test_harbor_reward_uses_background_job_for_tests(


def test_packaged_harbor_and_opencode_imports_are_available_from_packages() -> None:
assert CodexCLI
assert CodexCLIConfig
assert OpenCode
assert OpenCodeConfig
assert Pi
Expand Down Expand Up @@ -324,6 +329,7 @@ def test_opencode_custom_version_uses_versioned_release() -> None:
@pytest.mark.parametrize(
("harness_cls", "config_cls", "program_cls"),
[
(CodexCLI, CodexCLIConfig, CodexCLIProgramConfig),
(OpenCode, OpenCodeConfig, OpenCodeProgramConfig),
(MiniSWEAgent, MiniSWEAgentConfig, MiniSWEAgentProgramConfig),
(Pi, PiConfig, PiProgramConfig),
Expand Down
Loading