diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 18587a9..5ad38a3 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,4 +1,4 @@
-# Contributing to __________
+# Contributing to ProgramBench
We want to make contributing to this project as easy and transparent as
possible.
@@ -35,5 +35,5 @@ outlined on that page and do not file a public issue.
* ...
## License
-By contributing to __________, you agree that your contributions will be licensed
+By contributing to ProgramBench, you agree that your contributions will be licensed
under the LICENSE file in the root directory of this source tree.
diff --git a/docs/README.md b/docs/README.md
index a0b1af6..0b501f4 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -8,7 +8,7 @@
## Inference
Please use the images with tag `task_cleanroom` from `https://hub.docker.com/orgs/programbench/repositories`.
-E.g., to solve the task `ffmpeg__ffmpeg.360a402`, use the followoing image:
+E.g., to solve the task `ffmpeg__ffmpeg.360a402`, use the following image:
```
https://hub.docker.com/repository/docker/programbench/ffmpeg_1776_ffmpeg.360a402/tags/task_cleanroom/
@@ -41,9 +41,9 @@ We expect to release our baseline system in `mini-swe-agent` this week.
## Evaluation
-Evaluation your agent run is the main function performed by the `ProgramBench` repository.
+Evaluating your agent run is the main function performed by the `ProgramBench` repository.
-After following the installation instructions from the [README](https://github.com/SWE-agent/ProgramBench#installation), you can run the evaluation with:
+After following the installation instructions from the [README](../README.md#quickstart), you can run the evaluation with:
```
uv run programbench eval /path/to/my-amazing-agent-run
diff --git a/tests/test_cli.py b/tests/test_cli.py
new file mode 100644
index 0000000..594bd9d
--- /dev/null
+++ b/tests/test_cli.py
@@ -0,0 +1,39 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Smoke tests for CLI subcommands."""
+
+from typer.testing import CliRunner
+
+from programbench.cli.main import app
+
+runner = CliRunner()
+
+
+def test_top_level_help():
+ result = runner.invoke(app, ["--help"])
+ assert result.exit_code == 0
+ assert "eval" in result.output
+ assert "blob" in result.output
+ assert "info" in result.output
+
+
+def test_info_help():
+ result = runner.invoke(app, ["info", "--help"])
+ assert result.exit_code == 0
+ assert "run-dir" in result.output.lower() or "run_dir" in result.output.lower()
+
+
+def test_blob_help():
+ result = runner.invoke(app, ["blob", "--help"])
+ assert result.exit_code == 0
+ assert "sync" in result.output
+
+
+def test_blob_sync_help():
+ result = runner.invoke(app, ["blob", "sync", "--help"])
+ assert result.exit_code == 0
+ assert "instance" in result.output.lower()
diff --git a/tests/test_eval_extras.py b/tests/test_eval_extras.py
new file mode 100644
index 0000000..8a009e8
--- /dev/null
+++ b/tests/test_eval_extras.py
@@ -0,0 +1,115 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Tests for eval functions not covered by test_eval.py."""
+
+import pytest
+
+from programbench.eval.eval import (
+ EvaluationResult,
+ TestBranchError,
+ TestResult,
+ count_testcases,
+)
+from programbench.eval.eval_batch import _can_reprocess
+
+
+JUNIT_XML_THREE_CASES = """\
+
+
+
+
+
+
+
+
+"""
+
+
+class TestCountTestcases:
+ @pytest.mark.parametrize(
+ ("xml", "expected"),
+ [
+ ("", 0),
+ (" \n ", 0),
+ ("", 0),
+ (JUNIT_XML_THREE_CASES, 3),
+ ],
+ )
+ def test_counts(self, xml, expected):
+ assert count_testcases(xml) == expected
+
+
+class TestEvaluationResultSummarize:
+ def test_clean_run(self):
+ result = EvaluationResult(
+ test_results=[
+ TestResult(name="t1", branch="b1", status="passed", extra={}),
+ TestResult(name="t2", branch="b1", status="passed", extra={}),
+ ],
+ solution_branch="submission",
+ )
+ s = result.summarize()
+ assert "100" in s
+ assert "2/2" in s
+ assert "submission" in s
+
+ def test_with_error_code(self):
+ result = EvaluationResult(error_code="compile_failed", error_details="gcc not found")
+ s = result.summarize()
+ assert "compile_failed" in s
+ assert "gcc not found" in s
+
+ def test_with_branch_errors(self):
+ result = EvaluationResult(
+ test_results=[TestResult(name="t1", branch="b1", status="passed", extra={})],
+ test_branch_errors={"b2": [TestBranchError(error_code="timeout", error_details="")]},
+ )
+ assert "b2" in result.summarize()
+
+ def test_with_system_errors(self):
+ result = EvaluationResult(
+ test_results=[TestResult(name="t1", branch="b1", status="system_error", extra={})],
+ )
+ assert "system_errors=1" in result.summarize()
+
+ def test_with_warnings(self):
+ result = EvaluationResult(warnings=["something unexpected"])
+ assert "warnings=1" in result.summarize()
+
+
+class TestCanReprocess:
+ def test_error_code_is_reprocessable(self):
+ assert _can_reprocess(EvaluationResult(error_code="compile_failed"))
+
+ def test_all_branches_tagged_in_log(self):
+ result = EvaluationResult(
+ test_branches=["b1", "b2"],
+ log=[
+ {"step": "results_read", "branch": "b1", "returncode": 0, "output": ""},
+ {"step": "results_read", "branch": "b2", "returncode": 0, "output": ""},
+ ],
+ )
+ assert _can_reprocess(result)
+
+ def test_missing_branch_in_log_not_reprocessable(self):
+ result = EvaluationResult(
+ test_branches=["b1", "b2"],
+ log=[
+ {"step": "results_read", "branch": "b1", "returncode": 0, "output": ""},
+ ],
+ )
+ assert not _can_reprocess(result)
+
+ def test_branch_with_error_excluded_from_check(self):
+ result = EvaluationResult(
+ test_branches=["b1", "b2"],
+ test_branch_errors={"b2": [TestBranchError(error_code="fail", error_details="")]},
+ log=[
+ {"step": "results_read", "branch": "b1", "returncode": 0, "output": ""},
+ ],
+ )
+ assert _can_reprocess(result)
diff --git a/tests/test_instance_filters.py b/tests/test_instance_filters.py
new file mode 100644
index 0000000..41f5071
--- /dev/null
+++ b/tests/test_instance_filters.py
@@ -0,0 +1,51 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from programbench.utils.instance_filters import filter_instances
+
+
+def _inst(iid: str, branches: dict | None = None) -> dict:
+ return {"instance_id": iid, "branches": branches or {}}
+
+
+INSTANCES = [
+ _inst("alpha__foo.abc", {"b1": {"tests": ["t1"]}}),
+ _inst("beta__bar.def", {"b2": {"tests": ["t2"]}}),
+ _inst("gamma__baz.ghi"),
+]
+
+
+class TestFilterInstances:
+ def test_no_filters_returns_all(self):
+ assert filter_instances(INSTANCES) == INSTANCES
+
+ def test_regex_filter(self):
+ assert [i["instance_id"] for i in filter_instances(INSTANCES, filter_spec="alpha.*")] == ["alpha__foo.abc"]
+
+ def test_regex_filter_no_match(self):
+ assert filter_instances(INSTANCES, filter_spec="nonexistent") == []
+
+ def test_slice_spec(self):
+ assert [i["instance_id"] for i in filter_instances(INSTANCES, slice_spec="0:2")] == [
+ "alpha__foo.abc",
+ "beta__bar.def",
+ ]
+
+ def test_slice_from_end(self):
+ assert [i["instance_id"] for i in filter_instances(INSTANCES, slice_spec="-1:")] == ["gamma__baz.ghi"]
+
+ def test_has_test_branch(self):
+ result = filter_instances(INSTANCES, has_test_branch=True)
+ assert [i["instance_id"] for i in result] == ["alpha__foo.abc", "beta__bar.def"]
+
+ def test_filter_and_slice_combined(self):
+ result = filter_instances(INSTANCES, filter_spec="(alpha|beta).*", slice_spec="0:1")
+ assert [i["instance_id"] for i in result] == ["alpha__foo.abc"]
+
+ def test_shuffle_is_deterministic(self):
+ r1 = [i["instance_id"] for i in filter_instances(INSTANCES, shuffle=True)]
+ r2 = [i["instance_id"] for i in filter_instances(INSTANCES, shuffle=True)]
+ assert r1 == r2