diff --git a/tests/test_quest_taskset.py b/tests/test_quest_taskset.py new file mode 100644 index 0000000000..48e50b16da --- /dev/null +++ b/tests/test_quest_taskset.py @@ -0,0 +1,46 @@ +import pytest +import verifiers as vf +from pydantic import BaseModel, ValidationError + +from verifiers.envs.experimental.composable.tasksets.search.quest.taskset import ( + QuestOpenAIClient, +) + + +class _BinaryResult(BaseModel): + reasoning: str + result: bool + + +class _FakeStructuredCompletions: + async def parse(self, **kwargs): + response_format = kwargs["response_format"] + return response_format.model_validate_json( + r'{"reasoning": "bad \q escape", "result": true}' + ) + + +class _FakeChat: + completions = _FakeStructuredCompletions() + + +class _FakeBeta: + chat = _FakeChat() + + +class _FakeOpenAIClient: + beta = _FakeBeta() + + +@pytest.mark.asyncio +async def test_quest_structured_parse_error_becomes_invalid_model_response(): + client = QuestOpenAIClient(client=_FakeOpenAIClient(), model="judge-model") + + with pytest.raises(vf.InvalidModelResponseError) as exc_info: + await client.async_response( + messages=[{"role": "user", "content": "judge this"}], + response_format=_BinaryResult, + ) + + assert "QUEST judge returned invalid structured response" in str(exc_info.value) + assert isinstance(exc_info.value.__cause__, ValidationError) diff --git a/verifiers/envs/experimental/composable/tasksets/search/quest/taskset.py b/verifiers/envs/experimental/composable/tasksets/search/quest/taskset.py index 4eaa3cbe86..9cd7f31611 100644 --- a/verifiers/envs/experimental/composable/tasksets/search/quest/taskset.py +++ b/verifiers/envs/experimental/composable/tasksets/search/quest/taskset.py @@ -34,7 +34,7 @@ RateLimitError, UnprocessableEntityError, ) -from pydantic import BaseModel +from pydantic import BaseModel, ValidationError from verifiers.envs.experimental.composable import SandboxSpec, SandboxTaskSet from verifiers.types import ClientConfig from verifiers.utils.client_utils import setup_openai_client @@ -188,6 +188,10 @@ async def async_response(self, *, count_token: bool = False, **kwargs: Any) -> A ) except _QUEST_JUDGE_ERROR_TYPES as exc: _raise_quest_judge_error(exc, model=model) + except ValidationError as exc: + raise vf.InvalidModelResponseError( + f"QUEST judge returned invalid structured response for {model}: {exc}" + ) from exc choice = _single_choice(response, context="structured") parsed = choice.message.parsed if parsed is None: