diff --git a/pyrit/score/scorer.py b/pyrit/score/scorer.py index 031c7c1553..950c867b29 100644 --- a/pyrit/score/scorer.py +++ b/pyrit/score/scorer.py @@ -17,6 +17,7 @@ ) from pyrit.exceptions import ( + BadRequestException, InvalidJsonException, PyritException, pyrit_json_retry, @@ -729,6 +730,7 @@ async def _score_value_with_llm_async( score_value still needs to be normalized and validated. Raises: + BadRequestException: If the scorer's LLM response is blocked by content filtering. ValueError: If required keys are missing from the response or if the response format is invalid. InvalidJsonException: If the response is not valid JSON. Exception: For other unexpected errors during scoring. @@ -781,6 +783,16 @@ async def _score_value_with_llm_async( response_json: str = "" try: + # Check if the scorer's own LLM response was blocked by content filtering + if all(piece.is_blocked() for piece in response[0].message_pieces): + raise BadRequestException( + message=( + f"The scorer's LLM request was blocked by content filtering while scoring " + f"prompt ID: {scored_prompt_id}. Consider using a scorer endpoint with " + f"content filtering disabled for red-teaming workflows." + ) + ) + # Get the text piece which contains the JSON response containing the score_value and rationale from the LLM text_piece = next( piece for piece in response[0].message_pieces if piece.converted_value_data_type == "text" diff --git a/tests/unit/score/test_scorer.py b/tests/unit/score/test_scorer.py index 6491ccefeb..9afde12082 100644 --- a/tests/unit/score/test_scorer.py +++ b/tests/unit/score/test_scorer.py @@ -1548,6 +1548,39 @@ async def test_score_value_with_llm_skips_reasoning_piece(good_json): assert result.score_rationale == "Valid response" +async def test_score_value_with_llm_raises_when_scorer_response_blocked(): + """When the scorer's own LLM response is blocked by content filtering, raise BadRequestException.""" + from pyrit.exceptions import BadRequestException + + chat_target = MagicMock(PromptTarget) + chat_target.get_identifier.return_value = get_mock_target_identifier("MockChatTarget") + + blocked_piece = MessagePiece( + role="assistant", + original_value="", + original_value_data_type="error", + converted_value="", + converted_value_data_type="error", + conversation_id="test-convo", + response_error="blocked", + ) + blocked_response = Message(message_pieces=[blocked_piece]) + chat_target.send_prompt_async = AsyncMock(return_value=[blocked_response]) + + scorer = MockScorer() + + with pytest.raises(BadRequestException, match="blocked by content filtering"): + await scorer._score_value_with_llm_async( + prompt_target=chat_target, + system_prompt="system_prompt", + message_value="message_value", + message_data_type="text", + scored_prompt_id="test-prompt-id", + category="category", + objective="task", + ) + + # ── Helpers for score_blocked_content tests ──────────────────────────────────