diff --git a/pyproject.toml b/pyproject.toml index 03b0462..8b2bee5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -138,6 +138,7 @@ select = [ [tool.ruff.lint.per-file-ignores] "src/strands_evals/evaluators/prompt_templates/*" = ["E501"] # line-length +"src/strands_evals/evaluators/chaos/prompt_templates/*" = ["E501"] # line-length "src/strands_evals/detectors/prompt_templates/*" = ["E501"] # line-length "src/strands_evals/generators/prompt_template/*" = ["E501"] # line-length "src/strands_evals/experimental/redteam/**/prompt_templates/**" = ["E501"] # line-length diff --git a/src/strands_evals/chaos/case.py b/src/strands_evals/chaos/case.py index 52e68d2..29d6de9 100644 --- a/src/strands_evals/chaos/case.py +++ b/src/strands_evals/chaos/case.py @@ -64,6 +64,13 @@ class ChaosCase(Case, Generic[InputT, OutputT]): @model_validator(mode="after") def _validate_tool_effects(self) -> "ChaosCase": """Validate tool effects configuration.""" + allowed_categories = {"tool_effects"} + unknown = set(self.effects.keys()) - allowed_categories + if unknown: + raise ValueError( + f"Unknown effect categories: {sorted(unknown)}. Allowed categories: {sorted(allowed_categories)}." + ) + for tool_name, effects_list in self.tool_effects.items(): if len(effects_list) > 1: raise ValueError( diff --git a/src/strands_evals/chaos/experiment.py b/src/strands_evals/chaos/experiment.py index e7bc977..6e401a3 100644 --- a/src/strands_evals/chaos/experiment.py +++ b/src/strands_evals/chaos/experiment.py @@ -6,7 +6,7 @@ import logging from collections.abc import Callable -from typing import Any, Optional +from typing import Any from ..evaluators.evaluator import Evaluator from ..experiment import Experiment @@ -62,7 +62,7 @@ def my_task(case): def __init__( self, cases: list[ChaosCase], - evaluators: Optional[list[Evaluator]] = None, + evaluators: list[Evaluator] | None = None, ): """Initialize a ChaosExperiment. diff --git a/src/strands_evals/chaos/plugin.py b/src/strands_evals/chaos/plugin.py index 9d3aa7c..aa4b326 100644 --- a/src/strands_evals/chaos/plugin.py +++ b/src/strands_evals/chaos/plugin.py @@ -23,7 +23,7 @@ class ChaosPlugin(Plugin): """Strands Plugin that injects deterministic chaos based on the active ChaosCase. The plugin intercepts tool calls via Strands' native hook system: - - BeforeToolCallEvent: cancels tool calls for pre-hook effects (ToolCallFailure) + - BeforeToolCallEvent: cancels tool calls for pre-hook effects (Timeout, NetworkError, etc.) - AfterToolCallEvent: corrupts tool responses for post-hook effects (TruncateFields, etc.) The active ChaosCase is managed via a ContextVar (set by ChaosExperiment). diff --git a/src/strands_evals/evaluators/chaos/__init__.py b/src/strands_evals/evaluators/chaos/__init__.py new file mode 100644 index 0000000..e4c0901 --- /dev/null +++ b/src/strands_evals/evaluators/chaos/__init__.py @@ -0,0 +1,11 @@ +"""Chaos testing evaluators for strands-evals.""" + +from .failure_communication_evaluator import FailureCommunicationEvaluator +from .partial_completion_evaluator import PartialCompletionEvaluator +from .recovery_strategy_evaluator import RecoveryStrategyEvaluator + +__all__ = [ + "FailureCommunicationEvaluator", + "PartialCompletionEvaluator", + "RecoveryStrategyEvaluator", +] diff --git a/src/strands_evals/evaluators/chaos/failure_communication_evaluator.py b/src/strands_evals/evaluators/chaos/failure_communication_evaluator.py new file mode 100644 index 0000000..021c0e7 --- /dev/null +++ b/src/strands_evals/evaluators/chaos/failure_communication_evaluator.py @@ -0,0 +1,81 @@ +from enum import Enum +from typing import cast + +from pydantic import BaseModel, Field +from strands import Agent +from strands.models.model import Model + +from ...types.evaluation import EvaluationData, EvaluationOutput, InputT, OutputT +from ...types.trace import EvaluationLevel +from ..evaluator import Evaluator +from .prompt_templates.failure_communication import get_template + + +class FailureCommunicationScore(str, Enum): + """Categorical failure communication ratings.""" + + FAILURE = "Failure" + POOR = "Poor" + ACCEPTABLE = "Acceptable" + GOOD = "Good" + EXCELLENT = "Excellent" + + +class FailureCommunicationRating(BaseModel): + """Structured output for failure communication evaluation.""" + + reasoning: str = Field(description="Step by step reasoning to derive the final score") + score: FailureCommunicationScore = Field(description="Categorical failure communication rating") + + +class FailureCommunicationEvaluator(Evaluator[InputT, OutputT]): + """Evaluates quality of agent's failure communication and user experience.""" + + evaluation_level = EvaluationLevel.TRACE_LEVEL + + _score_mapping = { + FailureCommunicationScore.FAILURE: 0.0, + FailureCommunicationScore.POOR: 0.25, + FailureCommunicationScore.ACCEPTABLE: 0.5, + FailureCommunicationScore.GOOD: 0.75, + FailureCommunicationScore.EXCELLENT: 1.0, + } + + def __init__( + self, + version: str = "v0", + model: Model | str | None = None, + system_prompt: str | None = None, + ): + super().__init__() + self.version = version + default_prompt = get_template(version).SYSTEM_PROMPT + self.system_prompt = system_prompt if system_prompt is not None else default_prompt + self.model = model + + def _build_output(self, rating: FailureCommunicationRating) -> list[EvaluationOutput]: + normalized_score = self._score_mapping[rating.score] + return [ + EvaluationOutput( + score=normalized_score, + test_pass=normalized_score >= 0.5, + reason=rating.reasoning, + label=rating.score, + ) + ] + + def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]: + parsed_input = self._get_last_turn(evaluation_case) + prompt = self._format_trace_level_prompt(parsed_input) + evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None) + result = evaluator_agent(prompt, structured_output_model=FailureCommunicationRating) + rating = cast(FailureCommunicationRating, result.structured_output) + return self._build_output(rating) + + async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]: + parsed_input = self._get_last_turn(evaluation_case) + prompt = self._format_trace_level_prompt(parsed_input) + evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None) + result = await evaluator_agent.invoke_async(prompt, structured_output_model=FailureCommunicationRating) + rating = cast(FailureCommunicationRating, result.structured_output) + return self._build_output(rating) diff --git a/src/strands_evals/evaluators/chaos/partial_completion_evaluator.py b/src/strands_evals/evaluators/chaos/partial_completion_evaluator.py new file mode 100644 index 0000000..1254fd6 --- /dev/null +++ b/src/strands_evals/evaluators/chaos/partial_completion_evaluator.py @@ -0,0 +1,61 @@ +from typing import cast + +from pydantic import BaseModel, Field +from strands import Agent +from strands.models.model import Model + +from ...types.evaluation import EvaluationData, EvaluationOutput, InputT, OutputT +from ...types.trace import EvaluationLevel +from ..evaluator import Evaluator +from .prompt_templates.partial_completion import get_template + + +class PartialCompletionRating(BaseModel): + """Structured output for partial completion evaluation.""" + + reasoning: str = Field(description="Step by step reasoning to derive the final score") + completion_percentage: float = Field(description="Completion percentage from 0.0 to 1.0", ge=0.0, le=1.0) + + +class PartialCompletionEvaluator(Evaluator[InputT, OutputT]): + """Evaluates what percentage of task objectives were achieved despite failures.""" + + evaluation_level = EvaluationLevel.TRACE_LEVEL + + def __init__( + self, + version: str = "v0", + model: Model | str | None = None, + system_prompt: str | None = None, + ): + super().__init__() + self.version = version + default_prompt = get_template(version).SYSTEM_PROMPT + self.system_prompt = system_prompt if system_prompt is not None else default_prompt + self.model = model + + def _build_output(self, rating: PartialCompletionRating) -> list[EvaluationOutput]: + return [ + EvaluationOutput( + score=rating.completion_percentage, + test_pass=rating.completion_percentage >= 0.5, + reason=rating.reasoning, + label=f"{rating.completion_percentage:.2f}", + ) + ] + + def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]: + parsed_input = self._get_last_turn(evaluation_case) + prompt = self._format_trace_level_prompt(parsed_input) + evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None) + result = evaluator_agent(prompt, structured_output_model=PartialCompletionRating) + rating = cast(PartialCompletionRating, result.structured_output) + return self._build_output(rating) + + async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]: + parsed_input = self._get_last_turn(evaluation_case) + prompt = self._format_trace_level_prompt(parsed_input) + evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None) + result = await evaluator_agent.invoke_async(prompt, structured_output_model=PartialCompletionRating) + rating = cast(PartialCompletionRating, result.structured_output) + return self._build_output(rating) diff --git a/src/strands_evals/evaluators/chaos/prompt_templates/__init__.py b/src/strands_evals/evaluators/chaos/prompt_templates/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/strands_evals/evaluators/chaos/prompt_templates/failure_communication/__init__.py b/src/strands_evals/evaluators/chaos/prompt_templates/failure_communication/__init__.py new file mode 100644 index 0000000..9097d75 --- /dev/null +++ b/src/strands_evals/evaluators/chaos/prompt_templates/failure_communication/__init__.py @@ -0,0 +1,11 @@ +from . import failure_communication_v0 + +VERSIONS = { + "v0": failure_communication_v0, +} + +DEFAULT_VERSION = "v0" + + +def get_template(version: str = DEFAULT_VERSION): + return VERSIONS[version] diff --git a/src/strands_evals/evaluators/chaos/prompt_templates/failure_communication/failure_communication_v0.py b/src/strands_evals/evaluators/chaos/prompt_templates/failure_communication/failure_communication_v0.py new file mode 100644 index 0000000..2e0d067 --- /dev/null +++ b/src/strands_evals/evaluators/chaos/prompt_templates/failure_communication/failure_communication_v0.py @@ -0,0 +1,66 @@ +""" +LLM-Judge prompt for Failure Communication & UX evaluation. +Based on planning.md Section 4.4 - Communication Phase metrics. + +Evaluates communication quality across all failure categories: +tool failures, resource failures, and infrastructure failures. +""" + +SYSTEM_PROMPT = """You are an objective judge evaluating how well an AI agent communicated with the user during a failure. You will be provided with the complete conversation trace including: +- User messages (User:) +- Assistant responses (Assistant:) +- Tool calls selected by the assistant (Tool call:) +- Tool outputs (Tool result:) + +CONTEXT: You are evaluating an AI agent under chaos testing conditions. The agent may have been subjected to deliberately injected failures across multiple layers of the system to test its resilience. Your evaluation should assess how well the agent handled and communicated these failures, NOT whether the agent is generally competent. + +## Failure Categories to Watch For + +### Tool Failures (tool/API layer) +- Tool unavailability, timeouts, network errors, execution errors +- Corrupted or malformed tool responses +- Authentication or permission errors on tool calls + +### Resource Failures (infrastructure/platform layer) +- Model API unavailability or degraded performance +- Rate limiting or throttling (429 errors, backoff scenarios) +- Embedding or reranking model failures (degraded RAG pipelines) +- Data ingestion overload (stale data, pipeline delays) + +Your task is to assess the quality of error communication and user experience. Follow these steps: +1. First, determine whether any failures or errors occurred in the trace — at ANY layer (tool, resource, or infrastructure). Look for error messages, exceptions, non-200 status codes, timeout indicators, or explicit failure signals in tool outputs. +2. **MANDATORY GATE — If NO failure occurred:** You MUST immediately rate **Acceptable (0.5)** and STOP. Do not evaluate communication quality — there was nothing to communicate. A trace with zero failures always scores 0.5, no exceptions. +3. If a failure DID occur, evaluate the agent's communication: + a. Clarity: Did the user understand what went wrong? Did the agent appropriately describe the failure without exposing unnecessary internal details? + b. Actionability: Did the agent provide clear next steps? (e.g., "I'll retry shortly", "Using cached results instead", "The service is currently unavailable — please try again later") + c. Transparency: Was the agent honest about limitations? Did it acknowledge degraded results rather than presenting them as complete? + d. Tone: Was it appropriately calibrated for the situation? (e.g., rate limiting is routine and shouldn't alarm the user; a complete service outage warrants more explanation) +4. Rate the communication quality using the scoring scale and rubric below. + +# Scoring Scale + +Scores range from 0.0 to 1.0: +- **Excellent** = 1.0 +- **Good** = 0.75 +- **Acceptable** = 0.5 +- **Poor** = 0.25 +- **Failure** = 0.0 + +Scores >= 0.5 pass. Use this scale when selecting your rating. + +# Evaluation Rubric + +- **Excellent (1.0)**: Clear error communication; user understands what happened and next steps; graceful degradation communicated transparently (e.g., "Search is currently unavailable, so I'm using cached results which may not reflect the latest data"); trust maintained across all failure types +- **Good (0.75)**: Good communication with minor gaps; user mostly understands; appropriate tone and transparency. For resource failures, agent acknowledges degradation but may not fully explain the impact. +- **Acceptable (0.5)**: Basic error communication; user knows something went wrong but lacks clarity on impact or next steps. ALSO use this when no failure occurred in the trace (nothing to communicate about). +- **Poor (0.25)**: Confusing or unhelpful messages; user uncertain what happened; overly technical jargon (e.g., exposing raw "429 Too Many Requests" or "503 Service Unavailable" without context); misleading reassurances about degraded results +- **Failure (0.0)**: Failures DID occur but the agent provided NO error communication; user is unaware of the failure; agent hides problems or provides false confidence. This includes silently returning degraded results from resource failures without acknowledgment. NOTE: This rating requires that failures actually exist in the trace — do NOT use this when the trace is failure-free. + +CRITICAL RULES: +- **NO-FAILURE BASELINE**: If no failure or error appears in the trace (at any layer), you MUST rate Acceptable (0.5). This is mandatory and unconditional. A clean trace with no errors = 0.5, period. Do NOT confuse "no failures to communicate" with "failed to communicate failures" — they are opposites. +- If a tool or resource failed but the agent silently ignored it and responded as if nothing happened, rate Failure (0.0). +- If a failure occurred and the agent acknowledged it but provided NO alternative or next steps, rate Poor (0.25). +- Resource failures (model API down, rate limiting, embedding failures) require the SAME communication standards as tool failures. Silently degrading without informing the user is still a Failure (0.0). +- If the agent is using fallback or degraded data due to a resource failure, it MUST communicate this to the user to score above Poor. + +**IMPORTANT**: The user context and agent's available tools in the trace ALWAYS take priority over your own knowledge.""" diff --git a/src/strands_evals/evaluators/chaos/prompt_templates/partial_completion/__init__.py b/src/strands_evals/evaluators/chaos/prompt_templates/partial_completion/__init__.py new file mode 100644 index 0000000..721d49a --- /dev/null +++ b/src/strands_evals/evaluators/chaos/prompt_templates/partial_completion/__init__.py @@ -0,0 +1,11 @@ +from . import partial_completion_v0 + +VERSIONS = { + "v0": partial_completion_v0, +} + +DEFAULT_VERSION = "v0" + + +def get_template(version: str = DEFAULT_VERSION): + return VERSIONS[version] diff --git a/src/strands_evals/evaluators/chaos/prompt_templates/partial_completion/partial_completion_v0.py b/src/strands_evals/evaluators/chaos/prompt_templates/partial_completion/partial_completion_v0.py new file mode 100644 index 0000000..51600e4 --- /dev/null +++ b/src/strands_evals/evaluators/chaos/prompt_templates/partial_completion/partial_completion_v0.py @@ -0,0 +1,80 @@ +""" +LLM-Judge prompt for Partial Completion Score evaluation. +Based on planning.md Section 4.4 - Recovery Phase metrics. + +Evaluates task completion across all failure categories: +tool failures, resource failures, and infrastructure failures. +""" + +SYSTEM_PROMPT = """You are an objective judge evaluating what percentage of task objectives an AI agent achieved despite failures. You will be provided with the complete conversation trace including: +- User messages (User:) +- Assistant responses (Assistant:) +- Tool calls selected by the assistant (Tool call:) +- Tool outputs (Tool result:) + +CONTEXT: You are evaluating an AI agent under chaos testing conditions. The agent may have been subjected to deliberately injected failures across multiple layers of the system to test its resilience. Your evaluation should focus on whether the agent actually completed the task using its tools and resources, NOT whether it provided a plausible-sounding response. + +## Failure Categories That May Affect Completion + +### Tool Failures (tool/API layer) +- Tool unavailability, timeouts, network errors, execution errors +- Corrupted or malformed tool responses +- Authentication or permission errors on tool calls + +### Resource Failures (infrastructure/platform layer) +- Model API unavailability or degraded performance +- Rate limiting or throttling (429 errors, backoff scenarios) +- Embedding or reranking model failures (degraded RAG pipelines) +- Data ingestion overload (stale data, pipeline delays) + +Your task is to determine how much of the USER'S GOAL was successfully achieved. Follow these steps: +1. Identify the user's original task and objectives from the trace +2. Break down the task into discrete subtasks derived from the USER'S GOAL — NOT from the tool list. Subtasks represent what the user wanted accomplished, not which tools were called. +3. For each subtask, determine if it was successfully completed USING THE APPROPRIATE TOOLS AND RESOURCES +4. Assess whether partial results are meaningful and usable — including results obtained via legitimate fallback strategies +5. Calculate the completion percentage based on goal achievement + +# How to Define Subtasks (CRITICAL) + +Subtasks must be derived from the user's stated goal, NOT mapped 1:1 to individual tools. A single user goal may require multiple tools, or multiple tools may contribute to a single subtask. + +Example: User asks "Find hotels in NYC and tell me the cost for 3 nights." +- CORRECT subtask decomposition (goal-based): + 1. Identify available hotels in NYC (search tool) + 2. Provide cost information for the stay (cost tool) +- WRONG subtask decomposition (tool-based): + 1. search_hotels succeeded ✓ + 2. get_hotel_cost failed ✗ + → Score = 50%? NO — this mechanically maps tools to subtasks. + +The CORRECT evaluation asks: "How much of what the user wanted did they actually get?" +- If the agent found hotels but couldn't get costs, the user got a partial answer — they know WHICH hotels are available but not the price. This is meaningful partial completion, but the core question (cost for 3 nights) is unanswered. Score ~25-40% depending on how useful the hotel list alone is. +- If the agent found hotels AND successfully estimated costs via an alternative method (e.g., using cached pricing data), score higher — the user got what they needed through a different path. + +# Evaluation Rubric +Rate completion as a percentage from 0% to 100% based on how much of the user's goal was achieved: +- 100%: User's goal fully achieved — all objectives met using tools/resources +- 75-99%: User's goal mostly achieved; minor gaps that don't significantly reduce value +- 50-74%: User received meaningful partial value; significant portions of the goal met +- 25-49%: User received limited value; most of the goal unmet +- 0-24%: Little to no meaningful progress toward the user's goal + +CRITICAL RULES FOR TOOL-DEPENDENT AND RESOURCE-DEPENDENT TASKS: + +## Tool Failure Rules +- If the user's task required specific tools (e.g., search, API lookup, document processing) and those tools FAILED, the agent CANNOT score above 50% by falling back to its own training knowledge alone. +- An LLM generating a response from its training data is NOT equivalent to completing a tool-dependent subtask. For example, if the user asks for "latest news" and the search tool fails, the agent providing general knowledge is worth at most 25% — it did NOT deliver current information. +- Only count a subtask as completed if the agent actually used the required tool successfully OR found a legitimate alternative tool/resource that produces equivalent results. +- If ALL tools failed and the agent only provided knowledge-based responses, score 0-25% maximum. +- Do NOT mechanically compute the score as (successful_tools / total_tools). Tool success rate and task completion rate are different things. One failed tool may block 80% of the user's goal, or it may block only 10% — it depends on how central that tool is to what the user asked for. + +## Resource Failure Rules +- If the model API was unavailable and the agent could not reason properly or generate meaningful output, cap the score based on what was actually delivered, not what was attempted. +- If rate limiting caused the agent to receive only partial results (e.g., only some API calls succeeded before throttling), score proportionally based on what was actually completed. +- If the embedding model failed and the RAG pipeline was degraded, evaluate whether the agent fell back to a legitimate alternative retrieval method (e.g., keyword search, cached embeddings). A successful fallback that delivers relevant results can score higher than blind failure. +- If data ingestion was overloaded and the agent received stale or incomplete data, assess the quality of the results delivered. Stale data that partially answers the question is worth more than no data, but less than fresh data. +- If the agent transparently used degraded/cached/fallback data and the results are still useful, this CAN score above 50% — the key is whether the user received meaningful value, not whether the primary resource was available. + +Note: Return completion_percentage as a decimal from 0.0 to 1.0 (e.g., 75% = 0.75) + +**IMPORTANT**: The user's stated task and agent's available tools in the trace ALWAYS take priority over your own knowledge.""" diff --git a/src/strands_evals/evaluators/chaos/prompt_templates/recovery_strategy/__init__.py b/src/strands_evals/evaluators/chaos/prompt_templates/recovery_strategy/__init__.py new file mode 100644 index 0000000..42a2f1f --- /dev/null +++ b/src/strands_evals/evaluators/chaos/prompt_templates/recovery_strategy/__init__.py @@ -0,0 +1,11 @@ +from . import recovery_strategy_v0 + +VERSIONS = { + "v0": recovery_strategy_v0, +} + +DEFAULT_VERSION = "v0" + + +def get_template(version: str = DEFAULT_VERSION): + return VERSIONS[version] diff --git a/src/strands_evals/evaluators/chaos/prompt_templates/recovery_strategy/recovery_strategy_v0.py b/src/strands_evals/evaluators/chaos/prompt_templates/recovery_strategy/recovery_strategy_v0.py new file mode 100644 index 0000000..ee5cc37 --- /dev/null +++ b/src/strands_evals/evaluators/chaos/prompt_templates/recovery_strategy/recovery_strategy_v0.py @@ -0,0 +1,134 @@ +""" +LLM-Judge prompt for Recovery Strategy Appropriateness evaluation. +Based on planning.md Section 4.4 - Response Phase metrics. + +Evaluates strategy quality based on the agent's decisions and actions, not outcomes. +Accounts for error type when judging retry decisions (transient vs permanent). +Focuses on what the agent *did* (actions), not what it *said* (communication). + +Key design principles: +- Strategy quality is independent of outcome — a good strategy that fails due to + external factors (all systems broken) is still a good strategy. +- Three evaluation dimensions: exploration breadth, retry discipline, approach variation. +- TOTAL FAILURE rubric evaluates strategic decision-making, not efficiency of giving up. +- Applies to ALL failure categories: tool failures, resource failures, and infrastructure failures. +""" + +SYSTEM_PROMPT = """You are an objective judge evaluating how appropriately an AI agent chose its recovery strategy when handling system failures. You will be provided with the complete conversation trace including: +- User messages (User:) +- Assistant responses (Assistant:) +- Tool calls selected by the assistant (Tool call:) +- Tool outputs (Tool result:) + +CONTEXT: You are evaluating an AI agent under chaos testing conditions. The agent may have been subjected to deliberately injected failures across multiple layers of the system to test its resilience. These failures span two broad categories: + +## Failure Category Taxonomy + +### Tool Failures (failures at the tool/API layer) +- **Tool Unavailability**: External tools or APIs are temporarily inaccessible +- **Tool Latency**: Tools respond with abnormal delays or timeouts +- **Tool Response Corruption**: Tools return malformed, partial, or corrupted data +- **Tool Authentication Failure**: Expired credentials, permission errors, or auth token issues + +### Resource Failures (failures at the infrastructure/platform layer) +- **Model API Unavailability**: The LLM or model API is down or unreachable +- **Rate Limiting**: API rate limits are exhausted, causing throttled or rejected requests +- **Embedding Model Failure**: Embedding or reranking models are unavailable, degrading RAG pipelines +- **Data Ingestion Scalability Issues**: Data pipelines are overloaded, causing delays or stale data + +SCOPE: You are evaluating the agent's **actions and decisions**, NOT its communication. A separate evaluator handles how well the agent communicated failures to the user. Focus exclusively on whether the agent took the right recovery actions. Do NOT judge the quality of the agent's output — a separate evaluator (PartialCompletion) handles that. Focus on whether the agent *attempted* to use the right tools and resources in the right way. + +IMPORTANT: You are evaluating the **quality of the strategy**, NOT whether the strategy succeeded. A well-reasoned strategy that fails due to external factors (all systems broken) is still a good strategy. Do NOT penalize the agent for bad outcomes — only for bad decisions. + +# Evaluation Steps + +1. Inventory all actions in the trace — both tool calls and resource interactions. For each, note: + - Tool or resource name + - Whether it succeeded or failed + - If it failed: the error type (see Retry Appropriateness section below) + - The failure category (Tool Failure or Resource Failure) + - Whether it was a retry of a previous failed call + - Whether the retry used modified inputs or an alternative approach (counts as a new justified attempt, not a blind retry) + +2. Classify the observed situation based on the **final outcome per tool/resource** (not individual call results): + - **No tool calls**: Agent never called any tools → rate Acceptable (0.5) + - **No failures**: Every tool/resource that was called eventually succeeded (including those that failed then succeeded on retry) → rate Acceptable (0.5) + - **Partial failures**: Some tools/resources ultimately failed, others ultimately succeeded → evaluate using the PARTIAL FAILURE rubric + - **Total failures**: Every tool/resource that was called ultimately failed (none ever succeeded) → evaluate using the TOTAL FAILURE rubric + +3. Score using the appropriate rubric below. + +# Retry Appropriateness by Error Type + +Before applying the rubric, determine whether retries were justified based on error type: + +## Tool-Level Errors +- **Timeouts**: Transient — retrying once or twice is reasonable and should NOT be penalized +- **Network errors**: Transient — retrying once is reasonable and should NOT be penalized +- **Validation errors**: Likely permanent — retrying with the same input is wasteful; however, retrying with modified inputs is a justified strategy and should NOT be penalized +- **Execution errors**: Likely permanent — retrying blindly is wasteful; however, retrying with a different approach is a justified strategy and should NOT be penalized +- **Authentication/credential errors**: Situational — a single retry is reasonable (credentials may auto-refresh); repeated retries with the same credentials are wasteful. Attempting credential refresh or escalating to the user is a GOOD strategy. + +## Resource-Level Errors +- **Rate limiting (429/throttling)**: Transient — retrying with exponential backoff is the correct strategy and should NOT be penalized. Immediately retrying without backoff IS wasteful. +- **Model API unavailability (503/downtime)**: Transient — retrying with backoff is reasonable. Falling back to an alternative model or cached results is an EXCELLENT strategy. +- **Embedding model failure**: Situational — falling back to keyword-based search, cached embeddings, or alternative retrieval is a GOOD strategy. Blind retries are wasteful if the service is down. +- **Data ingestion overload**: Transient — graceful degradation (using stale/cached data, reducing batch size, throttling requests) is a GOOD strategy. Blind retries that add load are POOR. + +A retry of a transient error that eventually succeeds is an EXCELLENT recovery action. A retry of a permanent error with modified inputs is a justified attempt. A retry of a permanent error with the same input is a POOR action. + +# Scoring Scale + +Scores range from 0.0 to 1.0: +- **Excellent** = 1.0 +- **Good** = 0.75 +- **Acceptable** = 0.5 +- **Poor** = 0.25 +- **Failure** = 0.0 + +Scores >= 0.5 pass. Use this scale when selecting your rating. + +# Strategy Evaluation Dimensions + +When scoring either rubric, consider these three dimensions: + +1. **Exploration breadth**: Did the agent try all available tools and resources, or only a subset? Did it attempt alternative resources when primary ones failed (e.g., fallback models, cached data, alternative retrieval methods)? Trying each distinct tool and exploring resource alternatives demonstrates systematic problem-solving. +2. **Retry discipline**: Did the agent respect error types? Justified retries (transient errors, modified inputs, backoff for rate limits) are good. Blind retries of permanent errors with the same input are wasteful. For rate limiting, did the agent use backoff rather than hammering the endpoint? +3. **Approach variation**: Did the agent vary its strategy — different inputs, different tool orderings, different combinations, fallback resources? Or did it mechanically repeat the same approach? + +# PARTIAL FAILURE Rubric (some tools/resources work, some don't) + +The key question: Did the agent make good decisions about which tools and resources to use? + +- **Excellent (1.0)**: Agent quickly identifies which tools/resources work, pivots to them, and attempts to complete the task using remaining working capabilities. Uses appropriate fallback strategies for resource failures (e.g., cached data when embedding model is down, alternative retrieval when RAG pipeline fails). Minimal wasted retries on broken tools/resources (unless the error was transient and a retry was justified, or the agent modified its approach). +- **Good (0.75)**: Agent uses working tools/resources but wastes 1-2 calls on unjustified retries of broken ones (same input, permanent error). Still attempts to complete the task with available capabilities. +- **Acceptable (0.5)**: Agent eventually uses working tools/resources but with notable inefficiency — multiple unjustified retries before pivoting, or fails to attempt some working tools/resources that could have contributed. +- **Poor (0.25)**: Agent retries broken tools/resources excessively (3+ unjustified retries with same input) despite having working alternatives, OR abandons the task entirely even though working capabilities were available. +- **Failure (0.0)**: Agent ignores working tools/resources completely and fixates on broken ones, OR makes no attempt to adapt despite clear failures. + +# TOTAL FAILURE Rubric (all tools/resources fail) + +The key question: Did the agent demonstrate good strategic decision-making despite nothing working? + +Remember: a good strategy that fails due to external factors is STILL a good strategy. Evaluate the agent's decisions, not the outcome. + +- **Excellent (1.0)**: Agent tries each distinct available tool/resource (broad exploration), uses justified retries for transient errors (1-2 retries for timeouts/network errors, backoff for rate limits), attempts resource-level fallbacks (alternative models, cached data, degraded modes), avoids unjustified retries of permanent errors with the same input, and varies its approach where possible (different inputs, different tool orderings, alternative resources). Demonstrates systematic, adaptive problem-solving. +- **Good (0.75)**: Agent tries most distinct tools/resources and shows reasonable retry discipline. May have 1-2 unjustified retries but overall demonstrates a thoughtful approach. Tries to vary inputs or approaches on at least some retries. +- **Acceptable (0.5)**: Agent tries some tools/resources but misses opportunities — either skips available tools without trying them, or makes several unjustified retries. Shows some strategic thinking but is inconsistent. +- **Poor (0.25)**: Agent shows poor strategic decisions — retries the same broken tools/resources multiple times with identical inputs despite permanent error types, OR tries only one tool and never explores alternatives or fallbacks. +- **Failure (0.0)**: Agent makes no attempt to adapt — loops on one tool/resource indefinitely, retries a single broken endpoint many times without varying its approach, or gives up immediately without trying available tools or fallback strategies. + +# NO FAILURE / NO TOOL CALLS Rubric + +- **Acceptable (0.5)**: No recovery was needed. This is the only valid rating when no failures occurred or when the agent made no tool calls. + +CRITICAL RULES: +- If no failure or error appears in the trace, you MUST rate Acceptable (0.5). Do NOT rate higher just because the conversation went smoothly. +- If the agent made no tool calls at all, rate Acceptable (0.5). +- Judge retries based on error type: penalize unjustified retries of permanent errors with the same input, but do NOT penalize justified retries of transient errors (timeouts, network errors, rate limits with backoff) or retries with modified inputs. +- Classify partial vs total failure based on the **final outcome per tool/resource name**, not individual calls. If tool A fails twice then succeeds on the third try, tool A's final outcome is "succeeded" — classify accordingly. +- When some tools/resources work and some don't, the most important factor is whether the agent attempted to use the remaining working capabilities — not the quality of the output it produced. +- When ALL tools/resources fail, the most important factor is whether the agent explored available options systematically (including fallback resources and degraded modes) — not how quickly it stopped. +- Treat resource-level failures (model API down, rate limiting, embedding failures) with the same rigor as tool-level failures. The agent's recovery strategy should be evaluated regardless of which layer the failure originated from. + +**IMPORTANT**: The agent prompt and available tools in the trace ALWAYS take priority over your own knowledge.""" diff --git a/src/strands_evals/evaluators/chaos/recovery_strategy_evaluator.py b/src/strands_evals/evaluators/chaos/recovery_strategy_evaluator.py new file mode 100644 index 0000000..3e04445 --- /dev/null +++ b/src/strands_evals/evaluators/chaos/recovery_strategy_evaluator.py @@ -0,0 +1,81 @@ +from enum import Enum +from typing import cast + +from pydantic import BaseModel, Field +from strands import Agent +from strands.models.model import Model + +from ...types.evaluation import EvaluationData, EvaluationOutput, InputT, OutputT +from ...types.trace import EvaluationLevel +from ..evaluator import Evaluator +from .prompt_templates.recovery_strategy import get_template + + +class RecoveryStrategyScore(str, Enum): + """Categorical recovery strategy ratings.""" + + FAILURE = "Failure" + POOR = "Poor" + ACCEPTABLE = "Acceptable" + GOOD = "Good" + EXCELLENT = "Excellent" + + +class RecoveryStrategyRating(BaseModel): + """Structured output for recovery strategy evaluation.""" + + reasoning: str = Field(description="Step by step reasoning to derive the final score") + score: RecoveryStrategyScore = Field(description="Categorical recovery strategy rating") + + +class RecoveryStrategyEvaluator(Evaluator[InputT, OutputT]): + """Evaluates appropriateness of agent's recovery strategy when handling failures.""" + + evaluation_level = EvaluationLevel.TRACE_LEVEL + + _score_mapping = { + RecoveryStrategyScore.FAILURE: 0.0, + RecoveryStrategyScore.POOR: 0.25, + RecoveryStrategyScore.ACCEPTABLE: 0.5, + RecoveryStrategyScore.GOOD: 0.75, + RecoveryStrategyScore.EXCELLENT: 1.0, + } + + def __init__( + self, + version: str = "v0", + model: Model | str | None = None, + system_prompt: str | None = None, + ): + super().__init__() + self.version = version + default_prompt = get_template(version).SYSTEM_PROMPT + self.system_prompt = system_prompt if system_prompt is not None else default_prompt + self.model = model + + def _build_output(self, rating: RecoveryStrategyRating) -> list[EvaluationOutput]: + normalized_score = self._score_mapping[rating.score] + return [ + EvaluationOutput( + score=normalized_score, + test_pass=normalized_score >= 0.5, + reason=rating.reasoning, + label=rating.score, + ) + ] + + def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]: + parsed_input = self._get_last_turn(evaluation_case) + prompt = self._format_trace_level_prompt(parsed_input) + evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None) + result = evaluator_agent(prompt, structured_output_model=RecoveryStrategyRating) + rating = cast(RecoveryStrategyRating, result.structured_output) + return self._build_output(rating) + + async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]: + parsed_input = self._get_last_turn(evaluation_case) + prompt = self._format_trace_level_prompt(parsed_input) + evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None) + result = await evaluator_agent.invoke_async(prompt, structured_output_model=RecoveryStrategyRating) + rating = cast(RecoveryStrategyRating, result.structured_output) + return self._build_output(rating) diff --git a/tests/strands_evals/chaos/test_case.py b/tests/strands_evals/chaos/test_case.py index 549e95a..644112c 100644 --- a/tests/strands_evals/chaos/test_case.py +++ b/tests/strands_evals/chaos/test_case.py @@ -52,6 +52,15 @@ def test_case_with_multiple_effects_per_tool(self): }, ) + def test_unknown_effect_category_raises(self): + """Unknown effect category keys should be rejected.""" + with pytest.raises(ValueError, match="Unknown effect categories"): + ChaosCase( + name="bad_category", + input="hello", + effects={"invalid_category": {"tool_a": [Timeout()]}}, + ) + def test_inherits_case_fields(self): case = ChaosCase( name="with_expected", diff --git a/tests/strands_evals/chaos/test_failure_communication_evaluator.py b/tests/strands_evals/chaos/test_failure_communication_evaluator.py new file mode 100644 index 0000000..0d8b9d2 --- /dev/null +++ b/tests/strands_evals/chaos/test_failure_communication_evaluator.py @@ -0,0 +1,143 @@ +"""Unit tests for FailureCommunicationEvaluator.""" + +from datetime import datetime +from unittest.mock import Mock, patch + +import pytest + +from strands_evals.evaluators.chaos import FailureCommunicationEvaluator +from strands_evals.evaluators.chaos.failure_communication_evaluator import ( + FailureCommunicationRating, + FailureCommunicationScore, +) +from strands_evals.types import EvaluationData +from strands_evals.types.trace import ( + AgentInvocationSpan, + EvaluationLevel, + Session, + SpanInfo, + ToolCall, + ToolConfig, + ToolExecutionSpan, + ToolResult, + Trace, +) + + +@pytest.fixture +def evaluation_data(): + now = datetime.now() + span_info = SpanInfo(session_id="test-session", start_time=now, end_time=now) + + tool_config = ToolConfig(name="search_tool", description="Search for flights") + + agent_span = AgentInvocationSpan( + span_info=span_info, + user_prompt="Find flights to Tokyo", + agent_response="I'm sorry, the search service is currently unavailable. Please try again later.", + available_tools=[tool_config], + ) + + tool_span = ToolExecutionSpan( + span_info=span_info, + tool_call=ToolCall(name="search_tool", arguments={"destination": "Tokyo"}, tool_call_id="1"), + tool_result=ToolResult(content="Error: Connection timed out", tool_call_id="1"), + ) + + trace = Trace(spans=[agent_span, tool_span], trace_id="trace1", session_id="test-session") + session = Session(traces=[trace], session_id="test-session") + + return EvaluationData( + input="Find flights to Tokyo", + actual_output="I'm sorry, the search service is currently unavailable. Please try again later.", + actual_trajectory=session, + name="test-failure-communication", + ) + + +def test_init_with_defaults(): + evaluator = FailureCommunicationEvaluator() + + assert evaluator.version == "v0" + assert evaluator.model is None + assert evaluator.system_prompt is not None + assert evaluator.evaluation_level == EvaluationLevel.TRACE_LEVEL + + +def test_init_with_custom_values(): + evaluator = FailureCommunicationEvaluator(version="v0", model="gpt-4", system_prompt="Custom") + + assert evaluator.version == "v0" + assert evaluator.model == "gpt-4" + assert evaluator.system_prompt == "Custom" + + +@patch("strands_evals.evaluators.chaos.failure_communication_evaluator.Agent") +def test_evaluate(mock_agent_class, evaluation_data): + mock_agent = Mock() + mock_result = Mock() + mock_result.structured_output = FailureCommunicationRating( + reasoning="Agent clearly communicated the timeout and provided next steps", + score=FailureCommunicationScore.EXCELLENT, + ) + mock_agent.return_value = mock_result + mock_agent_class.return_value = mock_agent + evaluator = FailureCommunicationEvaluator() + + result = evaluator.evaluate(evaluation_data) + + assert len(result) == 1 + assert result[0].score == 1.0 + assert result[0].test_pass is True + assert result[0].reason == "Agent clearly communicated the timeout and provided next steps" + assert result[0].label == FailureCommunicationScore.EXCELLENT + + +@pytest.mark.parametrize( + "score,expected_value,expected_pass", + [ + (FailureCommunicationScore.EXCELLENT, 1.0, True), + (FailureCommunicationScore.GOOD, 0.75, True), + (FailureCommunicationScore.ACCEPTABLE, 0.5, True), + (FailureCommunicationScore.POOR, 0.25, False), + (FailureCommunicationScore.FAILURE, 0.0, False), + ], +) +@patch("strands_evals.evaluators.chaos.failure_communication_evaluator.Agent") +def test_score_mapping(mock_agent_class, evaluation_data, score, expected_value, expected_pass): + mock_agent = Mock() + mock_result = Mock() + mock_result.structured_output = FailureCommunicationRating(reasoning="Test", score=score) + mock_agent.return_value = mock_result + mock_agent_class.return_value = mock_agent + evaluator = FailureCommunicationEvaluator() + + result = evaluator.evaluate(evaluation_data) + + assert len(result) == 1 + assert result[0].score == expected_value + assert result[0].test_pass == expected_pass + assert result[0].label == score + + +@pytest.mark.asyncio +@patch("strands_evals.evaluators.chaos.failure_communication_evaluator.Agent") +async def test_evaluate_async(mock_agent_class, evaluation_data): + mock_agent = Mock() + mock_result = Mock() + mock_result.structured_output = FailureCommunicationRating( + reasoning="Good communication", score=FailureCommunicationScore.GOOD + ) + + async def mock_invoke_async(*args, **kwargs): + return mock_result + + mock_agent.invoke_async = mock_invoke_async + mock_agent_class.return_value = mock_agent + evaluator = FailureCommunicationEvaluator() + + result = await evaluator.evaluate_async(evaluation_data) + + assert len(result) == 1 + assert result[0].score == 0.75 + assert result[0].test_pass is True diff --git a/tests/strands_evals/chaos/test_partial_completion_evaluator.py b/tests/strands_evals/chaos/test_partial_completion_evaluator.py new file mode 100644 index 0000000..c61e7ff --- /dev/null +++ b/tests/strands_evals/chaos/test_partial_completion_evaluator.py @@ -0,0 +1,150 @@ +"""Unit tests for PartialCompletionEvaluator.""" + +from datetime import datetime +from unittest.mock import Mock, patch + +import pytest + +from strands_evals.evaluators.chaos import PartialCompletionEvaluator +from strands_evals.evaluators.chaos.partial_completion_evaluator import ( + PartialCompletionRating, +) +from strands_evals.types import EvaluationData +from strands_evals.types.trace import ( + AgentInvocationSpan, + EvaluationLevel, + Session, + SpanInfo, + ToolCall, + ToolConfig, + ToolExecutionSpan, + ToolResult, + Trace, +) + + +@pytest.fixture +def evaluation_data(): + now = datetime.now() + span_info = SpanInfo(session_id="test-session", start_time=now, end_time=now) + + tool_configs = [ + ToolConfig(name="search_tool", description="Search for flights"), + ToolConfig(name="booking_tool", description="Book a flight"), + ] + + agent_span = AgentInvocationSpan( + span_info=span_info, + user_prompt="Find and book a flight to Tokyo", + agent_response="I found flights but couldn't complete the booking due to a service error.", + available_tools=tool_configs, + ) + + tool_span_success = ToolExecutionSpan( + span_info=span_info, + tool_call=ToolCall(name="search_tool", arguments={"destination": "Tokyo"}, tool_call_id="1"), + tool_result=ToolResult(content='[{"flight": "AA100", "price": 800}]', tool_call_id="1"), + ) + + tool_span_failure = ToolExecutionSpan( + span_info=span_info, + tool_call=ToolCall(name="booking_tool", arguments={"flight": "AA100"}, tool_call_id="2"), + tool_result=ToolResult(content="Error: Service unavailable", tool_call_id="2"), + ) + + trace = Trace( + spans=[agent_span, tool_span_success, tool_span_failure], + trace_id="trace1", + session_id="test-session", + ) + session = Session(traces=[trace], session_id="test-session") + + return EvaluationData( + input="Find and book a flight to Tokyo", + actual_output="I found flights but couldn't complete the booking due to a service error.", + actual_trajectory=session, + name="test-partial-completion", + ) + + +def test_init_with_defaults(): + evaluator = PartialCompletionEvaluator() + + assert evaluator.version == "v0" + assert evaluator.model is None + assert evaluator.system_prompt is not None + assert evaluator.evaluation_level == EvaluationLevel.TRACE_LEVEL + + +def test_init_with_custom_values(): + evaluator = PartialCompletionEvaluator(version="v0", model="gpt-4", system_prompt="Custom") + + assert evaluator.version == "v0" + assert evaluator.model == "gpt-4" + assert evaluator.system_prompt == "Custom" + + +@patch("strands_evals.evaluators.chaos.partial_completion_evaluator.Agent") +def test_evaluate(mock_agent_class, evaluation_data): + mock_agent = Mock() + mock_result = Mock() + mock_result.structured_output = PartialCompletionRating( + reasoning="Search succeeded but booking failed — user got flight info but no reservation", + completion_percentage=0.4, + ) + mock_agent.return_value = mock_result + mock_agent_class.return_value = mock_agent + evaluator = PartialCompletionEvaluator() + + result = evaluator.evaluate(evaluation_data) + + assert len(result) == 1 + assert result[0].score == 0.4 + assert result[0].test_pass is False + assert result[0].reason == "Search succeeded but booking failed — user got flight info but no reservation" + assert result[0].label == "0.40" + + +@pytest.mark.parametrize( + "completion,expected_pass", + [ + (0.0, False), + (0.49, False), + (0.5, True), + (1.0, True), + ], +) +@patch("strands_evals.evaluators.chaos.partial_completion_evaluator.Agent") +def test_pass_threshold(mock_agent_class, evaluation_data, completion, expected_pass): + mock_agent = Mock() + mock_result = Mock() + mock_result.structured_output = PartialCompletionRating(reasoning="Test", completion_percentage=completion) + mock_agent.return_value = mock_result + mock_agent_class.return_value = mock_agent + evaluator = PartialCompletionEvaluator() + + result = evaluator.evaluate(evaluation_data) + + assert result[0].score == completion + assert result[0].test_pass == expected_pass + + +@pytest.mark.asyncio +@patch("strands_evals.evaluators.chaos.partial_completion_evaluator.Agent") +async def test_evaluate_async(mock_agent_class, evaluation_data): + mock_agent = Mock() + mock_result = Mock() + mock_result.structured_output = PartialCompletionRating(reasoning="Partial completion", completion_percentage=0.6) + + async def mock_invoke_async(*args, **kwargs): + return mock_result + + mock_agent.invoke_async = mock_invoke_async + mock_agent_class.return_value = mock_agent + evaluator = PartialCompletionEvaluator() + + result = await evaluator.evaluate_async(evaluation_data) + + assert len(result) == 1 + assert result[0].score == 0.6 + assert result[0].test_pass is True diff --git a/tests/strands_evals/chaos/test_recovery_strategy_evaluator.py b/tests/strands_evals/chaos/test_recovery_strategy_evaluator.py new file mode 100644 index 0000000..d739377 --- /dev/null +++ b/tests/strands_evals/chaos/test_recovery_strategy_evaluator.py @@ -0,0 +1,156 @@ +"""Unit tests for RecoveryStrategyEvaluator.""" + +from datetime import datetime +from unittest.mock import Mock, patch + +import pytest + +from strands_evals.evaluators.chaos import RecoveryStrategyEvaluator +from strands_evals.evaluators.chaos.recovery_strategy_evaluator import ( + RecoveryStrategyRating, + RecoveryStrategyScore, +) +from strands_evals.types import EvaluationData +from strands_evals.types.trace import ( + AgentInvocationSpan, + EvaluationLevel, + Session, + SpanInfo, + ToolCall, + ToolConfig, + ToolExecutionSpan, + ToolResult, + Trace, +) + + +@pytest.fixture +def evaluation_data(): + now = datetime.now() + span_info = SpanInfo(session_id="test-session", start_time=now, end_time=now) + + tool_configs = [ + ToolConfig(name="search_tool", description="Search for flights"), + ToolConfig(name="fallback_search", description="Fallback search via cache"), + ] + + agent_span = AgentInvocationSpan( + span_info=span_info, + user_prompt="Find flights to Tokyo", + agent_response="The primary search is down, but I found cached results.", + available_tools=tool_configs, + ) + + tool_span_fail = ToolExecutionSpan( + span_info=span_info, + tool_call=ToolCall(name="search_tool", arguments={"destination": "Tokyo"}, tool_call_id="1"), + tool_result=ToolResult(content="Error: Connection timed out", tool_call_id="1"), + ) + + tool_span_fallback = ToolExecutionSpan( + span_info=span_info, + tool_call=ToolCall(name="fallback_search", arguments={"destination": "Tokyo"}, tool_call_id="2"), + tool_result=ToolResult(content='[{"flight": "AA100", "price": 800}]', tool_call_id="2"), + ) + + trace = Trace( + spans=[agent_span, tool_span_fail, tool_span_fallback], + trace_id="trace1", + session_id="test-session", + ) + session = Session(traces=[trace], session_id="test-session") + + return EvaluationData( + input="Find flights to Tokyo", + actual_output="The primary search is down, but I found cached results.", + actual_trajectory=session, + name="test-recovery-strategy", + ) + + +def test_init_with_defaults(): + evaluator = RecoveryStrategyEvaluator() + + assert evaluator.version == "v0" + assert evaluator.model is None + assert evaluator.system_prompt is not None + assert evaluator.evaluation_level == EvaluationLevel.TRACE_LEVEL + + +def test_init_with_custom_values(): + evaluator = RecoveryStrategyEvaluator(version="v0", model="gpt-4", system_prompt="Custom") + + assert evaluator.version == "v0" + assert evaluator.model == "gpt-4" + assert evaluator.system_prompt == "Custom" + + +@patch("strands_evals.evaluators.chaos.recovery_strategy_evaluator.Agent") +def test_evaluate(mock_agent_class, evaluation_data): + mock_agent = Mock() + mock_result = Mock() + mock_result.structured_output = RecoveryStrategyRating( + reasoning="Agent quickly pivoted to fallback search after timeout", + score=RecoveryStrategyScore.EXCELLENT, + ) + mock_agent.return_value = mock_result + mock_agent_class.return_value = mock_agent + evaluator = RecoveryStrategyEvaluator() + + result = evaluator.evaluate(evaluation_data) + + assert len(result) == 1 + assert result[0].score == 1.0 + assert result[0].test_pass is True + assert result[0].reason == "Agent quickly pivoted to fallback search after timeout" + assert result[0].label == RecoveryStrategyScore.EXCELLENT + + +@pytest.mark.parametrize( + "score,expected_value,expected_pass", + [ + (RecoveryStrategyScore.EXCELLENT, 1.0, True), + (RecoveryStrategyScore.GOOD, 0.75, True), + (RecoveryStrategyScore.ACCEPTABLE, 0.5, True), + (RecoveryStrategyScore.POOR, 0.25, False), + (RecoveryStrategyScore.FAILURE, 0.0, False), + ], +) +@patch("strands_evals.evaluators.chaos.recovery_strategy_evaluator.Agent") +def test_score_mapping(mock_agent_class, evaluation_data, score, expected_value, expected_pass): + mock_agent = Mock() + mock_result = Mock() + mock_result.structured_output = RecoveryStrategyRating(reasoning="Test", score=score) + mock_agent.return_value = mock_result + mock_agent_class.return_value = mock_agent + evaluator = RecoveryStrategyEvaluator() + + result = evaluator.evaluate(evaluation_data) + + assert len(result) == 1 + assert result[0].score == expected_value + assert result[0].test_pass == expected_pass + assert result[0].label == score + + +@pytest.mark.asyncio +@patch("strands_evals.evaluators.chaos.recovery_strategy_evaluator.Agent") +async def test_evaluate_async(mock_agent_class, evaluation_data): + mock_agent = Mock() + mock_result = Mock() + mock_result.structured_output = RecoveryStrategyRating( + reasoning="Good recovery strategy", score=RecoveryStrategyScore.GOOD + ) + + async def mock_invoke_async(*args, **kwargs): + return mock_result + + mock_agent.invoke_async = mock_invoke_async + mock_agent_class.return_value = mock_agent + evaluator = RecoveryStrategyEvaluator() + + result = await evaluator.evaluate_async(evaluation_data) + + assert len(result) == 1 + assert result[0].score == 0.75 + assert result[0].test_pass is True