Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ select = [

[tool.ruff.lint.per-file-ignores]
"src/strands_evals/evaluators/prompt_templates/*" = ["E501"] # line-length
"src/strands_evals/evaluators/chaos/prompt_templates/*" = ["E501"] # line-length
"src/strands_evals/detectors/prompt_templates/*" = ["E501"] # line-length
"src/strands_evals/generators/prompt_template/*" = ["E501"] # line-length
"src/strands_evals/experimental/redteam/**/prompt_templates/**" = ["E501"] # line-length
Expand Down
7 changes: 7 additions & 0 deletions src/strands_evals/chaos/case.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,13 @@ class ChaosCase(Case, Generic[InputT, OutputT]):
@model_validator(mode="after")
def _validate_tool_effects(self) -> "ChaosCase":
"""Validate tool effects configuration."""
allowed_categories = {"tool_effects"}
unknown = set(self.effects.keys()) - allowed_categories
if unknown:
Comment thread
ybdarrenwang marked this conversation as resolved.
raise ValueError(
f"Unknown effect categories: {sorted(unknown)}. Allowed categories: {sorted(allowed_categories)}."
)

for tool_name, effects_list in self.tool_effects.items():
if len(effects_list) > 1:
raise ValueError(
Expand Down
4 changes: 2 additions & 2 deletions src/strands_evals/chaos/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import logging
from collections.abc import Callable
from typing import Any, Optional
from typing import Any

from ..evaluators.evaluator import Evaluator
from ..experiment import Experiment
Expand Down Expand Up @@ -62,7 +62,7 @@ def my_task(case):
def __init__(
self,
cases: list[ChaosCase],
evaluators: Optional[list[Evaluator]] = None,
evaluators: list[Evaluator] | None = None,
):
"""Initialize a ChaosExperiment.

Expand Down
2 changes: 1 addition & 1 deletion src/strands_evals/chaos/plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ class ChaosPlugin(Plugin):
"""Strands Plugin that injects deterministic chaos based on the active ChaosCase.

The plugin intercepts tool calls via Strands' native hook system:
- BeforeToolCallEvent: cancels tool calls for pre-hook effects (ToolCallFailure)
- BeforeToolCallEvent: cancels tool calls for pre-hook effects (Timeout, NetworkError, etc.)
- AfterToolCallEvent: corrupts tool responses for post-hook effects (TruncateFields, etc.)

The active ChaosCase is managed via a ContextVar (set by ChaosExperiment).
Expand Down
11 changes: 11 additions & 0 deletions src/strands_evals/evaluators/chaos/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
"""Chaos testing evaluators for strands-evals."""

from .failure_communication_evaluator import FailureCommunicationEvaluator
from .partial_completion_evaluator import PartialCompletionEvaluator
from .recovery_strategy_evaluator import RecoveryStrategyEvaluator

__all__ = [
"FailureCommunicationEvaluator",
"PartialCompletionEvaluator",
"RecoveryStrategyEvaluator",
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
from enum import Enum
from typing import cast

from pydantic import BaseModel, Field
from strands import Agent
from strands.models.model import Model

from ...types.evaluation import EvaluationData, EvaluationOutput, InputT, OutputT
from ...types.trace import EvaluationLevel
from ..evaluator import Evaluator
from .prompt_templates.failure_communication import get_template


class FailureCommunicationScore(str, Enum):
"""Categorical failure communication ratings."""

FAILURE = "Failure"
POOR = "Poor"
ACCEPTABLE = "Acceptable"
GOOD = "Good"
EXCELLENT = "Excellent"


class FailureCommunicationRating(BaseModel):
"""Structured output for failure communication evaluation."""

reasoning: str = Field(description="Step by step reasoning to derive the final score")
score: FailureCommunicationScore = Field(description="Categorical failure communication rating")


class FailureCommunicationEvaluator(Evaluator[InputT, OutputT]):
"""Evaluates quality of agent's failure communication and user experience."""

evaluation_level = EvaluationLevel.TRACE_LEVEL

_score_mapping = {
FailureCommunicationScore.FAILURE: 0.0,
FailureCommunicationScore.POOR: 0.25,
FailureCommunicationScore.ACCEPTABLE: 0.5,
FailureCommunicationScore.GOOD: 0.75,
FailureCommunicationScore.EXCELLENT: 1.0,
}

def __init__(
self,
version: str = "v0",
model: Model | str | None = None,
system_prompt: str | None = None,
):
super().__init__()
self.version = version
default_prompt = get_template(version).SYSTEM_PROMPT
self.system_prompt = system_prompt if system_prompt is not None else default_prompt
self.model = model

def _build_output(self, rating: FailureCommunicationRating) -> list[EvaluationOutput]:
normalized_score = self._score_mapping[rating.score]
return [
Comment thread
ybdarrenwang marked this conversation as resolved.
EvaluationOutput(
score=normalized_score,
test_pass=normalized_score >= 0.5,
reason=rating.reasoning,
label=rating.score,
)
]

def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
parsed_input = self._get_last_turn(evaluation_case)
prompt = self._format_trace_level_prompt(parsed_input)
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
result = evaluator_agent(prompt, structured_output_model=FailureCommunicationRating)
rating = cast(FailureCommunicationRating, result.structured_output)
return self._build_output(rating)

async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
parsed_input = self._get_last_turn(evaluation_case)
prompt = self._format_trace_level_prompt(parsed_input)
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
result = await evaluator_agent.invoke_async(prompt, structured_output_model=FailureCommunicationRating)
rating = cast(FailureCommunicationRating, result.structured_output)
return self._build_output(rating)
61 changes: 61 additions & 0 deletions src/strands_evals/evaluators/chaos/partial_completion_evaluator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
from typing import cast

from pydantic import BaseModel, Field
from strands import Agent
from strands.models.model import Model

from ...types.evaluation import EvaluationData, EvaluationOutput, InputT, OutputT
from ...types.trace import EvaluationLevel
from ..evaluator import Evaluator
from .prompt_templates.partial_completion import get_template


class PartialCompletionRating(BaseModel):
"""Structured output for partial completion evaluation."""

reasoning: str = Field(description="Step by step reasoning to derive the final score")
completion_percentage: float = Field(description="Completion percentage from 0.0 to 1.0", ge=0.0, le=1.0)


class PartialCompletionEvaluator(Evaluator[InputT, OutputT]):
"""Evaluates what percentage of task objectives were achieved despite failures."""

evaluation_level = EvaluationLevel.TRACE_LEVEL

def __init__(
self,
version: str = "v0",
model: Model | str | None = None,
system_prompt: str | None = None,
):
super().__init__()
self.version = version
default_prompt = get_template(version).SYSTEM_PROMPT
self.system_prompt = system_prompt if system_prompt is not None else default_prompt
self.model = model

def _build_output(self, rating: PartialCompletionRating) -> list[EvaluationOutput]:
return [
EvaluationOutput(
score=rating.completion_percentage,
test_pass=rating.completion_percentage >= 0.5,
reason=rating.reasoning,
label=f"{rating.completion_percentage:.2f}",
)
]

def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
parsed_input = self._get_last_turn(evaluation_case)
prompt = self._format_trace_level_prompt(parsed_input)
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
result = evaluator_agent(prompt, structured_output_model=PartialCompletionRating)
rating = cast(PartialCompletionRating, result.structured_output)
return self._build_output(rating)

async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
parsed_input = self._get_last_turn(evaluation_case)
prompt = self._format_trace_level_prompt(parsed_input)
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
result = await evaluator_agent.invoke_async(prompt, structured_output_model=PartialCompletionRating)
rating = cast(PartialCompletionRating, result.structured_output)
return self._build_output(rating)
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from . import failure_communication_v0

VERSIONS = {
"v0": failure_communication_v0,
}

DEFAULT_VERSION = "v0"


def get_template(version: str = DEFAULT_VERSION):
return VERSIONS[version]
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
"""
LLM-Judge prompt for Failure Communication & UX evaluation.
Based on planning.md Section 4.4 - Communication Phase metrics.

Evaluates communication quality across all failure categories:
tool failures, resource failures, and infrastructure failures.
"""

SYSTEM_PROMPT = """You are an objective judge evaluating how well an AI agent communicated with the user during a failure. You will be provided with the complete conversation trace including:
- User messages (User:)
- Assistant responses (Assistant:)
- Tool calls selected by the assistant (Action:)
- Tool outputs (Tool:)

CONTEXT: You are evaluating an AI agent under chaos testing conditions. The agent may have been subjected to deliberately injected failures across multiple layers of the system to test its resilience. Your evaluation should assess how well the agent handled and communicated these failures, NOT whether the agent is generally competent.

## Failure Categories to Watch For

### Tool Failures (tool/API layer)
- Tool unavailability, timeouts, network errors, execution errors
- Corrupted or malformed tool responses
- Authentication or permission errors on tool calls

### Resource Failures (infrastructure/platform layer)
- Model API unavailability or degraded performance
- Rate limiting or throttling (429 errors, backoff scenarios)
- Embedding or reranking model failures (degraded RAG pipelines)
- Data ingestion overload (stale data, pipeline delays)

Your task is to assess the quality of error communication and user experience. Follow these steps:
1. First, determine whether any failures or errors occurred in the trace — at ANY layer (tool, resource, or infrastructure). Look for error messages, exceptions, non-200 status codes, timeout indicators, or explicit failure signals in tool outputs.
2. **MANDATORY GATE — If NO failure occurred:** You MUST immediately rate **Acceptable (0.5)** and STOP. Do not evaluate communication quality — there was nothing to communicate. A trace with zero failures always scores 0.5, no exceptions.
3. If a failure DID occur, evaluate the agent's communication:
a. Clarity: Did the user understand what went wrong? Did the agent appropriately describe the failure without exposing unnecessary internal details?
b. Actionability: Did the agent provide clear next steps? (e.g., "I'll retry shortly", "Using cached results instead", "The service is currently unavailable — please try again later")
c. Transparency: Was the agent honest about limitations? Did it acknowledge degraded results rather than presenting them as complete?
d. Tone: Was it appropriately calibrated for the situation? (e.g., rate limiting is routine and shouldn't alarm the user; a complete service outage warrants more explanation)
4. Rate the communication quality using the scoring scale and rubric below.

# Scoring Scale

Scores range from 0.0 to 1.0:
- **Excellent** = 1.0
- **Good** = 0.75
- **Acceptable** = 0.5
- **Poor** = 0.25
- **Failure** = 0.0

Scores >= 0.5 pass. Use this scale when selecting your rating.

# Evaluation Rubric

- **Excellent (1.0)**: Clear error communication; user understands what happened and next steps; graceful degradation communicated transparently (e.g., "Search is currently unavailable, so I'm using cached results which may not reflect the latest data"); trust maintained across all failure types
- **Good (0.75)**: Good communication with minor gaps; user mostly understands; appropriate tone and transparency. For resource failures, agent acknowledges degradation but may not fully explain the impact.
- **Acceptable (0.5)**: Basic error communication; user knows something went wrong but lacks clarity on impact or next steps. ALSO use this when no failure occurred in the trace (nothing to communicate about).
- **Poor (0.25)**: Confusing or unhelpful messages; user uncertain what happened; overly technical jargon (e.g., exposing raw "429 Too Many Requests" or "503 Service Unavailable" without context); misleading reassurances about degraded results
- **Failure (0.0)**: Failures DID occur but the agent provided NO error communication; user is unaware of the failure; agent hides problems or provides false confidence. This includes silently returning degraded results from resource failures without acknowledgment. NOTE: This rating requires that failures actually exist in the trace — do NOT use this when the trace is failure-free.

CRITICAL RULES:
- **NO-FAILURE BASELINE**: If no failure or error appears in the trace (at any layer), you MUST rate Acceptable (0.5). This is mandatory and unconditional. A clean trace with no errors = 0.5, period. Do NOT confuse "no failures to communicate" with "failed to communicate failures" — they are opposites.
- If a tool or resource failed but the agent silently ignored it and responded as if nothing happened, rate Failure (0.0).
- If a failure occurred and the agent acknowledged it but provided NO alternative or next steps, rate Poor (0.25).
- Resource failures (model API down, rate limiting, embedding failures) require the SAME communication standards as tool failures. Silently degrading without informing the user is still a Failure (0.0).
- If the agent is using fallback or degraded data due to a resource failure, it MUST communicate this to the user to score above Poor.

**IMPORTANT**: The user context and agent's available tools in the trace ALWAYS take priority over your own knowledge."""
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from . import partial_completion_v0

VERSIONS = {
"v0": partial_completion_v0,
}

DEFAULT_VERSION = "v0"


def get_template(version: str = DEFAULT_VERSION):
return VERSIONS[version]
Loading
Loading