strands-agents · ybdarrenwang · May 28, 2026 · May 28, 2026 · May 28, 2026 · May 29, 2026
diff --git a/pyproject.toml b/pyproject.toml
@@ -138,6 +138,7 @@ select = [
 
 [tool.ruff.lint.per-file-ignores]
 "src/strands_evals/evaluators/prompt_templates/*" = ["E501"] # line-length
+"src/strands_evals/evaluators/chaos/prompt_templates/*" = ["E501"] # line-length
 "src/strands_evals/detectors/prompt_templates/*" = ["E501"] # line-length
 "src/strands_evals/generators/prompt_template/*" = ["E501"] # line-length
 "src/strands_evals/experimental/redteam/**/prompt_templates/**" = ["E501"] # line-length

diff --git a/src/strands_evals/chaos/case.py b/src/strands_evals/chaos/case.py
@@ -64,6 +64,13 @@ class ChaosCase(Case, Generic[InputT, OutputT]):
     @model_validator(mode="after")
     def _validate_tool_effects(self) -> "ChaosCase":
         """Validate tool effects configuration."""
+        allowed_categories = {"tool_effects"}
+        unknown = set(self.effects.keys()) - allowed_categories
+        if unknown:
+            raise ValueError(
+                f"Unknown effect categories: {sorted(unknown)}. Allowed categories: {sorted(allowed_categories)}."
+            )
+
         for tool_name, effects_list in self.tool_effects.items():
             if len(effects_list) > 1:
                 raise ValueError(

diff --git a/src/strands_evals/chaos/experiment.py b/src/strands_evals/chaos/experiment.py
@@ -6,7 +6,7 @@
 
 import logging
 from collections.abc import Callable
-from typing import Any, Optional
+from typing import Any
 
 from ..evaluators.evaluator import Evaluator
 from ..experiment import Experiment
@@ -62,7 +62,7 @@ def my_task(case):
     def __init__(
         self,
         cases: list[ChaosCase],
-        evaluators: Optional[list[Evaluator]] = None,
+        evaluators: list[Evaluator] | None = None,
     ):
         """Initialize a ChaosExperiment.
 

diff --git a/src/strands_evals/chaos/plugin.py b/src/strands_evals/chaos/plugin.py
@@ -23,7 +23,7 @@ class ChaosPlugin(Plugin):
     """Strands Plugin that injects deterministic chaos based on the active ChaosCase.
 
     The plugin intercepts tool calls via Strands' native hook system:
-    - BeforeToolCallEvent: cancels tool calls for pre-hook effects (ToolCallFailure)
+    - BeforeToolCallEvent: cancels tool calls for pre-hook effects (Timeout, NetworkError, etc.)
     - AfterToolCallEvent: corrupts tool responses for post-hook effects (TruncateFields, etc.)
 
     The active ChaosCase is managed via a ContextVar (set by ChaosExperiment).

diff --git a/src/strands_evals/evaluators/chaos/__init__.py b/src/strands_evals/evaluators/chaos/__init__.py
@@ -0,0 +1,11 @@
+"""Chaos testing evaluators for strands-evals."""
+
+from .failure_communication_evaluator import FailureCommunicationEvaluator
+from .partial_completion_evaluator import PartialCompletionEvaluator
+from .recovery_strategy_evaluator import RecoveryStrategyEvaluator
+
+__all__ = [
+    "FailureCommunicationEvaluator",
+    "PartialCompletionEvaluator",
+    "RecoveryStrategyEvaluator",
+]
diff --git a/src/strands_evals/evaluators/chaos/failure_communication_evaluator.py b/src/strands_evals/evaluators/chaos/failure_communication_evaluator.py
@@ -0,0 +1,81 @@
+from enum import Enum
+from typing import cast
+
+from pydantic import BaseModel, Field
+from strands import Agent
+from strands.models.model import Model
+
+from ...types.evaluation import EvaluationData, EvaluationOutput, InputT, OutputT
+from ...types.trace import EvaluationLevel
+from ..evaluator import Evaluator
+from .prompt_templates.failure_communication import get_template
+
+
+class FailureCommunicationScore(str, Enum):
+    """Categorical failure communication ratings."""
+
+    FAILURE = "Failure"
+    POOR = "Poor"
+    ACCEPTABLE = "Acceptable"
+    GOOD = "Good"
+    EXCELLENT = "Excellent"
+
+
+class FailureCommunicationRating(BaseModel):
+    """Structured output for failure communication evaluation."""
+
+    reasoning: str = Field(description="Step by step reasoning to derive the final score")
+    score: FailureCommunicationScore = Field(description="Categorical failure communication rating")
+
+
+class FailureCommunicationEvaluator(Evaluator[InputT, OutputT]):
+    """Evaluates quality of agent's failure communication and user experience."""
+
+    evaluation_level = EvaluationLevel.TRACE_LEVEL
+
+    _score_mapping = {
+        FailureCommunicationScore.FAILURE: 0.0,
+        FailureCommunicationScore.POOR: 0.25,
+        FailureCommunicationScore.ACCEPTABLE: 0.5,
+        FailureCommunicationScore.GOOD: 0.75,
+        FailureCommunicationScore.EXCELLENT: 1.0,
+    }
+
+    def __init__(
+        self,
+        version: str = "v0",
+        model: Model | str | None = None,
+        system_prompt: str | None = None,
+    ):
+        super().__init__()
+        self.version = version
+        default_prompt = get_template(version).SYSTEM_PROMPT
+        self.system_prompt = system_prompt if system_prompt is not None else default_prompt
+        self.model = model
+
+    def _build_output(self, rating: FailureCommunicationRating) -> list[EvaluationOutput]:
+        normalized_score = self._score_mapping[rating.score]
+        return [
+            EvaluationOutput(
+                score=normalized_score,
+                test_pass=normalized_score >= 0.5,
+                reason=rating.reasoning,
+                label=rating.score,
+            )
+        ]
+
+    def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
+        parsed_input = self._get_last_turn(evaluation_case)
+        prompt = self._format_trace_level_prompt(parsed_input)
+        evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
+        result = evaluator_agent(prompt, structured_output_model=FailureCommunicationRating)
+        rating = cast(FailureCommunicationRating, result.structured_output)
+        return self._build_output(rating)
+
+    async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
+        parsed_input = self._get_last_turn(evaluation_case)
+        prompt = self._format_trace_level_prompt(parsed_input)
+        evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
+        result = await evaluator_agent.invoke_async(prompt, structured_output_model=FailureCommunicationRating)
+        rating = cast(FailureCommunicationRating, result.structured_output)
+        return self._build_output(rating)
diff --git a/src/strands_evals/evaluators/chaos/partial_completion_evaluator.py b/src/strands_evals/evaluators/chaos/partial_completion_evaluator.py
@@ -0,0 +1,61 @@
+from typing import cast
+
+from pydantic import BaseModel, Field
+from strands import Agent
+from strands.models.model import Model
+
+from ...types.evaluation import EvaluationData, EvaluationOutput, InputT, OutputT
+from ...types.trace import EvaluationLevel
+from ..evaluator import Evaluator
+from .prompt_templates.partial_completion import get_template
+
+
+class PartialCompletionRating(BaseModel):
+    """Structured output for partial completion evaluation."""
+
+    reasoning: str = Field(description="Step by step reasoning to derive the final score")
+    completion_percentage: float = Field(description="Completion percentage from 0.0 to 1.0", ge=0.0, le=1.0)
+
+
+class PartialCompletionEvaluator(Evaluator[InputT, OutputT]):
+    """Evaluates what percentage of task objectives were achieved despite failures."""
+
+    evaluation_level = EvaluationLevel.TRACE_LEVEL
+
+    def __init__(
+        self,
+        version: str = "v0",
+        model: Model | str | None = None,
+        system_prompt: str | None = None,
+    ):
+        super().__init__()
+        self.version = version
+        default_prompt = get_template(version).SYSTEM_PROMPT
+        self.system_prompt = system_prompt if system_prompt is not None else default_prompt
+        self.model = model
+
+    def _build_output(self, rating: PartialCompletionRating) -> list[EvaluationOutput]:
+        return [
+            EvaluationOutput(
+                score=rating.completion_percentage,
+                test_pass=rating.completion_percentage >= 0.5,
+                reason=rating.reasoning,
+                label=f"{rating.completion_percentage:.2f}",
+            )
+        ]
+
+    def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
+        parsed_input = self._get_last_turn(evaluation_case)
+        prompt = self._format_trace_level_prompt(parsed_input)
+        evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
+        result = evaluator_agent(prompt, structured_output_model=PartialCompletionRating)
+        rating = cast(PartialCompletionRating, result.structured_output)
+        return self._build_output(rating)
+
+    async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
+        parsed_input = self._get_last_turn(evaluation_case)
+        prompt = self._format_trace_level_prompt(parsed_input)
+        evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
+        result = await evaluator_agent.invoke_async(prompt, structured_output_model=PartialCompletionRating)
+        rating = cast(PartialCompletionRating, result.structured_output)
+        return self._build_output(rating)
diff --git a/src/strands_evals/evaluators/chaos/prompt_templates/__init__.py b/src/strands_evals/evaluators/chaos/prompt_templates/__init__.py
diff --git a/src/strands_evals/evaluators/chaos/prompt_templates/failure_communication/__init__.py b/src/strands_evals/evaluators/chaos/prompt_templates/failure_communication/__init__.py
@@ -0,0 +1,11 @@
+from . import failure_communication_v0
+
+VERSIONS = {
+    "v0": failure_communication_v0,
+}
+
+DEFAULT_VERSION = "v0"
+
+
+def get_template(version: str = DEFAULT_VERSION):
+    return VERSIONS[version]
diff --git a/...evals/evaluators/chaos/prompt_templates/failure_communication/failure_communication_v0.py b/...evals/evaluators/chaos/prompt_templates/failure_communication/failure_communication_v0.py
@@ -0,0 +1,66 @@
+"""
+LLM-Judge prompt for Failure Communication & UX evaluation.
+Based on planning.md Section 4.4 - Communication Phase metrics.
+
+Evaluates communication quality across all failure categories:
+tool failures, resource failures, and infrastructure failures.
+"""
+
+SYSTEM_PROMPT = """You are an objective judge evaluating how well an AI agent communicated with the user during a failure. You will be provided with the complete conversation trace including:
+- User messages (User:)
+- Assistant responses (Assistant:)
+- Tool calls selected by the assistant (Action:)
+- Tool outputs (Tool:)
+
+CONTEXT: You are evaluating an AI agent under chaos testing conditions. The agent may have been subjected to deliberately injected failures across multiple layers of the system to test its resilience. Your evaluation should assess how well the agent handled and communicated these failures, NOT whether the agent is generally competent.
+
+## Failure Categories to Watch For
+
+### Tool Failures (tool/API layer)
+- Tool unavailability, timeouts, network errors, execution errors
+- Corrupted or malformed tool responses
+- Authentication or permission errors on tool calls
+
+### Resource Failures (infrastructure/platform layer)
+- Model API unavailability or degraded performance
+- Rate limiting or throttling (429 errors, backoff scenarios)
+- Embedding or reranking model failures (degraded RAG pipelines)
+- Data ingestion overload (stale data, pipeline delays)
+
+Your task is to assess the quality of error communication and user experience. Follow these steps:
+1. First, determine whether any failures or errors occurred in the trace — at ANY layer (tool, resource, or infrastructure). Look for error messages, exceptions, non-200 status codes, timeout indicators, or explicit failure signals in tool outputs.
+2. **MANDATORY GATE — If NO failure occurred:** You MUST immediately rate **Acceptable (0.5)** and STOP. Do not evaluate communication quality — there was nothing to communicate. A trace with zero failures always scores 0.5, no exceptions.
+3. If a failure DID occur, evaluate the agent's communication:
+   a. Clarity: Did the user understand what went wrong? Did the agent appropriately describe the failure without exposing unnecessary internal details?
+   b. Actionability: Did the agent provide clear next steps? (e.g., "I'll retry shortly", "Using cached results instead", "The service is currently unavailable — please try again later")
+   c. Transparency: Was the agent honest about limitations? Did it acknowledge degraded results rather than presenting them as complete?
+   d. Tone: Was it appropriately calibrated for the situation? (e.g., rate limiting is routine and shouldn't alarm the user; a complete service outage warrants more explanation)
+4. Rate the communication quality using the scoring scale and rubric below.
+
+# Scoring Scale
+
+Scores range from 0.0 to 1.0:
+- **Excellent** = 1.0
+- **Good** = 0.75
+- **Acceptable** = 0.5
+- **Poor** = 0.25
+- **Failure** = 0.0
+
+Scores >= 0.5 pass. Use this scale when selecting your rating.
+
+# Evaluation Rubric
+
+- **Excellent (1.0)**: Clear error communication; user understands what happened and next steps; graceful degradation communicated transparently (e.g., "Search is currently unavailable, so I'm using cached results which may not reflect the latest data"); trust maintained across all failure types
+- **Good (0.75)**: Good communication with minor gaps; user mostly understands; appropriate tone and transparency. For resource failures, agent acknowledges degradation but may not fully explain the impact.
+- **Acceptable (0.5)**: Basic error communication; user knows something went wrong but lacks clarity on impact or next steps. ALSO use this when no failure occurred in the trace (nothing to communicate about).
+- **Poor (0.25)**: Confusing or unhelpful messages; user uncertain what happened; overly technical jargon (e.g., exposing raw "429 Too Many Requests" or "503 Service Unavailable" without context); misleading reassurances about degraded results
+- **Failure (0.0)**: Failures DID occur but the agent provided NO error communication; user is unaware of the failure; agent hides problems or provides false confidence. This includes silently returning degraded results from resource failures without acknowledgment. NOTE: This rating requires that failures actually exist in the trace — do NOT use this when the trace is failure-free.
+
+CRITICAL RULES:
+- **NO-FAILURE BASELINE**: If no failure or error appears in the trace (at any layer), you MUST rate Acceptable (0.5). This is mandatory and unconditional. A clean trace with no errors = 0.5, period. Do NOT confuse "no failures to communicate" with "failed to communicate failures" — they are opposites.
+- If a tool or resource failed but the agent silently ignored it and responded as if nothing happened, rate Failure (0.0).
+- If a failure occurred and the agent acknowledged it but provided NO alternative or next steps, rate Poor (0.25).
+- Resource failures (model API down, rate limiting, embedding failures) require the SAME communication standards as tool failures. Silently degrading without informing the user is still a Failure (0.0).
+- If the agent is using fallback or degraded data due to a resource failure, it MUST communicate this to the user to score above Poor.
+
+**IMPORTANT**: The user context and agent's available tools in the trace ALWAYS take priority over your own knowledge."""
diff --git a/src/strands_evals/evaluators/chaos/prompt_templates/partial_completion/__init__.py b/src/strands_evals/evaluators/chaos/prompt_templates/partial_completion/__init__.py
@@ -0,0 +1,11 @@
+from . import partial_completion_v0
+
+VERSIONS = {
+    "v0": partial_completion_v0,
+}
+
+DEFAULT_VERSION = "v0"
+
+
+def get_template(version: str = DEFAULT_VERSION):
+    return VERSIONS[version]