From 74a363791405bcd7279ca8f9ebf50021f8446236 Mon Sep 17 00:00:00 2001
From: Darren Wang <ybwang@amazon.com>
Date: Wed, 6 May 2026 17:16:32 +0000
Subject: [PATCH 1/6] add chaos testing example script

---
 .../chaos_testing_with_simulated_tools.py     | 175 ++++++++++++++++++
 1 file changed, 175 insertions(+)
 create mode 100644 site/docs/examples/evals-sdk/chaos_testing_with_simulated_tools.py

diff --git a/site/docs/examples/evals-sdk/chaos_testing_with_simulated_tools.py b/site/docs/examples/evals-sdk/chaos_testing_with_simulated_tools.py
new file mode 100644
index 000000000..8ce4d584c
--- /dev/null
+++ b/site/docs/examples/evals-sdk/chaos_testing_with_simulated_tools.py
@@ -0,0 +1,175 @@
+import logging
+from typing import Any
+
+from pydantic import BaseModel, Field
+
+from strands import Agent
+from strands_evals import Case
+from strands_evals.chaos import (
+    ChaosExperiment,
+    ChaosPlugin,
+    ChaosScenario,
+    CorruptValues,
+    RemoveFields,
+    ToolCallFailure,
+    TruncateFields,
+)
+from strands_evals.evaluators import GoalSuccessRateEvaluator
+from strands_evals.mappers import StrandsInMemorySessionMapper
+from strands_evals.simulation.tool_simulator import ToolSimulator
+from strands_evals.telemetry import StrandsEvalsTelemetry
+
+logging.basicConfig(level=logging.INFO, format="%(message)s")
+logger = logging.getLogger(__name__)
+
+# Setup telemetry
+telemetry = StrandsEvalsTelemetry().setup_in_memory_exporter()
+memory_exporter = telemetry.in_memory_exporter
+
+# 1. Set up ToolSimulator and register tools
+tool_simulator = ToolSimulator()
+
+class FlightSearchResponse(BaseModel):
+    """Response from the flight search tool."""
+
+    flights: list[dict[str, Any]] = Field(default_factory=list, description="List of available flights")
+    total_results: int = Field(default=0, description="Total number of results found")
+    status: str = Field(default="success", description="Operation status")
+
+class BookFlightResponse(BaseModel):
+    """Response from the flight booking tool."""
+
+    booking_id: str = Field(default="", description="Booking confirmation ID")
+    flight_id: str = Field(default="", description="The booked flight ID")
+    status: str = Field(default="success", description="Booking status")
+    message: str = Field(default="", description="Status message")
+
+class BookingConfirmationResponse(BaseModel):
+    """Response from the booking confirmation tool."""
+
+    confirmation_sent: bool = Field(default=False, description="Whether confirmation was sent")
+    method: str = Field(default="email", description="Delivery method")
+    message: str = Field(default="", description="Confirmation details")
+
+@tool_simulator.tool(output_schema=FlightSearchResponse)
+def search_flights(origin: str, destination: str, date: str) -> dict[str, Any]:
+    """Search for available flights between two cities on a given date."""
+    pass
+
+@tool_simulator.tool(output_schema=BookFlightResponse)
+def book_flight(flight_id: str) -> dict[str, Any]:
+    """Book a specific flight by its flight ID. Returns booking confirmation."""
+    pass
+
+@tool_simulator.tool(output_schema=BookingConfirmationResponse)
+def send_booking_confirmation(booking_id: str = "", flight_id: str = "", method: str = "email") -> dict[str, Any]:
+    """Send booking confirmation or fallback link to the user via email or SMS."""
+    pass
+
+# 2. Create the ChaosPlugin
+chaos_plugin = ChaosPlugin()
+
+# 3. Define chaos scenarios
+scenarios = [
+    # Single-tool, pre-hook: tool call is cancelled before execution
+    ChaosScenario(
+        name="search_timeout",
+        description="Search tool times out — agent must handle a hard failure",
+        effects={"search_flights": [ToolCallFailure(error_type="timeout")]},
+    ),
+    # Two-tool, post-hook: tools execute but responses are silently corrupted
+    ChaosScenario(
+        name="book_corrupt_and_confirm_truncated",
+        description="Booking returns garbage data while confirmation is truncated",
+        effects={
+            "book_flight": [CorruptValues(corrupt_ratio=0.8)],
+            "send_booking_confirmation": [TruncateFields(max_length=5)],
+        },
+    ),
+    # All-tool, mixed pre+post: combines hard failures with silent corruption
+    ChaosScenario(
+        name="total_chaos",
+        description="Search network error (pre), book execution error (pre), confirm fields removed (post)",
+        effects={
+            "search_flights": [ToolCallFailure(error_type="network_error")],
+            "book_flight": [ToolCallFailure(error_type="execution_error")],
+            "send_booking_confirmation": [RemoveFields(remove_ratio=0.7)],
+        },
+    ),
+]
+
+# 4. Define the task function
+# Pre-create tool instances once (avoids registry issues across runs)
+_search_tool = tool_simulator.get_tool("search_flights")
+_book_tool = tool_simulator.get_tool("book_flight")
+_confirm_tool = tool_simulator.get_tool("send_booking_confirmation")
+
+def travel_agent_task(case: Case) -> dict:
+    """Run the travel agent with a single user query."""
+    logger.info(f"\n{'─'*60}")
+    logger.info(f"  Case: {case.name}")
+    logger.info(f"  User: {case.input}")
+
+    agent = Agent(
+        system_prompt=(
+            "You are a travel booking assistant. You help users search for flights, "
+            "book them, and send confirmations. Use the available tools to complete "
+            "the user's request. Today's date is May 18, 2025.\n\n"
+            "Always use the tools directly — do not ask the user for clarification "
+            "if you can infer reasonable values from context.\n\n"
+            "If a tool fails or returns an error:\n"
+            "- Acknowledge the failure honestly to the user\n"
+            "- Try an alternative approach if possible\n"
+            "- Do NOT hallucinate successful results\n"
+            "- Do NOT retry more than once\n\n"
+            "If tool results look suspicious (e.g., $0 fares, past dates):\n"
+            "- Inform the user that results seem unreliable\n"
+            "- Suggest alternatives"
+        ),
+        tools=[_search_tool, _book_tool, _confirm_tool],
+        plugins=[chaos_plugin],
+        callback_handler=None,
+        trace_attributes={"gen_ai.conversation.id": case.session_id, "session.id": case.session_id},
+    )
+
+    memory_exporter.clear()
+    try:
+        result = agent(case.input)
+        output = str(result)
+    except Exception as e:
+        output = f"Agent failed with error: {type(e).__name__}: {str(e)[:200]}"
+
+    logger.info(f"  Agent: {output[:300]}{'...' if len(output) > 300 else ''}")
+    logger.info(f"{'─'*60}")
+
+    finished_spans = memory_exporter.get_finished_spans()
+    mapper = StrandsInMemorySessionMapper()
+    session = mapper.map_to_session(finished_spans, session_id=case.session_id)
+
+    return {"output": output, "trajectory": session}
+
+# 5. Define test cases
+test_cases = [
+    Case(
+        name="book_a_flight",
+        input="Find me a flight from SFO to JFK on May 20, book the cheapest one, and send me a confirmation.",
+    ),
+    Case(
+        name="search_and_confirm",
+        input="Search for flights from Seattle to Tokyo next Tuesday, book one, and email me the confirmation.",
+    ),
+]
+
+# 6. Create and run the ChaosExperiment
+evaluators = [GoalSuccessRateEvaluator()]
+
+experiment = ChaosExperiment(
+    cases=test_cases,
+    scenarios=scenarios,
+    evaluators=evaluators,
+    include_baseline=True,
+)
+
+# Run: (1 baseline + 3 scenarios) × 2 cases = 8 runs
+reports = experiment.run_evaluations(task=travel_agent_task)
+reports[0].run_display()

From f5033bf9a37e09dbbaec5c3c01fd1360187dd111 Mon Sep 17 00:00:00 2001
From: Darren Wang <ybwang@amazon.com>
Date: Fri, 15 May 2026 22:00:45 +0000
Subject: [PATCH 2/6] replace chaos scenario with chaos case

---
 .../chaos_testing_with_simulated_tools.py     | 55 ++++++++-----------
 1 file changed, 23 insertions(+), 32 deletions(-)

diff --git a/site/docs/examples/evals-sdk/chaos_testing_with_simulated_tools.py b/site/docs/examples/evals-sdk/chaos_testing_with_simulated_tools.py
index 8ce4d584c..fe5ababea 100644
--- a/site/docs/examples/evals-sdk/chaos_testing_with_simulated_tools.py
+++ b/site/docs/examples/evals-sdk/chaos_testing_with_simulated_tools.py
@@ -6,9 +6,9 @@
 from strands import Agent
 from strands_evals import Case
 from strands_evals.chaos import (
+    ChaosCase,
     ChaosExperiment,
     ChaosPlugin,
-    ChaosScenario,
     CorruptValues,
     RemoveFields,
     ToolCallFailure,
@@ -69,34 +69,24 @@ def send_booking_confirmation(booking_id: str = "", flight_id: str = "", method:
 # 2. Create the ChaosPlugin
 chaos_plugin = ChaosPlugin()
 
-# 3. Define chaos scenarios
-scenarios = [
+# 3. Define named effect maps
+effect_maps = {
     # Single-tool, pre-hook: tool call is cancelled before execution
-    ChaosScenario(
-        name="search_timeout",
-        description="Search tool times out — agent must handle a hard failure",
-        effects={"search_flights": [ToolCallFailure(error_type="timeout")]},
-    ),
+    "search_timeout": {
+        "search_flights": [ToolCallFailure(error_type="timeout")],
+    },
     # Two-tool, post-hook: tools execute but responses are silently corrupted
-    ChaosScenario(
-        name="book_corrupt_and_confirm_truncated",
-        description="Booking returns garbage data while confirmation is truncated",
-        effects={
-            "book_flight": [CorruptValues(corrupt_ratio=0.8)],
-            "send_booking_confirmation": [TruncateFields(max_length=5)],
-        },
-    ),
+    "book_corrupt_and_confirm_truncated": {
+        "book_flight": [CorruptValues(corrupt_ratio=0.8)],
+        "send_booking_confirmation": [TruncateFields(max_length=5)],
+    },
     # All-tool, mixed pre+post: combines hard failures with silent corruption
-    ChaosScenario(
-        name="total_chaos",
-        description="Search network error (pre), book execution error (pre), confirm fields removed (post)",
-        effects={
-            "search_flights": [ToolCallFailure(error_type="network_error")],
-            "book_flight": [ToolCallFailure(error_type="execution_error")],
-            "send_booking_confirmation": [RemoveFields(remove_ratio=0.7)],
-        },
-    ),
-]
+    "total_chaos": {
+        "search_flights": [ToolCallFailure(error_type="network_error")],
+        "book_flight": [ToolCallFailure(error_type="execution_error")],
+        "send_booking_confirmation": [RemoveFields(remove_ratio=0.7)],
+    },
+}
 
 # 4. Define the task function
 # Pre-create tool instances once (avoids registry issues across runs)
@@ -104,7 +94,7 @@ def send_booking_confirmation(booking_id: str = "", flight_id: str = "", method:
 _book_tool = tool_simulator.get_tool("book_flight")
 _confirm_tool = tool_simulator.get_tool("send_booking_confirmation")
 
-def travel_agent_task(case: Case) -> dict:
+def travel_agent_task(case: ChaosCase) -> dict:
     """Run the travel agent with a single user query."""
     logger.info(f"\n{'─'*60}")
     logger.info(f"  Case: {case.name}")
@@ -148,7 +138,7 @@ def travel_agent_task(case: Case) -> dict:
 
     return {"output": output, "trajectory": session}
 
-# 5. Define test cases
+# 5. Define test cases and expand with effect maps
 test_cases = [
     Case(
         name="book_a_flight",
@@ -160,16 +150,17 @@ def travel_agent_task(case: Case) -> dict:
     ),
 ]
 
+# Expand: 2 cases × (3 effect maps + 1 baseline) = 8 ChaosCase objects
+chaos_cases = ChaosCase.expand(test_cases, effect_maps, include_no_effect_baseline=True)
+
 # 6. Create and run the ChaosExperiment
 evaluators = [GoalSuccessRateEvaluator()]
 
 experiment = ChaosExperiment(
-    cases=test_cases,
-    scenarios=scenarios,
+    cases=chaos_cases,
     evaluators=evaluators,
-    include_baseline=True,
 )
 
-# Run: (1 baseline + 3 scenarios) × 2 cases = 8 runs
+# Run: 8 chaos cases = 8 agent invocations
 reports = experiment.run_evaluations(task=travel_agent_task)
 reports[0].run_display()

From e9877bc46cfda592406b1df97fbd3b66c7ffb47c Mon Sep 17 00:00:00 2001
From: Darren Wang <ybwang@amazon.com>
Date: Thu, 21 May 2026 23:45:19 +0000
Subject: [PATCH 3/6] update chaos effect map format; rename script

---
 ...th_simulated_tools.py => chaos_testing.py} | 20 ++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)
 rename site/docs/examples/evals-sdk/{chaos_testing_with_simulated_tools.py => chaos_testing.py} (92%)

diff --git a/site/docs/examples/evals-sdk/chaos_testing_with_simulated_tools.py b/site/docs/examples/evals-sdk/chaos_testing.py
similarity index 92%
rename from site/docs/examples/evals-sdk/chaos_testing_with_simulated_tools.py
rename to site/docs/examples/evals-sdk/chaos_testing.py
index fe5ababea..b3e167789 100644
--- a/site/docs/examples/evals-sdk/chaos_testing_with_simulated_tools.py
+++ b/site/docs/examples/evals-sdk/chaos_testing.py
@@ -10,10 +10,12 @@
     ChaosExperiment,
     ChaosPlugin,
     CorruptValues,
+    NetworkError,
     RemoveFields,
-    ToolCallFailure,
+    Timeout,
     TruncateFields,
 )
+from strands_evals.chaos.effects import ExecutionError
 from strands_evals.evaluators import GoalSuccessRateEvaluator
 from strands_evals.mappers import StrandsInMemorySessionMapper
 from strands_evals.simulation.tool_simulator import ToolSimulator
@@ -73,18 +75,22 @@ def send_booking_confirmation(booking_id: str = "", flight_id: str = "", method:
 effect_maps = {
     # Single-tool, pre-hook: tool call is cancelled before execution
     "search_timeout": {
-        "search_flights": [ToolCallFailure(error_type="timeout")],
+        "tool_effects": {"search_flights": [Timeout()]},
     },
     # Two-tool, post-hook: tools execute but responses are silently corrupted
     "book_corrupt_and_confirm_truncated": {
-        "book_flight": [CorruptValues(corrupt_ratio=0.8)],
-        "send_booking_confirmation": [TruncateFields(max_length=5)],
+        "tool_effects": {
+            "book_flight": [CorruptValues(corrupt_ratio=0.8)],
+            "send_booking_confirmation": [TruncateFields(max_length=5)],
+        },
     },
     # All-tool, mixed pre+post: combines hard failures with silent corruption
     "total_chaos": {
-        "search_flights": [ToolCallFailure(error_type="network_error")],
-        "book_flight": [ToolCallFailure(error_type="execution_error")],
-        "send_booking_confirmation": [RemoveFields(remove_ratio=0.7)],
+        "tool_effects": {
+            "search_flights": [NetworkError()],
+            "book_flight": [ExecutionError()],
+            "send_booking_confirmation": [RemoveFields(remove_ratio=0.7)],
+        },
     },
 }
 

From 7798e6eacffd79f8db60ab37221a013215bba55a Mon Sep 17 00:00:00 2001
From: Darren Wang <ybwang@amazon.com>
Date: Fri, 29 May 2026 17:01:44 +0000
Subject: [PATCH 4/6] add resilience evaluator examples

---
 .../evals-sdk/chaos_failure_communication.py  | 120 +++++++++++++++
 .../evals-sdk/chaos_partial_completion.py     | 137 ++++++++++++++++++
 .../evals-sdk/chaos_recovery_strategy.py      | 133 +++++++++++++++++
 3 files changed, 390 insertions(+)
 create mode 100644 site/docs/examples/evals-sdk/chaos_failure_communication.py
 create mode 100644 site/docs/examples/evals-sdk/chaos_partial_completion.py
 create mode 100644 site/docs/examples/evals-sdk/chaos_recovery_strategy.py

diff --git a/site/docs/examples/evals-sdk/chaos_failure_communication.py b/site/docs/examples/evals-sdk/chaos_failure_communication.py
new file mode 100644
index 000000000..3808fc8c8
--- /dev/null
+++ b/site/docs/examples/evals-sdk/chaos_failure_communication.py
@@ -0,0 +1,120 @@
+import logging
+from typing import Any
+
+from pydantic import BaseModel, Field
+
+from strands import Agent
+from strands_evals.chaos import ChaosCase, ChaosExperiment, ChaosPlugin, Timeout
+from strands_evals.chaos.effects import NetworkError
+from strands_evals.chaos.evaluators import FailureCommunicationEvaluator
+from strands_evals.mappers import StrandsInMemorySessionMapper
+from strands_evals.simulation.tool_simulator import ToolSimulator
+from strands_evals.telemetry import StrandsEvalsTelemetry
+
+logging.basicConfig(level=logging.INFO, format="%(message)s")
+logger = logging.getLogger(__name__)
+
+telemetry = StrandsEvalsTelemetry().setup_in_memory_exporter()
+memory_exporter = telemetry.in_memory_exporter
+
+tool_simulator = ToolSimulator()
+
+
+class FlightSearchResponse(BaseModel):
+    flights: list[dict[str, Any]] = Field(default_factory=list)
+    total_results: int = Field(default=0)
+    status: str = Field(default="success")
+
+
+class BookFlightResponse(BaseModel):
+    booking_id: str = Field(default="")
+    flight_id: str = Field(default="")
+    status: str = Field(default="success")
+    message: str = Field(default="")
+
+
+@tool_simulator.tool(output_schema=FlightSearchResponse)
+def search_flights(origin: str, destination: str, date: str) -> dict[str, Any]:
+    """Search for available flights between two cities on a given date."""
+    pass
+
+
+@tool_simulator.tool(output_schema=BookFlightResponse)
+def book_flight(flight_id: str) -> dict[str, Any]:
+    """Book a specific flight by its flight ID."""
+    pass
+
+
+chaos_plugin = ChaosPlugin()
+
+# Two cases that test communication quality:
+# 1. Search times out — agent must inform user about the failure
+# 2. Both tools fail — agent must communicate multiple failures clearly
+chaos_cases = [
+    ChaosCase(
+        name="search_timeout",
+        input="Find me a flight from SFO to JFK on May 20 and book the cheapest one.",
+        effects={"tool_effects": {"search_flights": [Timeout(error_message="Tool call timed out after 30s")]}},
+    ),
+    ChaosCase(
+        name="all_tools_down",
+        input="Search for flights from Seattle to Tokyo next Tuesday and book one.",
+        effects={
+            "tool_effects": {
+                "search_flights": [NetworkError(error_message="DNS resolution failed")],
+                "book_flight": [NetworkError(error_message="Connection refused")],
+            },
+        },
+    ),
+]
+
+_search_tool = tool_simulator.get_tool("search_flights")
+_book_tool = tool_simulator.get_tool("book_flight")
+
+
+def travel_agent_task(case: ChaosCase) -> dict:
+    """Run the travel agent under chaos and return output + trajectory."""
+    logger.info(f"\n{'─'*60}")
+    logger.info(f"  Case: {case.name}")
+    logger.info(f"  User: {case.input}")
+
+    agent = Agent(
+        system_prompt=(
+            "You are a travel booking assistant. Use the available tools to complete "
+            "the user's request. Today's date is May 18, 2025.\n\n"
+            "If a tool fails or returns an error:\n"
+            "- Acknowledge the failure honestly to the user\n"
+            "- Explain what went wrong in plain language\n"
+            "- Suggest next steps (retry later, try alternative)\n"
+            "- Do NOT hallucinate successful results"
+        ),
+        tools=[_search_tool, _book_tool],
+        plugins=[chaos_plugin],
+        callback_handler=None,
+        trace_attributes={"gen_ai.conversation.id": case.session_id, "session.id": case.session_id},
+    )
+
+    memory_exporter.clear()
+    try:
+        result = agent(case.input)
+        output = str(result)
+    except Exception as e:
+        output = f"Agent failed with error: {type(e).__name__}: {str(e)[:200]}"
+
+    logger.info(f"  Agent: {output[:300]}{'...' if len(output) > 300 else ''}")
+    logger.info(f"{'─'*60}")
+
+    finished_spans = memory_exporter.get_finished_spans()
+    mapper = StrandsInMemorySessionMapper()
+    session = mapper.map_to_session(finished_spans, session_id=case.session_id)
+
+    return {"output": output, "trajectory": session}
+
+
+experiment = ChaosExperiment(
+    cases=chaos_cases,
+    evaluators=[FailureCommunicationEvaluator()],
+)
+
+reports = experiment.run_evaluations(task=travel_agent_task)
+reports[0].run_display()
diff --git a/site/docs/examples/evals-sdk/chaos_partial_completion.py b/site/docs/examples/evals-sdk/chaos_partial_completion.py
new file mode 100644
index 000000000..d77f46df2
--- /dev/null
+++ b/site/docs/examples/evals-sdk/chaos_partial_completion.py
@@ -0,0 +1,137 @@
+import logging
+from typing import Any
+
+from pydantic import BaseModel, Field
+
+from strands import Agent
+from strands_evals.chaos import ChaosCase, ChaosExperiment, ChaosPlugin, TruncateFields
+from strands_evals.chaos.effects import NetworkError
+from strands_evals.chaos.evaluators import PartialCompletionEvaluator
+from strands_evals.mappers import StrandsInMemorySessionMapper
+from strands_evals.simulation.tool_simulator import ToolSimulator
+from strands_evals.telemetry import StrandsEvalsTelemetry
+
+logging.basicConfig(level=logging.INFO, format="%(message)s")
+logger = logging.getLogger(__name__)
+
+telemetry = StrandsEvalsTelemetry().setup_in_memory_exporter()
+memory_exporter = telemetry.in_memory_exporter
+
+tool_simulator = ToolSimulator()
+
+
+class FlightSearchResponse(BaseModel):
+    flights: list[dict[str, Any]] = Field(default_factory=list)
+    total_results: int = Field(default=0)
+    status: str = Field(default="success")
+
+
+class BookFlightResponse(BaseModel):
+    booking_id: str = Field(default="")
+    flight_id: str = Field(default="")
+    status: str = Field(default="success")
+    message: str = Field(default="")
+
+
+class BookingConfirmationResponse(BaseModel):
+    confirmation_sent: bool = Field(default=False)
+    method: str = Field(default="email")
+    message: str = Field(default="")
+
+
+@tool_simulator.tool(output_schema=FlightSearchResponse)
+def search_flights(origin: str, destination: str, date: str) -> dict[str, Any]:
+    """Search for available flights between two cities on a given date."""
+    pass
+
+
+@tool_simulator.tool(output_schema=BookFlightResponse)
+def book_flight(flight_id: str) -> dict[str, Any]:
+    """Book a specific flight by its flight ID."""
+    pass
+
+
+@tool_simulator.tool(output_schema=BookingConfirmationResponse)
+def send_booking_confirmation(booking_id: str = "", flight_id: str = "", method: str = "email") -> dict[str, Any]:
+    """Send booking confirmation to the user via email or SMS."""
+    pass
+
+
+chaos_plugin = ChaosPlugin()
+
+# Two cases that test partial completion:
+# 1. Search works (truncated) but booking fails — user gets degraded flight info but no reservation
+# 2. Search and booking work but confirmation fails — user gets most of what they asked for
+chaos_cases = [
+    ChaosCase(
+        name="search_degraded_booking_fails",
+        input="Find me a flight from SFO to JFK on May 20, book the cheapest one, and send me a confirmation.",
+        effects={
+            "tool_effects": {
+                "search_flights": [TruncateFields(max_length=5)],
+                "book_flight": [NetworkError(error_message="Connection reset by peer")],
+            },
+        },
+    ),
+    ChaosCase(
+        name="confirmation_fails",
+        input="Search for flights from Seattle to Tokyo next Tuesday, book one, and email me the confirmation.",
+        effects={
+            "tool_effects": {
+                "send_booking_confirmation": [NetworkError(error_message="SMTP server unreachable")],
+            },
+        },
+    ),
+]
+
+_search_tool = tool_simulator.get_tool("search_flights")
+_book_tool = tool_simulator.get_tool("book_flight")
+_confirm_tool = tool_simulator.get_tool("send_booking_confirmation")
+
+
+def travel_agent_task(case: ChaosCase) -> dict:
+    """Run the travel agent under chaos and return output + trajectory."""
+    logger.info(f"\n{'─'*60}")
+    logger.info(f"  Case: {case.name}")
+    logger.info(f"  User: {case.input}")
+
+    agent = Agent(
+        system_prompt=(
+            "You are a travel booking assistant. Use the available tools to complete "
+            "the user's request. Today's date is May 18, 2025.\n\n"
+            "If a tool fails or returns an error:\n"
+            "- Acknowledge the failure honestly\n"
+            "- Complete as much of the request as possible\n"
+            "- Do NOT hallucinate successful results\n"
+            "- Do NOT retry more than once"
+        ),
+        tools=[_search_tool, _book_tool, _confirm_tool],
+        plugins=[chaos_plugin],
+        callback_handler=None,
+        trace_attributes={"gen_ai.conversation.id": case.session_id, "session.id": case.session_id},
+    )
+
+    memory_exporter.clear()
+    try:
+        result = agent(case.input)
+        output = str(result)
+    except Exception as e:
+        output = f"Agent failed with error: {type(e).__name__}: {str(e)[:200]}"
+
+    logger.info(f"  Agent: {output[:300]}{'...' if len(output) > 300 else ''}")
+    logger.info(f"{'─'*60}")
+
+    finished_spans = memory_exporter.get_finished_spans()
+    mapper = StrandsInMemorySessionMapper()
+    session = mapper.map_to_session(finished_spans, session_id=case.session_id)
+
+    return {"output": output, "trajectory": session}
+
+
+experiment = ChaosExperiment(
+    cases=chaos_cases,
+    evaluators=[PartialCompletionEvaluator()],
+)
+
+reports = experiment.run_evaluations(task=travel_agent_task)
+reports[0].run_display()
diff --git a/site/docs/examples/evals-sdk/chaos_recovery_strategy.py b/site/docs/examples/evals-sdk/chaos_recovery_strategy.py
new file mode 100644
index 000000000..d3e30963d
--- /dev/null
+++ b/site/docs/examples/evals-sdk/chaos_recovery_strategy.py
@@ -0,0 +1,133 @@
+import logging
+from typing import Any
+
+from pydantic import BaseModel, Field
+
+from strands import Agent
+from strands_evals.chaos import ChaosCase, ChaosExperiment, ChaosPlugin, Timeout
+from strands_evals.chaos.effects import ExecutionError
+from strands_evals.chaos.evaluators import RecoveryStrategyEvaluator
+from strands_evals.mappers import StrandsInMemorySessionMapper
+from strands_evals.simulation.tool_simulator import ToolSimulator
+from strands_evals.telemetry import StrandsEvalsTelemetry
+
+logging.basicConfig(level=logging.INFO, format="%(message)s")
+logger = logging.getLogger(__name__)
+
+telemetry = StrandsEvalsTelemetry().setup_in_memory_exporter()
+memory_exporter = telemetry.in_memory_exporter
+
+tool_simulator = ToolSimulator()
+
+
+class FlightSearchResponse(BaseModel):
+    flights: list[dict[str, Any]] = Field(default_factory=list)
+    total_results: int = Field(default=0)
+    status: str = Field(default="success")
+
+
+class HotelSearchResponse(BaseModel):
+    hotels: list[dict[str, Any]] = Field(default_factory=list)
+    total_results: int = Field(default=0)
+    status: str = Field(default="success")
+
+
+class BookFlightResponse(BaseModel):
+    booking_id: str = Field(default="")
+    flight_id: str = Field(default="")
+    status: str = Field(default="success")
+    message: str = Field(default="")
+
+
+@tool_simulator.tool(output_schema=FlightSearchResponse)
+def search_flights(origin: str, destination: str, date: str) -> dict[str, Any]:
+    """Search for available flights between two cities on a given date."""
+    pass
+
+
+@tool_simulator.tool(output_schema=HotelSearchResponse)
+def search_hotels(city: str, check_in: str, check_out: str) -> dict[str, Any]:
+    """Search for available hotels in a city for given dates."""
+    pass
+
+
+@tool_simulator.tool(output_schema=BookFlightResponse)
+def book_flight(flight_id: str) -> dict[str, Any]:
+    """Book a specific flight by its flight ID."""
+    pass
+
+
+chaos_plugin = ChaosPlugin()
+
+# Two cases that test recovery strategy:
+# 1. Flight search times out but hotel search works — agent should pivot to hotel search
+# 2. Flight search fails permanently — agent should try once, then move on
+chaos_cases = [
+    ChaosCase(
+        name="flight_timeout_hotel_available",
+        input="Plan my trip to Tokyo: find flights from SFO and hotels for May 20-23.",
+        effects={"tool_effects": {"search_flights": [Timeout()]}},
+    ),
+    ChaosCase(
+        name="flight_and_booking_fail",
+        input="Find a flight from NYC to London on June 1 and book the cheapest option.",
+        effects={
+            "tool_effects": {
+                "search_flights": [ExecutionError(error_message="Internal server error")],
+                "book_flight": [ExecutionError(error_message="Service unavailable")],
+            },
+        },
+    ),
+]
+
+_search_flights_tool = tool_simulator.get_tool("search_flights")
+_search_hotels_tool = tool_simulator.get_tool("search_hotels")
+_book_tool = tool_simulator.get_tool("book_flight")
+
+
+def travel_agent_task(case: ChaosCase) -> dict:
+    """Run the travel agent under chaos and return output + trajectory."""
+    logger.info(f"\n{'─'*60}")
+    logger.info(f"  Case: {case.name}")
+    logger.info(f"  User: {case.input}")
+
+    agent = Agent(
+        system_prompt=(
+            "You are a travel planning assistant. Use the available tools to complete "
+            "the user's request. Today's date is May 18, 2025.\n\n"
+            "If a tool fails:\n"
+            "- Try alternative tools that can partially fulfill the request\n"
+            "- Do NOT retry the same failed tool more than once\n"
+            "- Do NOT hallucinate results\n"
+            "- Complete as much of the request as possible with working tools"
+        ),
+        tools=[_search_flights_tool, _search_hotels_tool, _book_tool],
+        plugins=[chaos_plugin],
+        callback_handler=None,
+        trace_attributes={"gen_ai.conversation.id": case.session_id, "session.id": case.session_id},
+    )
+
+    memory_exporter.clear()
+    try:
+        result = agent(case.input)
+        output = str(result)
+    except Exception as e:
+        output = f"Agent failed with error: {type(e).__name__}: {str(e)[:200]}"
+
+    logger.info(f"  Agent: {output[:300]}{'...' if len(output) > 300 else ''}")
+    logger.info(f"{'─'*60}")
+
+    finished_spans = memory_exporter.get_finished_spans()
+    mapper = StrandsInMemorySessionMapper()
+    session = mapper.map_to_session(finished_spans, session_id=case.session_id)
+
+    return {"output": output, "trajectory": session}
+
+
+experiment = ChaosExperiment(
+    cases=chaos_cases,
+    evaluators=[RecoveryStrategyEvaluator()],
+)
+
+reports = experiment.run_evaluations(task=travel_agent_task)
+reports[0].run_display()

From a13deef56b49cb92c62c8c2e8f5209d7b731f753 Mon Sep 17 00:00:00 2001
From: Darren Wang <ybwang@amazon.com>
Date: Fri, 29 May 2026 18:08:03 +0000
Subject: [PATCH 5/6] add chaos testing webpages

---
 site/src/config/navigation.yml                |   4 +
 .../evals-sdk/chaos_testing/chaos_testing.mdx | 472 ++++++++++++++++++
 .../evals-sdk/chaos_testing/index.mdx         | 286 +++++++++++
 3 files changed, 762 insertions(+)
 create mode 100644 site/src/content/docs/user-guide/evals-sdk/chaos_testing/chaos_testing.mdx
 create mode 100644 site/src/content/docs/user-guide/evals-sdk/chaos_testing/index.mdx

diff --git a/site/src/config/navigation.yml b/site/src/config/navigation.yml
index de9969f12..ec8e3d22f 100644
--- a/site/src/config/navigation.yml
+++ b/site/src/config/navigation.yml
@@ -211,6 +211,10 @@ sidebar:
           - label: Remote Trace Providers
             items:
               - docs/user-guide/evals-sdk/how-to/trace_providers
+          - label: Chaos Testing
+            items:
+              - docs/user-guide/evals-sdk/chaos_testing
+              - docs/user-guide/evals-sdk/chaos_testing/chaos_testing
           - label: How-To Guides
             items:
               - docs/user-guide/evals-sdk/how-to/eval_task
diff --git a/site/src/content/docs/user-guide/evals-sdk/chaos_testing/chaos_testing.mdx b/site/src/content/docs/user-guide/evals-sdk/chaos_testing/chaos_testing.mdx
new file mode 100644
index 000000000..ede3449af
--- /dev/null
+++ b/site/src/content/docs/user-guide/evals-sdk/chaos_testing/chaos_testing.mdx
@@ -0,0 +1,472 @@
+---
+title: Chaos Testing Guide
+tags: [error-handling, tool-evaluation, simulation]
+sidebar:
+  label: "Chaos Testing Guide"
+---
+
+## Overview
+
+This guide covers the complete chaos testing workflow: defining effects, expanding test cases, running experiments with `ChaosPlugin`, and evaluating agent resilience with specialized evaluators. Chaos testing uses Strands' native plugin system to inject failures transparently — your task function code stays chaos-free.
+
+```python
+from strands import Agent
+from strands_evals import Case
+from strands_evals.chaos import ChaosCase, ChaosExperiment, ChaosPlugin
+from strands_evals.chaos.effects import Timeout, NetworkError, CorruptValues
+from strands_evals.evaluators import GoalSuccessRateEvaluator
+
+# Define base cases
+base_cases = [
+    Case(
+        name="flight-booking",
+        input="Book me a flight to Paris next Tuesday",
+        metadata={"task_description": "Flight booked with confirmation number"}
+    )
+]
+
+# Define named effect maps
+effect_maps = {
+    "booking_timeout": {
+        "tool_effects": {"book_flight": Timeout()}
+    },
+    "search_corrupted": {
+        "tool_effects": {"search_flights": CorruptValues(corrupt_ratio=0.8)}
+    },
+}
+
+# Generate chaos cases
+chaos_cases = ChaosCase.expand(base_cases, effect_maps, include_baseline=True)
+
+# Run experiment
+def task_function(case: Case) -> dict:
+    agent = Agent(
+        system_prompt="You are a travel booking assistant.",
+        plugins=[ChaosPlugin()],
+        callback_handler=None
+    )
+    response = agent(case.input)
+    return {"output": str(response)}
+
+experiment = ChaosExperiment(
+    cases=chaos_cases,
+    evaluators=[GoalSuccessRateEvaluator()]
+)
+reports = experiment.run_evaluations(task_function)
+```
+
+## How It Works
+
+1. **Case Expansion**: `ChaosCase.expand()` generates the Cartesian product of base cases × named effect maps, producing `ChaosCase` objects with an `effects` field.
+2. **Context Injection**: `ChaosExperiment` sets a `ContextVar` with the active `ChaosCase` before each task execution, ensuring thread/async safety.
+3. **Plugin Interception**: `ChaosPlugin` reads the active case from the `ContextVar` and applies effects via `BeforeToolCallEvent` (pre-hook) or `AfterToolCallEvent` (post-hook).
+4. **Transparent Execution**: Your task function code has zero chaos concepts — just add `ChaosPlugin()` to the agent's plugins list.
+
+## Defining Effects
+
+### Pre-hook Effects
+
+Pre-hook effects cancel the tool call and return an error message to the agent:
+
+```python
+from strands_evals.chaos.effects import (
+    Timeout,
+    NetworkError,
+    ExecutionError,
+    ValidationError,
+)
+
+effect_maps = {
+    "timeout": {"tool_effects": {"my_tool": Timeout()}},
+    "network": {"tool_effects": {"my_tool": NetworkError()}},
+    "execution": {"tool_effects": {"my_tool": ExecutionError()}},
+    "validation": {"tool_effects": {"my_tool": ValidationError()}},
+}
+```
+
+### Post-hook Effects
+
+Post-hook effects let the tool execute normally but corrupt the response:
+
+```python
+from strands_evals.chaos.effects import (
+    TruncateFields,
+    RemoveFields,
+    CorruptValues,
+)
+
+effect_maps = {
+    "truncated": {
+        "tool_effects": {"my_tool": TruncateFields(max_length=10)}
+    },
+    "missing_fields": {
+        "tool_effects": {"my_tool": RemoveFields(remove_ratio=0.5)}
+    },
+    "corrupted": {
+        "tool_effects": {"my_tool": CorruptValues(corrupt_ratio=0.3)}
+    },
+}
+```
+
+### Compound Effects (Multiple Tools)
+
+Target multiple tools in a single effect map to simulate cascading failures:
+
+```python
+effect_maps = {
+    "total_chaos": {
+        "tool_effects": {
+            "search_flights": Timeout(),
+            "book_flight": NetworkError(),
+            "send_confirmation": CorruptValues(corrupt_ratio=0.5),
+        }
+    },
+}
+```
+
+## ChaosCase
+
+`ChaosCase` extends `Case` with an `effects` field. The `effects` dict keys are restricted to known categories (currently `"tool_effects"`):
+
+```python
+from strands_evals.chaos import ChaosCase
+from strands_evals.chaos.effects import Timeout
+
+# Manual construction
+chaos_case = ChaosCase(
+    name="timeout-test",
+    input="Book a flight",
+    effects={"tool_effects": {"book_flight": Timeout()}},
+    metadata={"task_description": "Flight booked"}
+)
+
+# Expansion from base cases (preferred)
+chaos_cases = ChaosCase.expand(
+    cases=[Case(name="test", input="Book a flight")],
+    effect_maps={"timeout": {"tool_effects": {"book_flight": Timeout()}}},
+    include_baseline=True
+)
+```
+
+## ChaosPlugin
+
+`ChaosPlugin` hooks into Strands' event system. Add it to your agent's plugins list:
+
+```python
+from strands import Agent
+from strands_evals.chaos import ChaosPlugin
+
+agent = Agent(
+    system_prompt="You are a helpful assistant.",
+    plugins=[ChaosPlugin()],
+    callback_handler=None
+)
+```
+
+The plugin reads the active `ChaosCase` from a `ContextVar` (managed by `ChaosExperiment`) and applies effects only to tools listed in the case's `effects["tool_effects"]` dict. Tools not listed execute normally.
+
+## ChaosExperiment
+
+`ChaosExperiment` composes the base `Experiment` class and manages the `ContextVar` lifecycle:
+
+```python
+from strands_evals.chaos import ChaosExperiment
+
+experiment = ChaosExperiment(
+    cases=chaos_cases,
+    evaluators=evaluators
+)
+
+# Sync execution
+reports = experiment.run_evaluations(task_function)
+
+# Async execution
+reports = await experiment.run_evaluations_async(async_task_function)
+```
+
+## Resilience Evaluators
+
+### FailureCommunicationEvaluator
+
+Scores how well the agent communicates failures to the user across four dimensions: clarity, actionability, transparency, and tone.
+
+```python
+from strands_evals.chaos.evaluators import FailureCommunicationEvaluator
+
+evaluator = FailureCommunicationEvaluator()
+```
+
+**Scoring criteria:**
+- Does the agent acknowledge the failure clearly?
+- Does it suggest actionable next steps?
+- Is it transparent about what went wrong (without exposing internals)?
+- Is the tone appropriate (not dismissive, not alarming)?
+
+### PartialCompletionEvaluator
+
+Scores what percentage of the user's goal was achieved despite failures, returning a continuous 0.0–1.0 score:
+
+```python
+from strands_evals.chaos.evaluators import PartialCompletionEvaluator
+
+evaluator = PartialCompletionEvaluator()
+```
+
+**Example scores:**
+- `1.0` — Full goal achieved despite failures
+- `0.7` — Most sub-goals completed, one blocked by failure
+- `0.0` — Agent gave up entirely or crashed
+
+### RecoveryStrategyEvaluator
+
+Scores the quality of the agent's recovery actions when tools fail:
+
+```python
+from strands_evals.chaos.evaluators import RecoveryStrategyEvaluator
+
+evaluator = RecoveryStrategyEvaluator()
+```
+
+**Scoring criteria:**
+- Exploration breadth — Did the agent try alternative tools or approaches?
+- Retry discipline — Did it retry appropriately (not excessively)?
+- Approach variation — Did retries use different strategies?
+
+## Complete Example: Multi-Tool Chaos with Resilience Evaluation
+
+```python
+from typing import Any
+from pydantic import BaseModel, Field
+from strands import Agent
+from strands_evals import Case
+from strands_evals.chaos import ChaosCase, ChaosExperiment, ChaosPlugin
+from strands_evals.chaos.effects import Timeout, NetworkError, CorruptValues
+from strands_evals.chaos.evaluators import (
+    FailureCommunicationEvaluator,
+    PartialCompletionEvaluator,
+    RecoveryStrategyEvaluator,
+)
+from strands_evals.evaluators import GoalSuccessRateEvaluator
+from strands_evals.simulation.tool_simulator import ToolSimulator
+
+# Setup tool simulator for reproducible responses
+tool_simulator = ToolSimulator()
+
+class FlightResult(BaseModel):
+    airline: str = Field(..., description="Airline name")
+    price: float = Field(..., description="Price in USD")
+    departure: str = Field(..., description="Departure time")
+
+class BookingConfirmation(BaseModel):
+    confirmation_id: str = Field(..., description="Booking confirmation ID")
+    status: str = Field(..., description="Booking status")
+
+@tool_simulator.tool(
+    share_state_id="travel",
+    initial_state_description="Available flights: AA101 $450 8am, UA202 $380 2pm, DL303 $520 6pm",
+    output_schema=FlightResult,
+)
+def search_flights(destination: str, date: str) -> dict[str, Any]:
+    """Search for available flights."""
+    pass
+
+@tool_simulator.tool(
+    share_state_id="travel",
+    output_schema=BookingConfirmation,
+)
+def book_flight(flight_id: str, passenger_name: str) -> dict[str, Any]:
+    """Book a specific flight."""
+    pass
+
+# Define effect maps
+effect_maps = {
+    "search_timeout": {
+        "tool_effects": {"search_flights": Timeout()}
+    },
+    "booking_network_error": {
+        "tool_effects": {"book_flight": NetworkError()}
+    },
+    "corrupted_search": {
+        "tool_effects": {"search_flights": CorruptValues(corrupt_ratio=0.8)}
+    },
+    "total_chaos": {
+        "tool_effects": {
+            "search_flights": Timeout(),
+            "book_flight": NetworkError(),
+        }
+    },
+}
+
+# Define base cases
+base_cases = [
+    Case(
+        name="book-cheapest",
+        input="Find the cheapest flight to Paris next Tuesday and book it for John Smith",
+        metadata={"task_description": "Flight searched, cheapest option identified, booking confirmed"}
+    ),
+    Case(
+        name="book-morning",
+        input="I need a morning flight to Tokyo on Friday",
+        metadata={"task_description": "Morning flight found and booked"}
+    ),
+]
+
+# Expand into chaos cases
+chaos_cases = ChaosCase.expand(base_cases, effect_maps, include_baseline=True)
+
+# Task function — no chaos concepts here
+def task_function(case: Case) -> dict:
+    search_tool = tool_simulator.get_tool("search_flights")
+    booking_tool = tool_simulator.get_tool("book_flight")
+
+    agent = Agent(
+        system_prompt="You are a travel booking assistant. Help users find and book flights.",
+        tools=[search_tool, booking_tool],
+        plugins=[ChaosPlugin()],
+        callback_handler=None,
+    )
+    response = agent(case.input)
+    return {"output": str(response)}
+
+# Run with all evaluators
+evaluators = [
+    GoalSuccessRateEvaluator(),
+    FailureCommunicationEvaluator(),
+    PartialCompletionEvaluator(),
+    RecoveryStrategyEvaluator(),
+]
+
+experiment = ChaosExperiment(cases=chaos_cases, evaluators=evaluators)
+reports = experiment.run_evaluations(task_function)
+
+# Display results
+for report in reports:
+    print(f"\n{'='*60}")
+    print(f"Evaluator: {report.evaluator_name}")
+    print(f"{'='*60}")
+    report.run_display()
+```
+
+## Advanced Patterns
+
+### Pattern 1: Comparing Agent Configurations Under Chaos
+
+```python
+def compare_agents_under_chaos(chaos_cases, configs):
+    """Compare how different agent configs handle the same failures."""
+    results = {}
+
+    for config_name, system_prompt in configs.items():
+        def make_task(prompt):
+            def task_function(case: Case) -> dict:
+                agent = Agent(
+                    system_prompt=prompt,
+                    plugins=[ChaosPlugin()],
+                    callback_handler=None,
+                )
+                response = agent(case.input)
+                return {"output": str(response)}
+            return task_function
+
+        experiment = ChaosExperiment(
+            cases=chaos_cases,
+            evaluators=[PartialCompletionEvaluator()]
+        )
+        reports = experiment.run_evaluations(make_task(system_prompt))
+        results[config_name] = reports
+
+    return results
+```
+
+### Pattern 2: Progressive Failure Escalation
+
+```python
+# Test increasing severity
+effect_maps = {
+    "mild": {
+        "tool_effects": {"search": CorruptValues(corrupt_ratio=0.2)}
+    },
+    "moderate": {
+        "tool_effects": {"search": CorruptValues(corrupt_ratio=0.5)}
+    },
+    "severe": {
+        "tool_effects": {"search": CorruptValues(corrupt_ratio=0.9)}
+    },
+    "total_failure": {
+        "tool_effects": {"search": Timeout()}
+    },
+}
+```
+
+### Pattern 3: Chaos with User Simulation
+
+Combine chaos testing with user simulation for multi-turn resilience evaluation:
+
+```python
+from strands_evals import ActorSimulator
+
+def task_function(case: Case) -> dict:
+    user_sim = ActorSimulator.from_case_for_user_simulator(
+        case=case, max_turns=8
+    )
+
+    agent = Agent(
+        system_prompt="You are a helpful assistant.",
+        plugins=[ChaosPlugin()],
+        callback_handler=None,
+    )
+
+    user_message = case.input
+    while user_sim.has_next():
+        agent_response = agent(user_message)
+        user_result = user_sim.act(str(agent_response))
+        user_message = str(user_result.structured_output.message)
+
+    return {"output": str(agent_response)}
+```
+
+## Troubleshooting
+
+### Issue: Effects Not Being Applied
+
+Ensure `ChaosPlugin()` is in the agent's plugins list and you're using `ChaosExperiment` (not base `Experiment`):
+
+```python
+# Correct
+agent = Agent(plugins=[ChaosPlugin()], ...)
+experiment = ChaosExperiment(cases=chaos_cases, ...)
+
+# Wrong — base Experiment doesn't set the ContextVar
+experiment = Experiment(cases=chaos_cases, ...)
+```
+
+### Issue: All Tools Failing
+
+Check that your effect map keys match the exact tool function names:
+
+```python
+# If your tool is defined as:
+def search_flights(...): ...
+
+# The effect map key must be "search_flights", not "searchFlights" or "search"
+effect_maps = {"test": {"tool_effects": {"search_flights": Timeout()}}}
+```
+
+### Issue: Async Task Errors
+
+`ChaosExperiment` supports both sync and async tasks. Use the appropriate method:
+
+```python
+# Sync
+reports = experiment.run_evaluations(sync_task_function)
+
+# Async
+reports = await experiment.run_evaluations_async(async_task_function)
+```
+
+## Related Documentation
+
+- [Chaos Testing Overview](/docs/user-guide/evals-sdk/chaos_testing/): Overview and quick start
+- [Tool Simulation](/docs/user-guide/evals-sdk/simulators/tool_simulation/): Simulate tool behavior
+- [Goal Success Rate Evaluator](/docs/user-guide/evals-sdk/evaluators/goal_success_rate_evaluator/): Assess goal completion
+- [Simulators Overview](/docs/user-guide/evals-sdk/simulators/): Simulator framework
diff --git a/site/src/content/docs/user-guide/evals-sdk/chaos_testing/index.mdx b/site/src/content/docs/user-guide/evals-sdk/chaos_testing/index.mdx
new file mode 100644
index 000000000..4dd523b09
--- /dev/null
+++ b/site/src/content/docs/user-guide/evals-sdk/chaos_testing/index.mdx
@@ -0,0 +1,286 @@
+---
+title: Chaos Testing
+tags: [error-handling, simulation]
+sidebar:
+  label: "Overview"
+---
+
+## Overview
+
+Chaos testing systematically evaluates agent resilience by injecting controlled failures into tool execution. Using the `ChaosPlugin`, `ChaosCase`, and `ChaosExperiment` classes, you can test how agents handle tool timeouts, network errors, and corrupted responses — without modifying agent code.
+
+This enables you to answer questions like:
+- Does the agent gracefully communicate failures to users?
+- Can the agent achieve partial goals when some tools fail?
+- Does the agent employ effective recovery strategies?
+
+## Why Chaos Testing?
+
+Traditional evaluation tests agents under ideal conditions. In production, tools fail unpredictably:
+
+**Standard Evaluation:**
+- Tools always return correct responses
+- No network failures or timeouts
+- Cannot reveal fragile error handling
+- Misses degraded-mode behavior
+
+**Chaos Testing:**
+- Injects realistic tool failures (timeouts, network errors, validation errors)
+- Corrupts tool responses (truncated fields, removed data, corrupted values)
+- Tests agent resilience without live infrastructure failures
+- Measures graceful degradation and recovery behavior
+- Quantifies partial goal completion under failure
+
+## When to Use Chaos Testing
+
+Use chaos testing when you need to:
+- **Evaluate Resilience**: Test how agents handle tool failures gracefully
+- **Assess Recovery**: Verify agents try alternative approaches when tools fail
+- **Measure Degradation**: Quantify how much of a goal agents achieve despite failures
+- **Test Communication**: Ensure agents inform users clearly about failures
+- **Validate Robustness**: Confirm agents don't crash or loop on corrupted data
+
+## Architecture
+
+Chaos testing integrates with Strands' plugin system via `BeforeToolCallEvent` and `AfterToolCallEvent` hooks:
+
+1. **ChaosCase** — Extends `Case` with an `effects` field mapping tool names to failure effects
+2. **ChaosPlugin** — A Strands plugin that intercepts tool calls and applies effects transparently
+3. **ChaosExperiment** — Composes the base `Experiment` to manage chaos context per case
+4. **ChaosEffect** — A hierarchy of pre-hook effects (cancel tool calls) and post-hook effects (corrupt responses)
+
+## Effect Types
+
+### Pre-hook Effects (Tool Call Failures)
+
+These effects cancel the tool call entirely and return an error:
+
+| Effect | Description |
+| :------- | :------------ |
+| `Timeout` | Simulates a tool execution timeout |
+| `NetworkError` | Simulates a network connectivity failure |
+| `ExecutionError` | Simulates a runtime error during tool execution |
+| `ValidationError` | Simulates invalid input/output validation failure |
+
+### Post-hook Effects (Response Corruption)
+
+These effects let the tool execute but corrupt the response:
+
+| Effect | Description | Parameters |
+| :------- | :------------ | :----------- |
+| `TruncateFields` | Truncates string fields in the response | `max_length` |
+| `RemoveFields` | Randomly removes fields from the response | `remove_ratio` |
+| `CorruptValues` | Corrupts field values with garbage data | `corrupt_ratio` |
+
+## Basic Usage
+
+```python
+from strands import Agent
+from strands_evals import Case
+from strands_evals.chaos import ChaosCase, ChaosExperiment, ChaosPlugin
+from strands_evals.chaos.effects import Timeout, NetworkError
+from strands_evals.evaluators import GoalSuccessRateEvaluator
+
+# Define base test cases
+base_cases = [
+    Case(
+        name="weather-lookup",
+        input="What's the weather in Seattle?",
+        metadata={"task_description": "Weather information provided"}
+    )
+]
+
+# Define named effect maps
+effect_maps = {
+    "search_timeout": {
+        "tool_effects": {"get_weather": Timeout()}
+    },
+    "network_failure": {
+        "tool_effects": {"get_weather": NetworkError()}
+    },
+}
+
+# Expand cases into Cartesian product (base cases × effect maps + baseline)
+chaos_cases = ChaosCase.expand(base_cases, effect_maps, include_baseline=True)
+
+# Run chaos experiment
+experiment = ChaosExperiment(
+    cases=chaos_cases,
+    evaluators=[GoalSuccessRateEvaluator()]
+)
+
+def task_function(case: Case) -> dict:
+    agent = Agent(
+        system_prompt="You are a helpful weather assistant.",
+        plugins=[ChaosPlugin()],
+        callback_handler=None
+    )
+    response = agent(case.input)
+    return {"output": str(response)}
+
+reports = experiment.run_evaluations(task_function)
+```
+
+## ChaosCase.expand()
+
+The `expand()` class method generates the Cartesian product of base cases and effect maps, optionally including a baseline (no effects) for comparison:
+
+```python
+chaos_cases = ChaosCase.expand(
+    cases=base_cases,        # List of base Case objects
+    effect_maps=effect_maps, # Dict of named effect configurations
+    include_baseline=True    # Include cases with no effects for comparison
+)
+```
+
+For 2 base cases and 3 effect maps with `include_baseline=True`, this produces `2 × (3 + 1) = 8` chaos cases.
+
+## Integration with ToolSimulator
+
+Chaos testing works naturally with `ToolSimulator` for fully controlled evaluation — simulated tools provide reproducible responses, and chaos effects inject failures on top:
+
+```python
+from strands import Agent
+from strands_evals import Case
+from strands_evals.chaos import ChaosCase, ChaosExperiment, ChaosPlugin
+from strands_evals.chaos.effects import Timeout, CorruptValues
+from strands_evals.evaluators import GoalSuccessRateEvaluator
+from strands_evals.simulation.tool_simulator import ToolSimulator
+from pydantic import BaseModel, Field
+
+tool_simulator = ToolSimulator()
+
+class SearchResult(BaseModel):
+    title: str = Field(..., description="Result title")
+    snippet: str = Field(..., description="Result snippet")
+
+@tool_simulator.tool(output_schema=SearchResult)
+def web_search(query: str) -> dict:
+    """Search the web for information."""
+    pass
+
+# Define effect maps
+effect_maps = {
+    "search_timeout": {
+        "tool_effects": {"web_search": Timeout()}
+    },
+    "corrupted_results": {
+        "tool_effects": {"web_search": CorruptValues(corrupt_ratio=0.5)}
+    },
+}
+
+base_cases = [
+    Case(name="research", input="Find recent news about AI agents")
+]
+
+chaos_cases = ChaosCase.expand(base_cases, effect_maps, include_baseline=True)
+
+def task_function(case: Case) -> dict:
+    search_tool = tool_simulator.get_tool("web_search")
+    agent = Agent(
+        tools=[search_tool],
+        plugins=[ChaosPlugin()],
+        callback_handler=None
+    )
+    response = agent(case.input)
+    return {"output": str(response)}
+
+experiment = ChaosExperiment(
+    cases=chaos_cases,
+    evaluators=[GoalSuccessRateEvaluator()]
+)
+reports = experiment.run_evaluations(task_function)
+```
+
+## Resilience Evaluators
+
+Chaos testing ships with three specialized evaluators designed to assess agent behavior under failure:
+
+| Evaluator | What It Measures |
+| :---------- | :---------------- |
+| `FailureCommunicationEvaluator` | Clarity, actionability, transparency, and tone of failure messages |
+| `PartialCompletionEvaluator` | Percentage of user goal achieved despite failures (0.0–1.0) |
+| `RecoveryStrategyEvaluator` | Quality of recovery actions: exploration breadth, retry discipline, approach variation |
+
+```python
+from strands_evals.chaos.evaluators import (
+    FailureCommunicationEvaluator,
+    PartialCompletionEvaluator,
+    RecoveryStrategyEvaluator,
+)
+
+evaluators = [
+    GoalSuccessRateEvaluator(),
+    FailureCommunicationEvaluator(),
+    PartialCompletionEvaluator(),
+    RecoveryStrategyEvaluator(),
+]
+
+experiment = ChaosExperiment(cases=chaos_cases, evaluators=evaluators)
+```
+
+[Complete Chaos Testing Guide →](/docs/user-guide/evals-sdk/chaos_testing/chaos_testing/)
+
+## Chaos Testing vs Simulators
+
+| Aspect | Simulators | Chaos Testing |
+| :------- | :---------- | :-------------- |
+| **Role** | Replace tool execution entirely | Inject failures into tool execution |
+| **Scope** | All tool calls are simulated | Only targeted tools are affected |
+| **Use Case** | Test without infrastructure | Test resilience under failure |
+| **Combination** | Can be used together | Chaos effects apply on top of simulated tools |
+
+## Best Practices
+
+### 1. Start with Baseline Comparisons
+
+Always include `include_baseline=True` to compare agent performance with and without failures:
+
+```python
+chaos_cases = ChaosCase.expand(cases, effect_maps, include_baseline=True)
+```
+
+### 2. Test One Failure at a Time First
+
+Start with single-tool failures before testing compound chaos:
+
+```python
+# Single failure
+effect_maps = {
+    "search_fails": {"tool_effects": {"search": Timeout()}},
+}
+
+# Compound (test after single failures are understood)
+effect_maps = {
+    "total_chaos": {
+        "tool_effects": {
+            "search": Timeout(),
+            "database": NetworkError(),
+        }
+    },
+}
+```
+
+### 3. Use Resilience Evaluators Together
+
+Combine all three resilience evaluators for a complete picture:
+
+```python
+evaluators = [
+    FailureCommunicationEvaluator(),  # Did the agent tell the user?
+    PartialCompletionEvaluator(),     # How much was achieved?
+    RecoveryStrategyEvaluator(),      # Did it try alternatives?
+]
+```
+
+## Next Steps
+
+- [Chaos Testing Guide](/docs/user-guide/evals-sdk/chaos_testing/chaos_testing/): Complete guide with advanced patterns
+- [Tool Simulation](/docs/user-guide/evals-sdk/simulators/tool_simulation/): Simulate tool behavior
+- [Goal Success Rate Evaluator](/docs/user-guide/evals-sdk/evaluators/goal_success_rate_evaluator/): Assess goal completion
+
+## Related Documentation
+
+- [Simulators Overview](/docs/user-guide/evals-sdk/simulators/): Simulator framework
+- [Evaluators](/docs/user-guide/evals-sdk/evaluators/): All available evaluators
+- [Quickstart Guide](/docs/user-guide/evals-sdk/quickstart/): Get started with Strands Evals

From 5acf8675a57a3f1ddd3f597bad0c7cacc0abf8a8 Mon Sep 17 00:00:00 2001
From: Darren Wang <ybwang@amazon.com>
Date: Tue, 2 Jun 2026 17:59:32 +0000
Subject: [PATCH 6/6] revert chaos webpage; use flatten in example

---
 ... chaos_failure_communication_evaluator.py} |   5 +-
 ... => chaos_partial_completion_evaluator.py} |   5 +-
 ...y => chaos_recovery_strategy_evaluator.py} |   5 +-
 site/docs/examples/evals-sdk/chaos_testing.py |   3 +-
 site/src/config/navigation.yml                |   4 -
 .../evals-sdk/chaos_testing/chaos_testing.mdx | 472 ------------------
 .../evals-sdk/chaos_testing/index.mdx         | 286 -----------
 7 files changed, 11 insertions(+), 769 deletions(-)
 rename site/docs/examples/evals-sdk/{chaos_failure_communication.py => chaos_failure_communication_evaluator.py} (95%)
 rename site/docs/examples/evals-sdk/{chaos_partial_completion.py => chaos_partial_completion_evaluator.py} (96%)
 rename site/docs/examples/evals-sdk/{chaos_recovery_strategy.py => chaos_recovery_strategy_evaluator.py} (96%)
 delete mode 100644 site/src/content/docs/user-guide/evals-sdk/chaos_testing/chaos_testing.mdx
 delete mode 100644 site/src/content/docs/user-guide/evals-sdk/chaos_testing/index.mdx

diff --git a/site/docs/examples/evals-sdk/chaos_failure_communication.py b/site/docs/examples/evals-sdk/chaos_failure_communication_evaluator.py
similarity index 95%
rename from site/docs/examples/evals-sdk/chaos_failure_communication.py
rename to site/docs/examples/evals-sdk/chaos_failure_communication_evaluator.py
index 3808fc8c8..d78ab0430 100644
--- a/site/docs/examples/evals-sdk/chaos_failure_communication.py
+++ b/site/docs/examples/evals-sdk/chaos_failure_communication_evaluator.py
@@ -6,10 +6,11 @@
 from strands import Agent
 from strands_evals.chaos import ChaosCase, ChaosExperiment, ChaosPlugin, Timeout
 from strands_evals.chaos.effects import NetworkError
-from strands_evals.chaos.evaluators import FailureCommunicationEvaluator
+from strands_evals.evaluators.chaos import FailureCommunicationEvaluator
 from strands_evals.mappers import StrandsInMemorySessionMapper
 from strands_evals.simulation.tool_simulator import ToolSimulator
 from strands_evals.telemetry import StrandsEvalsTelemetry
+from strands_evals.types.evaluation_report import EvaluationReport
 
 logging.basicConfig(level=logging.INFO, format="%(message)s")
 logger = logging.getLogger(__name__)
@@ -117,4 +118,4 @@ def travel_agent_task(case: ChaosCase) -> dict:
 )
 
 reports = experiment.run_evaluations(task=travel_agent_task)
-reports[0].run_display()
+EvaluationReport.flatten(reports).run_display()
diff --git a/site/docs/examples/evals-sdk/chaos_partial_completion.py b/site/docs/examples/evals-sdk/chaos_partial_completion_evaluator.py
similarity index 96%
rename from site/docs/examples/evals-sdk/chaos_partial_completion.py
rename to site/docs/examples/evals-sdk/chaos_partial_completion_evaluator.py
index d77f46df2..d247ae0aa 100644
--- a/site/docs/examples/evals-sdk/chaos_partial_completion.py
+++ b/site/docs/examples/evals-sdk/chaos_partial_completion_evaluator.py
@@ -6,10 +6,11 @@
 from strands import Agent
 from strands_evals.chaos import ChaosCase, ChaosExperiment, ChaosPlugin, TruncateFields
 from strands_evals.chaos.effects import NetworkError
-from strands_evals.chaos.evaluators import PartialCompletionEvaluator
+from strands_evals.evaluators.chaos import PartialCompletionEvaluator
 from strands_evals.mappers import StrandsInMemorySessionMapper
 from strands_evals.simulation.tool_simulator import ToolSimulator
 from strands_evals.telemetry import StrandsEvalsTelemetry
+from strands_evals.types.evaluation_report import EvaluationReport
 
 logging.basicConfig(level=logging.INFO, format="%(message)s")
 logger = logging.getLogger(__name__)
@@ -134,4 +135,4 @@ def travel_agent_task(case: ChaosCase) -> dict:
 )
 
 reports = experiment.run_evaluations(task=travel_agent_task)
-reports[0].run_display()
+EvaluationReport.flatten(reports).run_display()
diff --git a/site/docs/examples/evals-sdk/chaos_recovery_strategy.py b/site/docs/examples/evals-sdk/chaos_recovery_strategy_evaluator.py
similarity index 96%
rename from site/docs/examples/evals-sdk/chaos_recovery_strategy.py
rename to site/docs/examples/evals-sdk/chaos_recovery_strategy_evaluator.py
index d3e30963d..fc2903e11 100644
--- a/site/docs/examples/evals-sdk/chaos_recovery_strategy.py
+++ b/site/docs/examples/evals-sdk/chaos_recovery_strategy_evaluator.py
@@ -6,10 +6,11 @@
 from strands import Agent
 from strands_evals.chaos import ChaosCase, ChaosExperiment, ChaosPlugin, Timeout
 from strands_evals.chaos.effects import ExecutionError
-from strands_evals.chaos.evaluators import RecoveryStrategyEvaluator
+from strands_evals.evaluators.chaos import RecoveryStrategyEvaluator
 from strands_evals.mappers import StrandsInMemorySessionMapper
 from strands_evals.simulation.tool_simulator import ToolSimulator
 from strands_evals.telemetry import StrandsEvalsTelemetry
+from strands_evals.types.evaluation_report import EvaluationReport
 
 logging.basicConfig(level=logging.INFO, format="%(message)s")
 logger = logging.getLogger(__name__)
@@ -130,4 +131,4 @@ def travel_agent_task(case: ChaosCase) -> dict:
 )
 
 reports = experiment.run_evaluations(task=travel_agent_task)
-reports[0].run_display()
+EvaluationReport.flatten(reports).run_display()
diff --git a/site/docs/examples/evals-sdk/chaos_testing.py b/site/docs/examples/evals-sdk/chaos_testing.py
index b3e167789..86e366103 100644
--- a/site/docs/examples/evals-sdk/chaos_testing.py
+++ b/site/docs/examples/evals-sdk/chaos_testing.py
@@ -20,6 +20,7 @@
 from strands_evals.mappers import StrandsInMemorySessionMapper
 from strands_evals.simulation.tool_simulator import ToolSimulator
 from strands_evals.telemetry import StrandsEvalsTelemetry
+from strands_evals.types.evaluation_report import EvaluationReport
 
 logging.basicConfig(level=logging.INFO, format="%(message)s")
 logger = logging.getLogger(__name__)
@@ -169,4 +170,4 @@ def travel_agent_task(case: ChaosCase) -> dict:
 
 # Run: 8 chaos cases = 8 agent invocations
 reports = experiment.run_evaluations(task=travel_agent_task)
-reports[0].run_display()
+EvaluationReport.flatten(reports).run_display()
diff --git a/site/src/config/navigation.yml b/site/src/config/navigation.yml
index ec8e3d22f..de9969f12 100644
--- a/site/src/config/navigation.yml
+++ b/site/src/config/navigation.yml
@@ -211,10 +211,6 @@ sidebar:
           - label: Remote Trace Providers
             items:
               - docs/user-guide/evals-sdk/how-to/trace_providers
-          - label: Chaos Testing
-            items:
-              - docs/user-guide/evals-sdk/chaos_testing
-              - docs/user-guide/evals-sdk/chaos_testing/chaos_testing
           - label: How-To Guides
             items:
               - docs/user-guide/evals-sdk/how-to/eval_task
diff --git a/site/src/content/docs/user-guide/evals-sdk/chaos_testing/chaos_testing.mdx b/site/src/content/docs/user-guide/evals-sdk/chaos_testing/chaos_testing.mdx
deleted file mode 100644
index ede3449af..000000000
--- a/site/src/content/docs/user-guide/evals-sdk/chaos_testing/chaos_testing.mdx
+++ /dev/null
@@ -1,472 +0,0 @@
----
-title: Chaos Testing Guide
-tags: [error-handling, tool-evaluation, simulation]
-sidebar:
-  label: "Chaos Testing Guide"
----
-
-## Overview
-
-This guide covers the complete chaos testing workflow: defining effects, expanding test cases, running experiments with `ChaosPlugin`, and evaluating agent resilience with specialized evaluators. Chaos testing uses Strands' native plugin system to inject failures transparently — your task function code stays chaos-free.
-
-```python
-from strands import Agent
-from strands_evals import Case
-from strands_evals.chaos import ChaosCase, ChaosExperiment, ChaosPlugin
-from strands_evals.chaos.effects import Timeout, NetworkError, CorruptValues
-from strands_evals.evaluators import GoalSuccessRateEvaluator
-
-# Define base cases
-base_cases = [
-    Case(
-        name="flight-booking",
-        input="Book me a flight to Paris next Tuesday",
-        metadata={"task_description": "Flight booked with confirmation number"}
-    )
-]
-
-# Define named effect maps
-effect_maps = {
-    "booking_timeout": {
-        "tool_effects": {"book_flight": Timeout()}
-    },
-    "search_corrupted": {
-        "tool_effects": {"search_flights": CorruptValues(corrupt_ratio=0.8)}
-    },
-}
-
-# Generate chaos cases
-chaos_cases = ChaosCase.expand(base_cases, effect_maps, include_baseline=True)
-
-# Run experiment
-def task_function(case: Case) -> dict:
-    agent = Agent(
-        system_prompt="You are a travel booking assistant.",
-        plugins=[ChaosPlugin()],
-        callback_handler=None
-    )
-    response = agent(case.input)
-    return {"output": str(response)}
-
-experiment = ChaosExperiment(
-    cases=chaos_cases,
-    evaluators=[GoalSuccessRateEvaluator()]
-)
-reports = experiment.run_evaluations(task_function)
-```
-
-## How It Works
-
-1. **Case Expansion**: `ChaosCase.expand()` generates the Cartesian product of base cases × named effect maps, producing `ChaosCase` objects with an `effects` field.
-2. **Context Injection**: `ChaosExperiment` sets a `ContextVar` with the active `ChaosCase` before each task execution, ensuring thread/async safety.
-3. **Plugin Interception**: `ChaosPlugin` reads the active case from the `ContextVar` and applies effects via `BeforeToolCallEvent` (pre-hook) or `AfterToolCallEvent` (post-hook).
-4. **Transparent Execution**: Your task function code has zero chaos concepts — just add `ChaosPlugin()` to the agent's plugins list.
-
-## Defining Effects
-
-### Pre-hook Effects
-
-Pre-hook effects cancel the tool call and return an error message to the agent:
-
-```python
-from strands_evals.chaos.effects import (
-    Timeout,
-    NetworkError,
-    ExecutionError,
-    ValidationError,
-)
-
-effect_maps = {
-    "timeout": {"tool_effects": {"my_tool": Timeout()}},
-    "network": {"tool_effects": {"my_tool": NetworkError()}},
-    "execution": {"tool_effects": {"my_tool": ExecutionError()}},
-    "validation": {"tool_effects": {"my_tool": ValidationError()}},
-}
-```
-
-### Post-hook Effects
-
-Post-hook effects let the tool execute normally but corrupt the response:
-
-```python
-from strands_evals.chaos.effects import (
-    TruncateFields,
-    RemoveFields,
-    CorruptValues,
-)
-
-effect_maps = {
-    "truncated": {
-        "tool_effects": {"my_tool": TruncateFields(max_length=10)}
-    },
-    "missing_fields": {
-        "tool_effects": {"my_tool": RemoveFields(remove_ratio=0.5)}
-    },
-    "corrupted": {
-        "tool_effects": {"my_tool": CorruptValues(corrupt_ratio=0.3)}
-    },
-}
-```
-
-### Compound Effects (Multiple Tools)
-
-Target multiple tools in a single effect map to simulate cascading failures:
-
-```python
-effect_maps = {
-    "total_chaos": {
-        "tool_effects": {
-            "search_flights": Timeout(),
-            "book_flight": NetworkError(),
-            "send_confirmation": CorruptValues(corrupt_ratio=0.5),
-        }
-    },
-}
-```
-
-## ChaosCase
-
-`ChaosCase` extends `Case` with an `effects` field. The `effects` dict keys are restricted to known categories (currently `"tool_effects"`):
-
-```python
-from strands_evals.chaos import ChaosCase
-from strands_evals.chaos.effects import Timeout
-
-# Manual construction
-chaos_case = ChaosCase(
-    name="timeout-test",
-    input="Book a flight",
-    effects={"tool_effects": {"book_flight": Timeout()}},
-    metadata={"task_description": "Flight booked"}
-)
-
-# Expansion from base cases (preferred)
-chaos_cases = ChaosCase.expand(
-    cases=[Case(name="test", input="Book a flight")],
-    effect_maps={"timeout": {"tool_effects": {"book_flight": Timeout()}}},
-    include_baseline=True
-)
-```
-
-## ChaosPlugin
-
-`ChaosPlugin` hooks into Strands' event system. Add it to your agent's plugins list:
-
-```python
-from strands import Agent
-from strands_evals.chaos import ChaosPlugin
-
-agent = Agent(
-    system_prompt="You are a helpful assistant.",
-    plugins=[ChaosPlugin()],
-    callback_handler=None
-)
-```
-
-The plugin reads the active `ChaosCase` from a `ContextVar` (managed by `ChaosExperiment`) and applies effects only to tools listed in the case's `effects["tool_effects"]` dict. Tools not listed execute normally.
-
-## ChaosExperiment
-
-`ChaosExperiment` composes the base `Experiment` class and manages the `ContextVar` lifecycle:
-
-```python
-from strands_evals.chaos import ChaosExperiment
-
-experiment = ChaosExperiment(
-    cases=chaos_cases,
-    evaluators=evaluators
-)
-
-# Sync execution
-reports = experiment.run_evaluations(task_function)
-
-# Async execution
-reports = await experiment.run_evaluations_async(async_task_function)
-```
-
-## Resilience Evaluators
-
-### FailureCommunicationEvaluator
-
-Scores how well the agent communicates failures to the user across four dimensions: clarity, actionability, transparency, and tone.
-
-```python
-from strands_evals.chaos.evaluators import FailureCommunicationEvaluator
-
-evaluator = FailureCommunicationEvaluator()
-```
-
-**Scoring criteria:**
-- Does the agent acknowledge the failure clearly?
-- Does it suggest actionable next steps?
-- Is it transparent about what went wrong (without exposing internals)?
-- Is the tone appropriate (not dismissive, not alarming)?
-
-### PartialCompletionEvaluator
-
-Scores what percentage of the user's goal was achieved despite failures, returning a continuous 0.0–1.0 score:
-
-```python
-from strands_evals.chaos.evaluators import PartialCompletionEvaluator
-
-evaluator = PartialCompletionEvaluator()
-```
-
-**Example scores:**
-- `1.0` — Full goal achieved despite failures
-- `0.7` — Most sub-goals completed, one blocked by failure
-- `0.0` — Agent gave up entirely or crashed
-
-### RecoveryStrategyEvaluator
-
-Scores the quality of the agent's recovery actions when tools fail:
-
-```python
-from strands_evals.chaos.evaluators import RecoveryStrategyEvaluator
-
-evaluator = RecoveryStrategyEvaluator()
-```
-
-**Scoring criteria:**
-- Exploration breadth — Did the agent try alternative tools or approaches?
-- Retry discipline — Did it retry appropriately (not excessively)?
-- Approach variation — Did retries use different strategies?
-
-## Complete Example: Multi-Tool Chaos with Resilience Evaluation
-
-```python
-from typing import Any
-from pydantic import BaseModel, Field
-from strands import Agent
-from strands_evals import Case
-from strands_evals.chaos import ChaosCase, ChaosExperiment, ChaosPlugin
-from strands_evals.chaos.effects import Timeout, NetworkError, CorruptValues
-from strands_evals.chaos.evaluators import (
-    FailureCommunicationEvaluator,
-    PartialCompletionEvaluator,
-    RecoveryStrategyEvaluator,
-)
-from strands_evals.evaluators import GoalSuccessRateEvaluator
-from strands_evals.simulation.tool_simulator import ToolSimulator
-
-# Setup tool simulator for reproducible responses
-tool_simulator = ToolSimulator()
-
-class FlightResult(BaseModel):
-    airline: str = Field(..., description="Airline name")
-    price: float = Field(..., description="Price in USD")
-    departure: str = Field(..., description="Departure time")
-
-class BookingConfirmation(BaseModel):
-    confirmation_id: str = Field(..., description="Booking confirmation ID")
-    status: str = Field(..., description="Booking status")
-
-@tool_simulator.tool(
-    share_state_id="travel",
-    initial_state_description="Available flights: AA101 $450 8am, UA202 $380 2pm, DL303 $520 6pm",
-    output_schema=FlightResult,
-)
-def search_flights(destination: str, date: str) -> dict[str, Any]:
-    """Search for available flights."""
-    pass
-
-@tool_simulator.tool(
-    share_state_id="travel",
-    output_schema=BookingConfirmation,
-)
-def book_flight(flight_id: str, passenger_name: str) -> dict[str, Any]:
-    """Book a specific flight."""
-    pass
-
-# Define effect maps
-effect_maps = {
-    "search_timeout": {
-        "tool_effects": {"search_flights": Timeout()}
-    },
-    "booking_network_error": {
-        "tool_effects": {"book_flight": NetworkError()}
-    },
-    "corrupted_search": {
-        "tool_effects": {"search_flights": CorruptValues(corrupt_ratio=0.8)}
-    },
-    "total_chaos": {
-        "tool_effects": {
-            "search_flights": Timeout(),
-            "book_flight": NetworkError(),
-        }
-    },
-}
-
-# Define base cases
-base_cases = [
-    Case(
-        name="book-cheapest",
-        input="Find the cheapest flight to Paris next Tuesday and book it for John Smith",
-        metadata={"task_description": "Flight searched, cheapest option identified, booking confirmed"}
-    ),
-    Case(
-        name="book-morning",
-        input="I need a morning flight to Tokyo on Friday",
-        metadata={"task_description": "Morning flight found and booked"}
-    ),
-]
-
-# Expand into chaos cases
-chaos_cases = ChaosCase.expand(base_cases, effect_maps, include_baseline=True)
-
-# Task function — no chaos concepts here
-def task_function(case: Case) -> dict:
-    search_tool = tool_simulator.get_tool("search_flights")
-    booking_tool = tool_simulator.get_tool("book_flight")
-
-    agent = Agent(
-        system_prompt="You are a travel booking assistant. Help users find and book flights.",
-        tools=[search_tool, booking_tool],
-        plugins=[ChaosPlugin()],
-        callback_handler=None,
-    )
-    response = agent(case.input)
-    return {"output": str(response)}
-
-# Run with all evaluators
-evaluators = [
-    GoalSuccessRateEvaluator(),
-    FailureCommunicationEvaluator(),
-    PartialCompletionEvaluator(),
-    RecoveryStrategyEvaluator(),
-]
-
-experiment = ChaosExperiment(cases=chaos_cases, evaluators=evaluators)
-reports = experiment.run_evaluations(task_function)
-
-# Display results
-for report in reports:
-    print(f"\n{'='*60}")
-    print(f"Evaluator: {report.evaluator_name}")
-    print(f"{'='*60}")
-    report.run_display()
-```
-
-## Advanced Patterns
-
-### Pattern 1: Comparing Agent Configurations Under Chaos
-
-```python
-def compare_agents_under_chaos(chaos_cases, configs):
-    """Compare how different agent configs handle the same failures."""
-    results = {}
-
-    for config_name, system_prompt in configs.items():
-        def make_task(prompt):
-            def task_function(case: Case) -> dict:
-                agent = Agent(
-                    system_prompt=prompt,
-                    plugins=[ChaosPlugin()],
-                    callback_handler=None,
-                )
-                response = agent(case.input)
-                return {"output": str(response)}
-            return task_function
-
-        experiment = ChaosExperiment(
-            cases=chaos_cases,
-            evaluators=[PartialCompletionEvaluator()]
-        )
-        reports = experiment.run_evaluations(make_task(system_prompt))
-        results[config_name] = reports
-
-    return results
-```
-
-### Pattern 2: Progressive Failure Escalation
-
-```python
-# Test increasing severity
-effect_maps = {
-    "mild": {
-        "tool_effects": {"search": CorruptValues(corrupt_ratio=0.2)}
-    },
-    "moderate": {
-        "tool_effects": {"search": CorruptValues(corrupt_ratio=0.5)}
-    },
-    "severe": {
-        "tool_effects": {"search": CorruptValues(corrupt_ratio=0.9)}
-    },
-    "total_failure": {
-        "tool_effects": {"search": Timeout()}
-    },
-}
-```
-
-### Pattern 3: Chaos with User Simulation
-
-Combine chaos testing with user simulation for multi-turn resilience evaluation:
-
-```python
-from strands_evals import ActorSimulator
-
-def task_function(case: Case) -> dict:
-    user_sim = ActorSimulator.from_case_for_user_simulator(
-        case=case, max_turns=8
-    )
-
-    agent = Agent(
-        system_prompt="You are a helpful assistant.",
-        plugins=[ChaosPlugin()],
-        callback_handler=None,
-    )
-
-    user_message = case.input
-    while user_sim.has_next():
-        agent_response = agent(user_message)
-        user_result = user_sim.act(str(agent_response))
-        user_message = str(user_result.structured_output.message)
-
-    return {"output": str(agent_response)}
-```
-
-## Troubleshooting
-
-### Issue: Effects Not Being Applied
-
-Ensure `ChaosPlugin()` is in the agent's plugins list and you're using `ChaosExperiment` (not base `Experiment`):
-
-```python
-# Correct
-agent = Agent(plugins=[ChaosPlugin()], ...)
-experiment = ChaosExperiment(cases=chaos_cases, ...)
-
-# Wrong — base Experiment doesn't set the ContextVar
-experiment = Experiment(cases=chaos_cases, ...)
-```
-
-### Issue: All Tools Failing
-
-Check that your effect map keys match the exact tool function names:
-
-```python
-# If your tool is defined as:
-def search_flights(...): ...
-
-# The effect map key must be "search_flights", not "searchFlights" or "search"
-effect_maps = {"test": {"tool_effects": {"search_flights": Timeout()}}}
-```
-
-### Issue: Async Task Errors
-
-`ChaosExperiment` supports both sync and async tasks. Use the appropriate method:
-
-```python
-# Sync
-reports = experiment.run_evaluations(sync_task_function)
-
-# Async
-reports = await experiment.run_evaluations_async(async_task_function)
-```
-
-## Related Documentation
-
-- [Chaos Testing Overview](/docs/user-guide/evals-sdk/chaos_testing/): Overview and quick start
-- [Tool Simulation](/docs/user-guide/evals-sdk/simulators/tool_simulation/): Simulate tool behavior
-- [Goal Success Rate Evaluator](/docs/user-guide/evals-sdk/evaluators/goal_success_rate_evaluator/): Assess goal completion
-- [Simulators Overview](/docs/user-guide/evals-sdk/simulators/): Simulator framework
diff --git a/site/src/content/docs/user-guide/evals-sdk/chaos_testing/index.mdx b/site/src/content/docs/user-guide/evals-sdk/chaos_testing/index.mdx
deleted file mode 100644
index 4dd523b09..000000000
--- a/site/src/content/docs/user-guide/evals-sdk/chaos_testing/index.mdx
+++ /dev/null
@@ -1,286 +0,0 @@
----
-title: Chaos Testing
-tags: [error-handling, simulation]
-sidebar:
-  label: "Overview"
----
-
-## Overview
-
-Chaos testing systematically evaluates agent resilience by injecting controlled failures into tool execution. Using the `ChaosPlugin`, `ChaosCase`, and `ChaosExperiment` classes, you can test how agents handle tool timeouts, network errors, and corrupted responses — without modifying agent code.
-
-This enables you to answer questions like:
-- Does the agent gracefully communicate failures to users?
-- Can the agent achieve partial goals when some tools fail?
-- Does the agent employ effective recovery strategies?
-
-## Why Chaos Testing?
-
-Traditional evaluation tests agents under ideal conditions. In production, tools fail unpredictably:
-
-**Standard Evaluation:**
-- Tools always return correct responses
-- No network failures or timeouts
-- Cannot reveal fragile error handling
-- Misses degraded-mode behavior
-
-**Chaos Testing:**
-- Injects realistic tool failures (timeouts, network errors, validation errors)
-- Corrupts tool responses (truncated fields, removed data, corrupted values)
-- Tests agent resilience without live infrastructure failures
-- Measures graceful degradation and recovery behavior
-- Quantifies partial goal completion under failure
-
-## When to Use Chaos Testing
-
-Use chaos testing when you need to:
-- **Evaluate Resilience**: Test how agents handle tool failures gracefully
-- **Assess Recovery**: Verify agents try alternative approaches when tools fail
-- **Measure Degradation**: Quantify how much of a goal agents achieve despite failures
-- **Test Communication**: Ensure agents inform users clearly about failures
-- **Validate Robustness**: Confirm agents don't crash or loop on corrupted data
-
-## Architecture
-
-Chaos testing integrates with Strands' plugin system via `BeforeToolCallEvent` and `AfterToolCallEvent` hooks:
-
-1. **ChaosCase** — Extends `Case` with an `effects` field mapping tool names to failure effects
-2. **ChaosPlugin** — A Strands plugin that intercepts tool calls and applies effects transparently
-3. **ChaosExperiment** — Composes the base `Experiment` to manage chaos context per case
-4. **ChaosEffect** — A hierarchy of pre-hook effects (cancel tool calls) and post-hook effects (corrupt responses)
-
-## Effect Types
-
-### Pre-hook Effects (Tool Call Failures)
-
-These effects cancel the tool call entirely and return an error:
-
-| Effect | Description |
-| :------- | :------------ |
-| `Timeout` | Simulates a tool execution timeout |
-| `NetworkError` | Simulates a network connectivity failure |
-| `ExecutionError` | Simulates a runtime error during tool execution |
-| `ValidationError` | Simulates invalid input/output validation failure |
-
-### Post-hook Effects (Response Corruption)
-
-These effects let the tool execute but corrupt the response:
-
-| Effect | Description | Parameters |
-| :------- | :------------ | :----------- |
-| `TruncateFields` | Truncates string fields in the response | `max_length` |
-| `RemoveFields` | Randomly removes fields from the response | `remove_ratio` |
-| `CorruptValues` | Corrupts field values with garbage data | `corrupt_ratio` |
-
-## Basic Usage
-
-```python
-from strands import Agent
-from strands_evals import Case
-from strands_evals.chaos import ChaosCase, ChaosExperiment, ChaosPlugin
-from strands_evals.chaos.effects import Timeout, NetworkError
-from strands_evals.evaluators import GoalSuccessRateEvaluator
-
-# Define base test cases
-base_cases = [
-    Case(
-        name="weather-lookup",
-        input="What's the weather in Seattle?",
-        metadata={"task_description": "Weather information provided"}
-    )
-]
-
-# Define named effect maps
-effect_maps = {
-    "search_timeout": {
-        "tool_effects": {"get_weather": Timeout()}
-    },
-    "network_failure": {
-        "tool_effects": {"get_weather": NetworkError()}
-    },
-}
-
-# Expand cases into Cartesian product (base cases × effect maps + baseline)
-chaos_cases = ChaosCase.expand(base_cases, effect_maps, include_baseline=True)
-
-# Run chaos experiment
-experiment = ChaosExperiment(
-    cases=chaos_cases,
-    evaluators=[GoalSuccessRateEvaluator()]
-)
-
-def task_function(case: Case) -> dict:
-    agent = Agent(
-        system_prompt="You are a helpful weather assistant.",
-        plugins=[ChaosPlugin()],
-        callback_handler=None
-    )
-    response = agent(case.input)
-    return {"output": str(response)}
-
-reports = experiment.run_evaluations(task_function)
-```
-
-## ChaosCase.expand()
-
-The `expand()` class method generates the Cartesian product of base cases and effect maps, optionally including a baseline (no effects) for comparison:
-
-```python
-chaos_cases = ChaosCase.expand(
-    cases=base_cases,        # List of base Case objects
-    effect_maps=effect_maps, # Dict of named effect configurations
-    include_baseline=True    # Include cases with no effects for comparison
-)
-```
-
-For 2 base cases and 3 effect maps with `include_baseline=True`, this produces `2 × (3 + 1) = 8` chaos cases.
-
-## Integration with ToolSimulator
-
-Chaos testing works naturally with `ToolSimulator` for fully controlled evaluation — simulated tools provide reproducible responses, and chaos effects inject failures on top:
-
-```python
-from strands import Agent
-from strands_evals import Case
-from strands_evals.chaos import ChaosCase, ChaosExperiment, ChaosPlugin
-from strands_evals.chaos.effects import Timeout, CorruptValues
-from strands_evals.evaluators import GoalSuccessRateEvaluator
-from strands_evals.simulation.tool_simulator import ToolSimulator
-from pydantic import BaseModel, Field
-
-tool_simulator = ToolSimulator()
-
-class SearchResult(BaseModel):
-    title: str = Field(..., description="Result title")
-    snippet: str = Field(..., description="Result snippet")
-
-@tool_simulator.tool(output_schema=SearchResult)
-def web_search(query: str) -> dict:
-    """Search the web for information."""
-    pass
-
-# Define effect maps
-effect_maps = {
-    "search_timeout": {
-        "tool_effects": {"web_search": Timeout()}
-    },
-    "corrupted_results": {
-        "tool_effects": {"web_search": CorruptValues(corrupt_ratio=0.5)}
-    },
-}
-
-base_cases = [
-    Case(name="research", input="Find recent news about AI agents")
-]
-
-chaos_cases = ChaosCase.expand(base_cases, effect_maps, include_baseline=True)
-
-def task_function(case: Case) -> dict:
-    search_tool = tool_simulator.get_tool("web_search")
-    agent = Agent(
-        tools=[search_tool],
-        plugins=[ChaosPlugin()],
-        callback_handler=None
-    )
-    response = agent(case.input)
-    return {"output": str(response)}
-
-experiment = ChaosExperiment(
-    cases=chaos_cases,
-    evaluators=[GoalSuccessRateEvaluator()]
-)
-reports = experiment.run_evaluations(task_function)
-```
-
-## Resilience Evaluators
-
-Chaos testing ships with three specialized evaluators designed to assess agent behavior under failure:
-
-| Evaluator | What It Measures |
-| :---------- | :---------------- |
-| `FailureCommunicationEvaluator` | Clarity, actionability, transparency, and tone of failure messages |
-| `PartialCompletionEvaluator` | Percentage of user goal achieved despite failures (0.0–1.0) |
-| `RecoveryStrategyEvaluator` | Quality of recovery actions: exploration breadth, retry discipline, approach variation |
-
-```python
-from strands_evals.chaos.evaluators import (
-    FailureCommunicationEvaluator,
-    PartialCompletionEvaluator,
-    RecoveryStrategyEvaluator,
-)
-
-evaluators = [
-    GoalSuccessRateEvaluator(),
-    FailureCommunicationEvaluator(),
-    PartialCompletionEvaluator(),
-    RecoveryStrategyEvaluator(),
-]
-
-experiment = ChaosExperiment(cases=chaos_cases, evaluators=evaluators)
-```
-
-[Complete Chaos Testing Guide →](/docs/user-guide/evals-sdk/chaos_testing/chaos_testing/)
-
-## Chaos Testing vs Simulators
-
-| Aspect | Simulators | Chaos Testing |
-| :------- | :---------- | :-------------- |
-| **Role** | Replace tool execution entirely | Inject failures into tool execution |
-| **Scope** | All tool calls are simulated | Only targeted tools are affected |
-| **Use Case** | Test without infrastructure | Test resilience under failure |
-| **Combination** | Can be used together | Chaos effects apply on top of simulated tools |
-
-## Best Practices
-
-### 1. Start with Baseline Comparisons
-
-Always include `include_baseline=True` to compare agent performance with and without failures:
-
-```python
-chaos_cases = ChaosCase.expand(cases, effect_maps, include_baseline=True)
-```
-
-### 2. Test One Failure at a Time First
-
-Start with single-tool failures before testing compound chaos:
-
-```python
-# Single failure
-effect_maps = {
-    "search_fails": {"tool_effects": {"search": Timeout()}},
-}
-
-# Compound (test after single failures are understood)
-effect_maps = {
-    "total_chaos": {
-        "tool_effects": {
-            "search": Timeout(),
-            "database": NetworkError(),
-        }
-    },
-}
-```
-
-### 3. Use Resilience Evaluators Together
-
-Combine all three resilience evaluators for a complete picture:
-
-```python
-evaluators = [
-    FailureCommunicationEvaluator(),  # Did the agent tell the user?
-    PartialCompletionEvaluator(),     # How much was achieved?
-    RecoveryStrategyEvaluator(),      # Did it try alternatives?
-]
-```
-
-## Next Steps
-
-- [Chaos Testing Guide](/docs/user-guide/evals-sdk/chaos_testing/chaos_testing/): Complete guide with advanced patterns
-- [Tool Simulation](/docs/user-guide/evals-sdk/simulators/tool_simulation/): Simulate tool behavior
-- [Goal Success Rate Evaluator](/docs/user-guide/evals-sdk/evaluators/goal_success_rate_evaluator/): Assess goal completion
-
-## Related Documentation
-
-- [Simulators Overview](/docs/user-guide/evals-sdk/simulators/): Simulator framework
-- [Evaluators](/docs/user-guide/evals-sdk/evaluators/): All available evaluators
-- [Quickstart Guide](/docs/user-guide/evals-sdk/quickstart/): Get started with Strands Evals