From 74a363791405bcd7279ca8f9ebf50021f8446236 Mon Sep 17 00:00:00 2001 From: Darren Wang Date: Wed, 6 May 2026 17:16:32 +0000 Subject: [PATCH 1/6] add chaos testing example script --- .../chaos_testing_with_simulated_tools.py | 175 ++++++++++++++++++ 1 file changed, 175 insertions(+) create mode 100644 site/docs/examples/evals-sdk/chaos_testing_with_simulated_tools.py diff --git a/site/docs/examples/evals-sdk/chaos_testing_with_simulated_tools.py b/site/docs/examples/evals-sdk/chaos_testing_with_simulated_tools.py new file mode 100644 index 000000000..8ce4d584c --- /dev/null +++ b/site/docs/examples/evals-sdk/chaos_testing_with_simulated_tools.py @@ -0,0 +1,175 @@ +import logging +from typing import Any + +from pydantic import BaseModel, Field + +from strands import Agent +from strands_evals import Case +from strands_evals.chaos import ( + ChaosExperiment, + ChaosPlugin, + ChaosScenario, + CorruptValues, + RemoveFields, + ToolCallFailure, + TruncateFields, +) +from strands_evals.evaluators import GoalSuccessRateEvaluator +from strands_evals.mappers import StrandsInMemorySessionMapper +from strands_evals.simulation.tool_simulator import ToolSimulator +from strands_evals.telemetry import StrandsEvalsTelemetry + +logging.basicConfig(level=logging.INFO, format="%(message)s") +logger = logging.getLogger(__name__) + +# Setup telemetry +telemetry = StrandsEvalsTelemetry().setup_in_memory_exporter() +memory_exporter = telemetry.in_memory_exporter + +# 1. Set up ToolSimulator and register tools +tool_simulator = ToolSimulator() + +class FlightSearchResponse(BaseModel): + """Response from the flight search tool.""" + + flights: list[dict[str, Any]] = Field(default_factory=list, description="List of available flights") + total_results: int = Field(default=0, description="Total number of results found") + status: str = Field(default="success", description="Operation status") + +class BookFlightResponse(BaseModel): + """Response from the flight booking tool.""" + + booking_id: str = Field(default="", description="Booking confirmation ID") + flight_id: str = Field(default="", description="The booked flight ID") + status: str = Field(default="success", description="Booking status") + message: str = Field(default="", description="Status message") + +class BookingConfirmationResponse(BaseModel): + """Response from the booking confirmation tool.""" + + confirmation_sent: bool = Field(default=False, description="Whether confirmation was sent") + method: str = Field(default="email", description="Delivery method") + message: str = Field(default="", description="Confirmation details") + +@tool_simulator.tool(output_schema=FlightSearchResponse) +def search_flights(origin: str, destination: str, date: str) -> dict[str, Any]: + """Search for available flights between two cities on a given date.""" + pass + +@tool_simulator.tool(output_schema=BookFlightResponse) +def book_flight(flight_id: str) -> dict[str, Any]: + """Book a specific flight by its flight ID. Returns booking confirmation.""" + pass + +@tool_simulator.tool(output_schema=BookingConfirmationResponse) +def send_booking_confirmation(booking_id: str = "", flight_id: str = "", method: str = "email") -> dict[str, Any]: + """Send booking confirmation or fallback link to the user via email or SMS.""" + pass + +# 2. Create the ChaosPlugin +chaos_plugin = ChaosPlugin() + +# 3. Define chaos scenarios +scenarios = [ + # Single-tool, pre-hook: tool call is cancelled before execution + ChaosScenario( + name="search_timeout", + description="Search tool times out — agent must handle a hard failure", + effects={"search_flights": [ToolCallFailure(error_type="timeout")]}, + ), + # Two-tool, post-hook: tools execute but responses are silently corrupted + ChaosScenario( + name="book_corrupt_and_confirm_truncated", + description="Booking returns garbage data while confirmation is truncated", + effects={ + "book_flight": [CorruptValues(corrupt_ratio=0.8)], + "send_booking_confirmation": [TruncateFields(max_length=5)], + }, + ), + # All-tool, mixed pre+post: combines hard failures with silent corruption + ChaosScenario( + name="total_chaos", + description="Search network error (pre), book execution error (pre), confirm fields removed (post)", + effects={ + "search_flights": [ToolCallFailure(error_type="network_error")], + "book_flight": [ToolCallFailure(error_type="execution_error")], + "send_booking_confirmation": [RemoveFields(remove_ratio=0.7)], + }, + ), +] + +# 4. Define the task function +# Pre-create tool instances once (avoids registry issues across runs) +_search_tool = tool_simulator.get_tool("search_flights") +_book_tool = tool_simulator.get_tool("book_flight") +_confirm_tool = tool_simulator.get_tool("send_booking_confirmation") + +def travel_agent_task(case: Case) -> dict: + """Run the travel agent with a single user query.""" + logger.info(f"\n{'─'*60}") + logger.info(f" Case: {case.name}") + logger.info(f" User: {case.input}") + + agent = Agent( + system_prompt=( + "You are a travel booking assistant. You help users search for flights, " + "book them, and send confirmations. Use the available tools to complete " + "the user's request. Today's date is May 18, 2025.\n\n" + "Always use the tools directly — do not ask the user for clarification " + "if you can infer reasonable values from context.\n\n" + "If a tool fails or returns an error:\n" + "- Acknowledge the failure honestly to the user\n" + "- Try an alternative approach if possible\n" + "- Do NOT hallucinate successful results\n" + "- Do NOT retry more than once\n\n" + "If tool results look suspicious (e.g., $0 fares, past dates):\n" + "- Inform the user that results seem unreliable\n" + "- Suggest alternatives" + ), + tools=[_search_tool, _book_tool, _confirm_tool], + plugins=[chaos_plugin], + callback_handler=None, + trace_attributes={"gen_ai.conversation.id": case.session_id, "session.id": case.session_id}, + ) + + memory_exporter.clear() + try: + result = agent(case.input) + output = str(result) + except Exception as e: + output = f"Agent failed with error: {type(e).__name__}: {str(e)[:200]}" + + logger.info(f" Agent: {output[:300]}{'...' if len(output) > 300 else ''}") + logger.info(f"{'─'*60}") + + finished_spans = memory_exporter.get_finished_spans() + mapper = StrandsInMemorySessionMapper() + session = mapper.map_to_session(finished_spans, session_id=case.session_id) + + return {"output": output, "trajectory": session} + +# 5. Define test cases +test_cases = [ + Case( + name="book_a_flight", + input="Find me a flight from SFO to JFK on May 20, book the cheapest one, and send me a confirmation.", + ), + Case( + name="search_and_confirm", + input="Search for flights from Seattle to Tokyo next Tuesday, book one, and email me the confirmation.", + ), +] + +# 6. Create and run the ChaosExperiment +evaluators = [GoalSuccessRateEvaluator()] + +experiment = ChaosExperiment( + cases=test_cases, + scenarios=scenarios, + evaluators=evaluators, + include_baseline=True, +) + +# Run: (1 baseline + 3 scenarios) × 2 cases = 8 runs +reports = experiment.run_evaluations(task=travel_agent_task) +reports[0].run_display() From f5033bf9a37e09dbbaec5c3c01fd1360187dd111 Mon Sep 17 00:00:00 2001 From: Darren Wang Date: Fri, 15 May 2026 22:00:45 +0000 Subject: [PATCH 2/6] replace chaos scenario with chaos case --- .../chaos_testing_with_simulated_tools.py | 55 ++++++++----------- 1 file changed, 23 insertions(+), 32 deletions(-) diff --git a/site/docs/examples/evals-sdk/chaos_testing_with_simulated_tools.py b/site/docs/examples/evals-sdk/chaos_testing_with_simulated_tools.py index 8ce4d584c..fe5ababea 100644 --- a/site/docs/examples/evals-sdk/chaos_testing_with_simulated_tools.py +++ b/site/docs/examples/evals-sdk/chaos_testing_with_simulated_tools.py @@ -6,9 +6,9 @@ from strands import Agent from strands_evals import Case from strands_evals.chaos import ( + ChaosCase, ChaosExperiment, ChaosPlugin, - ChaosScenario, CorruptValues, RemoveFields, ToolCallFailure, @@ -69,34 +69,24 @@ def send_booking_confirmation(booking_id: str = "", flight_id: str = "", method: # 2. Create the ChaosPlugin chaos_plugin = ChaosPlugin() -# 3. Define chaos scenarios -scenarios = [ +# 3. Define named effect maps +effect_maps = { # Single-tool, pre-hook: tool call is cancelled before execution - ChaosScenario( - name="search_timeout", - description="Search tool times out — agent must handle a hard failure", - effects={"search_flights": [ToolCallFailure(error_type="timeout")]}, - ), + "search_timeout": { + "search_flights": [ToolCallFailure(error_type="timeout")], + }, # Two-tool, post-hook: tools execute but responses are silently corrupted - ChaosScenario( - name="book_corrupt_and_confirm_truncated", - description="Booking returns garbage data while confirmation is truncated", - effects={ - "book_flight": [CorruptValues(corrupt_ratio=0.8)], - "send_booking_confirmation": [TruncateFields(max_length=5)], - }, - ), + "book_corrupt_and_confirm_truncated": { + "book_flight": [CorruptValues(corrupt_ratio=0.8)], + "send_booking_confirmation": [TruncateFields(max_length=5)], + }, # All-tool, mixed pre+post: combines hard failures with silent corruption - ChaosScenario( - name="total_chaos", - description="Search network error (pre), book execution error (pre), confirm fields removed (post)", - effects={ - "search_flights": [ToolCallFailure(error_type="network_error")], - "book_flight": [ToolCallFailure(error_type="execution_error")], - "send_booking_confirmation": [RemoveFields(remove_ratio=0.7)], - }, - ), -] + "total_chaos": { + "search_flights": [ToolCallFailure(error_type="network_error")], + "book_flight": [ToolCallFailure(error_type="execution_error")], + "send_booking_confirmation": [RemoveFields(remove_ratio=0.7)], + }, +} # 4. Define the task function # Pre-create tool instances once (avoids registry issues across runs) @@ -104,7 +94,7 @@ def send_booking_confirmation(booking_id: str = "", flight_id: str = "", method: _book_tool = tool_simulator.get_tool("book_flight") _confirm_tool = tool_simulator.get_tool("send_booking_confirmation") -def travel_agent_task(case: Case) -> dict: +def travel_agent_task(case: ChaosCase) -> dict: """Run the travel agent with a single user query.""" logger.info(f"\n{'─'*60}") logger.info(f" Case: {case.name}") @@ -148,7 +138,7 @@ def travel_agent_task(case: Case) -> dict: return {"output": output, "trajectory": session} -# 5. Define test cases +# 5. Define test cases and expand with effect maps test_cases = [ Case( name="book_a_flight", @@ -160,16 +150,17 @@ def travel_agent_task(case: Case) -> dict: ), ] +# Expand: 2 cases × (3 effect maps + 1 baseline) = 8 ChaosCase objects +chaos_cases = ChaosCase.expand(test_cases, effect_maps, include_no_effect_baseline=True) + # 6. Create and run the ChaosExperiment evaluators = [GoalSuccessRateEvaluator()] experiment = ChaosExperiment( - cases=test_cases, - scenarios=scenarios, + cases=chaos_cases, evaluators=evaluators, - include_baseline=True, ) -# Run: (1 baseline + 3 scenarios) × 2 cases = 8 runs +# Run: 8 chaos cases = 8 agent invocations reports = experiment.run_evaluations(task=travel_agent_task) reports[0].run_display() From e9877bc46cfda592406b1df97fbd3b66c7ffb47c Mon Sep 17 00:00:00 2001 From: Darren Wang Date: Thu, 21 May 2026 23:45:19 +0000 Subject: [PATCH 3/6] update chaos effect map format; rename script --- ...th_simulated_tools.py => chaos_testing.py} | 20 ++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) rename site/docs/examples/evals-sdk/{chaos_testing_with_simulated_tools.py => chaos_testing.py} (92%) diff --git a/site/docs/examples/evals-sdk/chaos_testing_with_simulated_tools.py b/site/docs/examples/evals-sdk/chaos_testing.py similarity index 92% rename from site/docs/examples/evals-sdk/chaos_testing_with_simulated_tools.py rename to site/docs/examples/evals-sdk/chaos_testing.py index fe5ababea..b3e167789 100644 --- a/site/docs/examples/evals-sdk/chaos_testing_with_simulated_tools.py +++ b/site/docs/examples/evals-sdk/chaos_testing.py @@ -10,10 +10,12 @@ ChaosExperiment, ChaosPlugin, CorruptValues, + NetworkError, RemoveFields, - ToolCallFailure, + Timeout, TruncateFields, ) +from strands_evals.chaos.effects import ExecutionError from strands_evals.evaluators import GoalSuccessRateEvaluator from strands_evals.mappers import StrandsInMemorySessionMapper from strands_evals.simulation.tool_simulator import ToolSimulator @@ -73,18 +75,22 @@ def send_booking_confirmation(booking_id: str = "", flight_id: str = "", method: effect_maps = { # Single-tool, pre-hook: tool call is cancelled before execution "search_timeout": { - "search_flights": [ToolCallFailure(error_type="timeout")], + "tool_effects": {"search_flights": [Timeout()]}, }, # Two-tool, post-hook: tools execute but responses are silently corrupted "book_corrupt_and_confirm_truncated": { - "book_flight": [CorruptValues(corrupt_ratio=0.8)], - "send_booking_confirmation": [TruncateFields(max_length=5)], + "tool_effects": { + "book_flight": [CorruptValues(corrupt_ratio=0.8)], + "send_booking_confirmation": [TruncateFields(max_length=5)], + }, }, # All-tool, mixed pre+post: combines hard failures with silent corruption "total_chaos": { - "search_flights": [ToolCallFailure(error_type="network_error")], - "book_flight": [ToolCallFailure(error_type="execution_error")], - "send_booking_confirmation": [RemoveFields(remove_ratio=0.7)], + "tool_effects": { + "search_flights": [NetworkError()], + "book_flight": [ExecutionError()], + "send_booking_confirmation": [RemoveFields(remove_ratio=0.7)], + }, }, } From 7798e6eacffd79f8db60ab37221a013215bba55a Mon Sep 17 00:00:00 2001 From: Darren Wang Date: Fri, 29 May 2026 17:01:44 +0000 Subject: [PATCH 4/6] add resilience evaluator examples --- .../evals-sdk/chaos_failure_communication.py | 120 +++++++++++++++ .../evals-sdk/chaos_partial_completion.py | 137 ++++++++++++++++++ .../evals-sdk/chaos_recovery_strategy.py | 133 +++++++++++++++++ 3 files changed, 390 insertions(+) create mode 100644 site/docs/examples/evals-sdk/chaos_failure_communication.py create mode 100644 site/docs/examples/evals-sdk/chaos_partial_completion.py create mode 100644 site/docs/examples/evals-sdk/chaos_recovery_strategy.py diff --git a/site/docs/examples/evals-sdk/chaos_failure_communication.py b/site/docs/examples/evals-sdk/chaos_failure_communication.py new file mode 100644 index 000000000..3808fc8c8 --- /dev/null +++ b/site/docs/examples/evals-sdk/chaos_failure_communication.py @@ -0,0 +1,120 @@ +import logging +from typing import Any + +from pydantic import BaseModel, Field + +from strands import Agent +from strands_evals.chaos import ChaosCase, ChaosExperiment, ChaosPlugin, Timeout +from strands_evals.chaos.effects import NetworkError +from strands_evals.chaos.evaluators import FailureCommunicationEvaluator +from strands_evals.mappers import StrandsInMemorySessionMapper +from strands_evals.simulation.tool_simulator import ToolSimulator +from strands_evals.telemetry import StrandsEvalsTelemetry + +logging.basicConfig(level=logging.INFO, format="%(message)s") +logger = logging.getLogger(__name__) + +telemetry = StrandsEvalsTelemetry().setup_in_memory_exporter() +memory_exporter = telemetry.in_memory_exporter + +tool_simulator = ToolSimulator() + + +class FlightSearchResponse(BaseModel): + flights: list[dict[str, Any]] = Field(default_factory=list) + total_results: int = Field(default=0) + status: str = Field(default="success") + + +class BookFlightResponse(BaseModel): + booking_id: str = Field(default="") + flight_id: str = Field(default="") + status: str = Field(default="success") + message: str = Field(default="") + + +@tool_simulator.tool(output_schema=FlightSearchResponse) +def search_flights(origin: str, destination: str, date: str) -> dict[str, Any]: + """Search for available flights between two cities on a given date.""" + pass + + +@tool_simulator.tool(output_schema=BookFlightResponse) +def book_flight(flight_id: str) -> dict[str, Any]: + """Book a specific flight by its flight ID.""" + pass + + +chaos_plugin = ChaosPlugin() + +# Two cases that test communication quality: +# 1. Search times out — agent must inform user about the failure +# 2. Both tools fail — agent must communicate multiple failures clearly +chaos_cases = [ + ChaosCase( + name="search_timeout", + input="Find me a flight from SFO to JFK on May 20 and book the cheapest one.", + effects={"tool_effects": {"search_flights": [Timeout(error_message="Tool call timed out after 30s")]}}, + ), + ChaosCase( + name="all_tools_down", + input="Search for flights from Seattle to Tokyo next Tuesday and book one.", + effects={ + "tool_effects": { + "search_flights": [NetworkError(error_message="DNS resolution failed")], + "book_flight": [NetworkError(error_message="Connection refused")], + }, + }, + ), +] + +_search_tool = tool_simulator.get_tool("search_flights") +_book_tool = tool_simulator.get_tool("book_flight") + + +def travel_agent_task(case: ChaosCase) -> dict: + """Run the travel agent under chaos and return output + trajectory.""" + logger.info(f"\n{'─'*60}") + logger.info(f" Case: {case.name}") + logger.info(f" User: {case.input}") + + agent = Agent( + system_prompt=( + "You are a travel booking assistant. Use the available tools to complete " + "the user's request. Today's date is May 18, 2025.\n\n" + "If a tool fails or returns an error:\n" + "- Acknowledge the failure honestly to the user\n" + "- Explain what went wrong in plain language\n" + "- Suggest next steps (retry later, try alternative)\n" + "- Do NOT hallucinate successful results" + ), + tools=[_search_tool, _book_tool], + plugins=[chaos_plugin], + callback_handler=None, + trace_attributes={"gen_ai.conversation.id": case.session_id, "session.id": case.session_id}, + ) + + memory_exporter.clear() + try: + result = agent(case.input) + output = str(result) + except Exception as e: + output = f"Agent failed with error: {type(e).__name__}: {str(e)[:200]}" + + logger.info(f" Agent: {output[:300]}{'...' if len(output) > 300 else ''}") + logger.info(f"{'─'*60}") + + finished_spans = memory_exporter.get_finished_spans() + mapper = StrandsInMemorySessionMapper() + session = mapper.map_to_session(finished_spans, session_id=case.session_id) + + return {"output": output, "trajectory": session} + + +experiment = ChaosExperiment( + cases=chaos_cases, + evaluators=[FailureCommunicationEvaluator()], +) + +reports = experiment.run_evaluations(task=travel_agent_task) +reports[0].run_display() diff --git a/site/docs/examples/evals-sdk/chaos_partial_completion.py b/site/docs/examples/evals-sdk/chaos_partial_completion.py new file mode 100644 index 000000000..d77f46df2 --- /dev/null +++ b/site/docs/examples/evals-sdk/chaos_partial_completion.py @@ -0,0 +1,137 @@ +import logging +from typing import Any + +from pydantic import BaseModel, Field + +from strands import Agent +from strands_evals.chaos import ChaosCase, ChaosExperiment, ChaosPlugin, TruncateFields +from strands_evals.chaos.effects import NetworkError +from strands_evals.chaos.evaluators import PartialCompletionEvaluator +from strands_evals.mappers import StrandsInMemorySessionMapper +from strands_evals.simulation.tool_simulator import ToolSimulator +from strands_evals.telemetry import StrandsEvalsTelemetry + +logging.basicConfig(level=logging.INFO, format="%(message)s") +logger = logging.getLogger(__name__) + +telemetry = StrandsEvalsTelemetry().setup_in_memory_exporter() +memory_exporter = telemetry.in_memory_exporter + +tool_simulator = ToolSimulator() + + +class FlightSearchResponse(BaseModel): + flights: list[dict[str, Any]] = Field(default_factory=list) + total_results: int = Field(default=0) + status: str = Field(default="success") + + +class BookFlightResponse(BaseModel): + booking_id: str = Field(default="") + flight_id: str = Field(default="") + status: str = Field(default="success") + message: str = Field(default="") + + +class BookingConfirmationResponse(BaseModel): + confirmation_sent: bool = Field(default=False) + method: str = Field(default="email") + message: str = Field(default="") + + +@tool_simulator.tool(output_schema=FlightSearchResponse) +def search_flights(origin: str, destination: str, date: str) -> dict[str, Any]: + """Search for available flights between two cities on a given date.""" + pass + + +@tool_simulator.tool(output_schema=BookFlightResponse) +def book_flight(flight_id: str) -> dict[str, Any]: + """Book a specific flight by its flight ID.""" + pass + + +@tool_simulator.tool(output_schema=BookingConfirmationResponse) +def send_booking_confirmation(booking_id: str = "", flight_id: str = "", method: str = "email") -> dict[str, Any]: + """Send booking confirmation to the user via email or SMS.""" + pass + + +chaos_plugin = ChaosPlugin() + +# Two cases that test partial completion: +# 1. Search works (truncated) but booking fails — user gets degraded flight info but no reservation +# 2. Search and booking work but confirmation fails — user gets most of what they asked for +chaos_cases = [ + ChaosCase( + name="search_degraded_booking_fails", + input="Find me a flight from SFO to JFK on May 20, book the cheapest one, and send me a confirmation.", + effects={ + "tool_effects": { + "search_flights": [TruncateFields(max_length=5)], + "book_flight": [NetworkError(error_message="Connection reset by peer")], + }, + }, + ), + ChaosCase( + name="confirmation_fails", + input="Search for flights from Seattle to Tokyo next Tuesday, book one, and email me the confirmation.", + effects={ + "tool_effects": { + "send_booking_confirmation": [NetworkError(error_message="SMTP server unreachable")], + }, + }, + ), +] + +_search_tool = tool_simulator.get_tool("search_flights") +_book_tool = tool_simulator.get_tool("book_flight") +_confirm_tool = tool_simulator.get_tool("send_booking_confirmation") + + +def travel_agent_task(case: ChaosCase) -> dict: + """Run the travel agent under chaos and return output + trajectory.""" + logger.info(f"\n{'─'*60}") + logger.info(f" Case: {case.name}") + logger.info(f" User: {case.input}") + + agent = Agent( + system_prompt=( + "You are a travel booking assistant. Use the available tools to complete " + "the user's request. Today's date is May 18, 2025.\n\n" + "If a tool fails or returns an error:\n" + "- Acknowledge the failure honestly\n" + "- Complete as much of the request as possible\n" + "- Do NOT hallucinate successful results\n" + "- Do NOT retry more than once" + ), + tools=[_search_tool, _book_tool, _confirm_tool], + plugins=[chaos_plugin], + callback_handler=None, + trace_attributes={"gen_ai.conversation.id": case.session_id, "session.id": case.session_id}, + ) + + memory_exporter.clear() + try: + result = agent(case.input) + output = str(result) + except Exception as e: + output = f"Agent failed with error: {type(e).__name__}: {str(e)[:200]}" + + logger.info(f" Agent: {output[:300]}{'...' if len(output) > 300 else ''}") + logger.info(f"{'─'*60}") + + finished_spans = memory_exporter.get_finished_spans() + mapper = StrandsInMemorySessionMapper() + session = mapper.map_to_session(finished_spans, session_id=case.session_id) + + return {"output": output, "trajectory": session} + + +experiment = ChaosExperiment( + cases=chaos_cases, + evaluators=[PartialCompletionEvaluator()], +) + +reports = experiment.run_evaluations(task=travel_agent_task) +reports[0].run_display() diff --git a/site/docs/examples/evals-sdk/chaos_recovery_strategy.py b/site/docs/examples/evals-sdk/chaos_recovery_strategy.py new file mode 100644 index 000000000..d3e30963d --- /dev/null +++ b/site/docs/examples/evals-sdk/chaos_recovery_strategy.py @@ -0,0 +1,133 @@ +import logging +from typing import Any + +from pydantic import BaseModel, Field + +from strands import Agent +from strands_evals.chaos import ChaosCase, ChaosExperiment, ChaosPlugin, Timeout +from strands_evals.chaos.effects import ExecutionError +from strands_evals.chaos.evaluators import RecoveryStrategyEvaluator +from strands_evals.mappers import StrandsInMemorySessionMapper +from strands_evals.simulation.tool_simulator import ToolSimulator +from strands_evals.telemetry import StrandsEvalsTelemetry + +logging.basicConfig(level=logging.INFO, format="%(message)s") +logger = logging.getLogger(__name__) + +telemetry = StrandsEvalsTelemetry().setup_in_memory_exporter() +memory_exporter = telemetry.in_memory_exporter + +tool_simulator = ToolSimulator() + + +class FlightSearchResponse(BaseModel): + flights: list[dict[str, Any]] = Field(default_factory=list) + total_results: int = Field(default=0) + status: str = Field(default="success") + + +class HotelSearchResponse(BaseModel): + hotels: list[dict[str, Any]] = Field(default_factory=list) + total_results: int = Field(default=0) + status: str = Field(default="success") + + +class BookFlightResponse(BaseModel): + booking_id: str = Field(default="") + flight_id: str = Field(default="") + status: str = Field(default="success") + message: str = Field(default="") + + +@tool_simulator.tool(output_schema=FlightSearchResponse) +def search_flights(origin: str, destination: str, date: str) -> dict[str, Any]: + """Search for available flights between two cities on a given date.""" + pass + + +@tool_simulator.tool(output_schema=HotelSearchResponse) +def search_hotels(city: str, check_in: str, check_out: str) -> dict[str, Any]: + """Search for available hotels in a city for given dates.""" + pass + + +@tool_simulator.tool(output_schema=BookFlightResponse) +def book_flight(flight_id: str) -> dict[str, Any]: + """Book a specific flight by its flight ID.""" + pass + + +chaos_plugin = ChaosPlugin() + +# Two cases that test recovery strategy: +# 1. Flight search times out but hotel search works — agent should pivot to hotel search +# 2. Flight search fails permanently — agent should try once, then move on +chaos_cases = [ + ChaosCase( + name="flight_timeout_hotel_available", + input="Plan my trip to Tokyo: find flights from SFO and hotels for May 20-23.", + effects={"tool_effects": {"search_flights": [Timeout()]}}, + ), + ChaosCase( + name="flight_and_booking_fail", + input="Find a flight from NYC to London on June 1 and book the cheapest option.", + effects={ + "tool_effects": { + "search_flights": [ExecutionError(error_message="Internal server error")], + "book_flight": [ExecutionError(error_message="Service unavailable")], + }, + }, + ), +] + +_search_flights_tool = tool_simulator.get_tool("search_flights") +_search_hotels_tool = tool_simulator.get_tool("search_hotels") +_book_tool = tool_simulator.get_tool("book_flight") + + +def travel_agent_task(case: ChaosCase) -> dict: + """Run the travel agent under chaos and return output + trajectory.""" + logger.info(f"\n{'─'*60}") + logger.info(f" Case: {case.name}") + logger.info(f" User: {case.input}") + + agent = Agent( + system_prompt=( + "You are a travel planning assistant. Use the available tools to complete " + "the user's request. Today's date is May 18, 2025.\n\n" + "If a tool fails:\n" + "- Try alternative tools that can partially fulfill the request\n" + "- Do NOT retry the same failed tool more than once\n" + "- Do NOT hallucinate results\n" + "- Complete as much of the request as possible with working tools" + ), + tools=[_search_flights_tool, _search_hotels_tool, _book_tool], + plugins=[chaos_plugin], + callback_handler=None, + trace_attributes={"gen_ai.conversation.id": case.session_id, "session.id": case.session_id}, + ) + + memory_exporter.clear() + try: + result = agent(case.input) + output = str(result) + except Exception as e: + output = f"Agent failed with error: {type(e).__name__}: {str(e)[:200]}" + + logger.info(f" Agent: {output[:300]}{'...' if len(output) > 300 else ''}") + logger.info(f"{'─'*60}") + + finished_spans = memory_exporter.get_finished_spans() + mapper = StrandsInMemorySessionMapper() + session = mapper.map_to_session(finished_spans, session_id=case.session_id) + + return {"output": output, "trajectory": session} + + +experiment = ChaosExperiment( + cases=chaos_cases, + evaluators=[RecoveryStrategyEvaluator()], +) + +reports = experiment.run_evaluations(task=travel_agent_task) +reports[0].run_display() From a13deef56b49cb92c62c8c2e8f5209d7b731f753 Mon Sep 17 00:00:00 2001 From: Darren Wang Date: Fri, 29 May 2026 18:08:03 +0000 Subject: [PATCH 5/6] add chaos testing webpages --- site/src/config/navigation.yml | 4 + .../evals-sdk/chaos_testing/chaos_testing.mdx | 472 ++++++++++++++++++ .../evals-sdk/chaos_testing/index.mdx | 286 +++++++++++ 3 files changed, 762 insertions(+) create mode 100644 site/src/content/docs/user-guide/evals-sdk/chaos_testing/chaos_testing.mdx create mode 100644 site/src/content/docs/user-guide/evals-sdk/chaos_testing/index.mdx diff --git a/site/src/config/navigation.yml b/site/src/config/navigation.yml index de9969f12..ec8e3d22f 100644 --- a/site/src/config/navigation.yml +++ b/site/src/config/navigation.yml @@ -211,6 +211,10 @@ sidebar: - label: Remote Trace Providers items: - docs/user-guide/evals-sdk/how-to/trace_providers + - label: Chaos Testing + items: + - docs/user-guide/evals-sdk/chaos_testing + - docs/user-guide/evals-sdk/chaos_testing/chaos_testing - label: How-To Guides items: - docs/user-guide/evals-sdk/how-to/eval_task diff --git a/site/src/content/docs/user-guide/evals-sdk/chaos_testing/chaos_testing.mdx b/site/src/content/docs/user-guide/evals-sdk/chaos_testing/chaos_testing.mdx new file mode 100644 index 000000000..ede3449af --- /dev/null +++ b/site/src/content/docs/user-guide/evals-sdk/chaos_testing/chaos_testing.mdx @@ -0,0 +1,472 @@ +--- +title: Chaos Testing Guide +tags: [error-handling, tool-evaluation, simulation] +sidebar: + label: "Chaos Testing Guide" +--- + +## Overview + +This guide covers the complete chaos testing workflow: defining effects, expanding test cases, running experiments with `ChaosPlugin`, and evaluating agent resilience with specialized evaluators. Chaos testing uses Strands' native plugin system to inject failures transparently — your task function code stays chaos-free. + +```python +from strands import Agent +from strands_evals import Case +from strands_evals.chaos import ChaosCase, ChaosExperiment, ChaosPlugin +from strands_evals.chaos.effects import Timeout, NetworkError, CorruptValues +from strands_evals.evaluators import GoalSuccessRateEvaluator + +# Define base cases +base_cases = [ + Case( + name="flight-booking", + input="Book me a flight to Paris next Tuesday", + metadata={"task_description": "Flight booked with confirmation number"} + ) +] + +# Define named effect maps +effect_maps = { + "booking_timeout": { + "tool_effects": {"book_flight": Timeout()} + }, + "search_corrupted": { + "tool_effects": {"search_flights": CorruptValues(corrupt_ratio=0.8)} + }, +} + +# Generate chaos cases +chaos_cases = ChaosCase.expand(base_cases, effect_maps, include_baseline=True) + +# Run experiment +def task_function(case: Case) -> dict: + agent = Agent( + system_prompt="You are a travel booking assistant.", + plugins=[ChaosPlugin()], + callback_handler=None + ) + response = agent(case.input) + return {"output": str(response)} + +experiment = ChaosExperiment( + cases=chaos_cases, + evaluators=[GoalSuccessRateEvaluator()] +) +reports = experiment.run_evaluations(task_function) +``` + +## How It Works + +1. **Case Expansion**: `ChaosCase.expand()` generates the Cartesian product of base cases × named effect maps, producing `ChaosCase` objects with an `effects` field. +2. **Context Injection**: `ChaosExperiment` sets a `ContextVar` with the active `ChaosCase` before each task execution, ensuring thread/async safety. +3. **Plugin Interception**: `ChaosPlugin` reads the active case from the `ContextVar` and applies effects via `BeforeToolCallEvent` (pre-hook) or `AfterToolCallEvent` (post-hook). +4. **Transparent Execution**: Your task function code has zero chaos concepts — just add `ChaosPlugin()` to the agent's plugins list. + +## Defining Effects + +### Pre-hook Effects + +Pre-hook effects cancel the tool call and return an error message to the agent: + +```python +from strands_evals.chaos.effects import ( + Timeout, + NetworkError, + ExecutionError, + ValidationError, +) + +effect_maps = { + "timeout": {"tool_effects": {"my_tool": Timeout()}}, + "network": {"tool_effects": {"my_tool": NetworkError()}}, + "execution": {"tool_effects": {"my_tool": ExecutionError()}}, + "validation": {"tool_effects": {"my_tool": ValidationError()}}, +} +``` + +### Post-hook Effects + +Post-hook effects let the tool execute normally but corrupt the response: + +```python +from strands_evals.chaos.effects import ( + TruncateFields, + RemoveFields, + CorruptValues, +) + +effect_maps = { + "truncated": { + "tool_effects": {"my_tool": TruncateFields(max_length=10)} + }, + "missing_fields": { + "tool_effects": {"my_tool": RemoveFields(remove_ratio=0.5)} + }, + "corrupted": { + "tool_effects": {"my_tool": CorruptValues(corrupt_ratio=0.3)} + }, +} +``` + +### Compound Effects (Multiple Tools) + +Target multiple tools in a single effect map to simulate cascading failures: + +```python +effect_maps = { + "total_chaos": { + "tool_effects": { + "search_flights": Timeout(), + "book_flight": NetworkError(), + "send_confirmation": CorruptValues(corrupt_ratio=0.5), + } + }, +} +``` + +## ChaosCase + +`ChaosCase` extends `Case` with an `effects` field. The `effects` dict keys are restricted to known categories (currently `"tool_effects"`): + +```python +from strands_evals.chaos import ChaosCase +from strands_evals.chaos.effects import Timeout + +# Manual construction +chaos_case = ChaosCase( + name="timeout-test", + input="Book a flight", + effects={"tool_effects": {"book_flight": Timeout()}}, + metadata={"task_description": "Flight booked"} +) + +# Expansion from base cases (preferred) +chaos_cases = ChaosCase.expand( + cases=[Case(name="test", input="Book a flight")], + effect_maps={"timeout": {"tool_effects": {"book_flight": Timeout()}}}, + include_baseline=True +) +``` + +## ChaosPlugin + +`ChaosPlugin` hooks into Strands' event system. Add it to your agent's plugins list: + +```python +from strands import Agent +from strands_evals.chaos import ChaosPlugin + +agent = Agent( + system_prompt="You are a helpful assistant.", + plugins=[ChaosPlugin()], + callback_handler=None +) +``` + +The plugin reads the active `ChaosCase` from a `ContextVar` (managed by `ChaosExperiment`) and applies effects only to tools listed in the case's `effects["tool_effects"]` dict. Tools not listed execute normally. + +## ChaosExperiment + +`ChaosExperiment` composes the base `Experiment` class and manages the `ContextVar` lifecycle: + +```python +from strands_evals.chaos import ChaosExperiment + +experiment = ChaosExperiment( + cases=chaos_cases, + evaluators=evaluators +) + +# Sync execution +reports = experiment.run_evaluations(task_function) + +# Async execution +reports = await experiment.run_evaluations_async(async_task_function) +``` + +## Resilience Evaluators + +### FailureCommunicationEvaluator + +Scores how well the agent communicates failures to the user across four dimensions: clarity, actionability, transparency, and tone. + +```python +from strands_evals.chaos.evaluators import FailureCommunicationEvaluator + +evaluator = FailureCommunicationEvaluator() +``` + +**Scoring criteria:** +- Does the agent acknowledge the failure clearly? +- Does it suggest actionable next steps? +- Is it transparent about what went wrong (without exposing internals)? +- Is the tone appropriate (not dismissive, not alarming)? + +### PartialCompletionEvaluator + +Scores what percentage of the user's goal was achieved despite failures, returning a continuous 0.0–1.0 score: + +```python +from strands_evals.chaos.evaluators import PartialCompletionEvaluator + +evaluator = PartialCompletionEvaluator() +``` + +**Example scores:** +- `1.0` — Full goal achieved despite failures +- `0.7` — Most sub-goals completed, one blocked by failure +- `0.0` — Agent gave up entirely or crashed + +### RecoveryStrategyEvaluator + +Scores the quality of the agent's recovery actions when tools fail: + +```python +from strands_evals.chaos.evaluators import RecoveryStrategyEvaluator + +evaluator = RecoveryStrategyEvaluator() +``` + +**Scoring criteria:** +- Exploration breadth — Did the agent try alternative tools or approaches? +- Retry discipline — Did it retry appropriately (not excessively)? +- Approach variation — Did retries use different strategies? + +## Complete Example: Multi-Tool Chaos with Resilience Evaluation + +```python +from typing import Any +from pydantic import BaseModel, Field +from strands import Agent +from strands_evals import Case +from strands_evals.chaos import ChaosCase, ChaosExperiment, ChaosPlugin +from strands_evals.chaos.effects import Timeout, NetworkError, CorruptValues +from strands_evals.chaos.evaluators import ( + FailureCommunicationEvaluator, + PartialCompletionEvaluator, + RecoveryStrategyEvaluator, +) +from strands_evals.evaluators import GoalSuccessRateEvaluator +from strands_evals.simulation.tool_simulator import ToolSimulator + +# Setup tool simulator for reproducible responses +tool_simulator = ToolSimulator() + +class FlightResult(BaseModel): + airline: str = Field(..., description="Airline name") + price: float = Field(..., description="Price in USD") + departure: str = Field(..., description="Departure time") + +class BookingConfirmation(BaseModel): + confirmation_id: str = Field(..., description="Booking confirmation ID") + status: str = Field(..., description="Booking status") + +@tool_simulator.tool( + share_state_id="travel", + initial_state_description="Available flights: AA101 $450 8am, UA202 $380 2pm, DL303 $520 6pm", + output_schema=FlightResult, +) +def search_flights(destination: str, date: str) -> dict[str, Any]: + """Search for available flights.""" + pass + +@tool_simulator.tool( + share_state_id="travel", + output_schema=BookingConfirmation, +) +def book_flight(flight_id: str, passenger_name: str) -> dict[str, Any]: + """Book a specific flight.""" + pass + +# Define effect maps +effect_maps = { + "search_timeout": { + "tool_effects": {"search_flights": Timeout()} + }, + "booking_network_error": { + "tool_effects": {"book_flight": NetworkError()} + }, + "corrupted_search": { + "tool_effects": {"search_flights": CorruptValues(corrupt_ratio=0.8)} + }, + "total_chaos": { + "tool_effects": { + "search_flights": Timeout(), + "book_flight": NetworkError(), + } + }, +} + +# Define base cases +base_cases = [ + Case( + name="book-cheapest", + input="Find the cheapest flight to Paris next Tuesday and book it for John Smith", + metadata={"task_description": "Flight searched, cheapest option identified, booking confirmed"} + ), + Case( + name="book-morning", + input="I need a morning flight to Tokyo on Friday", + metadata={"task_description": "Morning flight found and booked"} + ), +] + +# Expand into chaos cases +chaos_cases = ChaosCase.expand(base_cases, effect_maps, include_baseline=True) + +# Task function — no chaos concepts here +def task_function(case: Case) -> dict: + search_tool = tool_simulator.get_tool("search_flights") + booking_tool = tool_simulator.get_tool("book_flight") + + agent = Agent( + system_prompt="You are a travel booking assistant. Help users find and book flights.", + tools=[search_tool, booking_tool], + plugins=[ChaosPlugin()], + callback_handler=None, + ) + response = agent(case.input) + return {"output": str(response)} + +# Run with all evaluators +evaluators = [ + GoalSuccessRateEvaluator(), + FailureCommunicationEvaluator(), + PartialCompletionEvaluator(), + RecoveryStrategyEvaluator(), +] + +experiment = ChaosExperiment(cases=chaos_cases, evaluators=evaluators) +reports = experiment.run_evaluations(task_function) + +# Display results +for report in reports: + print(f"\n{'='*60}") + print(f"Evaluator: {report.evaluator_name}") + print(f"{'='*60}") + report.run_display() +``` + +## Advanced Patterns + +### Pattern 1: Comparing Agent Configurations Under Chaos + +```python +def compare_agents_under_chaos(chaos_cases, configs): + """Compare how different agent configs handle the same failures.""" + results = {} + + for config_name, system_prompt in configs.items(): + def make_task(prompt): + def task_function(case: Case) -> dict: + agent = Agent( + system_prompt=prompt, + plugins=[ChaosPlugin()], + callback_handler=None, + ) + response = agent(case.input) + return {"output": str(response)} + return task_function + + experiment = ChaosExperiment( + cases=chaos_cases, + evaluators=[PartialCompletionEvaluator()] + ) + reports = experiment.run_evaluations(make_task(system_prompt)) + results[config_name] = reports + + return results +``` + +### Pattern 2: Progressive Failure Escalation + +```python +# Test increasing severity +effect_maps = { + "mild": { + "tool_effects": {"search": CorruptValues(corrupt_ratio=0.2)} + }, + "moderate": { + "tool_effects": {"search": CorruptValues(corrupt_ratio=0.5)} + }, + "severe": { + "tool_effects": {"search": CorruptValues(corrupt_ratio=0.9)} + }, + "total_failure": { + "tool_effects": {"search": Timeout()} + }, +} +``` + +### Pattern 3: Chaos with User Simulation + +Combine chaos testing with user simulation for multi-turn resilience evaluation: + +```python +from strands_evals import ActorSimulator + +def task_function(case: Case) -> dict: + user_sim = ActorSimulator.from_case_for_user_simulator( + case=case, max_turns=8 + ) + + agent = Agent( + system_prompt="You are a helpful assistant.", + plugins=[ChaosPlugin()], + callback_handler=None, + ) + + user_message = case.input + while user_sim.has_next(): + agent_response = agent(user_message) + user_result = user_sim.act(str(agent_response)) + user_message = str(user_result.structured_output.message) + + return {"output": str(agent_response)} +``` + +## Troubleshooting + +### Issue: Effects Not Being Applied + +Ensure `ChaosPlugin()` is in the agent's plugins list and you're using `ChaosExperiment` (not base `Experiment`): + +```python +# Correct +agent = Agent(plugins=[ChaosPlugin()], ...) +experiment = ChaosExperiment(cases=chaos_cases, ...) + +# Wrong — base Experiment doesn't set the ContextVar +experiment = Experiment(cases=chaos_cases, ...) +``` + +### Issue: All Tools Failing + +Check that your effect map keys match the exact tool function names: + +```python +# If your tool is defined as: +def search_flights(...): ... + +# The effect map key must be "search_flights", not "searchFlights" or "search" +effect_maps = {"test": {"tool_effects": {"search_flights": Timeout()}}} +``` + +### Issue: Async Task Errors + +`ChaosExperiment` supports both sync and async tasks. Use the appropriate method: + +```python +# Sync +reports = experiment.run_evaluations(sync_task_function) + +# Async +reports = await experiment.run_evaluations_async(async_task_function) +``` + +## Related Documentation + +- [Chaos Testing Overview](/docs/user-guide/evals-sdk/chaos_testing/): Overview and quick start +- [Tool Simulation](/docs/user-guide/evals-sdk/simulators/tool_simulation/): Simulate tool behavior +- [Goal Success Rate Evaluator](/docs/user-guide/evals-sdk/evaluators/goal_success_rate_evaluator/): Assess goal completion +- [Simulators Overview](/docs/user-guide/evals-sdk/simulators/): Simulator framework diff --git a/site/src/content/docs/user-guide/evals-sdk/chaos_testing/index.mdx b/site/src/content/docs/user-guide/evals-sdk/chaos_testing/index.mdx new file mode 100644 index 000000000..4dd523b09 --- /dev/null +++ b/site/src/content/docs/user-guide/evals-sdk/chaos_testing/index.mdx @@ -0,0 +1,286 @@ +--- +title: Chaos Testing +tags: [error-handling, simulation] +sidebar: + label: "Overview" +--- + +## Overview + +Chaos testing systematically evaluates agent resilience by injecting controlled failures into tool execution. Using the `ChaosPlugin`, `ChaosCase`, and `ChaosExperiment` classes, you can test how agents handle tool timeouts, network errors, and corrupted responses — without modifying agent code. + +This enables you to answer questions like: +- Does the agent gracefully communicate failures to users? +- Can the agent achieve partial goals when some tools fail? +- Does the agent employ effective recovery strategies? + +## Why Chaos Testing? + +Traditional evaluation tests agents under ideal conditions. In production, tools fail unpredictably: + +**Standard Evaluation:** +- Tools always return correct responses +- No network failures or timeouts +- Cannot reveal fragile error handling +- Misses degraded-mode behavior + +**Chaos Testing:** +- Injects realistic tool failures (timeouts, network errors, validation errors) +- Corrupts tool responses (truncated fields, removed data, corrupted values) +- Tests agent resilience without live infrastructure failures +- Measures graceful degradation and recovery behavior +- Quantifies partial goal completion under failure + +## When to Use Chaos Testing + +Use chaos testing when you need to: +- **Evaluate Resilience**: Test how agents handle tool failures gracefully +- **Assess Recovery**: Verify agents try alternative approaches when tools fail +- **Measure Degradation**: Quantify how much of a goal agents achieve despite failures +- **Test Communication**: Ensure agents inform users clearly about failures +- **Validate Robustness**: Confirm agents don't crash or loop on corrupted data + +## Architecture + +Chaos testing integrates with Strands' plugin system via `BeforeToolCallEvent` and `AfterToolCallEvent` hooks: + +1. **ChaosCase** — Extends `Case` with an `effects` field mapping tool names to failure effects +2. **ChaosPlugin** — A Strands plugin that intercepts tool calls and applies effects transparently +3. **ChaosExperiment** — Composes the base `Experiment` to manage chaos context per case +4. **ChaosEffect** — A hierarchy of pre-hook effects (cancel tool calls) and post-hook effects (corrupt responses) + +## Effect Types + +### Pre-hook Effects (Tool Call Failures) + +These effects cancel the tool call entirely and return an error: + +| Effect | Description | +| :------- | :------------ | +| `Timeout` | Simulates a tool execution timeout | +| `NetworkError` | Simulates a network connectivity failure | +| `ExecutionError` | Simulates a runtime error during tool execution | +| `ValidationError` | Simulates invalid input/output validation failure | + +### Post-hook Effects (Response Corruption) + +These effects let the tool execute but corrupt the response: + +| Effect | Description | Parameters | +| :------- | :------------ | :----------- | +| `TruncateFields` | Truncates string fields in the response | `max_length` | +| `RemoveFields` | Randomly removes fields from the response | `remove_ratio` | +| `CorruptValues` | Corrupts field values with garbage data | `corrupt_ratio` | + +## Basic Usage + +```python +from strands import Agent +from strands_evals import Case +from strands_evals.chaos import ChaosCase, ChaosExperiment, ChaosPlugin +from strands_evals.chaos.effects import Timeout, NetworkError +from strands_evals.evaluators import GoalSuccessRateEvaluator + +# Define base test cases +base_cases = [ + Case( + name="weather-lookup", + input="What's the weather in Seattle?", + metadata={"task_description": "Weather information provided"} + ) +] + +# Define named effect maps +effect_maps = { + "search_timeout": { + "tool_effects": {"get_weather": Timeout()} + }, + "network_failure": { + "tool_effects": {"get_weather": NetworkError()} + }, +} + +# Expand cases into Cartesian product (base cases × effect maps + baseline) +chaos_cases = ChaosCase.expand(base_cases, effect_maps, include_baseline=True) + +# Run chaos experiment +experiment = ChaosExperiment( + cases=chaos_cases, + evaluators=[GoalSuccessRateEvaluator()] +) + +def task_function(case: Case) -> dict: + agent = Agent( + system_prompt="You are a helpful weather assistant.", + plugins=[ChaosPlugin()], + callback_handler=None + ) + response = agent(case.input) + return {"output": str(response)} + +reports = experiment.run_evaluations(task_function) +``` + +## ChaosCase.expand() + +The `expand()` class method generates the Cartesian product of base cases and effect maps, optionally including a baseline (no effects) for comparison: + +```python +chaos_cases = ChaosCase.expand( + cases=base_cases, # List of base Case objects + effect_maps=effect_maps, # Dict of named effect configurations + include_baseline=True # Include cases with no effects for comparison +) +``` + +For 2 base cases and 3 effect maps with `include_baseline=True`, this produces `2 × (3 + 1) = 8` chaos cases. + +## Integration with ToolSimulator + +Chaos testing works naturally with `ToolSimulator` for fully controlled evaluation — simulated tools provide reproducible responses, and chaos effects inject failures on top: + +```python +from strands import Agent +from strands_evals import Case +from strands_evals.chaos import ChaosCase, ChaosExperiment, ChaosPlugin +from strands_evals.chaos.effects import Timeout, CorruptValues +from strands_evals.evaluators import GoalSuccessRateEvaluator +from strands_evals.simulation.tool_simulator import ToolSimulator +from pydantic import BaseModel, Field + +tool_simulator = ToolSimulator() + +class SearchResult(BaseModel): + title: str = Field(..., description="Result title") + snippet: str = Field(..., description="Result snippet") + +@tool_simulator.tool(output_schema=SearchResult) +def web_search(query: str) -> dict: + """Search the web for information.""" + pass + +# Define effect maps +effect_maps = { + "search_timeout": { + "tool_effects": {"web_search": Timeout()} + }, + "corrupted_results": { + "tool_effects": {"web_search": CorruptValues(corrupt_ratio=0.5)} + }, +} + +base_cases = [ + Case(name="research", input="Find recent news about AI agents") +] + +chaos_cases = ChaosCase.expand(base_cases, effect_maps, include_baseline=True) + +def task_function(case: Case) -> dict: + search_tool = tool_simulator.get_tool("web_search") + agent = Agent( + tools=[search_tool], + plugins=[ChaosPlugin()], + callback_handler=None + ) + response = agent(case.input) + return {"output": str(response)} + +experiment = ChaosExperiment( + cases=chaos_cases, + evaluators=[GoalSuccessRateEvaluator()] +) +reports = experiment.run_evaluations(task_function) +``` + +## Resilience Evaluators + +Chaos testing ships with three specialized evaluators designed to assess agent behavior under failure: + +| Evaluator | What It Measures | +| :---------- | :---------------- | +| `FailureCommunicationEvaluator` | Clarity, actionability, transparency, and tone of failure messages | +| `PartialCompletionEvaluator` | Percentage of user goal achieved despite failures (0.0–1.0) | +| `RecoveryStrategyEvaluator` | Quality of recovery actions: exploration breadth, retry discipline, approach variation | + +```python +from strands_evals.chaos.evaluators import ( + FailureCommunicationEvaluator, + PartialCompletionEvaluator, + RecoveryStrategyEvaluator, +) + +evaluators = [ + GoalSuccessRateEvaluator(), + FailureCommunicationEvaluator(), + PartialCompletionEvaluator(), + RecoveryStrategyEvaluator(), +] + +experiment = ChaosExperiment(cases=chaos_cases, evaluators=evaluators) +``` + +[Complete Chaos Testing Guide →](/docs/user-guide/evals-sdk/chaos_testing/chaos_testing/) + +## Chaos Testing vs Simulators + +| Aspect | Simulators | Chaos Testing | +| :------- | :---------- | :-------------- | +| **Role** | Replace tool execution entirely | Inject failures into tool execution | +| **Scope** | All tool calls are simulated | Only targeted tools are affected | +| **Use Case** | Test without infrastructure | Test resilience under failure | +| **Combination** | Can be used together | Chaos effects apply on top of simulated tools | + +## Best Practices + +### 1. Start with Baseline Comparisons + +Always include `include_baseline=True` to compare agent performance with and without failures: + +```python +chaos_cases = ChaosCase.expand(cases, effect_maps, include_baseline=True) +``` + +### 2. Test One Failure at a Time First + +Start with single-tool failures before testing compound chaos: + +```python +# Single failure +effect_maps = { + "search_fails": {"tool_effects": {"search": Timeout()}}, +} + +# Compound (test after single failures are understood) +effect_maps = { + "total_chaos": { + "tool_effects": { + "search": Timeout(), + "database": NetworkError(), + } + }, +} +``` + +### 3. Use Resilience Evaluators Together + +Combine all three resilience evaluators for a complete picture: + +```python +evaluators = [ + FailureCommunicationEvaluator(), # Did the agent tell the user? + PartialCompletionEvaluator(), # How much was achieved? + RecoveryStrategyEvaluator(), # Did it try alternatives? +] +``` + +## Next Steps + +- [Chaos Testing Guide](/docs/user-guide/evals-sdk/chaos_testing/chaos_testing/): Complete guide with advanced patterns +- [Tool Simulation](/docs/user-guide/evals-sdk/simulators/tool_simulation/): Simulate tool behavior +- [Goal Success Rate Evaluator](/docs/user-guide/evals-sdk/evaluators/goal_success_rate_evaluator/): Assess goal completion + +## Related Documentation + +- [Simulators Overview](/docs/user-guide/evals-sdk/simulators/): Simulator framework +- [Evaluators](/docs/user-guide/evals-sdk/evaluators/): All available evaluators +- [Quickstart Guide](/docs/user-guide/evals-sdk/quickstart/): Get started with Strands Evals From 5acf8675a57a3f1ddd3f597bad0c7cacc0abf8a8 Mon Sep 17 00:00:00 2001 From: Darren Wang Date: Tue, 2 Jun 2026 17:59:32 +0000 Subject: [PATCH 6/6] revert chaos webpage; use flatten in example --- ... chaos_failure_communication_evaluator.py} | 5 +- ... => chaos_partial_completion_evaluator.py} | 5 +- ...y => chaos_recovery_strategy_evaluator.py} | 5 +- site/docs/examples/evals-sdk/chaos_testing.py | 3 +- site/src/config/navigation.yml | 4 - .../evals-sdk/chaos_testing/chaos_testing.mdx | 472 ------------------ .../evals-sdk/chaos_testing/index.mdx | 286 ----------- 7 files changed, 11 insertions(+), 769 deletions(-) rename site/docs/examples/evals-sdk/{chaos_failure_communication.py => chaos_failure_communication_evaluator.py} (95%) rename site/docs/examples/evals-sdk/{chaos_partial_completion.py => chaos_partial_completion_evaluator.py} (96%) rename site/docs/examples/evals-sdk/{chaos_recovery_strategy.py => chaos_recovery_strategy_evaluator.py} (96%) delete mode 100644 site/src/content/docs/user-guide/evals-sdk/chaos_testing/chaos_testing.mdx delete mode 100644 site/src/content/docs/user-guide/evals-sdk/chaos_testing/index.mdx diff --git a/site/docs/examples/evals-sdk/chaos_failure_communication.py b/site/docs/examples/evals-sdk/chaos_failure_communication_evaluator.py similarity index 95% rename from site/docs/examples/evals-sdk/chaos_failure_communication.py rename to site/docs/examples/evals-sdk/chaos_failure_communication_evaluator.py index 3808fc8c8..d78ab0430 100644 --- a/site/docs/examples/evals-sdk/chaos_failure_communication.py +++ b/site/docs/examples/evals-sdk/chaos_failure_communication_evaluator.py @@ -6,10 +6,11 @@ from strands import Agent from strands_evals.chaos import ChaosCase, ChaosExperiment, ChaosPlugin, Timeout from strands_evals.chaos.effects import NetworkError -from strands_evals.chaos.evaluators import FailureCommunicationEvaluator +from strands_evals.evaluators.chaos import FailureCommunicationEvaluator from strands_evals.mappers import StrandsInMemorySessionMapper from strands_evals.simulation.tool_simulator import ToolSimulator from strands_evals.telemetry import StrandsEvalsTelemetry +from strands_evals.types.evaluation_report import EvaluationReport logging.basicConfig(level=logging.INFO, format="%(message)s") logger = logging.getLogger(__name__) @@ -117,4 +118,4 @@ def travel_agent_task(case: ChaosCase) -> dict: ) reports = experiment.run_evaluations(task=travel_agent_task) -reports[0].run_display() +EvaluationReport.flatten(reports).run_display() diff --git a/site/docs/examples/evals-sdk/chaos_partial_completion.py b/site/docs/examples/evals-sdk/chaos_partial_completion_evaluator.py similarity index 96% rename from site/docs/examples/evals-sdk/chaos_partial_completion.py rename to site/docs/examples/evals-sdk/chaos_partial_completion_evaluator.py index d77f46df2..d247ae0aa 100644 --- a/site/docs/examples/evals-sdk/chaos_partial_completion.py +++ b/site/docs/examples/evals-sdk/chaos_partial_completion_evaluator.py @@ -6,10 +6,11 @@ from strands import Agent from strands_evals.chaos import ChaosCase, ChaosExperiment, ChaosPlugin, TruncateFields from strands_evals.chaos.effects import NetworkError -from strands_evals.chaos.evaluators import PartialCompletionEvaluator +from strands_evals.evaluators.chaos import PartialCompletionEvaluator from strands_evals.mappers import StrandsInMemorySessionMapper from strands_evals.simulation.tool_simulator import ToolSimulator from strands_evals.telemetry import StrandsEvalsTelemetry +from strands_evals.types.evaluation_report import EvaluationReport logging.basicConfig(level=logging.INFO, format="%(message)s") logger = logging.getLogger(__name__) @@ -134,4 +135,4 @@ def travel_agent_task(case: ChaosCase) -> dict: ) reports = experiment.run_evaluations(task=travel_agent_task) -reports[0].run_display() +EvaluationReport.flatten(reports).run_display() diff --git a/site/docs/examples/evals-sdk/chaos_recovery_strategy.py b/site/docs/examples/evals-sdk/chaos_recovery_strategy_evaluator.py similarity index 96% rename from site/docs/examples/evals-sdk/chaos_recovery_strategy.py rename to site/docs/examples/evals-sdk/chaos_recovery_strategy_evaluator.py index d3e30963d..fc2903e11 100644 --- a/site/docs/examples/evals-sdk/chaos_recovery_strategy.py +++ b/site/docs/examples/evals-sdk/chaos_recovery_strategy_evaluator.py @@ -6,10 +6,11 @@ from strands import Agent from strands_evals.chaos import ChaosCase, ChaosExperiment, ChaosPlugin, Timeout from strands_evals.chaos.effects import ExecutionError -from strands_evals.chaos.evaluators import RecoveryStrategyEvaluator +from strands_evals.evaluators.chaos import RecoveryStrategyEvaluator from strands_evals.mappers import StrandsInMemorySessionMapper from strands_evals.simulation.tool_simulator import ToolSimulator from strands_evals.telemetry import StrandsEvalsTelemetry +from strands_evals.types.evaluation_report import EvaluationReport logging.basicConfig(level=logging.INFO, format="%(message)s") logger = logging.getLogger(__name__) @@ -130,4 +131,4 @@ def travel_agent_task(case: ChaosCase) -> dict: ) reports = experiment.run_evaluations(task=travel_agent_task) -reports[0].run_display() +EvaluationReport.flatten(reports).run_display() diff --git a/site/docs/examples/evals-sdk/chaos_testing.py b/site/docs/examples/evals-sdk/chaos_testing.py index b3e167789..86e366103 100644 --- a/site/docs/examples/evals-sdk/chaos_testing.py +++ b/site/docs/examples/evals-sdk/chaos_testing.py @@ -20,6 +20,7 @@ from strands_evals.mappers import StrandsInMemorySessionMapper from strands_evals.simulation.tool_simulator import ToolSimulator from strands_evals.telemetry import StrandsEvalsTelemetry +from strands_evals.types.evaluation_report import EvaluationReport logging.basicConfig(level=logging.INFO, format="%(message)s") logger = logging.getLogger(__name__) @@ -169,4 +170,4 @@ def travel_agent_task(case: ChaosCase) -> dict: # Run: 8 chaos cases = 8 agent invocations reports = experiment.run_evaluations(task=travel_agent_task) -reports[0].run_display() +EvaluationReport.flatten(reports).run_display() diff --git a/site/src/config/navigation.yml b/site/src/config/navigation.yml index ec8e3d22f..de9969f12 100644 --- a/site/src/config/navigation.yml +++ b/site/src/config/navigation.yml @@ -211,10 +211,6 @@ sidebar: - label: Remote Trace Providers items: - docs/user-guide/evals-sdk/how-to/trace_providers - - label: Chaos Testing - items: - - docs/user-guide/evals-sdk/chaos_testing - - docs/user-guide/evals-sdk/chaos_testing/chaos_testing - label: How-To Guides items: - docs/user-guide/evals-sdk/how-to/eval_task diff --git a/site/src/content/docs/user-guide/evals-sdk/chaos_testing/chaos_testing.mdx b/site/src/content/docs/user-guide/evals-sdk/chaos_testing/chaos_testing.mdx deleted file mode 100644 index ede3449af..000000000 --- a/site/src/content/docs/user-guide/evals-sdk/chaos_testing/chaos_testing.mdx +++ /dev/null @@ -1,472 +0,0 @@ ---- -title: Chaos Testing Guide -tags: [error-handling, tool-evaluation, simulation] -sidebar: - label: "Chaos Testing Guide" ---- - -## Overview - -This guide covers the complete chaos testing workflow: defining effects, expanding test cases, running experiments with `ChaosPlugin`, and evaluating agent resilience with specialized evaluators. Chaos testing uses Strands' native plugin system to inject failures transparently — your task function code stays chaos-free. - -```python -from strands import Agent -from strands_evals import Case -from strands_evals.chaos import ChaosCase, ChaosExperiment, ChaosPlugin -from strands_evals.chaos.effects import Timeout, NetworkError, CorruptValues -from strands_evals.evaluators import GoalSuccessRateEvaluator - -# Define base cases -base_cases = [ - Case( - name="flight-booking", - input="Book me a flight to Paris next Tuesday", - metadata={"task_description": "Flight booked with confirmation number"} - ) -] - -# Define named effect maps -effect_maps = { - "booking_timeout": { - "tool_effects": {"book_flight": Timeout()} - }, - "search_corrupted": { - "tool_effects": {"search_flights": CorruptValues(corrupt_ratio=0.8)} - }, -} - -# Generate chaos cases -chaos_cases = ChaosCase.expand(base_cases, effect_maps, include_baseline=True) - -# Run experiment -def task_function(case: Case) -> dict: - agent = Agent( - system_prompt="You are a travel booking assistant.", - plugins=[ChaosPlugin()], - callback_handler=None - ) - response = agent(case.input) - return {"output": str(response)} - -experiment = ChaosExperiment( - cases=chaos_cases, - evaluators=[GoalSuccessRateEvaluator()] -) -reports = experiment.run_evaluations(task_function) -``` - -## How It Works - -1. **Case Expansion**: `ChaosCase.expand()` generates the Cartesian product of base cases × named effect maps, producing `ChaosCase` objects with an `effects` field. -2. **Context Injection**: `ChaosExperiment` sets a `ContextVar` with the active `ChaosCase` before each task execution, ensuring thread/async safety. -3. **Plugin Interception**: `ChaosPlugin` reads the active case from the `ContextVar` and applies effects via `BeforeToolCallEvent` (pre-hook) or `AfterToolCallEvent` (post-hook). -4. **Transparent Execution**: Your task function code has zero chaos concepts — just add `ChaosPlugin()` to the agent's plugins list. - -## Defining Effects - -### Pre-hook Effects - -Pre-hook effects cancel the tool call and return an error message to the agent: - -```python -from strands_evals.chaos.effects import ( - Timeout, - NetworkError, - ExecutionError, - ValidationError, -) - -effect_maps = { - "timeout": {"tool_effects": {"my_tool": Timeout()}}, - "network": {"tool_effects": {"my_tool": NetworkError()}}, - "execution": {"tool_effects": {"my_tool": ExecutionError()}}, - "validation": {"tool_effects": {"my_tool": ValidationError()}}, -} -``` - -### Post-hook Effects - -Post-hook effects let the tool execute normally but corrupt the response: - -```python -from strands_evals.chaos.effects import ( - TruncateFields, - RemoveFields, - CorruptValues, -) - -effect_maps = { - "truncated": { - "tool_effects": {"my_tool": TruncateFields(max_length=10)} - }, - "missing_fields": { - "tool_effects": {"my_tool": RemoveFields(remove_ratio=0.5)} - }, - "corrupted": { - "tool_effects": {"my_tool": CorruptValues(corrupt_ratio=0.3)} - }, -} -``` - -### Compound Effects (Multiple Tools) - -Target multiple tools in a single effect map to simulate cascading failures: - -```python -effect_maps = { - "total_chaos": { - "tool_effects": { - "search_flights": Timeout(), - "book_flight": NetworkError(), - "send_confirmation": CorruptValues(corrupt_ratio=0.5), - } - }, -} -``` - -## ChaosCase - -`ChaosCase` extends `Case` with an `effects` field. The `effects` dict keys are restricted to known categories (currently `"tool_effects"`): - -```python -from strands_evals.chaos import ChaosCase -from strands_evals.chaos.effects import Timeout - -# Manual construction -chaos_case = ChaosCase( - name="timeout-test", - input="Book a flight", - effects={"tool_effects": {"book_flight": Timeout()}}, - metadata={"task_description": "Flight booked"} -) - -# Expansion from base cases (preferred) -chaos_cases = ChaosCase.expand( - cases=[Case(name="test", input="Book a flight")], - effect_maps={"timeout": {"tool_effects": {"book_flight": Timeout()}}}, - include_baseline=True -) -``` - -## ChaosPlugin - -`ChaosPlugin` hooks into Strands' event system. Add it to your agent's plugins list: - -```python -from strands import Agent -from strands_evals.chaos import ChaosPlugin - -agent = Agent( - system_prompt="You are a helpful assistant.", - plugins=[ChaosPlugin()], - callback_handler=None -) -``` - -The plugin reads the active `ChaosCase` from a `ContextVar` (managed by `ChaosExperiment`) and applies effects only to tools listed in the case's `effects["tool_effects"]` dict. Tools not listed execute normally. - -## ChaosExperiment - -`ChaosExperiment` composes the base `Experiment` class and manages the `ContextVar` lifecycle: - -```python -from strands_evals.chaos import ChaosExperiment - -experiment = ChaosExperiment( - cases=chaos_cases, - evaluators=evaluators -) - -# Sync execution -reports = experiment.run_evaluations(task_function) - -# Async execution -reports = await experiment.run_evaluations_async(async_task_function) -``` - -## Resilience Evaluators - -### FailureCommunicationEvaluator - -Scores how well the agent communicates failures to the user across four dimensions: clarity, actionability, transparency, and tone. - -```python -from strands_evals.chaos.evaluators import FailureCommunicationEvaluator - -evaluator = FailureCommunicationEvaluator() -``` - -**Scoring criteria:** -- Does the agent acknowledge the failure clearly? -- Does it suggest actionable next steps? -- Is it transparent about what went wrong (without exposing internals)? -- Is the tone appropriate (not dismissive, not alarming)? - -### PartialCompletionEvaluator - -Scores what percentage of the user's goal was achieved despite failures, returning a continuous 0.0–1.0 score: - -```python -from strands_evals.chaos.evaluators import PartialCompletionEvaluator - -evaluator = PartialCompletionEvaluator() -``` - -**Example scores:** -- `1.0` — Full goal achieved despite failures -- `0.7` — Most sub-goals completed, one blocked by failure -- `0.0` — Agent gave up entirely or crashed - -### RecoveryStrategyEvaluator - -Scores the quality of the agent's recovery actions when tools fail: - -```python -from strands_evals.chaos.evaluators import RecoveryStrategyEvaluator - -evaluator = RecoveryStrategyEvaluator() -``` - -**Scoring criteria:** -- Exploration breadth — Did the agent try alternative tools or approaches? -- Retry discipline — Did it retry appropriately (not excessively)? -- Approach variation — Did retries use different strategies? - -## Complete Example: Multi-Tool Chaos with Resilience Evaluation - -```python -from typing import Any -from pydantic import BaseModel, Field -from strands import Agent -from strands_evals import Case -from strands_evals.chaos import ChaosCase, ChaosExperiment, ChaosPlugin -from strands_evals.chaos.effects import Timeout, NetworkError, CorruptValues -from strands_evals.chaos.evaluators import ( - FailureCommunicationEvaluator, - PartialCompletionEvaluator, - RecoveryStrategyEvaluator, -) -from strands_evals.evaluators import GoalSuccessRateEvaluator -from strands_evals.simulation.tool_simulator import ToolSimulator - -# Setup tool simulator for reproducible responses -tool_simulator = ToolSimulator() - -class FlightResult(BaseModel): - airline: str = Field(..., description="Airline name") - price: float = Field(..., description="Price in USD") - departure: str = Field(..., description="Departure time") - -class BookingConfirmation(BaseModel): - confirmation_id: str = Field(..., description="Booking confirmation ID") - status: str = Field(..., description="Booking status") - -@tool_simulator.tool( - share_state_id="travel", - initial_state_description="Available flights: AA101 $450 8am, UA202 $380 2pm, DL303 $520 6pm", - output_schema=FlightResult, -) -def search_flights(destination: str, date: str) -> dict[str, Any]: - """Search for available flights.""" - pass - -@tool_simulator.tool( - share_state_id="travel", - output_schema=BookingConfirmation, -) -def book_flight(flight_id: str, passenger_name: str) -> dict[str, Any]: - """Book a specific flight.""" - pass - -# Define effect maps -effect_maps = { - "search_timeout": { - "tool_effects": {"search_flights": Timeout()} - }, - "booking_network_error": { - "tool_effects": {"book_flight": NetworkError()} - }, - "corrupted_search": { - "tool_effects": {"search_flights": CorruptValues(corrupt_ratio=0.8)} - }, - "total_chaos": { - "tool_effects": { - "search_flights": Timeout(), - "book_flight": NetworkError(), - } - }, -} - -# Define base cases -base_cases = [ - Case( - name="book-cheapest", - input="Find the cheapest flight to Paris next Tuesday and book it for John Smith", - metadata={"task_description": "Flight searched, cheapest option identified, booking confirmed"} - ), - Case( - name="book-morning", - input="I need a morning flight to Tokyo on Friday", - metadata={"task_description": "Morning flight found and booked"} - ), -] - -# Expand into chaos cases -chaos_cases = ChaosCase.expand(base_cases, effect_maps, include_baseline=True) - -# Task function — no chaos concepts here -def task_function(case: Case) -> dict: - search_tool = tool_simulator.get_tool("search_flights") - booking_tool = tool_simulator.get_tool("book_flight") - - agent = Agent( - system_prompt="You are a travel booking assistant. Help users find and book flights.", - tools=[search_tool, booking_tool], - plugins=[ChaosPlugin()], - callback_handler=None, - ) - response = agent(case.input) - return {"output": str(response)} - -# Run with all evaluators -evaluators = [ - GoalSuccessRateEvaluator(), - FailureCommunicationEvaluator(), - PartialCompletionEvaluator(), - RecoveryStrategyEvaluator(), -] - -experiment = ChaosExperiment(cases=chaos_cases, evaluators=evaluators) -reports = experiment.run_evaluations(task_function) - -# Display results -for report in reports: - print(f"\n{'='*60}") - print(f"Evaluator: {report.evaluator_name}") - print(f"{'='*60}") - report.run_display() -``` - -## Advanced Patterns - -### Pattern 1: Comparing Agent Configurations Under Chaos - -```python -def compare_agents_under_chaos(chaos_cases, configs): - """Compare how different agent configs handle the same failures.""" - results = {} - - for config_name, system_prompt in configs.items(): - def make_task(prompt): - def task_function(case: Case) -> dict: - agent = Agent( - system_prompt=prompt, - plugins=[ChaosPlugin()], - callback_handler=None, - ) - response = agent(case.input) - return {"output": str(response)} - return task_function - - experiment = ChaosExperiment( - cases=chaos_cases, - evaluators=[PartialCompletionEvaluator()] - ) - reports = experiment.run_evaluations(make_task(system_prompt)) - results[config_name] = reports - - return results -``` - -### Pattern 2: Progressive Failure Escalation - -```python -# Test increasing severity -effect_maps = { - "mild": { - "tool_effects": {"search": CorruptValues(corrupt_ratio=0.2)} - }, - "moderate": { - "tool_effects": {"search": CorruptValues(corrupt_ratio=0.5)} - }, - "severe": { - "tool_effects": {"search": CorruptValues(corrupt_ratio=0.9)} - }, - "total_failure": { - "tool_effects": {"search": Timeout()} - }, -} -``` - -### Pattern 3: Chaos with User Simulation - -Combine chaos testing with user simulation for multi-turn resilience evaluation: - -```python -from strands_evals import ActorSimulator - -def task_function(case: Case) -> dict: - user_sim = ActorSimulator.from_case_for_user_simulator( - case=case, max_turns=8 - ) - - agent = Agent( - system_prompt="You are a helpful assistant.", - plugins=[ChaosPlugin()], - callback_handler=None, - ) - - user_message = case.input - while user_sim.has_next(): - agent_response = agent(user_message) - user_result = user_sim.act(str(agent_response)) - user_message = str(user_result.structured_output.message) - - return {"output": str(agent_response)} -``` - -## Troubleshooting - -### Issue: Effects Not Being Applied - -Ensure `ChaosPlugin()` is in the agent's plugins list and you're using `ChaosExperiment` (not base `Experiment`): - -```python -# Correct -agent = Agent(plugins=[ChaosPlugin()], ...) -experiment = ChaosExperiment(cases=chaos_cases, ...) - -# Wrong — base Experiment doesn't set the ContextVar -experiment = Experiment(cases=chaos_cases, ...) -``` - -### Issue: All Tools Failing - -Check that your effect map keys match the exact tool function names: - -```python -# If your tool is defined as: -def search_flights(...): ... - -# The effect map key must be "search_flights", not "searchFlights" or "search" -effect_maps = {"test": {"tool_effects": {"search_flights": Timeout()}}} -``` - -### Issue: Async Task Errors - -`ChaosExperiment` supports both sync and async tasks. Use the appropriate method: - -```python -# Sync -reports = experiment.run_evaluations(sync_task_function) - -# Async -reports = await experiment.run_evaluations_async(async_task_function) -``` - -## Related Documentation - -- [Chaos Testing Overview](/docs/user-guide/evals-sdk/chaos_testing/): Overview and quick start -- [Tool Simulation](/docs/user-guide/evals-sdk/simulators/tool_simulation/): Simulate tool behavior -- [Goal Success Rate Evaluator](/docs/user-guide/evals-sdk/evaluators/goal_success_rate_evaluator/): Assess goal completion -- [Simulators Overview](/docs/user-guide/evals-sdk/simulators/): Simulator framework diff --git a/site/src/content/docs/user-guide/evals-sdk/chaos_testing/index.mdx b/site/src/content/docs/user-guide/evals-sdk/chaos_testing/index.mdx deleted file mode 100644 index 4dd523b09..000000000 --- a/site/src/content/docs/user-guide/evals-sdk/chaos_testing/index.mdx +++ /dev/null @@ -1,286 +0,0 @@ ---- -title: Chaos Testing -tags: [error-handling, simulation] -sidebar: - label: "Overview" ---- - -## Overview - -Chaos testing systematically evaluates agent resilience by injecting controlled failures into tool execution. Using the `ChaosPlugin`, `ChaosCase`, and `ChaosExperiment` classes, you can test how agents handle tool timeouts, network errors, and corrupted responses — without modifying agent code. - -This enables you to answer questions like: -- Does the agent gracefully communicate failures to users? -- Can the agent achieve partial goals when some tools fail? -- Does the agent employ effective recovery strategies? - -## Why Chaos Testing? - -Traditional evaluation tests agents under ideal conditions. In production, tools fail unpredictably: - -**Standard Evaluation:** -- Tools always return correct responses -- No network failures or timeouts -- Cannot reveal fragile error handling -- Misses degraded-mode behavior - -**Chaos Testing:** -- Injects realistic tool failures (timeouts, network errors, validation errors) -- Corrupts tool responses (truncated fields, removed data, corrupted values) -- Tests agent resilience without live infrastructure failures -- Measures graceful degradation and recovery behavior -- Quantifies partial goal completion under failure - -## When to Use Chaos Testing - -Use chaos testing when you need to: -- **Evaluate Resilience**: Test how agents handle tool failures gracefully -- **Assess Recovery**: Verify agents try alternative approaches when tools fail -- **Measure Degradation**: Quantify how much of a goal agents achieve despite failures -- **Test Communication**: Ensure agents inform users clearly about failures -- **Validate Robustness**: Confirm agents don't crash or loop on corrupted data - -## Architecture - -Chaos testing integrates with Strands' plugin system via `BeforeToolCallEvent` and `AfterToolCallEvent` hooks: - -1. **ChaosCase** — Extends `Case` with an `effects` field mapping tool names to failure effects -2. **ChaosPlugin** — A Strands plugin that intercepts tool calls and applies effects transparently -3. **ChaosExperiment** — Composes the base `Experiment` to manage chaos context per case -4. **ChaosEffect** — A hierarchy of pre-hook effects (cancel tool calls) and post-hook effects (corrupt responses) - -## Effect Types - -### Pre-hook Effects (Tool Call Failures) - -These effects cancel the tool call entirely and return an error: - -| Effect | Description | -| :------- | :------------ | -| `Timeout` | Simulates a tool execution timeout | -| `NetworkError` | Simulates a network connectivity failure | -| `ExecutionError` | Simulates a runtime error during tool execution | -| `ValidationError` | Simulates invalid input/output validation failure | - -### Post-hook Effects (Response Corruption) - -These effects let the tool execute but corrupt the response: - -| Effect | Description | Parameters | -| :------- | :------------ | :----------- | -| `TruncateFields` | Truncates string fields in the response | `max_length` | -| `RemoveFields` | Randomly removes fields from the response | `remove_ratio` | -| `CorruptValues` | Corrupts field values with garbage data | `corrupt_ratio` | - -## Basic Usage - -```python -from strands import Agent -from strands_evals import Case -from strands_evals.chaos import ChaosCase, ChaosExperiment, ChaosPlugin -from strands_evals.chaos.effects import Timeout, NetworkError -from strands_evals.evaluators import GoalSuccessRateEvaluator - -# Define base test cases -base_cases = [ - Case( - name="weather-lookup", - input="What's the weather in Seattle?", - metadata={"task_description": "Weather information provided"} - ) -] - -# Define named effect maps -effect_maps = { - "search_timeout": { - "tool_effects": {"get_weather": Timeout()} - }, - "network_failure": { - "tool_effects": {"get_weather": NetworkError()} - }, -} - -# Expand cases into Cartesian product (base cases × effect maps + baseline) -chaos_cases = ChaosCase.expand(base_cases, effect_maps, include_baseline=True) - -# Run chaos experiment -experiment = ChaosExperiment( - cases=chaos_cases, - evaluators=[GoalSuccessRateEvaluator()] -) - -def task_function(case: Case) -> dict: - agent = Agent( - system_prompt="You are a helpful weather assistant.", - plugins=[ChaosPlugin()], - callback_handler=None - ) - response = agent(case.input) - return {"output": str(response)} - -reports = experiment.run_evaluations(task_function) -``` - -## ChaosCase.expand() - -The `expand()` class method generates the Cartesian product of base cases and effect maps, optionally including a baseline (no effects) for comparison: - -```python -chaos_cases = ChaosCase.expand( - cases=base_cases, # List of base Case objects - effect_maps=effect_maps, # Dict of named effect configurations - include_baseline=True # Include cases with no effects for comparison -) -``` - -For 2 base cases and 3 effect maps with `include_baseline=True`, this produces `2 × (3 + 1) = 8` chaos cases. - -## Integration with ToolSimulator - -Chaos testing works naturally with `ToolSimulator` for fully controlled evaluation — simulated tools provide reproducible responses, and chaos effects inject failures on top: - -```python -from strands import Agent -from strands_evals import Case -from strands_evals.chaos import ChaosCase, ChaosExperiment, ChaosPlugin -from strands_evals.chaos.effects import Timeout, CorruptValues -from strands_evals.evaluators import GoalSuccessRateEvaluator -from strands_evals.simulation.tool_simulator import ToolSimulator -from pydantic import BaseModel, Field - -tool_simulator = ToolSimulator() - -class SearchResult(BaseModel): - title: str = Field(..., description="Result title") - snippet: str = Field(..., description="Result snippet") - -@tool_simulator.tool(output_schema=SearchResult) -def web_search(query: str) -> dict: - """Search the web for information.""" - pass - -# Define effect maps -effect_maps = { - "search_timeout": { - "tool_effects": {"web_search": Timeout()} - }, - "corrupted_results": { - "tool_effects": {"web_search": CorruptValues(corrupt_ratio=0.5)} - }, -} - -base_cases = [ - Case(name="research", input="Find recent news about AI agents") -] - -chaos_cases = ChaosCase.expand(base_cases, effect_maps, include_baseline=True) - -def task_function(case: Case) -> dict: - search_tool = tool_simulator.get_tool("web_search") - agent = Agent( - tools=[search_tool], - plugins=[ChaosPlugin()], - callback_handler=None - ) - response = agent(case.input) - return {"output": str(response)} - -experiment = ChaosExperiment( - cases=chaos_cases, - evaluators=[GoalSuccessRateEvaluator()] -) -reports = experiment.run_evaluations(task_function) -``` - -## Resilience Evaluators - -Chaos testing ships with three specialized evaluators designed to assess agent behavior under failure: - -| Evaluator | What It Measures | -| :---------- | :---------------- | -| `FailureCommunicationEvaluator` | Clarity, actionability, transparency, and tone of failure messages | -| `PartialCompletionEvaluator` | Percentage of user goal achieved despite failures (0.0–1.0) | -| `RecoveryStrategyEvaluator` | Quality of recovery actions: exploration breadth, retry discipline, approach variation | - -```python -from strands_evals.chaos.evaluators import ( - FailureCommunicationEvaluator, - PartialCompletionEvaluator, - RecoveryStrategyEvaluator, -) - -evaluators = [ - GoalSuccessRateEvaluator(), - FailureCommunicationEvaluator(), - PartialCompletionEvaluator(), - RecoveryStrategyEvaluator(), -] - -experiment = ChaosExperiment(cases=chaos_cases, evaluators=evaluators) -``` - -[Complete Chaos Testing Guide →](/docs/user-guide/evals-sdk/chaos_testing/chaos_testing/) - -## Chaos Testing vs Simulators - -| Aspect | Simulators | Chaos Testing | -| :------- | :---------- | :-------------- | -| **Role** | Replace tool execution entirely | Inject failures into tool execution | -| **Scope** | All tool calls are simulated | Only targeted tools are affected | -| **Use Case** | Test without infrastructure | Test resilience under failure | -| **Combination** | Can be used together | Chaos effects apply on top of simulated tools | - -## Best Practices - -### 1. Start with Baseline Comparisons - -Always include `include_baseline=True` to compare agent performance with and without failures: - -```python -chaos_cases = ChaosCase.expand(cases, effect_maps, include_baseline=True) -``` - -### 2. Test One Failure at a Time First - -Start with single-tool failures before testing compound chaos: - -```python -# Single failure -effect_maps = { - "search_fails": {"tool_effects": {"search": Timeout()}}, -} - -# Compound (test after single failures are understood) -effect_maps = { - "total_chaos": { - "tool_effects": { - "search": Timeout(), - "database": NetworkError(), - } - }, -} -``` - -### 3. Use Resilience Evaluators Together - -Combine all three resilience evaluators for a complete picture: - -```python -evaluators = [ - FailureCommunicationEvaluator(), # Did the agent tell the user? - PartialCompletionEvaluator(), # How much was achieved? - RecoveryStrategyEvaluator(), # Did it try alternatives? -] -``` - -## Next Steps - -- [Chaos Testing Guide](/docs/user-guide/evals-sdk/chaos_testing/chaos_testing/): Complete guide with advanced patterns -- [Tool Simulation](/docs/user-guide/evals-sdk/simulators/tool_simulation/): Simulate tool behavior -- [Goal Success Rate Evaluator](/docs/user-guide/evals-sdk/evaluators/goal_success_rate_evaluator/): Assess goal completion - -## Related Documentation - -- [Simulators Overview](/docs/user-guide/evals-sdk/simulators/): Simulator framework -- [Evaluators](/docs/user-guide/evals-sdk/evaluators/): All available evaluators -- [Quickstart Guide](/docs/user-guide/evals-sdk/quickstart/): Get started with Strands Evals