This repository was archived by the owner on Jun 3, 2026. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 224
docs: add chaos testing doc and example script #836
Open
ybdarrenwang
wants to merge
6
commits into
strands-agents:main
Choose a base branch
from
ybdarrenwang:docs/chaos-tool
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
+566
−0
Open
Changes from 5 commits
Commits
Show all changes
6 commits
Select commit
Hold shift + click to select a range
74a3637
add chaos testing example script
ybdarrenwang f5033bf
replace chaos scenario with chaos case
ybdarrenwang e9877bc
update chaos effect map format; rename script
ybdarrenwang 7798e6e
add resilience evaluator examples
ybdarrenwang a13deef
add chaos testing webpages
ybdarrenwang 5acf867
revert chaos webpage; use flatten in example
ybdarrenwang File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
120 changes: 120 additions & 0 deletions
120
site/docs/examples/evals-sdk/chaos_failure_communication.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,120 @@ | ||
| import logging | ||
| from typing import Any | ||
|
|
||
| from pydantic import BaseModel, Field | ||
|
|
||
| from strands import Agent | ||
| from strands_evals.chaos import ChaosCase, ChaosExperiment, ChaosPlugin, Timeout | ||
| from strands_evals.chaos.effects import NetworkError | ||
| from strands_evals.chaos.evaluators import FailureCommunicationEvaluator | ||
| from strands_evals.mappers import StrandsInMemorySessionMapper | ||
| from strands_evals.simulation.tool_simulator import ToolSimulator | ||
| from strands_evals.telemetry import StrandsEvalsTelemetry | ||
|
|
||
| logging.basicConfig(level=logging.INFO, format="%(message)s") | ||
| logger = logging.getLogger(__name__) | ||
|
|
||
| telemetry = StrandsEvalsTelemetry().setup_in_memory_exporter() | ||
| memory_exporter = telemetry.in_memory_exporter | ||
|
|
||
| tool_simulator = ToolSimulator() | ||
|
|
||
|
|
||
| class FlightSearchResponse(BaseModel): | ||
| flights: list[dict[str, Any]] = Field(default_factory=list) | ||
| total_results: int = Field(default=0) | ||
| status: str = Field(default="success") | ||
|
|
||
|
|
||
| class BookFlightResponse(BaseModel): | ||
| booking_id: str = Field(default="") | ||
| flight_id: str = Field(default="") | ||
| status: str = Field(default="success") | ||
| message: str = Field(default="") | ||
|
|
||
|
|
||
| @tool_simulator.tool(output_schema=FlightSearchResponse) | ||
| def search_flights(origin: str, destination: str, date: str) -> dict[str, Any]: | ||
| """Search for available flights between two cities on a given date.""" | ||
| pass | ||
|
|
||
|
|
||
| @tool_simulator.tool(output_schema=BookFlightResponse) | ||
| def book_flight(flight_id: str) -> dict[str, Any]: | ||
| """Book a specific flight by its flight ID.""" | ||
| pass | ||
|
|
||
|
|
||
| chaos_plugin = ChaosPlugin() | ||
|
|
||
| # Two cases that test communication quality: | ||
| # 1. Search times out — agent must inform user about the failure | ||
| # 2. Both tools fail — agent must communicate multiple failures clearly | ||
| chaos_cases = [ | ||
| ChaosCase( | ||
| name="search_timeout", | ||
| input="Find me a flight from SFO to JFK on May 20 and book the cheapest one.", | ||
| effects={"tool_effects": {"search_flights": [Timeout(error_message="Tool call timed out after 30s")]}}, | ||
| ), | ||
| ChaosCase( | ||
| name="all_tools_down", | ||
| input="Search for flights from Seattle to Tokyo next Tuesday and book one.", | ||
| effects={ | ||
| "tool_effects": { | ||
| "search_flights": [NetworkError(error_message="DNS resolution failed")], | ||
| "book_flight": [NetworkError(error_message="Connection refused")], | ||
| }, | ||
| }, | ||
| ), | ||
| ] | ||
|
|
||
| _search_tool = tool_simulator.get_tool("search_flights") | ||
| _book_tool = tool_simulator.get_tool("book_flight") | ||
|
|
||
|
|
||
| def travel_agent_task(case: ChaosCase) -> dict: | ||
| """Run the travel agent under chaos and return output + trajectory.""" | ||
| logger.info(f"\n{'─'*60}") | ||
| logger.info(f" Case: {case.name}") | ||
| logger.info(f" User: {case.input}") | ||
|
|
||
| agent = Agent( | ||
| system_prompt=( | ||
| "You are a travel booking assistant. Use the available tools to complete " | ||
| "the user's request. Today's date is May 18, 2025.\n\n" | ||
| "If a tool fails or returns an error:\n" | ||
| "- Acknowledge the failure honestly to the user\n" | ||
| "- Explain what went wrong in plain language\n" | ||
| "- Suggest next steps (retry later, try alternative)\n" | ||
| "- Do NOT hallucinate successful results" | ||
| ), | ||
| tools=[_search_tool, _book_tool], | ||
| plugins=[chaos_plugin], | ||
| callback_handler=None, | ||
| trace_attributes={"gen_ai.conversation.id": case.session_id, "session.id": case.session_id}, | ||
| ) | ||
|
|
||
| memory_exporter.clear() | ||
| try: | ||
| result = agent(case.input) | ||
| output = str(result) | ||
| except Exception as e: | ||
| output = f"Agent failed with error: {type(e).__name__}: {str(e)[:200]}" | ||
|
|
||
| logger.info(f" Agent: {output[:300]}{'...' if len(output) > 300 else ''}") | ||
| logger.info(f"{'─'*60}") | ||
|
|
||
| finished_spans = memory_exporter.get_finished_spans() | ||
| mapper = StrandsInMemorySessionMapper() | ||
| session = mapper.map_to_session(finished_spans, session_id=case.session_id) | ||
|
|
||
| return {"output": output, "trajectory": session} | ||
|
|
||
|
|
||
| experiment = ChaosExperiment( | ||
| cases=chaos_cases, | ||
| evaluators=[FailureCommunicationEvaluator()], | ||
| ) | ||
|
|
||
| reports = experiment.run_evaluations(task=travel_agent_task) | ||
| reports[0].run_display() | ||
137 changes: 137 additions & 0 deletions
137
site/docs/examples/evals-sdk/chaos_partial_completion.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,137 @@ | ||
| import logging | ||
| from typing import Any | ||
|
|
||
| from pydantic import BaseModel, Field | ||
|
|
||
| from strands import Agent | ||
| from strands_evals.chaos import ChaosCase, ChaosExperiment, ChaosPlugin, TruncateFields | ||
| from strands_evals.chaos.effects import NetworkError | ||
| from strands_evals.chaos.evaluators import PartialCompletionEvaluator | ||
| from strands_evals.mappers import StrandsInMemorySessionMapper | ||
| from strands_evals.simulation.tool_simulator import ToolSimulator | ||
| from strands_evals.telemetry import StrandsEvalsTelemetry | ||
|
|
||
| logging.basicConfig(level=logging.INFO, format="%(message)s") | ||
| logger = logging.getLogger(__name__) | ||
|
|
||
| telemetry = StrandsEvalsTelemetry().setup_in_memory_exporter() | ||
| memory_exporter = telemetry.in_memory_exporter | ||
|
|
||
| tool_simulator = ToolSimulator() | ||
|
|
||
|
|
||
| class FlightSearchResponse(BaseModel): | ||
| flights: list[dict[str, Any]] = Field(default_factory=list) | ||
| total_results: int = Field(default=0) | ||
| status: str = Field(default="success") | ||
|
|
||
|
|
||
| class BookFlightResponse(BaseModel): | ||
| booking_id: str = Field(default="") | ||
| flight_id: str = Field(default="") | ||
| status: str = Field(default="success") | ||
| message: str = Field(default="") | ||
|
|
||
|
|
||
| class BookingConfirmationResponse(BaseModel): | ||
| confirmation_sent: bool = Field(default=False) | ||
| method: str = Field(default="email") | ||
| message: str = Field(default="") | ||
|
|
||
|
|
||
| @tool_simulator.tool(output_schema=FlightSearchResponse) | ||
| def search_flights(origin: str, destination: str, date: str) -> dict[str, Any]: | ||
| """Search for available flights between two cities on a given date.""" | ||
| pass | ||
|
|
||
|
|
||
| @tool_simulator.tool(output_schema=BookFlightResponse) | ||
| def book_flight(flight_id: str) -> dict[str, Any]: | ||
| """Book a specific flight by its flight ID.""" | ||
| pass | ||
|
|
||
|
|
||
| @tool_simulator.tool(output_schema=BookingConfirmationResponse) | ||
| def send_booking_confirmation(booking_id: str = "", flight_id: str = "", method: str = "email") -> dict[str, Any]: | ||
| """Send booking confirmation to the user via email or SMS.""" | ||
| pass | ||
|
|
||
|
|
||
| chaos_plugin = ChaosPlugin() | ||
|
|
||
| # Two cases that test partial completion: | ||
| # 1. Search works (truncated) but booking fails — user gets degraded flight info but no reservation | ||
| # 2. Search and booking work but confirmation fails — user gets most of what they asked for | ||
| chaos_cases = [ | ||
| ChaosCase( | ||
| name="search_degraded_booking_fails", | ||
| input="Find me a flight from SFO to JFK on May 20, book the cheapest one, and send me a confirmation.", | ||
| effects={ | ||
| "tool_effects": { | ||
| "search_flights": [TruncateFields(max_length=5)], | ||
| "book_flight": [NetworkError(error_message="Connection reset by peer")], | ||
| }, | ||
| }, | ||
| ), | ||
| ChaosCase( | ||
| name="confirmation_fails", | ||
| input="Search for flights from Seattle to Tokyo next Tuesday, book one, and email me the confirmation.", | ||
| effects={ | ||
| "tool_effects": { | ||
| "send_booking_confirmation": [NetworkError(error_message="SMTP server unreachable")], | ||
| }, | ||
| }, | ||
| ), | ||
| ] | ||
|
|
||
| _search_tool = tool_simulator.get_tool("search_flights") | ||
| _book_tool = tool_simulator.get_tool("book_flight") | ||
| _confirm_tool = tool_simulator.get_tool("send_booking_confirmation") | ||
|
|
||
|
|
||
| def travel_agent_task(case: ChaosCase) -> dict: | ||
| """Run the travel agent under chaos and return output + trajectory.""" | ||
| logger.info(f"\n{'─'*60}") | ||
| logger.info(f" Case: {case.name}") | ||
| logger.info(f" User: {case.input}") | ||
|
|
||
| agent = Agent( | ||
| system_prompt=( | ||
| "You are a travel booking assistant. Use the available tools to complete " | ||
| "the user's request. Today's date is May 18, 2025.\n\n" | ||
| "If a tool fails or returns an error:\n" | ||
| "- Acknowledge the failure honestly\n" | ||
| "- Complete as much of the request as possible\n" | ||
| "- Do NOT hallucinate successful results\n" | ||
| "- Do NOT retry more than once" | ||
| ), | ||
| tools=[_search_tool, _book_tool, _confirm_tool], | ||
| plugins=[chaos_plugin], | ||
| callback_handler=None, | ||
| trace_attributes={"gen_ai.conversation.id": case.session_id, "session.id": case.session_id}, | ||
| ) | ||
|
|
||
| memory_exporter.clear() | ||
| try: | ||
| result = agent(case.input) | ||
| output = str(result) | ||
| except Exception as e: | ||
| output = f"Agent failed with error: {type(e).__name__}: {str(e)[:200]}" | ||
|
|
||
| logger.info(f" Agent: {output[:300]}{'...' if len(output) > 300 else ''}") | ||
| logger.info(f"{'─'*60}") | ||
|
|
||
| finished_spans = memory_exporter.get_finished_spans() | ||
| mapper = StrandsInMemorySessionMapper() | ||
| session = mapper.map_to_session(finished_spans, session_id=case.session_id) | ||
|
|
||
| return {"output": output, "trajectory": session} | ||
|
|
||
|
|
||
| experiment = ChaosExperiment( | ||
| cases=chaos_cases, | ||
| evaluators=[PartialCompletionEvaluator()], | ||
| ) | ||
|
|
||
| reports = experiment.run_evaluations(task=travel_agent_task) | ||
| reports[0].run_display() |
133 changes: 133 additions & 0 deletions
133
site/docs/examples/evals-sdk/chaos_recovery_strategy.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,133 @@ | ||
| import logging | ||
| from typing import Any | ||
|
|
||
| from pydantic import BaseModel, Field | ||
|
|
||
| from strands import Agent | ||
| from strands_evals.chaos import ChaosCase, ChaosExperiment, ChaosPlugin, Timeout | ||
| from strands_evals.chaos.effects import ExecutionError | ||
| from strands_evals.chaos.evaluators import RecoveryStrategyEvaluator | ||
| from strands_evals.mappers import StrandsInMemorySessionMapper | ||
| from strands_evals.simulation.tool_simulator import ToolSimulator | ||
| from strands_evals.telemetry import StrandsEvalsTelemetry | ||
|
|
||
| logging.basicConfig(level=logging.INFO, format="%(message)s") | ||
| logger = logging.getLogger(__name__) | ||
|
|
||
| telemetry = StrandsEvalsTelemetry().setup_in_memory_exporter() | ||
| memory_exporter = telemetry.in_memory_exporter | ||
|
|
||
| tool_simulator = ToolSimulator() | ||
|
|
||
|
|
||
| class FlightSearchResponse(BaseModel): | ||
| flights: list[dict[str, Any]] = Field(default_factory=list) | ||
| total_results: int = Field(default=0) | ||
| status: str = Field(default="success") | ||
|
|
||
|
|
||
| class HotelSearchResponse(BaseModel): | ||
| hotels: list[dict[str, Any]] = Field(default_factory=list) | ||
| total_results: int = Field(default=0) | ||
| status: str = Field(default="success") | ||
|
|
||
|
|
||
| class BookFlightResponse(BaseModel): | ||
| booking_id: str = Field(default="") | ||
| flight_id: str = Field(default="") | ||
| status: str = Field(default="success") | ||
| message: str = Field(default="") | ||
|
|
||
|
|
||
| @tool_simulator.tool(output_schema=FlightSearchResponse) | ||
| def search_flights(origin: str, destination: str, date: str) -> dict[str, Any]: | ||
| """Search for available flights between two cities on a given date.""" | ||
| pass | ||
|
|
||
|
|
||
| @tool_simulator.tool(output_schema=HotelSearchResponse) | ||
| def search_hotels(city: str, check_in: str, check_out: str) -> dict[str, Any]: | ||
| """Search for available hotels in a city for given dates.""" | ||
| pass | ||
|
|
||
|
|
||
| @tool_simulator.tool(output_schema=BookFlightResponse) | ||
| def book_flight(flight_id: str) -> dict[str, Any]: | ||
| """Book a specific flight by its flight ID.""" | ||
| pass | ||
|
|
||
|
|
||
| chaos_plugin = ChaosPlugin() | ||
|
|
||
| # Two cases that test recovery strategy: | ||
| # 1. Flight search times out but hotel search works — agent should pivot to hotel search | ||
| # 2. Flight search fails permanently — agent should try once, then move on | ||
| chaos_cases = [ | ||
| ChaosCase( | ||
| name="flight_timeout_hotel_available", | ||
| input="Plan my trip to Tokyo: find flights from SFO and hotels for May 20-23.", | ||
| effects={"tool_effects": {"search_flights": [Timeout()]}}, | ||
| ), | ||
| ChaosCase( | ||
| name="flight_and_booking_fail", | ||
| input="Find a flight from NYC to London on June 1 and book the cheapest option.", | ||
| effects={ | ||
| "tool_effects": { | ||
| "search_flights": [ExecutionError(error_message="Internal server error")], | ||
| "book_flight": [ExecutionError(error_message="Service unavailable")], | ||
| }, | ||
| }, | ||
| ), | ||
| ] | ||
|
|
||
| _search_flights_tool = tool_simulator.get_tool("search_flights") | ||
| _search_hotels_tool = tool_simulator.get_tool("search_hotels") | ||
| _book_tool = tool_simulator.get_tool("book_flight") | ||
|
|
||
|
|
||
| def travel_agent_task(case: ChaosCase) -> dict: | ||
| """Run the travel agent under chaos and return output + trajectory.""" | ||
| logger.info(f"\n{'─'*60}") | ||
| logger.info(f" Case: {case.name}") | ||
| logger.info(f" User: {case.input}") | ||
|
|
||
| agent = Agent( | ||
| system_prompt=( | ||
| "You are a travel planning assistant. Use the available tools to complete " | ||
| "the user's request. Today's date is May 18, 2025.\n\n" | ||
| "If a tool fails:\n" | ||
| "- Try alternative tools that can partially fulfill the request\n" | ||
| "- Do NOT retry the same failed tool more than once\n" | ||
| "- Do NOT hallucinate results\n" | ||
| "- Complete as much of the request as possible with working tools" | ||
| ), | ||
| tools=[_search_flights_tool, _search_hotels_tool, _book_tool], | ||
| plugins=[chaos_plugin], | ||
| callback_handler=None, | ||
| trace_attributes={"gen_ai.conversation.id": case.session_id, "session.id": case.session_id}, | ||
| ) | ||
|
|
||
| memory_exporter.clear() | ||
| try: | ||
| result = agent(case.input) | ||
| output = str(result) | ||
| except Exception as e: | ||
| output = f"Agent failed with error: {type(e).__name__}: {str(e)[:200]}" | ||
|
|
||
| logger.info(f" Agent: {output[:300]}{'...' if len(output) > 300 else ''}") | ||
| logger.info(f"{'─'*60}") | ||
|
|
||
| finished_spans = memory_exporter.get_finished_spans() | ||
| mapper = StrandsInMemorySessionMapper() | ||
| session = mapper.map_to_session(finished_spans, session_id=case.session_id) | ||
|
|
||
| return {"output": output, "trajectory": session} | ||
|
|
||
|
|
||
| experiment = ChaosExperiment( | ||
| cases=chaos_cases, | ||
| evaluators=[RecoveryStrategyEvaluator()], | ||
| ) | ||
|
|
||
| reports = experiment.run_evaluations(task=travel_agent_task) | ||
| reports[0].run_display() |
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.