diff --git a/site/docs/examples/evals-sdk/chaos_failure_communication_evaluator.py b/site/docs/examples/evals-sdk/chaos_failure_communication_evaluator.py new file mode 100644 index 000000000..d78ab0430 --- /dev/null +++ b/site/docs/examples/evals-sdk/chaos_failure_communication_evaluator.py @@ -0,0 +1,121 @@ +import logging +from typing import Any + +from pydantic import BaseModel, Field + +from strands import Agent +from strands_evals.chaos import ChaosCase, ChaosExperiment, ChaosPlugin, Timeout +from strands_evals.chaos.effects import NetworkError +from strands_evals.evaluators.chaos import FailureCommunicationEvaluator +from strands_evals.mappers import StrandsInMemorySessionMapper +from strands_evals.simulation.tool_simulator import ToolSimulator +from strands_evals.telemetry import StrandsEvalsTelemetry +from strands_evals.types.evaluation_report import EvaluationReport + +logging.basicConfig(level=logging.INFO, format="%(message)s") +logger = logging.getLogger(__name__) + +telemetry = StrandsEvalsTelemetry().setup_in_memory_exporter() +memory_exporter = telemetry.in_memory_exporter + +tool_simulator = ToolSimulator() + + +class FlightSearchResponse(BaseModel): + flights: list[dict[str, Any]] = Field(default_factory=list) + total_results: int = Field(default=0) + status: str = Field(default="success") + + +class BookFlightResponse(BaseModel): + booking_id: str = Field(default="") + flight_id: str = Field(default="") + status: str = Field(default="success") + message: str = Field(default="") + + +@tool_simulator.tool(output_schema=FlightSearchResponse) +def search_flights(origin: str, destination: str, date: str) -> dict[str, Any]: + """Search for available flights between two cities on a given date.""" + pass + + +@tool_simulator.tool(output_schema=BookFlightResponse) +def book_flight(flight_id: str) -> dict[str, Any]: + """Book a specific flight by its flight ID.""" + pass + + +chaos_plugin = ChaosPlugin() + +# Two cases that test communication quality: +# 1. Search times out — agent must inform user about the failure +# 2. Both tools fail — agent must communicate multiple failures clearly +chaos_cases = [ + ChaosCase( + name="search_timeout", + input="Find me a flight from SFO to JFK on May 20 and book the cheapest one.", + effects={"tool_effects": {"search_flights": [Timeout(error_message="Tool call timed out after 30s")]}}, + ), + ChaosCase( + name="all_tools_down", + input="Search for flights from Seattle to Tokyo next Tuesday and book one.", + effects={ + "tool_effects": { + "search_flights": [NetworkError(error_message="DNS resolution failed")], + "book_flight": [NetworkError(error_message="Connection refused")], + }, + }, + ), +] + +_search_tool = tool_simulator.get_tool("search_flights") +_book_tool = tool_simulator.get_tool("book_flight") + + +def travel_agent_task(case: ChaosCase) -> dict: + """Run the travel agent under chaos and return output + trajectory.""" + logger.info(f"\n{'─'*60}") + logger.info(f" Case: {case.name}") + logger.info(f" User: {case.input}") + + agent = Agent( + system_prompt=( + "You are a travel booking assistant. Use the available tools to complete " + "the user's request. Today's date is May 18, 2025.\n\n" + "If a tool fails or returns an error:\n" + "- Acknowledge the failure honestly to the user\n" + "- Explain what went wrong in plain language\n" + "- Suggest next steps (retry later, try alternative)\n" + "- Do NOT hallucinate successful results" + ), + tools=[_search_tool, _book_tool], + plugins=[chaos_plugin], + callback_handler=None, + trace_attributes={"gen_ai.conversation.id": case.session_id, "session.id": case.session_id}, + ) + + memory_exporter.clear() + try: + result = agent(case.input) + output = str(result) + except Exception as e: + output = f"Agent failed with error: {type(e).__name__}: {str(e)[:200]}" + + logger.info(f" Agent: {output[:300]}{'...' if len(output) > 300 else ''}") + logger.info(f"{'─'*60}") + + finished_spans = memory_exporter.get_finished_spans() + mapper = StrandsInMemorySessionMapper() + session = mapper.map_to_session(finished_spans, session_id=case.session_id) + + return {"output": output, "trajectory": session} + + +experiment = ChaosExperiment( + cases=chaos_cases, + evaluators=[FailureCommunicationEvaluator()], +) + +reports = experiment.run_evaluations(task=travel_agent_task) +EvaluationReport.flatten(reports).run_display() diff --git a/site/docs/examples/evals-sdk/chaos_partial_completion_evaluator.py b/site/docs/examples/evals-sdk/chaos_partial_completion_evaluator.py new file mode 100644 index 000000000..d247ae0aa --- /dev/null +++ b/site/docs/examples/evals-sdk/chaos_partial_completion_evaluator.py @@ -0,0 +1,138 @@ +import logging +from typing import Any + +from pydantic import BaseModel, Field + +from strands import Agent +from strands_evals.chaos import ChaosCase, ChaosExperiment, ChaosPlugin, TruncateFields +from strands_evals.chaos.effects import NetworkError +from strands_evals.evaluators.chaos import PartialCompletionEvaluator +from strands_evals.mappers import StrandsInMemorySessionMapper +from strands_evals.simulation.tool_simulator import ToolSimulator +from strands_evals.telemetry import StrandsEvalsTelemetry +from strands_evals.types.evaluation_report import EvaluationReport + +logging.basicConfig(level=logging.INFO, format="%(message)s") +logger = logging.getLogger(__name__) + +telemetry = StrandsEvalsTelemetry().setup_in_memory_exporter() +memory_exporter = telemetry.in_memory_exporter + +tool_simulator = ToolSimulator() + + +class FlightSearchResponse(BaseModel): + flights: list[dict[str, Any]] = Field(default_factory=list) + total_results: int = Field(default=0) + status: str = Field(default="success") + + +class BookFlightResponse(BaseModel): + booking_id: str = Field(default="") + flight_id: str = Field(default="") + status: str = Field(default="success") + message: str = Field(default="") + + +class BookingConfirmationResponse(BaseModel): + confirmation_sent: bool = Field(default=False) + method: str = Field(default="email") + message: str = Field(default="") + + +@tool_simulator.tool(output_schema=FlightSearchResponse) +def search_flights(origin: str, destination: str, date: str) -> dict[str, Any]: + """Search for available flights between two cities on a given date.""" + pass + + +@tool_simulator.tool(output_schema=BookFlightResponse) +def book_flight(flight_id: str) -> dict[str, Any]: + """Book a specific flight by its flight ID.""" + pass + + +@tool_simulator.tool(output_schema=BookingConfirmationResponse) +def send_booking_confirmation(booking_id: str = "", flight_id: str = "", method: str = "email") -> dict[str, Any]: + """Send booking confirmation to the user via email or SMS.""" + pass + + +chaos_plugin = ChaosPlugin() + +# Two cases that test partial completion: +# 1. Search works (truncated) but booking fails — user gets degraded flight info but no reservation +# 2. Search and booking work but confirmation fails — user gets most of what they asked for +chaos_cases = [ + ChaosCase( + name="search_degraded_booking_fails", + input="Find me a flight from SFO to JFK on May 20, book the cheapest one, and send me a confirmation.", + effects={ + "tool_effects": { + "search_flights": [TruncateFields(max_length=5)], + "book_flight": [NetworkError(error_message="Connection reset by peer")], + }, + }, + ), + ChaosCase( + name="confirmation_fails", + input="Search for flights from Seattle to Tokyo next Tuesday, book one, and email me the confirmation.", + effects={ + "tool_effects": { + "send_booking_confirmation": [NetworkError(error_message="SMTP server unreachable")], + }, + }, + ), +] + +_search_tool = tool_simulator.get_tool("search_flights") +_book_tool = tool_simulator.get_tool("book_flight") +_confirm_tool = tool_simulator.get_tool("send_booking_confirmation") + + +def travel_agent_task(case: ChaosCase) -> dict: + """Run the travel agent under chaos and return output + trajectory.""" + logger.info(f"\n{'─'*60}") + logger.info(f" Case: {case.name}") + logger.info(f" User: {case.input}") + + agent = Agent( + system_prompt=( + "You are a travel booking assistant. Use the available tools to complete " + "the user's request. Today's date is May 18, 2025.\n\n" + "If a tool fails or returns an error:\n" + "- Acknowledge the failure honestly\n" + "- Complete as much of the request as possible\n" + "- Do NOT hallucinate successful results\n" + "- Do NOT retry more than once" + ), + tools=[_search_tool, _book_tool, _confirm_tool], + plugins=[chaos_plugin], + callback_handler=None, + trace_attributes={"gen_ai.conversation.id": case.session_id, "session.id": case.session_id}, + ) + + memory_exporter.clear() + try: + result = agent(case.input) + output = str(result) + except Exception as e: + output = f"Agent failed with error: {type(e).__name__}: {str(e)[:200]}" + + logger.info(f" Agent: {output[:300]}{'...' if len(output) > 300 else ''}") + logger.info(f"{'─'*60}") + + finished_spans = memory_exporter.get_finished_spans() + mapper = StrandsInMemorySessionMapper() + session = mapper.map_to_session(finished_spans, session_id=case.session_id) + + return {"output": output, "trajectory": session} + + +experiment = ChaosExperiment( + cases=chaos_cases, + evaluators=[PartialCompletionEvaluator()], +) + +reports = experiment.run_evaluations(task=travel_agent_task) +EvaluationReport.flatten(reports).run_display() diff --git a/site/docs/examples/evals-sdk/chaos_recovery_strategy_evaluator.py b/site/docs/examples/evals-sdk/chaos_recovery_strategy_evaluator.py new file mode 100644 index 000000000..fc2903e11 --- /dev/null +++ b/site/docs/examples/evals-sdk/chaos_recovery_strategy_evaluator.py @@ -0,0 +1,134 @@ +import logging +from typing import Any + +from pydantic import BaseModel, Field + +from strands import Agent +from strands_evals.chaos import ChaosCase, ChaosExperiment, ChaosPlugin, Timeout +from strands_evals.chaos.effects import ExecutionError +from strands_evals.evaluators.chaos import RecoveryStrategyEvaluator +from strands_evals.mappers import StrandsInMemorySessionMapper +from strands_evals.simulation.tool_simulator import ToolSimulator +from strands_evals.telemetry import StrandsEvalsTelemetry +from strands_evals.types.evaluation_report import EvaluationReport + +logging.basicConfig(level=logging.INFO, format="%(message)s") +logger = logging.getLogger(__name__) + +telemetry = StrandsEvalsTelemetry().setup_in_memory_exporter() +memory_exporter = telemetry.in_memory_exporter + +tool_simulator = ToolSimulator() + + +class FlightSearchResponse(BaseModel): + flights: list[dict[str, Any]] = Field(default_factory=list) + total_results: int = Field(default=0) + status: str = Field(default="success") + + +class HotelSearchResponse(BaseModel): + hotels: list[dict[str, Any]] = Field(default_factory=list) + total_results: int = Field(default=0) + status: str = Field(default="success") + + +class BookFlightResponse(BaseModel): + booking_id: str = Field(default="") + flight_id: str = Field(default="") + status: str = Field(default="success") + message: str = Field(default="") + + +@tool_simulator.tool(output_schema=FlightSearchResponse) +def search_flights(origin: str, destination: str, date: str) -> dict[str, Any]: + """Search for available flights between two cities on a given date.""" + pass + + +@tool_simulator.tool(output_schema=HotelSearchResponse) +def search_hotels(city: str, check_in: str, check_out: str) -> dict[str, Any]: + """Search for available hotels in a city for given dates.""" + pass + + +@tool_simulator.tool(output_schema=BookFlightResponse) +def book_flight(flight_id: str) -> dict[str, Any]: + """Book a specific flight by its flight ID.""" + pass + + +chaos_plugin = ChaosPlugin() + +# Two cases that test recovery strategy: +# 1. Flight search times out but hotel search works — agent should pivot to hotel search +# 2. Flight search fails permanently — agent should try once, then move on +chaos_cases = [ + ChaosCase( + name="flight_timeout_hotel_available", + input="Plan my trip to Tokyo: find flights from SFO and hotels for May 20-23.", + effects={"tool_effects": {"search_flights": [Timeout()]}}, + ), + ChaosCase( + name="flight_and_booking_fail", + input="Find a flight from NYC to London on June 1 and book the cheapest option.", + effects={ + "tool_effects": { + "search_flights": [ExecutionError(error_message="Internal server error")], + "book_flight": [ExecutionError(error_message="Service unavailable")], + }, + }, + ), +] + +_search_flights_tool = tool_simulator.get_tool("search_flights") +_search_hotels_tool = tool_simulator.get_tool("search_hotels") +_book_tool = tool_simulator.get_tool("book_flight") + + +def travel_agent_task(case: ChaosCase) -> dict: + """Run the travel agent under chaos and return output + trajectory.""" + logger.info(f"\n{'─'*60}") + logger.info(f" Case: {case.name}") + logger.info(f" User: {case.input}") + + agent = Agent( + system_prompt=( + "You are a travel planning assistant. Use the available tools to complete " + "the user's request. Today's date is May 18, 2025.\n\n" + "If a tool fails:\n" + "- Try alternative tools that can partially fulfill the request\n" + "- Do NOT retry the same failed tool more than once\n" + "- Do NOT hallucinate results\n" + "- Complete as much of the request as possible with working tools" + ), + tools=[_search_flights_tool, _search_hotels_tool, _book_tool], + plugins=[chaos_plugin], + callback_handler=None, + trace_attributes={"gen_ai.conversation.id": case.session_id, "session.id": case.session_id}, + ) + + memory_exporter.clear() + try: + result = agent(case.input) + output = str(result) + except Exception as e: + output = f"Agent failed with error: {type(e).__name__}: {str(e)[:200]}" + + logger.info(f" Agent: {output[:300]}{'...' if len(output) > 300 else ''}") + logger.info(f"{'─'*60}") + + finished_spans = memory_exporter.get_finished_spans() + mapper = StrandsInMemorySessionMapper() + session = mapper.map_to_session(finished_spans, session_id=case.session_id) + + return {"output": output, "trajectory": session} + + +experiment = ChaosExperiment( + cases=chaos_cases, + evaluators=[RecoveryStrategyEvaluator()], +) + +reports = experiment.run_evaluations(task=travel_agent_task) +EvaluationReport.flatten(reports).run_display() diff --git a/site/docs/examples/evals-sdk/chaos_testing.py b/site/docs/examples/evals-sdk/chaos_testing.py new file mode 100644 index 000000000..86e366103 --- /dev/null +++ b/site/docs/examples/evals-sdk/chaos_testing.py @@ -0,0 +1,173 @@ +import logging +from typing import Any + +from pydantic import BaseModel, Field + +from strands import Agent +from strands_evals import Case +from strands_evals.chaos import ( + ChaosCase, + ChaosExperiment, + ChaosPlugin, + CorruptValues, + NetworkError, + RemoveFields, + Timeout, + TruncateFields, +) +from strands_evals.chaos.effects import ExecutionError +from strands_evals.evaluators import GoalSuccessRateEvaluator +from strands_evals.mappers import StrandsInMemorySessionMapper +from strands_evals.simulation.tool_simulator import ToolSimulator +from strands_evals.telemetry import StrandsEvalsTelemetry +from strands_evals.types.evaluation_report import EvaluationReport + +logging.basicConfig(level=logging.INFO, format="%(message)s") +logger = logging.getLogger(__name__) + +# Setup telemetry +telemetry = StrandsEvalsTelemetry().setup_in_memory_exporter() +memory_exporter = telemetry.in_memory_exporter + +# 1. Set up ToolSimulator and register tools +tool_simulator = ToolSimulator() + +class FlightSearchResponse(BaseModel): + """Response from the flight search tool.""" + + flights: list[dict[str, Any]] = Field(default_factory=list, description="List of available flights") + total_results: int = Field(default=0, description="Total number of results found") + status: str = Field(default="success", description="Operation status") + +class BookFlightResponse(BaseModel): + """Response from the flight booking tool.""" + + booking_id: str = Field(default="", description="Booking confirmation ID") + flight_id: str = Field(default="", description="The booked flight ID") + status: str = Field(default="success", description="Booking status") + message: str = Field(default="", description="Status message") + +class BookingConfirmationResponse(BaseModel): + """Response from the booking confirmation tool.""" + + confirmation_sent: bool = Field(default=False, description="Whether confirmation was sent") + method: str = Field(default="email", description="Delivery method") + message: str = Field(default="", description="Confirmation details") + +@tool_simulator.tool(output_schema=FlightSearchResponse) +def search_flights(origin: str, destination: str, date: str) -> dict[str, Any]: + """Search for available flights between two cities on a given date.""" + pass + +@tool_simulator.tool(output_schema=BookFlightResponse) +def book_flight(flight_id: str) -> dict[str, Any]: + """Book a specific flight by its flight ID. Returns booking confirmation.""" + pass + +@tool_simulator.tool(output_schema=BookingConfirmationResponse) +def send_booking_confirmation(booking_id: str = "", flight_id: str = "", method: str = "email") -> dict[str, Any]: + """Send booking confirmation or fallback link to the user via email or SMS.""" + pass + +# 2. Create the ChaosPlugin +chaos_plugin = ChaosPlugin() + +# 3. Define named effect maps +effect_maps = { + # Single-tool, pre-hook: tool call is cancelled before execution + "search_timeout": { + "tool_effects": {"search_flights": [Timeout()]}, + }, + # Two-tool, post-hook: tools execute but responses are silently corrupted + "book_corrupt_and_confirm_truncated": { + "tool_effects": { + "book_flight": [CorruptValues(corrupt_ratio=0.8)], + "send_booking_confirmation": [TruncateFields(max_length=5)], + }, + }, + # All-tool, mixed pre+post: combines hard failures with silent corruption + "total_chaos": { + "tool_effects": { + "search_flights": [NetworkError()], + "book_flight": [ExecutionError()], + "send_booking_confirmation": [RemoveFields(remove_ratio=0.7)], + }, + }, +} + +# 4. Define the task function +# Pre-create tool instances once (avoids registry issues across runs) +_search_tool = tool_simulator.get_tool("search_flights") +_book_tool = tool_simulator.get_tool("book_flight") +_confirm_tool = tool_simulator.get_tool("send_booking_confirmation") + +def travel_agent_task(case: ChaosCase) -> dict: + """Run the travel agent with a single user query.""" + logger.info(f"\n{'─'*60}") + logger.info(f" Case: {case.name}") + logger.info(f" User: {case.input}") + + agent = Agent( + system_prompt=( + "You are a travel booking assistant. You help users search for flights, " + "book them, and send confirmations. Use the available tools to complete " + "the user's request. Today's date is May 18, 2025.\n\n" + "Always use the tools directly — do not ask the user for clarification " + "if you can infer reasonable values from context.\n\n" + "If a tool fails or returns an error:\n" + "- Acknowledge the failure honestly to the user\n" + "- Try an alternative approach if possible\n" + "- Do NOT hallucinate successful results\n" + "- Do NOT retry more than once\n\n" + "If tool results look suspicious (e.g., $0 fares, past dates):\n" + "- Inform the user that results seem unreliable\n" + "- Suggest alternatives" + ), + tools=[_search_tool, _book_tool, _confirm_tool], + plugins=[chaos_plugin], + callback_handler=None, + trace_attributes={"gen_ai.conversation.id": case.session_id, "session.id": case.session_id}, + ) + + memory_exporter.clear() + try: + result = agent(case.input) + output = str(result) + except Exception as e: + output = f"Agent failed with error: {type(e).__name__}: {str(e)[:200]}" + + logger.info(f" Agent: {output[:300]}{'...' if len(output) > 300 else ''}") + logger.info(f"{'─'*60}") + + finished_spans = memory_exporter.get_finished_spans() + mapper = StrandsInMemorySessionMapper() + session = mapper.map_to_session(finished_spans, session_id=case.session_id) + + return {"output": output, "trajectory": session} + +# 5. Define test cases and expand with effect maps +test_cases = [ + Case( + name="book_a_flight", + input="Find me a flight from SFO to JFK on May 20, book the cheapest one, and send me a confirmation.", + ), + Case( + name="search_and_confirm", + input="Search for flights from Seattle to Tokyo next Tuesday, book one, and email me the confirmation.", + ), +] + +# Expand: 2 cases × (3 effect maps + 1 baseline) = 8 ChaosCase objects +chaos_cases = ChaosCase.expand(test_cases, effect_maps, include_no_effect_baseline=True) + +# 6. Create and run the ChaosExperiment +evaluators = [GoalSuccessRateEvaluator()] + +experiment = ChaosExperiment( + cases=chaos_cases, + evaluators=evaluators, +) + +# Run: 8 chaos cases = 8 agent invocations +reports = experiment.run_evaluations(task=travel_agent_task) +EvaluationReport.flatten(reports).run_display()