Skip to content
This repository was archived by the owner on Jun 3, 2026. It is now read-only.
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120 changes: 120 additions & 0 deletions site/docs/examples/evals-sdk/chaos_failure_communication.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
import logging
from typing import Any

from pydantic import BaseModel, Field

from strands import Agent
from strands_evals.chaos import ChaosCase, ChaosExperiment, ChaosPlugin, Timeout
from strands_evals.chaos.effects import NetworkError
from strands_evals.chaos.evaluators import FailureCommunicationEvaluator
from strands_evals.mappers import StrandsInMemorySessionMapper
from strands_evals.simulation.tool_simulator import ToolSimulator
from strands_evals.telemetry import StrandsEvalsTelemetry

logging.basicConfig(level=logging.INFO, format="%(message)s")
logger = logging.getLogger(__name__)

telemetry = StrandsEvalsTelemetry().setup_in_memory_exporter()
memory_exporter = telemetry.in_memory_exporter

tool_simulator = ToolSimulator()


class FlightSearchResponse(BaseModel):
flights: list[dict[str, Any]] = Field(default_factory=list)
total_results: int = Field(default=0)
status: str = Field(default="success")


class BookFlightResponse(BaseModel):
booking_id: str = Field(default="")
flight_id: str = Field(default="")
status: str = Field(default="success")
message: str = Field(default="")


@tool_simulator.tool(output_schema=FlightSearchResponse)
def search_flights(origin: str, destination: str, date: str) -> dict[str, Any]:
"""Search for available flights between two cities on a given date."""
pass


@tool_simulator.tool(output_schema=BookFlightResponse)
def book_flight(flight_id: str) -> dict[str, Any]:
"""Book a specific flight by its flight ID."""
pass


chaos_plugin = ChaosPlugin()

# Two cases that test communication quality:
# 1. Search times out — agent must inform user about the failure
# 2. Both tools fail — agent must communicate multiple failures clearly
chaos_cases = [
ChaosCase(
name="search_timeout",
input="Find me a flight from SFO to JFK on May 20 and book the cheapest one.",
effects={"tool_effects": {"search_flights": [Timeout(error_message="Tool call timed out after 30s")]}},
),
ChaosCase(
name="all_tools_down",
input="Search for flights from Seattle to Tokyo next Tuesday and book one.",
effects={
"tool_effects": {
"search_flights": [NetworkError(error_message="DNS resolution failed")],
"book_flight": [NetworkError(error_message="Connection refused")],
},
},
),
]

_search_tool = tool_simulator.get_tool("search_flights")
_book_tool = tool_simulator.get_tool("book_flight")


def travel_agent_task(case: ChaosCase) -> dict:
"""Run the travel agent under chaos and return output + trajectory."""
logger.info(f"\n{'─'*60}")
logger.info(f" Case: {case.name}")
logger.info(f" User: {case.input}")

agent = Agent(
system_prompt=(
"You are a travel booking assistant. Use the available tools to complete "
"the user's request. Today's date is May 18, 2025.\n\n"
"If a tool fails or returns an error:\n"
"- Acknowledge the failure honestly to the user\n"
"- Explain what went wrong in plain language\n"
"- Suggest next steps (retry later, try alternative)\n"
"- Do NOT hallucinate successful results"
),
tools=[_search_tool, _book_tool],
plugins=[chaos_plugin],
callback_handler=None,
trace_attributes={"gen_ai.conversation.id": case.session_id, "session.id": case.session_id},
)

memory_exporter.clear()
try:
result = agent(case.input)
output = str(result)
except Exception as e:
output = f"Agent failed with error: {type(e).__name__}: {str(e)[:200]}"

logger.info(f" Agent: {output[:300]}{'...' if len(output) > 300 else ''}")
logger.info(f"{'─'*60}")

finished_spans = memory_exporter.get_finished_spans()
mapper = StrandsInMemorySessionMapper()
session = mapper.map_to_session(finished_spans, session_id=case.session_id)

return {"output": output, "trajectory": session}


experiment = ChaosExperiment(
cases=chaos_cases,
evaluators=[FailureCommunicationEvaluator()],
)

reports = experiment.run_evaluations(task=travel_agent_task)
reports[0].run_display()
Comment thread
ybdarrenwang marked this conversation as resolved.
Outdated
137 changes: 137 additions & 0 deletions site/docs/examples/evals-sdk/chaos_partial_completion.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
import logging
from typing import Any

from pydantic import BaseModel, Field

from strands import Agent
from strands_evals.chaos import ChaosCase, ChaosExperiment, ChaosPlugin, TruncateFields
from strands_evals.chaos.effects import NetworkError
from strands_evals.chaos.evaluators import PartialCompletionEvaluator
from strands_evals.mappers import StrandsInMemorySessionMapper
from strands_evals.simulation.tool_simulator import ToolSimulator
from strands_evals.telemetry import StrandsEvalsTelemetry

logging.basicConfig(level=logging.INFO, format="%(message)s")
logger = logging.getLogger(__name__)

telemetry = StrandsEvalsTelemetry().setup_in_memory_exporter()
memory_exporter = telemetry.in_memory_exporter

tool_simulator = ToolSimulator()


class FlightSearchResponse(BaseModel):
flights: list[dict[str, Any]] = Field(default_factory=list)
total_results: int = Field(default=0)
status: str = Field(default="success")


class BookFlightResponse(BaseModel):
booking_id: str = Field(default="")
flight_id: str = Field(default="")
status: str = Field(default="success")
message: str = Field(default="")


class BookingConfirmationResponse(BaseModel):
confirmation_sent: bool = Field(default=False)
method: str = Field(default="email")
message: str = Field(default="")


@tool_simulator.tool(output_schema=FlightSearchResponse)
def search_flights(origin: str, destination: str, date: str) -> dict[str, Any]:
"""Search for available flights between two cities on a given date."""
pass


@tool_simulator.tool(output_schema=BookFlightResponse)
def book_flight(flight_id: str) -> dict[str, Any]:
"""Book a specific flight by its flight ID."""
pass


@tool_simulator.tool(output_schema=BookingConfirmationResponse)
def send_booking_confirmation(booking_id: str = "", flight_id: str = "", method: str = "email") -> dict[str, Any]:
"""Send booking confirmation to the user via email or SMS."""
pass


chaos_plugin = ChaosPlugin()

# Two cases that test partial completion:
# 1. Search works (truncated) but booking fails — user gets degraded flight info but no reservation
# 2. Search and booking work but confirmation fails — user gets most of what they asked for
chaos_cases = [
ChaosCase(
name="search_degraded_booking_fails",
input="Find me a flight from SFO to JFK on May 20, book the cheapest one, and send me a confirmation.",
effects={
"tool_effects": {
"search_flights": [TruncateFields(max_length=5)],
"book_flight": [NetworkError(error_message="Connection reset by peer")],
},
},
),
ChaosCase(
name="confirmation_fails",
input="Search for flights from Seattle to Tokyo next Tuesday, book one, and email me the confirmation.",
effects={
"tool_effects": {
"send_booking_confirmation": [NetworkError(error_message="SMTP server unreachable")],
},
},
),
]

_search_tool = tool_simulator.get_tool("search_flights")
_book_tool = tool_simulator.get_tool("book_flight")
_confirm_tool = tool_simulator.get_tool("send_booking_confirmation")


def travel_agent_task(case: ChaosCase) -> dict:
"""Run the travel agent under chaos and return output + trajectory."""
logger.info(f"\n{'─'*60}")
logger.info(f" Case: {case.name}")
logger.info(f" User: {case.input}")

agent = Agent(
system_prompt=(
"You are a travel booking assistant. Use the available tools to complete "
"the user's request. Today's date is May 18, 2025.\n\n"
"If a tool fails or returns an error:\n"
"- Acknowledge the failure honestly\n"
"- Complete as much of the request as possible\n"
"- Do NOT hallucinate successful results\n"
"- Do NOT retry more than once"
),
tools=[_search_tool, _book_tool, _confirm_tool],
plugins=[chaos_plugin],
callback_handler=None,
trace_attributes={"gen_ai.conversation.id": case.session_id, "session.id": case.session_id},
)

memory_exporter.clear()
try:
result = agent(case.input)
output = str(result)
except Exception as e:
output = f"Agent failed with error: {type(e).__name__}: {str(e)[:200]}"

logger.info(f" Agent: {output[:300]}{'...' if len(output) > 300 else ''}")
logger.info(f"{'─'*60}")

finished_spans = memory_exporter.get_finished_spans()
mapper = StrandsInMemorySessionMapper()
session = mapper.map_to_session(finished_spans, session_id=case.session_id)

return {"output": output, "trajectory": session}


experiment = ChaosExperiment(
cases=chaos_cases,
evaluators=[PartialCompletionEvaluator()],
)

reports = experiment.run_evaluations(task=travel_agent_task)
reports[0].run_display()
133 changes: 133 additions & 0 deletions site/docs/examples/evals-sdk/chaos_recovery_strategy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
import logging
from typing import Any

from pydantic import BaseModel, Field

from strands import Agent
from strands_evals.chaos import ChaosCase, ChaosExperiment, ChaosPlugin, Timeout
from strands_evals.chaos.effects import ExecutionError
from strands_evals.chaos.evaluators import RecoveryStrategyEvaluator
from strands_evals.mappers import StrandsInMemorySessionMapper
from strands_evals.simulation.tool_simulator import ToolSimulator
from strands_evals.telemetry import StrandsEvalsTelemetry

logging.basicConfig(level=logging.INFO, format="%(message)s")
logger = logging.getLogger(__name__)

telemetry = StrandsEvalsTelemetry().setup_in_memory_exporter()
memory_exporter = telemetry.in_memory_exporter

tool_simulator = ToolSimulator()


class FlightSearchResponse(BaseModel):
flights: list[dict[str, Any]] = Field(default_factory=list)
total_results: int = Field(default=0)
status: str = Field(default="success")


class HotelSearchResponse(BaseModel):
hotels: list[dict[str, Any]] = Field(default_factory=list)
total_results: int = Field(default=0)
status: str = Field(default="success")


class BookFlightResponse(BaseModel):
booking_id: str = Field(default="")
flight_id: str = Field(default="")
status: str = Field(default="success")
message: str = Field(default="")


@tool_simulator.tool(output_schema=FlightSearchResponse)
def search_flights(origin: str, destination: str, date: str) -> dict[str, Any]:
"""Search for available flights between two cities on a given date."""
pass


@tool_simulator.tool(output_schema=HotelSearchResponse)
def search_hotels(city: str, check_in: str, check_out: str) -> dict[str, Any]:
"""Search for available hotels in a city for given dates."""
pass


@tool_simulator.tool(output_schema=BookFlightResponse)
def book_flight(flight_id: str) -> dict[str, Any]:
"""Book a specific flight by its flight ID."""
pass


chaos_plugin = ChaosPlugin()

# Two cases that test recovery strategy:
# 1. Flight search times out but hotel search works — agent should pivot to hotel search
# 2. Flight search fails permanently — agent should try once, then move on
chaos_cases = [
ChaosCase(
name="flight_timeout_hotel_available",
input="Plan my trip to Tokyo: find flights from SFO and hotels for May 20-23.",
effects={"tool_effects": {"search_flights": [Timeout()]}},
),
ChaosCase(
name="flight_and_booking_fail",
input="Find a flight from NYC to London on June 1 and book the cheapest option.",
effects={
"tool_effects": {
"search_flights": [ExecutionError(error_message="Internal server error")],
"book_flight": [ExecutionError(error_message="Service unavailable")],
},
},
),
]

_search_flights_tool = tool_simulator.get_tool("search_flights")
_search_hotels_tool = tool_simulator.get_tool("search_hotels")
_book_tool = tool_simulator.get_tool("book_flight")


def travel_agent_task(case: ChaosCase) -> dict:
"""Run the travel agent under chaos and return output + trajectory."""
logger.info(f"\n{'─'*60}")
logger.info(f" Case: {case.name}")
logger.info(f" User: {case.input}")

agent = Agent(
system_prompt=(
"You are a travel planning assistant. Use the available tools to complete "
"the user's request. Today's date is May 18, 2025.\n\n"
"If a tool fails:\n"
"- Try alternative tools that can partially fulfill the request\n"
"- Do NOT retry the same failed tool more than once\n"
"- Do NOT hallucinate results\n"
"- Complete as much of the request as possible with working tools"
),
tools=[_search_flights_tool, _search_hotels_tool, _book_tool],
plugins=[chaos_plugin],
callback_handler=None,
trace_attributes={"gen_ai.conversation.id": case.session_id, "session.id": case.session_id},
)

memory_exporter.clear()
try:
result = agent(case.input)
output = str(result)
except Exception as e:
output = f"Agent failed with error: {type(e).__name__}: {str(e)[:200]}"

logger.info(f" Agent: {output[:300]}{'...' if len(output) > 300 else ''}")
logger.info(f"{'─'*60}")

finished_spans = memory_exporter.get_finished_spans()
mapper = StrandsInMemorySessionMapper()
session = mapper.map_to_session(finished_spans, session_id=case.session_id)

return {"output": output, "trajectory": session}


experiment = ChaosExperiment(
cases=chaos_cases,
evaluators=[RecoveryStrategyEvaluator()],
)

reports = experiment.run_evaluations(task=travel_agent_task)
reports[0].run_display()
Loading
Loading