diff --git a/strix/llm/llm.py b/strix/llm/llm.py index 5e6a01f73..448aca6fe 100644 --- a/strix/llm/llm.py +++ b/strix/llm/llm.py @@ -159,12 +159,36 @@ async def generate( messages = self._prepare_messages(conversation_history) max_retries = int(Config.get("strix_llm_max_retries") or "5") + transient_thinking_retries = 0 + max_transient_thinking_retries = 6 + for attempt in range(max_retries + 1): try: async for response in self._stream(messages): yield response return # noqa: TRY300 except Exception as e: # noqa: BLE001 + # Bedrock occasionally returns a 400 claiming the assistant's thinking + # blocks have been modified — even when the payload is structurally + # valid and the same payload succeeds on replay. Observed on + # claude-sonnet-4-6 with adaptive thinking. Retry with exponential + # backoff (5s, 10s, 20s, 40s, 80s, 160s — ~5 min total budget), using + # an inner loop so the transient retry counter does not share slots + # with the outer max_retries budget. Once the inner budget is + # exhausted (or a non-transient error is raised), fall through to the + # generic retry path below. + while ( + self._is_transient_thinking_error(e) + and transient_thinking_retries < max_transient_thinking_retries + ): + transient_thinking_retries += 1 + await asyncio.sleep(min(240, 5 * (2 ** (transient_thinking_retries - 1)))) + try: + async for response in self._stream(messages): + yield response + return # noqa: TRY300 + except Exception as e2: # noqa: BLE001 + e = e2 if attempt >= max_retries or not self._should_retry(e): self._raise_error(e) wait = min(90, 2 * (2**attempt)) @@ -323,6 +347,22 @@ def _extract_cost(self, response: Any) -> float: except Exception: # noqa: BLE001 return 0.0 + @staticmethod + def _is_transient_thinking_error(e: Exception) -> bool: + """Detect Bedrock's transient 'thinking blocks cannot be modified' 400. + + Observed on claude-sonnet-4-6 with adaptive thinking: Bedrock occasionally + rejects a well-formed payload with this error, and the identical payload + succeeds on replay. Treat it as transient rather than a structural issue. + """ + code = getattr(e, "status_code", None) or getattr( + getattr(e, "response", None), "status_code", None + ) + if code != 400: + return False + message = str(e).lower() + return "thinking" in message and "cannot be modified" in message + def _should_retry(self, e: Exception) -> bool: code = getattr(e, "status_code", None) or getattr( getattr(e, "response", None), "status_code", None