From 6406a4d6e1cf5270475960cc7fdd44839d427dc9 Mon Sep 17 00:00:00 2001 From: Sean Turner Date: Mon, 27 Apr 2026 19:01:38 +0100 Subject: [PATCH 1/2] fix: retry transient 'thinking blocks cannot be modified' 400s from Bedrock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bedrock occasionally returns HTTP 400 claiming the assistant's thinking blocks have been modified, even when the payload is structurally valid. The same payload replayed immediately succeeds, indicating a transient server-side condition rather than a client bug. Observed on bedrock/us.anthropic.claude-sonnet-4-6 with adaptive thinking (reasoning_effort=high maps to {type: "adaptive"} on claude-4-6). The error message is: messages.N.content.M: `thinking` or `redacted_thinking` blocks in the latest assistant message cannot be modified. These blocks must remain as they were in the original response. Reproduced during a large-scale multi-scan study (800+ Strix runs). Most failures clear within seconds but we observed one case where the error persisted across ~2 minutes of backoff, so the retry budget allows up to ~5 minutes total. Fix: - Add _is_transient_thinking_error() helper that matches on the status code (400) AND the characteristic message ("thinking" + "cannot be modified"), to avoid treating unrelated 400s as transient. - On detection, retry with exponential backoff (5s, 10s, 20s, 40s, 80s, 160s — total ~5 min) before falling through to the generic retry path. - Self-contained: the detector checks the exception directly, so it works with or without other 400 handlers in place. Testing: - Replayed 33 captured payloads that appeared adjacent to a failure: all succeeded, confirming the condition is transient and not a client-side malformation. - In a multi-repo scan study, 3-retry budget was insufficient in one case (exhausted retries over ~30s). The 6-retry budget implemented here succeeded on the same repo on a subsequent attempt. --- strix/llm/llm.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/strix/llm/llm.py b/strix/llm/llm.py index 5e6a01f73..8db71fa39 100644 --- a/strix/llm/llm.py +++ b/strix/llm/llm.py @@ -159,12 +159,26 @@ async def generate( messages = self._prepare_messages(conversation_history) max_retries = int(Config.get("strix_llm_max_retries") or "5") + transient_thinking_retries = 0 + for attempt in range(max_retries + 1): try: async for response in self._stream(messages): yield response return # noqa: TRY300 except Exception as e: # noqa: BLE001 + # Bedrock occasionally returns a 400 claiming the assistant's thinking + # blocks have been modified — even when the payload is structurally + # valid and the same payload succeeds on replay. Observed on + # claude-sonnet-4-6 with adaptive thinking. Retry with exponential + # backoff (5s, 10s, 20s, 40s, 80s, 160s — ~5 min total budget) before + # falling through to the generic retry path. + if self._is_transient_thinking_error(e) and transient_thinking_retries < 6: + transient_thinking_retries += 1 + if attempt >= max_retries: + self._raise_error(e) + await asyncio.sleep(min(240, 5 * (2 ** (transient_thinking_retries - 1)))) + continue if attempt >= max_retries or not self._should_retry(e): self._raise_error(e) wait = min(90, 2 * (2**attempt)) @@ -323,6 +337,22 @@ def _extract_cost(self, response: Any) -> float: except Exception: # noqa: BLE001 return 0.0 + @staticmethod + def _is_transient_thinking_error(e: Exception) -> bool: + """Detect Bedrock's transient 'thinking blocks cannot be modified' 400. + + Observed on claude-sonnet-4-6 with adaptive thinking: Bedrock occasionally + rejects a well-formed payload with this error, and the identical payload + succeeds on replay. Treat it as transient rather than a structural issue. + """ + code = getattr(e, "status_code", None) or getattr( + getattr(e, "response", None), "status_code", None + ) + if code != 400: + return False + message = str(e).lower() + return "thinking" in message and "cannot be modified" in message + def _should_retry(self, e: Exception) -> bool: code = getattr(e, "status_code", None) or getattr( getattr(e, "response", None), "status_code", None From ba7df45027f980116504344f2662419c7265f1ce Mon Sep 17 00:00:00 2001 From: Sean Turner Date: Mon, 27 Apr 2026 22:50:11 +0100 Subject: [PATCH 2/2] fix: decouple transient-thinking retry budget from outer max_retries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Greptile caught a real logic bug in the initial patch: the `continue` inside the transient-thinking branch advanced the outer `for attempt in range(max_retries + 1)` counter, so each transient retry consumed a generic-retry slot. With the default `max_retries=5`: - Loop runs 6 iterations (attempts 0–5) - Each transient retry burned one iteration - On attempt 5, `attempt >= max_retries` fired `_raise_error` before the 6th sleep (160s) ever ran - The documented "fall through to generic retry" path was unreachable Fix: inner `while` loop that does its own `_stream()` retry without advancing the outer `attempt` counter. The transient budget of 6 (5 / 10 / 20 / 40 / 80 / 160 s) is now independent of `max_retries`, and once the inner budget is exhausted the most recent exception falls through to the generic retry path as the PR originally intended. Also extracts the literal `6` into `max_transient_thinking_retries` for readability — still unconfigurable since the envelope budget is based on observed Bedrock transient durations, not something we want users tuning blindly. --- strix/llm/llm.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/strix/llm/llm.py b/strix/llm/llm.py index 8db71fa39..448aca6fe 100644 --- a/strix/llm/llm.py +++ b/strix/llm/llm.py @@ -160,6 +160,7 @@ async def generate( max_retries = int(Config.get("strix_llm_max_retries") or "5") transient_thinking_retries = 0 + max_transient_thinking_retries = 6 for attempt in range(max_retries + 1): try: @@ -171,14 +172,23 @@ async def generate( # blocks have been modified — even when the payload is structurally # valid and the same payload succeeds on replay. Observed on # claude-sonnet-4-6 with adaptive thinking. Retry with exponential - # backoff (5s, 10s, 20s, 40s, 80s, 160s — ~5 min total budget) before - # falling through to the generic retry path. - if self._is_transient_thinking_error(e) and transient_thinking_retries < 6: + # backoff (5s, 10s, 20s, 40s, 80s, 160s — ~5 min total budget), using + # an inner loop so the transient retry counter does not share slots + # with the outer max_retries budget. Once the inner budget is + # exhausted (or a non-transient error is raised), fall through to the + # generic retry path below. + while ( + self._is_transient_thinking_error(e) + and transient_thinking_retries < max_transient_thinking_retries + ): transient_thinking_retries += 1 - if attempt >= max_retries: - self._raise_error(e) await asyncio.sleep(min(240, 5 * (2 ** (transient_thinking_retries - 1)))) - continue + try: + async for response in self._stream(messages): + yield response + return # noqa: TRY300 + except Exception as e2: # noqa: BLE001 + e = e2 if attempt >= max_retries or not self._should_retry(e): self._raise_error(e) wait = min(90, 2 * (2**attempt))