From 6406a4d6e1cf5270475960cc7fdd44839d427dc9 Mon Sep 17 00:00:00 2001
From: Sean Turner <seanturner83@gmail.com>
Date: Mon, 27 Apr 2026 19:01:38 +0100
Subject: [PATCH 1/2] fix: retry transient 'thinking blocks cannot be modified'
 400s from Bedrock
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bedrock occasionally returns HTTP 400 claiming the assistant's thinking
blocks have been modified, even when the payload is structurally valid.
The same payload replayed immediately succeeds, indicating a transient
server-side condition rather than a client bug.

Observed on bedrock/us.anthropic.claude-sonnet-4-6 with adaptive thinking
(reasoning_effort=high maps to {type: "adaptive"} on claude-4-6). The
error message is:

  messages.N.content.M: `thinking` or `redacted_thinking` blocks in the
  latest assistant message cannot be modified. These blocks must remain
  as they were in the original response.

Reproduced during a large-scale multi-scan study (800+ Strix runs). Most
failures clear within seconds but we observed one case where the error
persisted across ~2 minutes of backoff, so the retry budget allows up to
~5 minutes total.

Fix:
- Add _is_transient_thinking_error() helper that matches on the status
  code (400) AND the characteristic message ("thinking" + "cannot be
  modified"), to avoid treating unrelated 400s as transient.
- On detection, retry with exponential backoff (5s, 10s, 20s, 40s, 80s,
  160s — total ~5 min) before falling through to the generic retry path.
- Self-contained: the detector checks the exception directly, so it works
  with or without other 400 handlers in place.

Testing:
- Replayed 33 captured payloads that appeared adjacent to a failure: all
  succeeded, confirming the condition is transient and not a client-side
  malformation.
- In a multi-repo scan study, 3-retry budget was insufficient in one case
  (exhausted retries over ~30s). The 6-retry budget implemented here
  succeeded on the same repo on a subsequent attempt.
---
 strix/llm/llm.py | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/strix/llm/llm.py b/strix/llm/llm.py
index 5e6a01f73..8db71fa39 100644
--- a/strix/llm/llm.py
+++ b/strix/llm/llm.py
@@ -159,12 +159,26 @@ async def generate(
         messages = self._prepare_messages(conversation_history)
         max_retries = int(Config.get("strix_llm_max_retries") or "5")
 
+        transient_thinking_retries = 0
+
         for attempt in range(max_retries + 1):
             try:
                 async for response in self._stream(messages):
                     yield response
                 return  # noqa: TRY300
             except Exception as e:  # noqa: BLE001
+                # Bedrock occasionally returns a 400 claiming the assistant's thinking
+                # blocks have been modified — even when the payload is structurally
+                # valid and the same payload succeeds on replay. Observed on
+                # claude-sonnet-4-6 with adaptive thinking. Retry with exponential
+                # backoff (5s, 10s, 20s, 40s, 80s, 160s — ~5 min total budget) before
+                # falling through to the generic retry path.
+                if self._is_transient_thinking_error(e) and transient_thinking_retries < 6:
+                    transient_thinking_retries += 1
+                    if attempt >= max_retries:
+                        self._raise_error(e)
+                    await asyncio.sleep(min(240, 5 * (2 ** (transient_thinking_retries - 1))))
+                    continue
                 if attempt >= max_retries or not self._should_retry(e):
                     self._raise_error(e)
                 wait = min(90, 2 * (2**attempt))
@@ -323,6 +337,22 @@ def _extract_cost(self, response: Any) -> float:
         except Exception:  # noqa: BLE001
             return 0.0
 
+    @staticmethod
+    def _is_transient_thinking_error(e: Exception) -> bool:
+        """Detect Bedrock's transient 'thinking blocks cannot be modified' 400.
+
+        Observed on claude-sonnet-4-6 with adaptive thinking: Bedrock occasionally
+        rejects a well-formed payload with this error, and the identical payload
+        succeeds on replay. Treat it as transient rather than a structural issue.
+        """
+        code = getattr(e, "status_code", None) or getattr(
+            getattr(e, "response", None), "status_code", None
+        )
+        if code != 400:
+            return False
+        message = str(e).lower()
+        return "thinking" in message and "cannot be modified" in message
+
     def _should_retry(self, e: Exception) -> bool:
         code = getattr(e, "status_code", None) or getattr(
             getattr(e, "response", None), "status_code", None

From ba7df45027f980116504344f2662419c7265f1ce Mon Sep 17 00:00:00 2001
From: Sean Turner <seanturner83@gmail.com>
Date: Mon, 27 Apr 2026 22:50:11 +0100
Subject: [PATCH 2/2] fix: decouple transient-thinking retry budget from outer
 max_retries
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Greptile caught a real logic bug in the initial patch: the `continue`
inside the transient-thinking branch advanced the outer
`for attempt in range(max_retries + 1)` counter, so each transient retry
consumed a generic-retry slot. With the default `max_retries=5`:

- Loop runs 6 iterations (attempts 0–5)
- Each transient retry burned one iteration
- On attempt 5, `attempt >= max_retries` fired `_raise_error` before the
  6th sleep (160s) ever ran
- The documented "fall through to generic retry" path was unreachable

Fix: inner `while` loop that does its own `_stream()` retry without
advancing the outer `attempt` counter. The transient budget of 6 (5 / 10
/ 20 / 40 / 80 / 160 s) is now independent of `max_retries`, and once
the inner budget is exhausted the most recent exception falls through
to the generic retry path as the PR originally intended.

Also extracts the literal `6` into `max_transient_thinking_retries` for
readability — still unconfigurable since the envelope budget is based on
observed Bedrock transient durations, not something we want users tuning
blindly.
---
 strix/llm/llm.py | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/strix/llm/llm.py b/strix/llm/llm.py
index 8db71fa39..448aca6fe 100644
--- a/strix/llm/llm.py
+++ b/strix/llm/llm.py
@@ -160,6 +160,7 @@ async def generate(
         max_retries = int(Config.get("strix_llm_max_retries") or "5")
 
         transient_thinking_retries = 0
+        max_transient_thinking_retries = 6
 
         for attempt in range(max_retries + 1):
             try:
@@ -171,14 +172,23 @@ async def generate(
                 # blocks have been modified — even when the payload is structurally
                 # valid and the same payload succeeds on replay. Observed on
                 # claude-sonnet-4-6 with adaptive thinking. Retry with exponential
-                # backoff (5s, 10s, 20s, 40s, 80s, 160s — ~5 min total budget) before
-                # falling through to the generic retry path.
-                if self._is_transient_thinking_error(e) and transient_thinking_retries < 6:
+                # backoff (5s, 10s, 20s, 40s, 80s, 160s — ~5 min total budget), using
+                # an inner loop so the transient retry counter does not share slots
+                # with the outer max_retries budget. Once the inner budget is
+                # exhausted (or a non-transient error is raised), fall through to the
+                # generic retry path below.
+                while (
+                    self._is_transient_thinking_error(e)
+                    and transient_thinking_retries < max_transient_thinking_retries
+                ):
                     transient_thinking_retries += 1
-                    if attempt >= max_retries:
-                        self._raise_error(e)
                     await asyncio.sleep(min(240, 5 * (2 ** (transient_thinking_retries - 1))))
-                    continue
+                    try:
+                        async for response in self._stream(messages):
+                            yield response
+                        return  # noqa: TRY300
+                    except Exception as e2:  # noqa: BLE001
+                        e = e2
                 if attempt >= max_retries or not self._should_retry(e):
                     self._raise_error(e)
                 wait = min(90, 2 * (2**attempt))