THUDM · jingshenghang · Jun 1, 2026 · Jun 2, 2026 · Jun 4, 2026 · Jun 4, 2026
diff --git a/examples/coding_agent_rl/README.md b/examples/coding_agent_rl/README.md
@@ -5,7 +5,7 @@ This directory provides an example of running end-to-end **SWE (Software-Enginee
 Two example files and one shared adapter implement the loop:
 
 - `generate.py` — per-sample `generate()` registered via `--custom-generate-function-path`. Boots the sandbox, runs claude-code, captures the diff, scores it, and emits one or more `Sample`s back to slime.
-- `slime.agent.adapters.AnthropicAdapter` — the shared Anthropic Messages adapter. claude-code talks to it as if it were Anthropic; the adapter tokenizes the current message history each turn, records prompt/output token snapshots, preserves model-generated tokens (`loss_mask=1`) only while later prompts stitch onto them, masks template/observation tokens (`0`), and emits **three kinds of segments** per trajectory: `subagent` (completed `Task/Agent` dispatch), `wipe` (chain frozen by auto-compact), `final` (tail of the main chain).
+- `slime.agent.adapters.AnthropicAdapter` — the shared Anthropic Messages adapter. claude-code talks to it as if it were Anthropic; the adapter tokenizes the current message history each turn, records prompt/output token snapshots, preserves model-generated tokens (`loss_mask=1`) only while later prompts stitch onto them, and masks template/observation tokens (`0`). Each turn is routed into a per-session message tree inside `slime.agent.trajectory_manager.TrajectoryManager`; any divergence in the prompt prefix forks a new branch, so sub-agent dispatches and auto-compaction are handled as separate root-to-leaf chains. `get_trajectory` linearizes each leaf chain into one `Sample`.
 - `sandbox.py` — coding-agent/SWE helpers built on `slime.agent.sandbox`: install bootstraps, spawn claude-code, capture patches, and run the fresh-sandbox evaluator. The shared sandbox contract lives in `slime.agent.sandbox.Sandbox`.
 
 `generate.py` owns one `AnthropicAdapter` instance. For each sample it calls
@@ -145,8 +145,8 @@ The Anthropic adapter therefore follows a **string in, token out** contract:
 
 Multi-turn agents still force the adapter to tokenize later message
 histories, because tool observations and claude-code's own compacted messages
-arrive as strings. `slime.agent.trajectory.merge_turns` stitches those later
-prompts against the saved token stream:
+arrive as strings. `slime.agent.trajectory_manager.TrajectoryManager` routes
+those later prompts against the saved token stream:
 
 - New prompt suffixes that are tool/user/environment context are appended with
   `loss_mask=0`.
@@ -160,15 +160,15 @@ That last case is the important correctness guard. A re-tokenization mismatch
 can make a string-level conversation look continuous while token-level
 provenance is broken. slime keeps the context needed to continue the agent, but
 does not backprop through tokens whose sampled origin can no longer be proven.
-The unit tests in `tests/test_agent_trajectory.py` cover matched prefixes,
-skipped turns, split-output drift, changed token counts, and prompt-base
-restarts.
+The unit tests in `tests/test_agent/test_trajectory_manager.py` cover matched
+prefixes, skipped turns, split-output drift, changed token counts, and
+prompt-base restarts.
 
 ## Fan-out Semantics
 
-- `generate()` returns `list[Sample]` — one Sample per trajectory **segment** (`subagent` / `wipe` / `final`).
-- Per-trajectory reward is split as `reward / K` across segments; `rollout_id` is shared so the per-rollout-mean loss reducer still counts the trajectory once.
-- Sub-agent dispatch increases `K` (each completed `Agent` turn block becomes its own segment), so the effective batch after flatten can be much larger than `rollout_batch_size * n_samples_per_prompt`.
+- `generate()` returns `list[Sample]` — one Sample per root-to-leaf chain in the per-session message tree.
+- Per-trajectory reward is split as `reward / K` across chains; `rollout_id` is shared so the per-rollout-mean loss reducer still counts the trajectory once.
+- Sub-agent dispatch and auto-compaction increase `K` (each prompt-prefix divergence forks a new branch), so the effective batch after flatten can be much larger than `rollout_batch_size * n_samples_per_prompt`.
 
 ## Porting to a New Sandbox Backend
 

diff --git a/examples/coding_agent_rl/generate.py b/examples/coding_agent_rl/generate.py
@@ -9,8 +9,8 @@
     1. ``sandbox.run_claude_code`` prepares the agent sandbox and runs claude-code.
     2. ``sandbox.git_diff`` captures the model-produced patch.
     3. ``sandbox.evaluate`` scores that patch in a second clean sandbox.
-    4. ``_merge_samples`` combines reward + adapter ``TokenSegment``s,
-       delegating segment-to-``Sample`` fan-out to ``slime.agent.trajectory``.
+    4. ``adapter.finish_session`` drains the session tree into reward-weighted
+       ``Sample`` objects with ``.response`` already decoded; ``generate`` logs.
 
 All sandbox-side details live in ``sandbox.py``; the LLM plumbing
 (Anthropic <-> SGLang /generate, token capture, 3-kind segment split) uses
@@ -49,17 +49,15 @@
 import secrets
 import time
 import traceback
-from dataclasses import dataclass
 from typing import Any
 
 from slime.agent.adapters import AnthropicAdapter
-from slime.agent.trajectory import TokenSegment, fan_out_sample_segments
+from slime.agent.aiohttp_threaded import FilteredAccessLogger, run_app_in_thread
 from slime.utils.misc import SingletonMeta
 from slime.utils.processing_utils import load_tokenizer
 from slime.utils.types import Sample
 
 from . import sandbox
-from .aiohttp_threaded import run_app_in_thread
 
 logger = logging.getLogger(__name__)
 
@@ -97,11 +95,13 @@ def __init__(self, args) -> None:
                 "Without it the sandbox cannot dial back and the rollout will "
                 "silently abort."
             )
+        fork_merge_threshold = int(v) if (v := os.environ.get("SLIME_FORK_MERGE_MAX_RESPONSE_TOKENS")) else None
         self.adapter = AnthropicAdapter(
             tokenizer=self.tokenizer,
             sglang_url=sglang_url,
             tool_parser=self.tool_parser,
             reasoning_parser=self.reasoning_parser,
+            fork_threshold_tokens=fork_merge_threshold,
         )
         # handler_cancellation=True so a client disconnect cancels the handler
         # coroutine, arming the fire-and-forget /abort_request inside the
@@ -113,7 +113,10 @@ def __init__(self, args) -> None:
             host=SHIM_BIND_HOST,
             port=SHIM_PORT,
             thread_name="anthropic-adapter",
-            runner_kwargs={"handler_cancellation": True},
+            runner_kwargs={
+                "handler_cancellation": True,
+                "access_log_class": FilteredAccessLogger,
+            },
         )
         self.adapter_url = f"http://{public_host}:{self.app_handle.port}"
         logger.info(
@@ -127,18 +130,8 @@ def __init__(self, args) -> None:
 
 
 # ---------------------------------------------------------------------------
-# Trajectory -> Sample conversion
-# adapter.finish_session() returns TokenSegments. One trajectory yields >=1
-# segments because the agent may compact + reset mid-run; trajectory.py handles
-# the mechanical segment -> Sample fan-out.
+# Session setup
 # ---------------------------------------------------------------------------
-@dataclass(frozen=True)
-class RewardResult:
-    reward: float
-    is_solved: bool
-    applied_cleanly: bool
-
-
 def _start_session(
     state: _State,
     sample: Sample,
@@ -164,55 +157,11 @@ def _start_session(
     return session_id
 
 
-def _merge_samples(
-    *,
-    sample: Sample,
-    state: _State,
-    segments: list[TokenSegment],
-    reward_result: RewardResult,
-    elapsed_sec: float,
-    instance_id: str,
-):
-    if not segments:
-        return _abort_result(sample, "adapter_session_empty")
-
-    trajectory_metadata = {
-        **(sample.metadata or {}),
-        "instance_id": instance_id,
-        "is_solved": reward_result.is_solved,
-        "applied_cleanly": reward_result.applied_cleanly,
-        "elapsed_sec": elapsed_sec,
-    }
-
-    # All K samples share rollout_id so the loss reducer counts this
-    # trajectory once.
-    fanned = fan_out_sample_segments(
-        sample,
-        segments,
-        reward_result.reward,
-        state.tokenizer,
-        metadata=trajectory_metadata,
-    )
-    if not fanned:
-        raise ValueError("fan-out produced no samples")
-
-    logger.info(
-        "[coding_agent_rl] %s: reward=%.2f solved=%s applied=%s elapsed=%.1fs segments=%d",
-        instance_id,
-        reward_result.reward,
-        reward_result.is_solved,
-        reward_result.applied_cleanly,
-        elapsed_sec,
-        len(fanned),
-    )
-    return fanned
-
-
 # ---------------------------------------------------------------------------
 # Main per-sample agent function
 #
 # The four calls inside the timeout are the high-level rollout recipe:
-# run_claude_code -> git_diff -> sandbox.evaluate -> merge_samples.
+# run_claude_code -> git_diff -> sandbox.evaluate -> finish_session.
 # ---------------------------------------------------------------------------
 async def generate(args, sample: Sample, sampling_params: dict[str, Any]):
     """Per-sample agent function with wall-clock guard. See
@@ -249,20 +198,26 @@ async def generate(args, sample: Sample, sampling_params: dict[str, Any]):
                 pre_commands=md["pre_commands"],
                 timeout_sec=SWE_EVAL_TIMEOUT_SEC,
             )
-            reward_result = RewardResult(
+            samples = await state.adapter.finish_session(
+                session_id,
+                base_sample=sample,
                 reward=float(reward),
-                is_solved=bool(is_solved),
-                applied_cleanly=bool(applied_cleanly),
             )
-            segments = await state.adapter.finish_session(session_id)
-            return _merge_samples(
-                sample=sample,
-                state=state,
-                segments=segments,
-                reward_result=reward_result,
-                elapsed_sec=time.time() - t0,
-                instance_id=instance_id,
+            if not samples:
+                return _abort_result(sample, "adapter_session_empty")
+
+            # finish_session already linearized, reward-weighted and decoded
+            # each segment's .response; here we only log a summary.
+            logger.info(
+                "[coding_agent_rl] %s: reward=%.2f solved=%s applied=%s elapsed=%.1fs segments=%d",
+                instance_id,
+                float(reward),
+                bool(is_solved),
+                bool(applied_cleanly),
+                time.time() - t0,
+                len(samples),
             )
+            return samples
 
     except asyncio.TimeoutError:
         _log_timeout_diagnostic(t0)
@@ -347,7 +302,9 @@ def _coerce_prompt(prompt) -> str:
     return ""
 
 
-def _abort(sample: Sample, reason: str) -> Sample:
+def _abort_result(sample: Sample, reason: str) -> list[Sample]:
+    """Mark ``sample`` aborted in place and return it in the list shape this
+    fan-out generate function always yields."""
     sample.tokens = [0, 0]
     sample.response = ""
     sample.response_length = 1
@@ -356,9 +313,4 @@ def _abort(sample: Sample, reason: str) -> Sample:
     sample.status = Sample.Status.ABORTED
     sample.metadata = {**(sample.metadata or {}), "abort_reason": reason}
     logger.warning("[coding_agent_rl] aborted: %s", reason)
-    return sample
-
-
-def _abort_result(sample: Sample, reason: str):
-    """Return a uniform list shape for this fan-out generate function."""
-    return [_abort(sample, reason)]
+    return [sample]
diff --git a/examples/coding_agent_rl/run_qwen36_35b_a3b_swe_8nodes.sh b/examples/coding_agent_rl/run_qwen36_35b_a3b_swe_8nodes.sh
@@ -1,8 +1,8 @@
 #!/usr/bin/env bash
 # End-to-end SWE coding-agent RL on 8 nodes.
 #
-# Same model and training loop as run_qwen36_35b_a3b_swe_8node.sh, with three
-# extra layers that actively encourage the rollout to dispatch sub-agents.
+# Standard model and training loop, with three extra layers that actively
+# encourage the rollout to dispatch sub-agents.
 # Trajectory trees produced by this script show real `sibling` branches:
 #
 #   (1) An `investigator` sub-agent is registered via claude-code's --agents

diff --git a/examples/coding_agent_rl/sandbox.py b/examples/coding_agent_rl/sandbox.py
@@ -326,8 +326,8 @@ async def evaluate(
 
         if swepro:
             r, s = await _run_swepro(ev, workdir, swepro, timeout_sec)
-            return r, s, True
-        r, s = await _run_eval_cmd(ev, workdir, eval_cmd, timeout_sec)
+        else:
+            r, s = await _run_eval_cmd(ev, workdir, eval_cmd, timeout_sec)
         return r, s, True
 
 
@@ -336,8 +336,7 @@ async def _setup_swepro_assets(ev: Sandbox, swepro: dict) -> None:
     for k, dst in [("run_script_path", "run_script.sh"), ("parser_script_path", "parser.py")]:
         host_p = swepro.get(k)
         if host_p:
-            text = Path(host_p).read_text()
-            await ev.write_file(f"{_SWEPRO_DIR}/{dst}", text, user="root")
+            await ev.write_file(f"{_SWEPRO_DIR}/{dst}", Path(host_p), user="root")
     await ev.exec(f"chmod 755 {_SWEPRO_DIR}/* && chown -R agent:agent {_SWEPRO_DIR}", user="root", check=True)