NVIDIA · bbednarski9 · Jul 1, 2026 · lvojtku · Jul 1, 2026 · lvojtku
@@ -37,11 +37,14 @@ class HeaderPlugin:
             name: str,
             request: nemo_relay.LLMRequest,
             annotated: nemo_relay.AnnotatedLLMRequest | None
-        ) -> tuple[nemo_relay.LLMRequest, nemo_relay.AnnotatedLLMRequest | None]:
+        ) -> nemo_relay.LLMRequestInterceptOutcome:
             # The request object is immutable, however we can return a new instance with updated headers.
             headers = request.headers.copy()
             headers[plugin_config["header_name"]] = plugin_config["value"]
-            return nemo_relay.LLMRequest(headers=headers, content=request.content), annotated
+            return nemo_relay.LLMRequestInterceptOutcome(
+                nemo_relay.LLMRequest(headers=headers, content=request.content),
+                annotated,
+            )
 
         context.register_llm_request_intercept("inject-header", 100, False, add_header)
 

@@ -51,10 +51,13 @@ class HeaderPlugin:
             name: str,
             request: nemo_relay.LLMRequest,
             annotated: nemo_relay.AnnotatedLLMRequest | None
-        ) -> tuple[nemo_relay.LLMRequest, nemo_relay.AnnotatedLLMRequest | None]:
+        ) -> nemo_relay.LLMRequestInterceptOutcome:
             headers = request.headers.copy()
             headers[plugin_config["header_name"]] = plugin_config["value"]
-            return nemo_relay.LLMRequest(headers=headers, content=request.content), annotated
+            return nemo_relay.LLMRequestInterceptOutcome(
+                nemo_relay.LLMRequest(headers=headers, content=request.content),
+                annotated,
+            )
 
         context.register_llm_request_intercept("inject-header", 100, False, add_header)
 
@@ -99,6 +102,7 @@ plugin.register('header-plugin', headerPlugin);
 
 <Tab title="Rust" language="rust">
 ```rust
+use nemo_relay::api::llm::LlmRequestInterceptOutcome;
 use nemo_relay::plugin::{
     register_plugin, ConfigDiagnostic, DiagnosticLevel, Plugin, PluginRegistrationContext,
     Result as PluginResult,
@@ -166,7 +170,7 @@ impl Plugin for HeaderPlugin {
                     request
                         .headers
                         .insert(header_name.clone(), header_value.clone().into());
-                    Ok((request, annotated))
+                    Ok(LlmRequestInterceptOutcome::new(request, annotated))
                 }),
             )?;
             Ok(())

@@ -260,8 +260,8 @@ tool_args = nemo_relay.tools.request_intercepts("search", {"query": "weather"})
 nemo_relay.tools.conditional_execution("search", tool_args)
 
 llm_request = LLMRequest({}, {"messages": [{"role": "user", "content": "hello"}]})
-llm_request = nemo_relay.llm.request_intercepts("demo-provider", llm_request)
-nemo_relay.llm.conditional_execution(llm_request)
+outcome = nemo_relay.llm.request_intercepts("demo-provider", llm_request)
+nemo_relay.llm.conditional_execution(outcome.request)
 ```
 </Tab>
 
@@ -279,8 +279,8 @@ const toolArgs = await toolRequestIntercepts('search', { query: 'weather' });
 await toolConditionalExecution('search', toolArgs);
 
 const request = new LlmRequest({}, { messages: [{ role: 'user', content: 'hello' }] });
-const rewritten = await llmRequestIntercepts('demo-provider', request);
-await llmConditionalExecution(rewritten);
+const outcome = await llmRequestIntercepts('demo-provider', request);
+await llmConditionalExecution(outcome.request);
 ```
 </Tab>
 
@@ -297,8 +297,8 @@ let request = LlmRequest {
     headers: Default::default(),
     content: json!({"messages": [{"role": "user", "content": "hello"}]}),
 };
-let rewritten = llm_request_intercepts("demo-provider", request)?;
-llm_conditional_execution(&rewritten)?;
+let outcome = llm_request_intercepts("demo-provider", request)?;
+llm_conditional_execution(&outcome.request)?;
 ```
 </Tab>
 

@@ -184,10 +184,11 @@ import nemo_relay
 from nemo_relay import LLMRequest
 
 rewritten_args = nemo_relay.tools.request_intercepts("search", {"query": "weather"})
-rewritten_request = nemo_relay.llm.request_intercepts(
+outcome = nemo_relay.llm.request_intercepts(
     "demo-provider",
     LLMRequest({}, {"messages": []}),
 )
+rewritten_request = outcome.request
 ```
 </Tab>
 
@@ -196,7 +197,8 @@ rewritten_request = nemo_relay.llm.request_intercepts(
 import { LlmRequest, llmRequestIntercepts, toolRequestIntercepts } from 'nemo-relay-node';
 
 const rewrittenArgs = await toolRequestIntercepts('search', { query: 'weather' });
-const rewrittenRequest = await llmRequestIntercepts('demo-provider', new LlmRequest({}, { messages: [] }));
+const outcome = await llmRequestIntercepts('demo-provider', new LlmRequest({}, { messages: [] }));
+const rewrittenRequest = outcome.request;
 ```
 </Tab>
 
@@ -208,7 +210,8 @@ use serde_json::json;
 
 let rewritten_args = tool_request_intercepts("search", json!({"query": "weather"}))?;
 let request = LlmRequest { headers: Default::default(), content: json!({"messages": []}) };
-let rewritten_request = llm_request_intercepts("demo-provider", request)?;
+let outcome = llm_request_intercepts("demo-provider", request)?;
+let rewritten_request = outcome.request;
 ```
 </Tab>
 

@@ -41,10 +41,17 @@ When a managed LLM call has a request codec:
 
 1. NeMo Relay calls `decode` before LLM request intercepts run.
 2. Request intercepts receive both the raw request and the annotated request.
-3. Intercepts may edit the raw request, the annotated request, or both.
+3. Intercepts edit provider-body fields through the annotated request and may
+   edit transport headers through the raw request. Raw `request.content` is
+   read-only while the codec is active.
 4. NeMo Relay calls `encode` to merge the annotated request back into the original raw request.
 5. Execution intercepts and the provider callback receive the encoded provider request.
 
+If a codec-aware intercept changes raw `request.content` or omits the returned
+annotation, Relay rejects the outcome before creating the LLM lifecycle. When
+no request codec is active, the raw request remains fully writable and is the
+provider-visible source of truth.
+
 When a managed LLM call has a response codec, NeMo Relay decodes the raw provider response for observability and attaches the result to the emitted LLM end event. The response codec does not rewrite the value returned to the application. Use [Provider Response Codecs](/integrate-into-frameworks/provider-response-codecs) for response-only behavior and custom response codec examples.
 
 Codec implementations should preserve fields they do not understand. Treat `encode` as a merge operation over the original provider payload, not as a full replacement.
@@ -87,7 +94,7 @@ from nemo_relay.codecs import OpenAIChatCodec
 
 def add_system_message(_name, request, annotated):
     if annotated is None:
-        return request, annotated
+        return nemo_relay.LLMRequestInterceptOutcome(request)
 
     # Attributes of the annotated request can be re-assigned, but cannot be modified in-place.
     # For example `annotated.messages.append(...)` would not work, but re-assigning
@@ -96,7 +103,7 @@ def add_system_message(_name, request, annotated):
         {"role": "system", "content": "Answer with concise technical detail."},
         *annotated.messages,
     ]
-    return request, annotated
+    return nemo_relay.LLMRequestInterceptOutcome(request, annotated)
 
 nemo_relay.intercepts.register_llm_request(
     "framework.add_system_message",

@@ -0,0 +1,97 @@
+---
+title: "LLM Request Intercept Outcomes"
+description: "Canonical request-intercept result and managed lifecycle behavior."
+---
+{/* SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: Apache-2.0 */}
+
+Every LLM request intercept returns one canonical outcome:
+
+```json
+{
+  "request": {"headers": {}, "content": {}},
+  "annotated_request": null,
+  "pending_marks": []
+}
+```
+
+`request` is required. `annotated_request` defaults to `null` when omitted on
+input, and `pending_marks` defaults to an empty list. Canonical serialization
+includes all three fields. A pending mark contains only `name`, optional
-includes all three fields. A pending mark contains only `name`, optional
+includes all three fields. A pending mark only contains `name`, optional
-includes all three fields. A pending mark contains only `name`, optional
+includes all three fields. A pending mark only contains `name`, optional
+`category` and `category_profile`, and optional `data` and `metadata`. Relay
+owns event UUIDs, parent UUIDs, and timestamps.
+
+## Request Authority
+
+The provider-body source of truth depends only on whether a request codec is
-The provider-body source of truth depends only on whether a request codec is
+The provider-body source of truth only depends on whether a request codec is
-The provider-body source of truth depends only on whether a request codec is
+The provider-body source of truth only depends on whether a request codec is
+active:
+
+| Request codec | Provider body source | Header source |
+| --- | --- | --- |
+| No codec | `outcome.request.content` | `outcome.request.headers` |
+| Active codec | `outcome.annotated_request` | `outcome.request.headers` |
+
+With an active codec, `request.content` is read-only context. Every intercept
+must return an annotation and make provider-body changes through that
+annotation, including its flattened `extra` fields for provider-specific data.
+Relay rejects a changed raw body or missing annotation at the offending
+intercept before invoking later middleware or creating an LLM lifecycle.
+
+```mermaid
+flowchart TD
+    INPUT["Original LlmRequest"] --> CODEC{"Request codec active?"}
+
+    CODEC -->|No| RAWCHAIN["Run intercept chain"]
+    RAWCHAIN --> RAWPROVIDER["Provider receives outcome.request"]
+
+    CODEC -->|Yes| DECODE["Decode content into annotated_request"]
+    DECODE --> INTERCEPT["Invoke next intercept"]
+    INTERCEPT --> CHECKANN{"Annotation returned?"}
+    CHECKANN -->|No| FAIL["Fail before lifecycle"]
+    CHECKANN -->|Yes| CHECKRAW{"request.content unchanged?"}
+    CHECKRAW -->|No| FAIL
+    CHECKRAW -->|Yes| MORE{"More intercepts?"}
+    MORE -->|Yes| INTERCEPT
+    MORE -->|No| ENCODE["Encode final annotated_request"]
+    ENCODE --> HEADERS["Apply final request.headers"]
+    HEADERS --> PROVIDER["Provider receives one resolved LlmRequest"]
+```
+
+Python callbacks return `LLMRequestInterceptOutcome`; Rust callbacks return
+`LlmRequestInterceptOutcome`; Go callbacks return
+`LLMRequestInterceptOutcome`; and Node.js and WebAssembly callbacks return
+`{ request, annotated?, pendingMarks? }`, with `categoryProfile` on each
+JavaScript pending-mark DTO. The canonical JSON forms retain `pending_marks`
+and `category_profile`. Public C callbacks write one owned canonical outcome
+JSON string. Native ABI v1 uses one host-owned outcome JSON string. Rust and
+Python `grpc-v1` worker SDKs return their canonical outcome type in a
+`JsonEnvelope` whose schema is
+`nemo.relay.LlmRequestInterceptOutcome@1`.
-Python callbacks return `LLMRequestInterceptOutcome`; Rust callbacks return
-`LlmRequestInterceptOutcome`; Go callbacks return
-`LLMRequestInterceptOutcome`; and Node.js and WebAssembly callbacks return
-`{ request, annotated?, pendingMarks? }`, with `categoryProfile` on each
-JavaScript pending-mark DTO. The canonical JSON forms retain `pending_marks`
-and `category_profile`. Public C callbacks write one owned canonical outcome
-JSON string. Native ABI v1 uses one host-owned outcome JSON string. Rust and
-Python `grpc-v1` worker SDKs return their canonical outcome type in a
-`JsonEnvelope` whose schema is
-`nemo.relay.LlmRequestInterceptOutcome@1`.
+The following are callbacks and what they return:
+- Python callbacks return `LLMRequestInterceptOutcome`
+- Rust callbacks return `LlmRequestInterceptOutcome`
+- Go callbacks return `LLMRequestInterceptOutcome`
+-  Node.js and WebAssembly callbacks return`{ request, annotated?, pendingMarks? }`, with `categoryProfile` on each JavaScript pending-mark DTO. 
+The canonical JSON forms retain `pending_marks` and `category_profile`. Public C callbacks write one owned canonical outcome JSON string. Native ABI v1 uses one host-owned outcome JSON string. Rust and Python `grpc-v1` worker SDKs return their canonical outcome type in a
+`JsonEnvelope` whose schema is `nemo.relay.LlmRequestInterceptOutcome@1`.
-Python callbacks return `LLMRequestInterceptOutcome`; Rust callbacks return
-`LlmRequestInterceptOutcome`; Go callbacks return
-`LLMRequestInterceptOutcome`; and Node.js and WebAssembly callbacks return
-`{ request, annotated?, pendingMarks? }`, with `categoryProfile` on each
-JavaScript pending-mark DTO. The canonical JSON forms retain `pending_marks`
-and `category_profile`. Public C callbacks write one owned canonical outcome
-JSON string. Native ABI v1 uses one host-owned outcome JSON string. Rust and
-Python `grpc-v1` worker SDKs return their canonical outcome type in a
-`JsonEnvelope` whose schema is
-`nemo.relay.LlmRequestInterceptOutcome@1`.
+The following are callbacks and what they return:
+- Python callbacks return `LLMRequestInterceptOutcome`
+- Rust callbacks return `LlmRequestInterceptOutcome`
+- Go callbacks return `LLMRequestInterceptOutcome`
+-  Node.js and WebAssembly callbacks return`{ request, annotated?, pendingMarks? }`, with `categoryProfile` on each JavaScript pending-mark DTO. 
+The canonical JSON forms retain `pending_marks` and `category_profile`. Public C callbacks write one owned canonical outcome JSON string. Native ABI v1 uses one host-owned outcome JSON string. Rust and Python `grpc-v1` worker SDKs return their canonical outcome type in a
+`JsonEnvelope` whose schema is `nemo.relay.LlmRequestInterceptOutcome@1`.
+
+The standalone request-intercept helper returns the complete outcome but does
+not emit its pending marks because it does not own an LLM lifecycle.
+
+## Managed Lifecycle
+
+Managed execution runs all effective global and scope-local intercepts before
+creating the LLM handle. Each accepted request/annotation pair feeds the next
-creating the LLM handle. Each accepted request/annotation pair feeds the next
+creating the LLM handle. Each accepted request or annotation pair feeds the next
-creating the LLM handle. Each accepted request/annotation pair feeds the next
+creating the LLM handle. Each accepted request or annotation pair feeds the next
+intercept under the authority rules above, while pending marks append in
+middleware order. A breaking
+intercept's marks are retained. If any intercept fails or its boundary result
+is malformed, Relay discards all accumulated marks and creates no LLM
+lifecycle.
+
+After successful interception, Relay creates the handle and captures one
+subscriber snapshot. It emits the LLM start at `T`, every pending mark at
+`T + 1µs` in returned order with the LLM UUID as parent, and the LLM end no
+earlier than `T + 1µs`. Streaming and non-streaming calls use the same rules.
+Pending marks are never added to the provider request, annotated request,
+codec input, sanitizer input, or start payload.
+
+## Migration
+
+This finalizes unpublished native ABI v1 and `grpc-v1` contracts. Rebuild all
+development native plugins and workers. Replace tuple results, split C/Go
+outputs, metadata envelopes, and parallel mark-aware registrations with the
+canonical outcome and the existing `register_llm_request_intercept`
+registration name.