Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions docs/build-plugins/code-examples.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,14 @@ class HeaderPlugin:
name: str,
request: nemo_relay.LLMRequest,
annotated: nemo_relay.AnnotatedLLMRequest | None
) -> tuple[nemo_relay.LLMRequest, nemo_relay.AnnotatedLLMRequest | None]:
) -> nemo_relay.LLMRequestInterceptOutcome:
# The request object is immutable, however we can return a new instance with updated headers.
headers = request.headers.copy()
headers[plugin_config["header_name"]] = plugin_config["value"]
return nemo_relay.LLMRequest(headers=headers, content=request.content), annotated
return nemo_relay.LLMRequestInterceptOutcome(
nemo_relay.LLMRequest(headers=headers, content=request.content),
annotated,
)

context.register_llm_request_intercept("inject-header", 100, False, add_header)

Expand Down
10 changes: 7 additions & 3 deletions docs/build-plugins/register-behavior.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,13 @@ class HeaderPlugin:
name: str,
request: nemo_relay.LLMRequest,
annotated: nemo_relay.AnnotatedLLMRequest | None
) -> tuple[nemo_relay.LLMRequest, nemo_relay.AnnotatedLLMRequest | None]:
) -> nemo_relay.LLMRequestInterceptOutcome:
headers = request.headers.copy()
headers[plugin_config["header_name"]] = plugin_config["value"]
return nemo_relay.LLMRequest(headers=headers, content=request.content), annotated
return nemo_relay.LLMRequestInterceptOutcome(
nemo_relay.LLMRequest(headers=headers, content=request.content),
annotated,
)

context.register_llm_request_intercept("inject-header", 100, False, add_header)

Expand Down Expand Up @@ -99,6 +102,7 @@ plugin.register('header-plugin', headerPlugin);

<Tab title="Rust" language="rust">
```rust
use nemo_relay::api::llm::LlmRequestInterceptOutcome;
use nemo_relay::plugin::{
register_plugin, ConfigDiagnostic, DiagnosticLevel, Plugin, PluginRegistrationContext,
Result as PluginResult,
Expand Down Expand Up @@ -166,7 +170,7 @@ impl Plugin for HeaderPlugin {
request
.headers
.insert(header_name.clone(), header_value.clone().into());
Ok((request, annotated))
Ok(LlmRequestInterceptOutcome::new(request, annotated))
}),
)?;
Ok(())
Expand Down
12 changes: 6 additions & 6 deletions docs/instrument-applications/code-examples.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -260,8 +260,8 @@ tool_args = nemo_relay.tools.request_intercepts("search", {"query": "weather"})
nemo_relay.tools.conditional_execution("search", tool_args)

llm_request = LLMRequest({}, {"messages": [{"role": "user", "content": "hello"}]})
llm_request = nemo_relay.llm.request_intercepts("demo-provider", llm_request)
nemo_relay.llm.conditional_execution(llm_request)
outcome = nemo_relay.llm.request_intercepts("demo-provider", llm_request)
nemo_relay.llm.conditional_execution(outcome.request)
```
</Tab>

Expand All @@ -279,8 +279,8 @@ const toolArgs = await toolRequestIntercepts('search', { query: 'weather' });
await toolConditionalExecution('search', toolArgs);

const request = new LlmRequest({}, { messages: [{ role: 'user', content: 'hello' }] });
const rewritten = await llmRequestIntercepts('demo-provider', request);
await llmConditionalExecution(rewritten);
const outcome = await llmRequestIntercepts('demo-provider', request);
await llmConditionalExecution(outcome.request);
```
</Tab>

Expand All @@ -297,8 +297,8 @@ let request = LlmRequest {
headers: Default::default(),
content: json!({"messages": [{"role": "user", "content": "hello"}]}),
};
let rewritten = llm_request_intercepts("demo-provider", request)?;
llm_conditional_execution(&rewritten)?;
let outcome = llm_request_intercepts("demo-provider", request)?;
llm_conditional_execution(&outcome.request)?;
```
</Tab>

Expand Down
9 changes: 6 additions & 3 deletions docs/integrate-into-frameworks/code-examples.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -184,10 +184,11 @@ import nemo_relay
from nemo_relay import LLMRequest

rewritten_args = nemo_relay.tools.request_intercepts("search", {"query": "weather"})
rewritten_request = nemo_relay.llm.request_intercepts(
outcome = nemo_relay.llm.request_intercepts(
"demo-provider",
LLMRequest({}, {"messages": []}),
)
rewritten_request = outcome.request
```
</Tab>

Expand All @@ -196,7 +197,8 @@ rewritten_request = nemo_relay.llm.request_intercepts(
import { LlmRequest, llmRequestIntercepts, toolRequestIntercepts } from 'nemo-relay-node';

const rewrittenArgs = await toolRequestIntercepts('search', { query: 'weather' });
const rewrittenRequest = await llmRequestIntercepts('demo-provider', new LlmRequest({}, { messages: [] }));
const outcome = await llmRequestIntercepts('demo-provider', new LlmRequest({}, { messages: [] }));
const rewrittenRequest = outcome.request;
```
</Tab>

Expand All @@ -208,7 +210,8 @@ use serde_json::json;

let rewritten_args = tool_request_intercepts("search", json!({"query": "weather"}))?;
let request = LlmRequest { headers: Default::default(), content: json!({"messages": []}) };
let rewritten_request = llm_request_intercepts("demo-provider", request)?;
let outcome = llm_request_intercepts("demo-provider", request)?;
let rewritten_request = outcome.request;
```
</Tab>

Expand Down
13 changes: 10 additions & 3 deletions docs/integrate-into-frameworks/provider-codecs.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,17 @@ When a managed LLM call has a request codec:

1. NeMo Relay calls `decode` before LLM request intercepts run.
2. Request intercepts receive both the raw request and the annotated request.
3. Intercepts may edit the raw request, the annotated request, or both.
3. Intercepts edit provider-body fields through the annotated request and may
edit transport headers through the raw request. Raw `request.content` is
read-only while the codec is active.
4. NeMo Relay calls `encode` to merge the annotated request back into the original raw request.
5. Execution intercepts and the provider callback receive the encoded provider request.

If a codec-aware intercept changes raw `request.content` or omits the returned
annotation, Relay rejects the outcome before creating the LLM lifecycle. When
no request codec is active, the raw request remains fully writable and is the
provider-visible source of truth.

When a managed LLM call has a response codec, NeMo Relay decodes the raw provider response for observability and attaches the result to the emitted LLM end event. The response codec does not rewrite the value returned to the application. Use [Provider Response Codecs](/integrate-into-frameworks/provider-response-codecs) for response-only behavior and custom response codec examples.

Codec implementations should preserve fields they do not understand. Treat `encode` as a merge operation over the original provider payload, not as a full replacement.
Expand Down Expand Up @@ -87,7 +94,7 @@ from nemo_relay.codecs import OpenAIChatCodec

def add_system_message(_name, request, annotated):
if annotated is None:
return request, annotated
return nemo_relay.LLMRequestInterceptOutcome(request)

# Attributes of the annotated request can be re-assigned, but cannot be modified in-place.
# For example `annotated.messages.append(...)` would not work, but re-assigning
Expand All @@ -96,7 +103,7 @@ def add_system_message(_name, request, annotated):
{"role": "system", "content": "Answer with concise technical detail."},
*annotated.messages,
]
return request, annotated
return nemo_relay.LLMRequestInterceptOutcome(request, annotated)

nemo_relay.intercepts.register_llm_request(
"framework.add_system_message",
Expand Down
97 changes: 97 additions & 0 deletions docs/reference/llm-request-intercept-outcomes.mdx
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
---
title: "LLM Request Intercept Outcomes"
description: "Canonical request-intercept result and managed lifecycle behavior."
---
{/* SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
SPDX-License-Identifier: Apache-2.0 */}

Every LLM request intercept returns one canonical outcome:

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you provide a description about what this does? Is this the LLM request intercept or the outcome?


```json
{
"request": {"headers": {}, "content": {}},
"annotated_request": null,
"pending_marks": []
}
```

`request` is required. `annotated_request` defaults to `null` when omitted on
input, and `pending_marks` defaults to an empty list. Canonical serialization
includes all three fields. A pending mark contains only `name`, optional

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
includes all three fields. A pending mark contains only `name`, optional
includes all three fields. A pending mark only contains `name`, optional

`category` and `category_profile`, and optional `data` and `metadata`. Relay
owns event UUIDs, parent UUIDs, and timestamps.

## Request Authority

The provider-body source of truth depends only on whether a request codec is

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
The provider-body source of truth depends only on whether a request codec is
The provider-body source of truth only depends on whether a request codec is

active:

| Request codec | Provider body source | Header source |
| --- | --- | --- |
| No codec | `outcome.request.content` | `outcome.request.headers` |
| Active codec | `outcome.annotated_request` | `outcome.request.headers` |

With an active codec, `request.content` is read-only context. Every intercept
must return an annotation and make provider-body changes through that
annotation, including its flattened `extra` fields for provider-specific data.
Relay rejects a changed raw body or missing annotation at the offending
intercept before invoking later middleware or creating an LLM lifecycle.

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What does the following snippet do?

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should begin with "The following example describes/does xyz..."

```mermaid
flowchart TD
INPUT["Original LlmRequest"] --> CODEC{"Request codec active?"}
CODEC -->|No| RAWCHAIN["Run intercept chain"]
RAWCHAIN --> RAWPROVIDER["Provider receives outcome.request"]
CODEC -->|Yes| DECODE["Decode content into annotated_request"]
DECODE --> INTERCEPT["Invoke next intercept"]
INTERCEPT --> CHECKANN{"Annotation returned?"}
CHECKANN -->|No| FAIL["Fail before lifecycle"]
CHECKANN -->|Yes| CHECKRAW{"request.content unchanged?"}
CHECKRAW -->|No| FAIL
CHECKRAW -->|Yes| MORE{"More intercepts?"}
MORE -->|Yes| INTERCEPT
MORE -->|No| ENCODE["Encode final annotated_request"]
ENCODE --> HEADERS["Apply final request.headers"]
HEADERS --> PROVIDER["Provider receives one resolved LlmRequest"]
```

Python callbacks return `LLMRequestInterceptOutcome`; Rust callbacks return
`LlmRequestInterceptOutcome`; Go callbacks return
`LLMRequestInterceptOutcome`; and Node.js and WebAssembly callbacks return
`{ request, annotated?, pendingMarks? }`, with `categoryProfile` on each
JavaScript pending-mark DTO. The canonical JSON forms retain `pending_marks`
and `category_profile`. Public C callbacks write one owned canonical outcome
JSON string. Native ABI v1 uses one host-owned outcome JSON string. Rust and
Python `grpc-v1` worker SDKs return their canonical outcome type in a
`JsonEnvelope` whose schema is
`nemo.relay.LlmRequestInterceptOutcome@1`.
Comment on lines +60 to +69

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
Python callbacks return `LLMRequestInterceptOutcome`; Rust callbacks return
`LlmRequestInterceptOutcome`; Go callbacks return
`LLMRequestInterceptOutcome`; and Node.js and WebAssembly callbacks return
`{ request, annotated?, pendingMarks? }`, with `categoryProfile` on each
JavaScript pending-mark DTO. The canonical JSON forms retain `pending_marks`
and `category_profile`. Public C callbacks write one owned canonical outcome
JSON string. Native ABI v1 uses one host-owned outcome JSON string. Rust and
Python `grpc-v1` worker SDKs return their canonical outcome type in a
`JsonEnvelope` whose schema is
`nemo.relay.LlmRequestInterceptOutcome@1`.
The following are callbacks and what they return:
- Python callbacks return `LLMRequestInterceptOutcome`
- Rust callbacks return `LlmRequestInterceptOutcome`
- Go callbacks return `LLMRequestInterceptOutcome`
- Node.js and WebAssembly callbacks return`{ request, annotated?, pendingMarks? }`, with `categoryProfile` on each JavaScript pending-mark DTO.
The canonical JSON forms retain `pending_marks` and `category_profile`. Public C callbacks write one owned canonical outcome JSON string. Native ABI v1 uses one host-owned outcome JSON string. Rust and Python `grpc-v1` worker SDKs return their canonical outcome type in a
`JsonEnvelope` whose schema is `nemo.relay.LlmRequestInterceptOutcome@1`.


The standalone request-intercept helper returns the complete outcome but does
not emit its pending marks because it does not own an LLM lifecycle.

## Managed Lifecycle

Managed execution runs all effective global and scope-local intercepts before
creating the LLM handle. Each accepted request/annotation pair feeds the next

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
creating the LLM handle. Each accepted request/annotation pair feeds the next
creating the LLM handle. Each accepted request or annotation pair feeds the next

intercept under the authority rules above, while pending marks append in
middleware order. A breaking
intercept's marks are retained. If any intercept fails or its boundary result
is malformed, Relay discards all accumulated marks and creates no LLM
lifecycle.

After successful interception, Relay creates the handle and captures one
subscriber snapshot. It emits the LLM start at `T`, every pending mark at
`T + 1µs` in returned order with the LLM UUID as parent, and the LLM end no
earlier than `T + 1µs`. Streaming and non-streaming calls use the same rules.
Pending marks are never added to the provider request, annotated request,
codec input, sanitizer input, or start payload.

## Migration

This finalizes unpublished native ABI v1 and `grpc-v1` contracts. Rebuild all
development native plugins and workers. Replace tuple results, split C/Go
outputs, metadata envelopes, and parallel mark-aware registrations with the
canonical outcome and the existing `register_llm_request_intercept`
registration name.
Loading