Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions py_src/vllm_router/router.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,8 @@ class Router:
If not specified, uses the main policy. Default: None
decode_policy: Specific load balancing policy for decode nodes (PD mode only).
If not specified, uses the main policy. Default: None
pd_kv_cache_ttl_secs: TTL in seconds for Decode-side KV metadata cached for
bidirectional vLLM P/D transfer. Default: 0
request_id_headers: List of HTTP headers to check for request IDs. If not specified,
uses common defaults: ['x-request-id', 'x-correlation-id', 'x-trace-id', 'request-id'].
Example: ['x-my-request-id', 'x-custom-trace-id']. Default: None
Expand Down
7 changes: 7 additions & 0 deletions py_src/vllm_router/router_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ class RouterArgs:
default_factory=list
) # List of (url, bootstrap_port)
decode_urls: List[str] = dataclasses.field(default_factory=list)
pd_kv_cache_ttl_secs: int = 0

# Routing policy
policy: str = "cache_aware"
Expand Down Expand Up @@ -201,6 +202,12 @@ def add_cli_args(
metavar=("URL",),
help="Decode server URL. Can be specified multiple times.",
)
parser.add_argument(
f"--{prefix}pd-kv-cache-ttl-secs",
type=int,
default=RouterArgs.pd_kv_cache_ttl_secs,
help="TTL in seconds for Decode-side KV metadata cached for bidirectional vLLM P/D transfer.",
)
parser.add_argument(
f"--{prefix}worker-startup-timeout-secs",
type=int,
Expand Down
11 changes: 11 additions & 0 deletions src/config/types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -84,12 +84,19 @@ pub struct RouterConfig {
/// Profiling timeout in seconds (for vLLM profiling endpoints)
#[serde(default = "default_profile_timeout_secs")]
pub profile_timeout_secs: u64,
/// TTL for Decode-side KV metadata cached by vLLM P/D router.
#[serde(default = "default_pd_kv_cache_ttl_secs")]
pub pd_kv_cache_ttl_secs: u64,
}

fn default_profile_timeout_secs() -> u64 {
10
}

fn default_pd_kv_cache_ttl_secs() -> u64 {
0
}

fn default_history_backend() -> HistoryBackend {
HistoryBackend::Memory
}
Expand Down Expand Up @@ -491,6 +498,7 @@ impl Default for RouterConfig {
history_backend: default_history_backend(),
enable_profiling: false,
profile_timeout_secs: default_profile_timeout_secs(),
pd_kv_cache_ttl_secs: default_pd_kv_cache_ttl_secs(),
}
}
}
Expand Down Expand Up @@ -1063,6 +1071,7 @@ mod tests {
history_backend: default_history_backend(),
enable_profiling: false,
profile_timeout_secs: default_profile_timeout_secs(),
pd_kv_cache_ttl_secs: default_pd_kv_cache_ttl_secs(),
};

assert!(config.mode.is_pd_mode());
Expand Down Expand Up @@ -1131,6 +1140,7 @@ mod tests {
history_backend: default_history_backend(),
enable_profiling: false,
profile_timeout_secs: default_profile_timeout_secs(),
pd_kv_cache_ttl_secs: default_pd_kv_cache_ttl_secs(),
};

assert!(!config.mode.is_pd_mode());
Expand Down Expand Up @@ -1195,6 +1205,7 @@ mod tests {
history_backend: default_history_backend(),
enable_profiling: false,
profile_timeout_secs: default_profile_timeout_secs(),
pd_kv_cache_ttl_secs: default_pd_kv_cache_ttl_secs(),
};

assert!(config.has_service_discovery());
Expand Down
7 changes: 7 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,8 @@ struct Router {
// OpenTelemetry tracing
enable_trace: bool,
otlp_traces_endpoint: Option<String>,
// vLLM P/D Decode -> Prefill KV metadata cache
pd_kv_cache_ttl_secs: u64,
}

impl Router {
Expand Down Expand Up @@ -253,6 +255,7 @@ impl Router {
history_backend: config::HistoryBackend::Memory,
enable_profiling: false, // Profiling disabled in Python binding by default
profile_timeout_secs: 10, // Default profiling timeout
pd_kv_cache_ttl_secs: self.pd_kv_cache_ttl_secs,
})
}
}
Expand Down Expand Up @@ -327,6 +330,8 @@ impl Router {
// Tracing defaults
enable_trace = false,
otlp_traces_endpoint = None,
// vLLM P/D defaults
pd_kv_cache_ttl_secs = 0,
))]
#[allow(clippy::too_many_arguments)]
fn new(
Expand Down Expand Up @@ -390,6 +395,7 @@ impl Router {
tokenizer_path: Option<String>,
enable_trace: bool,
otlp_traces_endpoint: Option<String>,
pd_kv_cache_ttl_secs: u64,
) -> PyResult<Self> {
// Determine connection mode from worker URLs
let mut all_urls = worker_urls.clone();
Expand Down Expand Up @@ -470,6 +476,7 @@ impl Router {
tokenizer_path,
enable_trace,
otlp_traces_endpoint,
pd_kv_cache_ttl_secs,
})
}

Expand Down
5 changes: 5 additions & 0 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,10 @@ struct CliArgs {
#[arg(long, default_value_t = false)]
vllm_pd_disaggregation: bool,

/// TTL in seconds for Decode-side KV metadata cached for bidirectional vLLM P/D transfer
#[arg(long, default_value_t = 0)]
pd_kv_cache_ttl_secs: u64,

/// ZMQ service discovery address for vLLM P2P NCCL coordination (e.g., "0.0.0.0:30001")
/// Required for --vllm-pd-disaggregation mode. Workers register their HTTP and ZMQ addresses here.
#[arg(long)]
Expand Down Expand Up @@ -680,6 +684,7 @@ impl CliArgs {
},
enable_profiling: self.profile,
profile_timeout_secs: 10, // Default profiling timeout
pd_kv_cache_ttl_secs: self.pd_kv_cache_ttl_secs,
})
}

Expand Down
Loading
Loading