PrimeIntellect-ai · S1ro1 · May 16, 2026
diff --git a/py_src/vllm_router/router.py b/py_src/vllm_router/router.py
@@ -76,6 +76,8 @@ class Router:
             If not specified, uses the main policy. Default: None
         decode_policy: Specific load balancing policy for decode nodes (PD mode only).
             If not specified, uses the main policy. Default: None
+        pd_kv_cache_ttl_secs: TTL in seconds for Decode-side KV metadata cached for
+            bidirectional vLLM P/D transfer. Default: 0
         request_id_headers: List of HTTP headers to check for request IDs. If not specified,
             uses common defaults: ['x-request-id', 'x-correlation-id', 'x-trace-id', 'request-id'].
             Example: ['x-my-request-id', 'x-custom-trace-id']. Default: None

diff --git a/py_src/vllm_router/router_args.py b/py_src/vllm_router/router_args.py
@@ -21,6 +21,7 @@ class RouterArgs:
         default_factory=list
     )  # List of (url, bootstrap_port)
     decode_urls: List[str] = dataclasses.field(default_factory=list)
+    pd_kv_cache_ttl_secs: int = 0
 
     # Routing policy
     policy: str = "cache_aware"
@@ -201,6 +202,12 @@ def add_cli_args(
             metavar=("URL",),
             help="Decode server URL. Can be specified multiple times.",
         )
+        parser.add_argument(
+            f"--{prefix}pd-kv-cache-ttl-secs",
+            type=int,
+            default=RouterArgs.pd_kv_cache_ttl_secs,
+            help="TTL in seconds for Decode-side KV metadata cached for bidirectional vLLM P/D transfer.",
+        )
         parser.add_argument(
             f"--{prefix}worker-startup-timeout-secs",
             type=int,

diff --git a/src/config/types.rs b/src/config/types.rs
@@ -84,12 +84,19 @@ pub struct RouterConfig {
     /// Profiling timeout in seconds (for vLLM profiling endpoints)
     #[serde(default = "default_profile_timeout_secs")]
     pub profile_timeout_secs: u64,
+    /// TTL for Decode-side KV metadata cached by vLLM P/D router.
+    #[serde(default = "default_pd_kv_cache_ttl_secs")]
+    pub pd_kv_cache_ttl_secs: u64,
 }
 
 fn default_profile_timeout_secs() -> u64 {
     10
 }
 
+fn default_pd_kv_cache_ttl_secs() -> u64 {
+    0
+}
+
 fn default_history_backend() -> HistoryBackend {
     HistoryBackend::Memory
 }
@@ -491,6 +498,7 @@ impl Default for RouterConfig {
             history_backend: default_history_backend(),
             enable_profiling: false,
             profile_timeout_secs: default_profile_timeout_secs(),
+            pd_kv_cache_ttl_secs: default_pd_kv_cache_ttl_secs(),
         }
     }
 }
@@ -1063,6 +1071,7 @@ mod tests {
             history_backend: default_history_backend(),
             enable_profiling: false,
             profile_timeout_secs: default_profile_timeout_secs(),
+            pd_kv_cache_ttl_secs: default_pd_kv_cache_ttl_secs(),
         };
 
         assert!(config.mode.is_pd_mode());
@@ -1131,6 +1140,7 @@ mod tests {
             history_backend: default_history_backend(),
             enable_profiling: false,
             profile_timeout_secs: default_profile_timeout_secs(),
+            pd_kv_cache_ttl_secs: default_pd_kv_cache_ttl_secs(),
         };
 
         assert!(!config.mode.is_pd_mode());
@@ -1195,6 +1205,7 @@ mod tests {
             history_backend: default_history_backend(),
             enable_profiling: false,
             profile_timeout_secs: default_profile_timeout_secs(),
+            pd_kv_cache_ttl_secs: default_pd_kv_cache_ttl_secs(),
         };
 
         assert!(config.has_service_discovery());

diff --git a/src/lib.rs b/src/lib.rs
@@ -104,6 +104,8 @@ struct Router {
     // OpenTelemetry tracing
     enable_trace: bool,
     otlp_traces_endpoint: Option<String>,
+    // vLLM P/D Decode -> Prefill KV metadata cache
+    pd_kv_cache_ttl_secs: u64,
 }
 
 impl Router {
@@ -253,6 +255,7 @@ impl Router {
             history_backend: config::HistoryBackend::Memory,
             enable_profiling: false, // Profiling disabled in Python binding by default
             profile_timeout_secs: 10, // Default profiling timeout
+            pd_kv_cache_ttl_secs: self.pd_kv_cache_ttl_secs,
         })
     }
 }
@@ -327,6 +330,8 @@ impl Router {
         // Tracing defaults
         enable_trace = false,
         otlp_traces_endpoint = None,
+        // vLLM P/D defaults
+        pd_kv_cache_ttl_secs = 0,
     ))]
     #[allow(clippy::too_many_arguments)]
     fn new(
@@ -390,6 +395,7 @@ impl Router {
         tokenizer_path: Option<String>,
         enable_trace: bool,
         otlp_traces_endpoint: Option<String>,
+        pd_kv_cache_ttl_secs: u64,
     ) -> PyResult<Self> {
         // Determine connection mode from worker URLs
         let mut all_urls = worker_urls.clone();
@@ -470,6 +476,7 @@ impl Router {
             tokenizer_path,
             enable_trace,
             otlp_traces_endpoint,
+            pd_kv_cache_ttl_secs,
         })
     }
 

diff --git a/src/main.rs b/src/main.rs
@@ -134,6 +134,10 @@ struct CliArgs {
     #[arg(long, default_value_t = false)]
     vllm_pd_disaggregation: bool,
 
+    /// TTL in seconds for Decode-side KV metadata cached for bidirectional vLLM P/D transfer
+    #[arg(long, default_value_t = 0)]
+    pd_kv_cache_ttl_secs: u64,
+
     /// ZMQ service discovery address for vLLM P2P NCCL coordination (e.g., "0.0.0.0:30001")
     /// Required for --vllm-pd-disaggregation mode. Workers register their HTTP and ZMQ addresses here.
     #[arg(long)]
@@ -680,6 +684,7 @@ impl CliArgs {
             },
             enable_profiling: self.profile,
             profile_timeout_secs: 10, // Default profiling timeout
+            pd_kv_cache_ttl_secs: self.pd_kv_cache_ttl_secs,
         })
     }