kvcache-ai · pingzhuu · Jun 18, 2026 · Jun 18, 2026 · Jun 18, 2026 · Jun 23, 2026
@@ -189,10 +189,12 @@ mooncake_master \
 
 ---
 
-### Tiered Storage with SSD Offload — Cost-Effective Capacity
+### Tiered Storage with SSD Offload - Cost-Effective Capacity
 
 Extends the cache pool from DRAM to SSD while keeping normal reads and writes on the distributed memory path. With `--enable_offload=true`, completed memory writes are queued for asynchronous SSD persistence through the master control plane. Set `--offload_on_evict=true` to defer that SSD write until the memory eviction path selects an object for reclamation. When `--promotion_on_hit=true`, SSD-only objects can be promoted back to DRAM after repeated reads; admission is gated by `--promotion_admission_threshold`.
 
+Promotion execution runs on the FileStorage holder client. By default one background worker drains promotion tasks outside the heartbeat thread. Increase `MOONCAKE_OFFLOAD_PROMOTION_WORKER_THREADS` only when SSD bandwidth, network bandwidth, and DRAM allocation headroom can absorb more concurrent L2-to-L1 copies.
+
 ```bash
 mooncake_master \
   --enable_offload=true \
@@ -483,6 +485,16 @@ Flags for controlling data movement between DRAM and SSD.
 
 Start with `--enable_offload=true` for eager asynchronous SSD persistence after `Put` completion. Add `--offload_on_evict=true` when you want SSD writes to happen only when memory pressure selects an object for eviction. Add `--promotion_on_hit=true` to allow hot SSD-only data to be promoted back to DRAM, and tune `--promotion_admission_threshold` to control how many observed reads are required before promotion is queued.
 
+FileStorage holder clients also accept the following environment variables:
+
+| Env | Default | Description |
+|-----|---------|-------------|
+| `MOONCAKE_OFFLOAD_PROMOTION_WORKER_THREADS` | `1` | Background workers used to execute L2-to-L1 promotion tasks; `0` falls back to synchronous heartbeat execution |
+| `MOONCAKE_OFFLOAD_PROMOTION_QUEUE_CAPACITY` | `1024` | Soft local backlog cap used to limit additional promotion pulls from the master |
+| `MOONCAKE_OFFLOAD_PROMOTION_DRAIN_BATCH_SIZE` | `64` | Max promotion heartbeat pulls per worker per FileStorage heartbeat tick |
+
+Keep the default worker count for latency-sensitive deployments. Raising it can drain bursty HiCache prefix-hit promotion backlogs faster, but it also increases SSD reads, transfer writes, and memory allocation pressure.
+
 ### CXL Memory
 
 | Flag | Default | Description |

@@ -1,9 +1,13 @@
 #pragma once
 
+#include <condition_variable>
+#include <deque>
+#include <mutex>
+
 #include "client_service.h"
 #include "client_buffer.hpp"
-#include "storage_backend.h"
 #include "pinned_buffer_pool.h"
+#include "storage_backend.h"
 
 namespace mooncake {
 
@@ -52,6 +56,16 @@ class FileStorage {
    private:
     friend class FileStorageTest;
     friend class FileStoragePromotionTest;
+
+    struct PromotionExecutionResult {
+        bool alloc_attempted = false;
+        bool write_attempted = false;
+        bool notify_success_attempted = false;
+        bool notify_failure_attempted = false;
+        bool completed = false;
+        ErrorCode terminal_error = ErrorCode::OK;
+    };
+
     struct AllocatedBatch {
         uint64_t batch_id;
         std::vector<BufferHandle> handles;
@@ -83,24 +97,39 @@ class FileStorage {
      * client.
      * 2. Receives feedback on which objects should be offloaded.
      * 3. Triggers asynchronous offloading of pending objects.
-     * 4. Pulls and processes any pending L2->L1 promotion tasks queued by the
-     *    master (mirror of step 1+2 in the reverse direction).
+     * 4. Pulls any pending L2->L1 promotion tasks queued by the master and
+     *    dispatches them for execution (mirror of step 1+2 in the reverse
+     *    direction).
      * @return tl::expected<void, ErrorCode> indicating operation status.
      */
     tl::expected<void, ErrorCode> Heartbeat();
 
     /**
      * @brief Drives the L2->L1 promotion pipeline for one heartbeat tick.
-     * Pulls promotion work from the master, stages a MEMORY replica for each
-     * key, copies the bytes from local SSD into that replica, and notifies the
-     * master on success. A failure on any single key is logged and skipped;
-     * the master-side reaper decrements the source replica's refcnt and
-     * erases the task entry on TTL expiry, and any orphaned PROCESSING
-     * MEMORY replica is reaped via the standard discarded-replicas path.
+     *
+     * Pulls promotion work from the master and either processes it
+     * synchronously or enqueues it for background workers. Each task stages a
+     * MEMORY replica, copies the bytes from local SSD into that replica, and
+     * notifies the master on success. FileStorage eagerly reports per-key
+     * failures so the master can release the promotion slot immediately, with
+     * the reaper acting as a long-stop.
+     *
      * @return tl::expected<void, ErrorCode> indicating operation status.
      */
     tl::expected<void, ErrorCode> ProcessPromotionTasks();
 
+    PromotionExecutionResult ProcessPromotionTask(
+        const PromotionTaskItem& task,
+        const std::vector<std::string>& preferred_segments);
+
+    bool EnqueuePromotionTask(const PromotionTaskItem& task,
+                              bool allow_over_capacity_for_pulled_task = false);
+
+    void ReleasePromotionTask(const std::string& key,
+                              const std::string& tenant_id);
+
+    void PromotionWorkerThreadFunc();
+
     tl::expected<bool, ErrorCode> IsEnableOffloading();
 
     tl::expected<void, ErrorCode> BatchLoad(
@@ -143,6 +172,11 @@ class FileStorage {
     std::thread heartbeat_thread_;
     std::atomic<bool> client_buffer_gc_running_;
     std::thread client_buffer_gc_thread_;
+    std::atomic<bool> promotion_workers_running_{false};
+    std::vector<std::thread> promotion_worker_threads_;
+    std::mutex promotion_queue_mutex_;
+    std::condition_variable promotion_queue_cv_;
+    std::deque<PromotionTaskItem> promotion_task_queue_;
     std::future<void> rescan_future_;
     std::atomic<bool> metadata_resync_pending_{false};
 };

@@ -226,6 +226,14 @@ struct FileStorageConfig {
     uint32_t client_buffer_gc_interval_seconds = 1;
     uint64_t client_buffer_gc_ttl_ms = 5000;
 
+    // Background worker settings for L2->L1 promotion-on-hit execution.
+    // Set promotion_worker_threads to 0 to disable async workers and fall back
+    // to the synchronous heartbeat path.
+    uint32_t promotion_worker_threads = 1;
+    // Soft local backlog cap used to limit additional master pulls.
+    uint32_t promotion_queue_capacity = 1024;
+    uint32_t promotion_drain_batch_size = 64;
+
     // Use io_uring for file I/O instead of POSIX pread/pwrite
     bool use_uring = false;
 
@@ -1209,4 +1217,4 @@ class OffsetAllocatorStorageBackend : public StorageBackendInterface {
 tl::expected<std::shared_ptr<StorageBackendInterface>, ErrorCode>
 CreateStorageBackend(const FileStorageConfig& config);
 
-}  // namespace mooncake
+}  // namespace mooncake