jd-opensource · yq33victor · Apr 22, 2026 · Apr 16, 2026
@@ -120,7 +120,6 @@ class PhyPagePool {
 
   // Track which pages are allocated (for segment management)
   std::vector<bool> page_allocated_;
-
 };
 
 }  // namespace xllm
@@ -83,8 +83,8 @@ static inline void unmap_pages(
   }
 
   for (const auto& entry : mapping) {
-    VirPtr addr = add_vir_ptr_offset(
-        vaddr, static_cast<size_t>(entry.first) * page_size);
+    VirPtr addr =
+        add_vir_ptr_offset(vaddr, static_cast<size_t>(entry.first) * page_size);
     vmm::unmap(addr, page_size);
   }
 }

@@ -302,6 +302,7 @@ TEST_F(DeepseekV2SparseMoEBlockTest, PrepInDpGatherBuildsLocalSkip) {
   auto prep = block->prep_in(attn_out,
                              residual,
                              input_params,
+                             block->plan_exec(input_params),
                              DeepseekV2AttentionImpl::PostAttnLayout::kTpShard);
 
   EXPECT_TRUE(prep.need_dp_gather);
@@ -349,6 +350,7 @@ TEST_F(DeepseekV2SparseMoEBlockTest, PrepInAll2AllPadsTpShardInput) {
   auto prep = block->prep_in(attn_out,
                              residual,
                              input_params,
+                             block->plan_exec(input_params),
                              DeepseekV2AttentionImpl::PostAttnLayout::kTpShard);
 
   EXPECT_FALSE(prep.need_dp_gather);
@@ -361,6 +363,35 @@ TEST_F(DeepseekV2SparseMoEBlockTest, PrepInAll2AllPadsTpShardInput) {
   test::verify_tensor_close(prep.skip_local, prep.ffn_in);
 }
 
+TEST_F(DeepseekV2SparseMoEBlockTest, PrepInUsesProvidedExecCfg) {
+  set_tp_dp_ctx(/*world_size=*/4, /*dp_size=*/2, /*tp_size=*/2, /*ep_size=*/4);
+  auto block = create_block();
+
+  ModelInputParams input_params;
+  input_params.dp_global_token_nums = {3, 1};
+  input_params.dp_is_decode = {0, 0};
+  auto planned_cfg = block->plan_exec(input_params);
+  EXPECT_FALSE(planned_cfg.enable_all2all);
+  EXPECT_TRUE(planned_cfg.need_dp_gather);
+
+  DeepseekV2SparseMoEBlockImpl::ExecCfg forced_cfg;
+  forced_cfg.enable_all2all = true;
+  forced_cfg.need_dp_gather = false;
+  auto attn_out = mat(/*rows=*/3, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f});
+  auto residual = mat(/*rows=*/3, {10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f});
+
+  auto prep = block->prep_in(attn_out,
+                             residual,
+                             input_params,
+                             forced_cfg,
+                             DeepseekV2AttentionImpl::PostAttnLayout::kTpShard);
+
+  EXPECT_FALSE(prep.need_dp_gather);
+  EXPECT_TRUE(prep.need_tp_pad);
+  EXPECT_TRUE(prep.pad_info.active);
+  test::verify_tensor_close(prep.skip_local, prep.ffn_in);
+}
+
 TEST_F(DeepseekV2SparseMoEBlockTest, MergeOutTpPadGathersAndUnpads) {
   set_tp_ctx(/*world_size=*/2, /*ep_size=*/2);
   auto block = create_block();

@@ -181,8 +181,11 @@ DeepseekV2DecoderLayerImpl::prepare_moe_inputs(
   }
 
   if (result.exec_cfg->enable_all2all || result.exec_cfg->need_dp_gather) {
-    result.moe_prep =
-        sparse_moe_->prep_in(std::move(x), residual, input_params, attn_layout);
+    result.moe_prep = sparse_moe_->prep_in(std::move(x),
+                                           residual,
+                                           input_params,
+                                           result.exec_cfg.value(),
+                                           attn_layout);
     result.ffn_in = result.moe_prep->ffn_in;
     return result;
   }

@@ -69,9 +69,9 @@ DeepseekV2SparseMoEBlockImpl::PrepOut DeepseekV2SparseMoEBlockImpl::prep_in(
     torch::Tensor x,
     const torch::Tensor& residual,
     const ModelInputParams& input_params,
+    const ExecCfg& exec,
     DeepseekV2AttentionImpl::PostAttnLayout attn_layout) const {
   PrepOut prep;
-  const ExecCfg exec = plan_exec(input_params);
   if (exec.enable_all2all) {
     auto shard =
         shard_attn_out(x,

@@ -77,6 +77,7 @@ class DeepseekV2SparseMoEBlockImpl : public torch::nn::Module {
   PrepOut prep_in(torch::Tensor x,
                   const torch::Tensor& residual,
                   const ModelInputParams& input_params,
+                  const ExecCfg& exec,
                   DeepseekV2AttentionImpl::PostAttnLayout attn_layout) const;
   torch::Tensor gather_in(const PrepOut& prep,
                           const ModelInputParams& input_params) const;