From e0224fad5ebc0d1aec19002a0571388816ef7f99 Mon Sep 17 00:00:00 2001 From: pengtao Date: Thu, 16 Apr 2026 17:03:09 +0800 Subject: [PATCH] feat: reuse preplanned ExecCfg in sparse MoE prep_in. Signed-off-by: pengtao --- xllm/core/framework/xtensor/phy_page_pool.h | 1 - xllm/core/framework/xtensor/xtensor.cpp | 4 +-- .../deepseek_v2_sparse_moe_block_tests.cpp | 31 +++++++++++++++++++ .../mlu/deepseek_v2_decoder_layer_impl.cpp | 7 +++-- .../mlu/deepseek_v2_sparse_moe_block.cpp | 2 +- .../layers/mlu/deepseek_v2_sparse_moe_block.h | 1 + 6 files changed, 40 insertions(+), 6 deletions(-) diff --git a/xllm/core/framework/xtensor/phy_page_pool.h b/xllm/core/framework/xtensor/phy_page_pool.h index 50021cb5f..f2147474f 100644 --- a/xllm/core/framework/xtensor/phy_page_pool.h +++ b/xllm/core/framework/xtensor/phy_page_pool.h @@ -120,7 +120,6 @@ class PhyPagePool { // Track which pages are allocated (for segment management) std::vector page_allocated_; - }; } // namespace xllm diff --git a/xllm/core/framework/xtensor/xtensor.cpp b/xllm/core/framework/xtensor/xtensor.cpp index 70c2e165c..b978928e4 100644 --- a/xllm/core/framework/xtensor/xtensor.cpp +++ b/xllm/core/framework/xtensor/xtensor.cpp @@ -83,8 +83,8 @@ static inline void unmap_pages( } for (const auto& entry : mapping) { - VirPtr addr = add_vir_ptr_offset( - vaddr, static_cast(entry.first) * page_size); + VirPtr addr = + add_vir_ptr_offset(vaddr, static_cast(entry.first) * page_size); vmm::unmap(addr, page_size); } } diff --git a/xllm/core/layers/common/tests/deepseek_v2_sparse_moe_block_tests.cpp b/xllm/core/layers/common/tests/deepseek_v2_sparse_moe_block_tests.cpp index 31aa43eda..cd0fef4d2 100644 --- a/xllm/core/layers/common/tests/deepseek_v2_sparse_moe_block_tests.cpp +++ b/xllm/core/layers/common/tests/deepseek_v2_sparse_moe_block_tests.cpp @@ -302,6 +302,7 @@ TEST_F(DeepseekV2SparseMoEBlockTest, PrepInDpGatherBuildsLocalSkip) { auto prep = block->prep_in(attn_out, residual, input_params, + block->plan_exec(input_params), DeepseekV2AttentionImpl::PostAttnLayout::kTpShard); EXPECT_TRUE(prep.need_dp_gather); @@ -349,6 +350,7 @@ TEST_F(DeepseekV2SparseMoEBlockTest, PrepInAll2AllPadsTpShardInput) { auto prep = block->prep_in(attn_out, residual, input_params, + block->plan_exec(input_params), DeepseekV2AttentionImpl::PostAttnLayout::kTpShard); EXPECT_FALSE(prep.need_dp_gather); @@ -361,6 +363,35 @@ TEST_F(DeepseekV2SparseMoEBlockTest, PrepInAll2AllPadsTpShardInput) { test::verify_tensor_close(prep.skip_local, prep.ffn_in); } +TEST_F(DeepseekV2SparseMoEBlockTest, PrepInUsesProvidedExecCfg) { + set_tp_dp_ctx(/*world_size=*/4, /*dp_size=*/2, /*tp_size=*/2, /*ep_size=*/4); + auto block = create_block(); + + ModelInputParams input_params; + input_params.dp_global_token_nums = {3, 1}; + input_params.dp_is_decode = {0, 0}; + auto planned_cfg = block->plan_exec(input_params); + EXPECT_FALSE(planned_cfg.enable_all2all); + EXPECT_TRUE(planned_cfg.need_dp_gather); + + DeepseekV2SparseMoEBlockImpl::ExecCfg forced_cfg; + forced_cfg.enable_all2all = true; + forced_cfg.need_dp_gather = false; + auto attn_out = mat(/*rows=*/3, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}); + auto residual = mat(/*rows=*/3, {10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f}); + + auto prep = block->prep_in(attn_out, + residual, + input_params, + forced_cfg, + DeepseekV2AttentionImpl::PostAttnLayout::kTpShard); + + EXPECT_FALSE(prep.need_dp_gather); + EXPECT_TRUE(prep.need_tp_pad); + EXPECT_TRUE(prep.pad_info.active); + test::verify_tensor_close(prep.skip_local, prep.ffn_in); +} + TEST_F(DeepseekV2SparseMoEBlockTest, MergeOutTpPadGathersAndUnpads) { set_tp_ctx(/*world_size=*/2, /*ep_size=*/2); auto block = create_block(); diff --git a/xllm/core/layers/mlu/deepseek_v2_decoder_layer_impl.cpp b/xllm/core/layers/mlu/deepseek_v2_decoder_layer_impl.cpp index 9ef413095..cc11b268b 100644 --- a/xllm/core/layers/mlu/deepseek_v2_decoder_layer_impl.cpp +++ b/xllm/core/layers/mlu/deepseek_v2_decoder_layer_impl.cpp @@ -181,8 +181,11 @@ DeepseekV2DecoderLayerImpl::prepare_moe_inputs( } if (result.exec_cfg->enable_all2all || result.exec_cfg->need_dp_gather) { - result.moe_prep = - sparse_moe_->prep_in(std::move(x), residual, input_params, attn_layout); + result.moe_prep = sparse_moe_->prep_in(std::move(x), + residual, + input_params, + result.exec_cfg.value(), + attn_layout); result.ffn_in = result.moe_prep->ffn_in; return result; } diff --git a/xllm/core/layers/mlu/deepseek_v2_sparse_moe_block.cpp b/xllm/core/layers/mlu/deepseek_v2_sparse_moe_block.cpp index ff532840c..ffa3b304a 100644 --- a/xllm/core/layers/mlu/deepseek_v2_sparse_moe_block.cpp +++ b/xllm/core/layers/mlu/deepseek_v2_sparse_moe_block.cpp @@ -69,9 +69,9 @@ DeepseekV2SparseMoEBlockImpl::PrepOut DeepseekV2SparseMoEBlockImpl::prep_in( torch::Tensor x, const torch::Tensor& residual, const ModelInputParams& input_params, + const ExecCfg& exec, DeepseekV2AttentionImpl::PostAttnLayout attn_layout) const { PrepOut prep; - const ExecCfg exec = plan_exec(input_params); if (exec.enable_all2all) { auto shard = shard_attn_out(x, diff --git a/xllm/core/layers/mlu/deepseek_v2_sparse_moe_block.h b/xllm/core/layers/mlu/deepseek_v2_sparse_moe_block.h index 6db9a2bb5..99637536d 100644 --- a/xllm/core/layers/mlu/deepseek_v2_sparse_moe_block.h +++ b/xllm/core/layers/mlu/deepseek_v2_sparse_moe_block.h @@ -77,6 +77,7 @@ class DeepseekV2SparseMoEBlockImpl : public torch::nn::Module { PrepOut prep_in(torch::Tensor x, const torch::Tensor& residual, const ModelInputParams& input_params, + const ExecCfg& exec, DeepseekV2AttentionImpl::PostAttnLayout attn_layout) const; torch::Tensor gather_in(const PrepOut& prep, const ModelInputParams& input_params) const;