bugfix: fix the accuracy error of NPU xattention.

LMX-xin · LMX-xin · commit 4e5a6de4476c · 2026-04-17T15:58:11.000+08:00
diff --git a/third_party/xllm_ops b/third_party/xllm_ops
@@ -1 +1 @@
-Subproject commit d2236de5b4820c3a58e65ab9fe1de63d37c3fedc
+Subproject commit bca7c6191ed309192e3c1bf0f3e0a276f941d1f0
diff --git a/xllm/core/runtime/rec_worker_impl.cpp b/xllm/core/runtime/rec_worker_impl.cpp
@@ -929,17 +929,8 @@ std::optional<ForwardOutput> RecWorkerImpl::LlmRecMultiRoundPipeline::step(
           << ")";
 
 #if defined(USE_NPU)
-      if (round == 0) {
-        // NPU beam_search_rec prefill only accepts top_k == 1. Flatten the
-        // sampler's per-request top-k output into one candidate per beam so
-        // the kernel can seed sequence_group without discarding beam diversity.
-        top_tokens =
-            sample_output.top_tokens.to(torch::kInt32).reshape({-1, 1});
-        top_logprobs = sample_output.top_logprobs.reshape({-1, 1});
-      } else {
-        top_tokens = sample_output.top_tokens.to(torch::kInt32);
-        top_logprobs = sample_output.top_logprobs;
-      }
+      top_tokens = sample_output.top_tokens.to(torch::kInt32);
+      top_logprobs = sample_output.top_logprobs;
 #else
       top_tokens = sample_output.top_tokens.to(torch::kInt32)
                        .reshape({-1, step_meta->beam_width});
@@ -997,18 +988,25 @@ void RecWorkerImpl::LlmRecMultiRoundPipeline::execute_beam_search(
     int32_t round,
     int32_t batch_size) {
 #if defined(USE_NPU)
-  xllm::kernel::npu::beam_search_rec(
-      /*logprobs=*/beam_tensors.acc_logprob,
-      /*top_tokens=*/top_tokens.to(torch::kInt32),
-      /*top_logprobs=*/top_logprobs,
-      /*sequence_group=*/beam_tensors.sequence_group,
-      /*current_step=*/static_cast<int64_t>(round),
-      /*out_token_ids=*/beam_tensors.out_token_ids,
-      /*out_token_index=*/beam_tensors.out_token_index,
-      /*out_log_probs=*/beam_tensors.out_log_probs,
-      /*out_beam_count_prefix_sums=*/
-      beam_tensors.out_beam_count_prefix_sums,
-      /*out_sequence=*/beam_tensors.out_seqgroup);
+  if (round == 0) {
+    beam_tensors.out_token_ids.copy_(top_tokens.reshape({-1, 1}));
+    beam_tensors.out_log_probs.copy_(top_logprobs.reshape({-1, 1}));
+    beam_tensors.out_seqgroup.select(/*dim=*/2, /*index=*/0)
+        .copy_(top_tokens.reshape({-1}));
+  } else {
+    xllm::kernel::npu::beam_search_rec(
+        /*logprobs=*/beam_tensors.acc_logprob,
+        /*top_tokens=*/top_tokens,
+        /*top_logprobs=*/top_logprobs,
+        /*sequence_group=*/beam_tensors.sequence_group,
+        /*current_step=*/static_cast<int64_t>(round),
+        /*out_token_ids=*/beam_tensors.out_token_ids,
+        /*out_token_index=*/beam_tensors.out_token_index,
+        /*out_log_probs=*/beam_tensors.out_log_probs,
+        /*out_beam_count_prefix_sums=*/
+        beam_tensors.out_beam_count_prefix_sums,
+        /*out_sequence=*/beam_tensors.out_seqgroup);
+  }
 #elif defined(USE_CUDA)
   xllm::kernel::cuda::beam_search(beam_tensors.acc_logprob,
                                   beam_tensors.sequence_group,