Glm5 cp benchmark data readme#1287
Glm5 cp benchmark data readme#1287ltdo111 wants to merge 74 commits intojd-opensource:release/v0.9.0from
Conversation
Signed-off-by: pengtao <pengtao.156@jd.com>
…use recursive fetching. (jd-opensource#1144) Co-authored-by: Zhang Minchao <zhminchao@163.com>
…ensor crash. (jd-opensource#1154) Signed-off-by: pengtao <pengtao.156@jd.com>
Co-authored-by: chenxb002 <chenxb002@whu.edu.cn>
…urce#1146) Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
…jd-opensource#1170) Signed-off-by: Super User <panxuanyu1@huawei.com> Signed-off-by: fems14 <panxuanyu1@huawei.com> Co-authored-by: Super User <panxuanyu1@huawei.com> Co-authored-by: ext.wangqingyu17 <ext.wangqingyu17@jd.com>
There was a problem hiding this comment.
Code Review
This pull request introduces significant changes to the codebase, including the addition of TileLang Ascend kernel development support, infrastructure for generative recommendation (REC) inference, and various refactorings to improve code quality and maintainability. Key updates include the introduction of ChatJsonParser for robust JSON handling, a new ResidualCache policy for DiT models, and the refactoring of ServiceImplFactory to support multiple serving modes. I have reviewed the changes and provided feedback on critical issues such as exception safety in destructors, code duplication in output target refreshing, and missing validation in communication logic. Please address these issues to ensure system stability and maintainability.
| // zero_page_ is not owned, don't delete it | ||
|
|
||
| unmap_and_release_virtual_mem(vaddr_, size_, page_size_); | ||
| cleanup_pages_and_vmem(vaddr_, size_, page_size_, mapping_); |
| if (rec_type == RecType::kOneRec) { | ||
| if (!sequence_groups_.empty()) { | ||
| // OneRec REC batches are tracked via sequence_groups_, while output | ||
| // target generation still walks sequences_. Refresh the flattened | ||
| // sequence view on every step so token writeback stays aligned after | ||
| // beam search expands or replaces the group-owned Sequence instances. | ||
| refresh_sequences_from_groups(); | ||
| } | ||
| if (FLAGS_enable_rec_prefill_only) { | ||
| refresh_onerec_prefill_output_targets(); | ||
| } else { | ||
| refresh_output_targets(); | ||
| } | ||
| } |
There was a problem hiding this comment.
| CHECK(pg_ != nullptr) << "Process group is not initialized."; | ||
| CHECK(output.defined()) | ||
| << "Output of all_to_all_single function is not defined"; | ||
| CHECK(input.defined()) | ||
| << "Input of all_to_all_single function is not defined"; | ||
| if (input.is_complex()) { | ||
| input = torch::view_as_real(input); | ||
| } | ||
| if (output.is_complex()) { | ||
| output = torch::view_as_real(output); | ||
| } | ||
|
|
||
| auto opts = c10d::AllToAllOptions(); | ||
| auto work = pg_->alltoall_base( | ||
| output, input, output_split_sizes, input_split_sizes, opts); | ||
| if (async_op) { | ||
| *async_work = work; | ||
| } else { | ||
| work->wait(); | ||
| } | ||
| } |
There was a problem hiding this comment.
| return std::make_unique<ProcessGroupImpl>(global_rank, | ||
| local_rank, | ||
| group_ranks, | ||
| world_size, | ||
| rank_size, | ||
| port, | ||
| host, | ||
| group_name, | ||
| device); | ||
| } |
| if (FLAGS_enable_convert_tokens_to_item && | ||
| output.token_ids.size() == rec_token_size) { | ||
| std::vector<int64_t> item_ids; | ||
| const bool ok = tokenizer.decode( | ||
| Slice<int32_t>{output.token_ids.data(), output.token_ids.size()}, | ||
| sequence_params_.skip_special_tokens, | ||
| &item_ids); | ||
| if (ok && !item_ids.empty()) { | ||
| output.item_ids_list = normalize_rec_item_ids(item_ids, index_); | ||
| if (!output.item_ids_list.empty()) { | ||
| output.item_ids = output.item_ids_list.front(); | ||
| } | ||
| } | ||
| } |
There was a problem hiding this comment.
The token-to-item conversion logic uses tokenizer.decode inside a loop, which can be inefficient if the tokenizer is not optimized for single-item decoding. Additionally, the normalize_rec_item_ids function performs shuffling and resizing, which might be expensive if called frequently. Consider optimizing the token-to-item mapping and normalization process.
| if (!(params.use_beam_search && params.all_random_sample && params.logprobs && | ||
| params.max_top_logprobs > 0)) { | ||
| return sampler_.forward(logits, params, filter_mask); | ||
| } | ||
|
|
||
| if (params.frequency_penalties.defined()) { | ||
| apply_frequency_presence_penalties(logits, | ||
| params.unique_token_ids, | ||
| params.unique_token_counts, | ||
| params.frequency_penalties, | ||
| params.presence_penalties); | ||
| } | ||
|
|
||
| if (params.repetition_penalties.defined()) { | ||
| apply_repetition_penalties( | ||
| logits, params.unique_token_ids, params.repetition_penalties); | ||
| } | ||
|
|
||
| torch::Tensor sample_logits = logits; | ||
| torch::Tensor sample_temperatures = params.temperatures; | ||
| torch::Tensor sample_top_k = params.top_k; | ||
| torch::Tensor sample_top_p = params.top_p; | ||
| const bool use_sample_indices = | ||
| params.selected_token_idxes.numel() != params.sample_idxes.numel(); | ||
| if (use_sample_indices) { | ||
| sample_logits = logits.index_select(/*dim=*/0, params.sample_idxes); | ||
| if (params.temperatures.defined()) { | ||
| sample_temperatures = | ||
| params.temperatures.index_select(/*dim=*/0, params.sample_idxes); | ||
| } | ||
| if (params.top_k.defined()) { | ||
| sample_top_k = params.top_k.index_select(/*dim=*/0, params.sample_idxes); | ||
| } | ||
| if (params.top_p.defined()) { | ||
| sample_top_p = params.top_p.index_select(/*dim=*/0, params.sample_idxes); | ||
| } | ||
| } | ||
|
|
||
| if (filter_mask.defined()) { | ||
| CHECK_EQ(filter_mask.dim(), 2) | ||
| << "filter_mask must be 2-D, dim=" << filter_mask.dim(); | ||
| CHECK_EQ(filter_mask.size(0), sample_logits.size(0)) | ||
| << "filter_mask batch mismatch, filter_mask.size(0)=" | ||
| << filter_mask.size(0) | ||
| << ", sample_logits.size(0)=" << sample_logits.size(0); | ||
| CHECK_EQ(filter_mask.size(1), sample_logits.size(1)) | ||
| << "filter_mask vocab mismatch, filter_mask.size(1)=" | ||
| << filter_mask.size(1) | ||
| << ", sample_logits.size(1)=" << sample_logits.size(1); | ||
| sample_logits = sample_logits + filter_mask; | ||
| } | ||
|
|
||
| apply_top_k_top_p( | ||
| sample_logits, sample_temperatures, sample_top_k, sample_top_p); | ||
| if (use_sample_indices) { | ||
| logits.index_copy_(/*dim=*/0, params.sample_idxes, sample_logits); | ||
| } | ||
|
|
||
| CHECK(params.do_sample.defined()) << "params.do_sample must be defined"; | ||
| CHECK_EQ(params.do_sample.dim(), 1) | ||
| << "params.do_sample must be 1D [num_seqs], got " | ||
| << params.do_sample.sizes(); | ||
| CHECK_EQ(sample_logits.size(0), params.do_sample.size(0)); | ||
|
|
||
| SampleOutput output; | ||
| auto probs = | ||
| torch::softmax(sample_logits, /*dim=*/-1, /*dtype=*/torch::kFloat32); | ||
| output.probs = probs.to(logits.dtype()); | ||
| auto logprobs = | ||
| torch::log_softmax(sample_logits, /*dim=*/-1, /*dtype=*/torch::kFloat32); | ||
|
|
||
| const int64_t vocab_size = probs.size(-1); | ||
| const int64_t top_count = std::min<int64_t>(params.max_top_logprobs, | ||
| static_cast<int64_t>(vocab_size)); | ||
| sample_top_candidates( | ||
| probs, logprobs, top_count, &output.top_tokens, &output.top_logprobs); | ||
| output.next_tokens = | ||
| output.top_tokens.select(/*dim=*/1, /*index=*/0).to(torch::kLong); | ||
| output.logprobs = | ||
| output.top_logprobs.select(/*dim=*/1, /*index=*/0).contiguous(); | ||
| return output; | ||
| } |
There was a problem hiding this comment.
The OneRecConstrainedSamplingStrategy::forward method performs extensive tensor operations, including index_select, index_copy_, and softmax on the device. These operations are performed for every step of the sampling process, which can significantly impact latency. Please optimize these operations, for example by minimizing tensor copies or using fused kernels where possible.
| constexpr bool kHasVecType = std::is_same_v<scalar_t, float> || | ||
| std::is_same_v<scalar_t, c10::Half> || | ||
| std::is_same_v<scalar_t, c10::BFloat16>; | ||
|
|
||
| if constexpr (kHasVecType) { | ||
| constexpr int32_t kVecWidth = VecType<scalar_t>::vec_width; | ||
| if (numel_per_block % kVecWidth == 0) { | ||
| const int64_t tiles_per_block = | ||
| ceil_div<int64_t>(numel_per_block / kVecWidth, kThreadsPerBlock); | ||
| const dim3 grid(num_layers, num_dst_blocks, tiles_per_block); | ||
| block_copy_kernel<scalar_t, true> | ||
| <<<grid, kThreadsPerBlock, 0, stream>>>( | ||
| key_cache_ptrs.data_ptr<int64_t>(), | ||
| value_cache_ptrs.data_ptr<int64_t>(), | ||
| src_block_indices.data_ptr<int32_t>(), | ||
| dst_block_indices.data_ptr<int32_t>(), | ||
| cum_sum.data_ptr<int32_t>(), | ||
| num_groups, | ||
| numel_per_block); | ||
| C10_CUDA_KERNEL_LAUNCH_CHECK(); | ||
| return; | ||
| } | ||
| } | ||
|
|
||
| const int64_t tiles_per_block = | ||
| ceil_div<int64_t>(numel_per_block, kThreadsPerBlock); | ||
| const dim3 grid(num_layers, num_dst_blocks, tiles_per_block); | ||
| block_copy_kernel<scalar_t, false><<<grid, kThreadsPerBlock, 0, stream>>>( | ||
| key_cache_ptrs.data_ptr<int64_t>(), | ||
| value_cache_ptrs.data_ptr<int64_t>(), | ||
| src_block_indices.data_ptr<int32_t>(), | ||
| dst_block_indices.data_ptr<int32_t>(), | ||
| cum_sum.data_ptr<int32_t>(), | ||
| num_groups, | ||
| numel_per_block); | ||
| C10_CUDA_KERNEL_LAUNCH_CHECK(); | ||
| }); |
There was a problem hiding this comment.
The block_copy kernel launch logic uses DISPATCH_FLOATING_TYPES but does not explicitly handle torch::kFloat32 in the vectorized path, even though VecType<float> is defined. This might lead to suboptimal performance for float32 tensors. Please ensure the vectorized path is correctly utilized for all supported types.
|
There seem to be many unrelated commits in this PR. Please rebase onto the latest base branch. |
Glm5 cp benchmark data readme