-
Notifications
You must be signed in to change notification settings - Fork 19.9k
ggml : process data in smaller chunks in CUDA ggml_top_k() implementation to reduce temporary buffers memory usage #24776
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 1 commit
c1a79a6
ca88122
4d0e838
8521ec4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -76,16 +76,30 @@ void ggml_cuda_op_top_k(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { | |
| const size_t shared_mem = ncols_pad * sizeof(int); | ||
| const size_t max_shared_mem = ggml_cuda_info().devices[ggml_cuda_get_device()].smpb; | ||
|
|
||
| ggml_cuda_pool_alloc<int> temp_dst_alloc(pool, ncols * nrows); | ||
| int * tmp_dst = temp_dst_alloc.get(); | ||
| // process input in chunks to avoid excessive temporary buffers memory usage | ||
| const size_t nb01 = src0->nb[1]; | ||
| const size_t chunk_size = 1LL << 26; | ||
| const int64_t nrows_per_chunk = chunk_size > nb01 ? chunk_size / nb01 : 1; | ||
| // make sure chunk_nrows can be safely cast to int below | ||
| GGML_ASSERT(nrows_per_chunk <= std::numeric_limits<int>::max()); | ||
|
|
||
| for (int64_t i = 0; i < nrows; i+= nrows_per_chunk) { | ||
| int64_t chunk_nrows = std::min(nrows_per_chunk, nrows - i); | ||
|
|
||
| ggml_cuda_pool_alloc<int> temp_dst_alloc(pool, ncols * chunk_nrows); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure how the cuda pool works exactly - just wondering if we actually need to have this allocation inside the loop and not one time before it?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
@ggerganov Good point, will try it out. |
||
| int * tmp_dst = temp_dst_alloc.get(); | ||
|
|
||
| if (shared_mem > max_shared_mem || ncols > 1024) { | ||
| argsort_f32_i32_cuda_cub(pool, src0_d, tmp_dst, ncols, nrows, GGML_SORT_ORDER_DESC, stream); | ||
| } else { | ||
| argsort_f32_i32_cuda_bitonic(src0_d, tmp_dst, ncols, nrows, GGML_SORT_ORDER_DESC, stream); | ||
| if (shared_mem > max_shared_mem || ncols > 1024) { | ||
| argsort_f32_i32_cuda_cub(pool, src0_d, tmp_dst, ncols, chunk_nrows, GGML_SORT_ORDER_DESC, stream); | ||
| } else { | ||
| argsort_f32_i32_cuda_bitonic(src0_d, tmp_dst, ncols, chunk_nrows, GGML_SORT_ORDER_DESC, stream); | ||
| } | ||
| CUDA_CHECK(cudaMemcpy2DAsync(dst_d, k * sizeof(int), tmp_dst, ncols * sizeof(int), k * sizeof(int), chunk_nrows, | ||
| cudaMemcpyDeviceToDevice, stream)); | ||
|
|
||
| src0_d += ncols * chunk_nrows; | ||
| dst_d += k * chunk_nrows; | ||
| } | ||
| CUDA_CHECK(cudaMemcpy2DAsync(dst_d, k * sizeof(int), tmp_dst, ncols * sizeof(int), k * sizeof(int), nrows, | ||
| cudaMemcpyDeviceToDevice, stream)); | ||
| #else // GGML_CUDA_USE_CUB | ||
| ggml_cuda_pool_alloc<int> temp_dst_alloc(pool, ncols * nrows); | ||
| int * tmp_dst = temp_dst_alloc.get(); | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.