From c1a79a6946ec5080718bfc0bfba4bf8e7666a43e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stanis=C5=82aw=20Szymczyk?= Date: Thu, 18 Jun 2026 19:38:14 +0200 Subject: [PATCH 1/4] ggml : process data in smaller chunks in CUDA ggml_top_k() implementation to reduce temporary buffers memory usage --- ggml/src/ggml-cuda/top-k.cu | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/ggml/src/ggml-cuda/top-k.cu b/ggml/src/ggml-cuda/top-k.cu index db1d39e2dc71..83e896d5c441 100644 --- a/ggml/src/ggml-cuda/top-k.cu +++ b/ggml/src/ggml-cuda/top-k.cu @@ -76,16 +76,30 @@ void ggml_cuda_op_top_k(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const size_t shared_mem = ncols_pad * sizeof(int); const size_t max_shared_mem = ggml_cuda_info().devices[ggml_cuda_get_device()].smpb; - ggml_cuda_pool_alloc temp_dst_alloc(pool, ncols * nrows); - int * tmp_dst = temp_dst_alloc.get(); + // process input in chunks to avoid excessive temporary buffers memory usage + const size_t nb01 = src0->nb[1]; + const size_t chunk_size = 1LL << 26; + const int64_t nrows_per_chunk = chunk_size > nb01 ? chunk_size / nb01 : 1; + // make sure chunk_nrows can be safely cast to int below + GGML_ASSERT(nrows_per_chunk <= std::numeric_limits::max()); + + for (int64_t i = 0; i < nrows; i+= nrows_per_chunk) { + int64_t chunk_nrows = std::min(nrows_per_chunk, nrows - i); + + ggml_cuda_pool_alloc temp_dst_alloc(pool, ncols * chunk_nrows); + int * tmp_dst = temp_dst_alloc.get(); - if (shared_mem > max_shared_mem || ncols > 1024) { - argsort_f32_i32_cuda_cub(pool, src0_d, tmp_dst, ncols, nrows, GGML_SORT_ORDER_DESC, stream); - } else { - argsort_f32_i32_cuda_bitonic(src0_d, tmp_dst, ncols, nrows, GGML_SORT_ORDER_DESC, stream); + if (shared_mem > max_shared_mem || ncols > 1024) { + argsort_f32_i32_cuda_cub(pool, src0_d, tmp_dst, ncols, chunk_nrows, GGML_SORT_ORDER_DESC, stream); + } else { + argsort_f32_i32_cuda_bitonic(src0_d, tmp_dst, ncols, chunk_nrows, GGML_SORT_ORDER_DESC, stream); + } + CUDA_CHECK(cudaMemcpy2DAsync(dst_d, k * sizeof(int), tmp_dst, ncols * sizeof(int), k * sizeof(int), chunk_nrows, + cudaMemcpyDeviceToDevice, stream)); + + src0_d += ncols * chunk_nrows; + dst_d += k * chunk_nrows; } - CUDA_CHECK(cudaMemcpy2DAsync(dst_d, k * sizeof(int), tmp_dst, ncols * sizeof(int), k * sizeof(int), nrows, - cudaMemcpyDeviceToDevice, stream)); #else // GGML_CUDA_USE_CUB ggml_cuda_pool_alloc temp_dst_alloc(pool, ncols * nrows); int * tmp_dst = temp_dst_alloc.get(); From ca88122e13dc8f6a90f3ebcfccc3b340966bcf40 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stanis=C5=82aw=20Szymczyk?= Date: Fri, 19 Jun 2026 10:27:27 +0200 Subject: [PATCH 2/4] ggml : allocate tmp_dst only only once before the loop --- ggml/src/ggml-cuda/top-k.cu | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-cuda/top-k.cu b/ggml/src/ggml-cuda/top-k.cu index 83e896d5c441..7b6ad13220f0 100644 --- a/ggml/src/ggml-cuda/top-k.cu +++ b/ggml/src/ggml-cuda/top-k.cu @@ -83,12 +83,13 @@ void ggml_cuda_op_top_k(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { // make sure chunk_nrows can be safely cast to int below GGML_ASSERT(nrows_per_chunk <= std::numeric_limits::max()); + int64_t tmp_dst_nrows = std::min(nrows_per_chunk, nrows); + ggml_cuda_pool_alloc temp_dst_alloc(pool, ncols * tmp_dst_nrows); + int * tmp_dst = temp_dst_alloc.get(); + for (int64_t i = 0; i < nrows; i+= nrows_per_chunk) { int64_t chunk_nrows = std::min(nrows_per_chunk, nrows - i); - ggml_cuda_pool_alloc temp_dst_alloc(pool, ncols * chunk_nrows); - int * tmp_dst = temp_dst_alloc.get(); - if (shared_mem > max_shared_mem || ncols > 1024) { argsort_f32_i32_cuda_cub(pool, src0_d, tmp_dst, ncols, chunk_nrows, GGML_SORT_ORDER_DESC, stream); } else { From 4d0e8387e1170b575c454eadcb9e55ba332a41df Mon Sep 17 00:00:00 2001 From: fairydreaming <166155368+fairydreaming@users.noreply.github.com> Date: Fri, 19 Jun 2026 10:58:10 +0200 Subject: [PATCH 3/4] chore : whitespaces Co-authored-by: Georgi Gerganov --- ggml/src/ggml-cuda/top-k.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-cuda/top-k.cu b/ggml/src/ggml-cuda/top-k.cu index 7b6ad13220f0..d65c258ff383 100644 --- a/ggml/src/ggml-cuda/top-k.cu +++ b/ggml/src/ggml-cuda/top-k.cu @@ -87,7 +87,7 @@ void ggml_cuda_op_top_k(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { ggml_cuda_pool_alloc temp_dst_alloc(pool, ncols * tmp_dst_nrows); int * tmp_dst = temp_dst_alloc.get(); - for (int64_t i = 0; i < nrows; i+= nrows_per_chunk) { + for (int64_t i = 0; i < nrows; i += nrows_per_chunk) { int64_t chunk_nrows = std::min(nrows_per_chunk, nrows - i); if (shared_mem > max_shared_mem || ncols > 1024) { From 8521ec42510f40ab4bdab59bcc143444086d9d68 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stanis=C5=82aw=20Szymczyk?= Date: Thu, 25 Jun 2026 21:12:51 +0200 Subject: [PATCH 4/4] ggml : use chunked processing in both CUDA CUB top-k and argsort implementations --- ggml/src/ggml-cuda/argsort.cu | 35 +++++++++++++++++++++++++++++----- ggml/src/ggml-cuda/argsort.cuh | 1 + ggml/src/ggml-cuda/top-k.cu | 28 +++++++++++---------------- 3 files changed, 42 insertions(+), 22 deletions(-) diff --git a/ggml/src/ggml-cuda/argsort.cu b/ggml/src/ggml-cuda/argsort.cu index c4f08091e79a..33a38c23e87e 100644 --- a/ggml/src/ggml-cuda/argsort.cu +++ b/ggml/src/ggml-cuda/argsort.cu @@ -28,6 +28,20 @@ static __global__ void init_offsets(int * offsets, const int ncols, const int nr #endif // STRIDED_ITERATOR_AVAILABLE #ifdef GGML_CUDA_USE_CUB + +// returns the suggested maximum number of rows to process during one argsort_f32_i32_cuda_cub() call +int argsort_f32_i32_cuda_cub_chunk_nrows(const size_t nb01, const int64_t nrows) { + // perform argsort in chunks up to approximately this size (currently 64MB) + // to avoid excessive temporary buffers memory usage + const int chunk_bytes = 1 << 26; + + // calculate how many rows will fit in one chunk (must be at least one) + const int chunk_nrows = chunk_bytes > nb01 ? chunk_bytes / nb01 : 1; + + // limit the resulting amount to total nrows + return nrows < chunk_nrows ? nrows : chunk_nrows; +} + void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool, const float * x, int * dst, @@ -254,11 +268,22 @@ void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const size_t shared_mem = ncols_pad * sizeof(int); const size_t max_shared_mem = ggml_cuda_info().devices[ggml_cuda_get_device()].smpb; - if (shared_mem > max_shared_mem || ncols > 1024) { - ggml_cuda_pool & pool = ctx.pool(); - argsort_f32_i32_cuda_cub(pool, src0_d, (int *) dst_d, ncols, nrows, order, stream); - } else { - argsort_f32_i32_cuda_bitonic(src0_d, (int *) dst_d, ncols, nrows, order, stream); + // early return if we can use bitonic argsort + if (shared_mem <= max_shared_mem && ncols <= 1024) { + return argsort_f32_i32_cuda_bitonic(src0_d, (int *) dst_d, ncols, nrows, order, stream); + } + + const int chunk_nrows = argsort_f32_i32_cuda_cub_chunk_nrows(src0->nb[1], nrows); + + ggml_cuda_pool & pool = ctx.pool(); + + for (int64_t i = 0; i < nrows; i += chunk_nrows) { + int iter_nrows = chunk_nrows < nrows - i ? chunk_nrows : nrows - i; + + argsort_f32_i32_cuda_cub(pool, src0_d, (int *) dst_d, ncols, iter_nrows, order, stream); + + src0_d += ncols * iter_nrows; + dst_d += ncols * iter_nrows; } #else argsort_f32_i32_cuda_bitonic(src0_d, (int *) dst_d, ncols, nrows, order, stream); diff --git a/ggml/src/ggml-cuda/argsort.cuh b/ggml/src/ggml-cuda/argsort.cuh index 22b7306f2020..3abb6448a057 100644 --- a/ggml/src/ggml-cuda/argsort.cuh +++ b/ggml/src/ggml-cuda/argsort.cuh @@ -3,6 +3,7 @@ void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst); #ifdef GGML_CUDA_USE_CUB +int argsort_f32_i32_cuda_cub_chunk_nrows(const size_t nb01, const int64_t nrows); void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool, const float * x, int * dst, diff --git a/ggml/src/ggml-cuda/top-k.cu b/ggml/src/ggml-cuda/top-k.cu index d65c258ff383..5e708e6c5ed4 100644 --- a/ggml/src/ggml-cuda/top-k.cu +++ b/ggml/src/ggml-cuda/top-k.cu @@ -75,31 +75,25 @@ void ggml_cuda_op_top_k(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const int ncols_pad = next_power_of_2(ncols); const size_t shared_mem = ncols_pad * sizeof(int); const size_t max_shared_mem = ggml_cuda_info().devices[ggml_cuda_get_device()].smpb; + const bool use_bitonic = shared_mem <= max_shared_mem && ncols <= 1024; + const int chunk_nrows = argsort_f32_i32_cuda_cub_chunk_nrows(src0->nb[1], nrows); - // process input in chunks to avoid excessive temporary buffers memory usage - const size_t nb01 = src0->nb[1]; - const size_t chunk_size = 1LL << 26; - const int64_t nrows_per_chunk = chunk_size > nb01 ? chunk_size / nb01 : 1; - // make sure chunk_nrows can be safely cast to int below - GGML_ASSERT(nrows_per_chunk <= std::numeric_limits::max()); - - int64_t tmp_dst_nrows = std::min(nrows_per_chunk, nrows); - ggml_cuda_pool_alloc temp_dst_alloc(pool, ncols * tmp_dst_nrows); + ggml_cuda_pool_alloc temp_dst_alloc(pool, ncols * chunk_nrows); int * tmp_dst = temp_dst_alloc.get(); - for (int64_t i = 0; i < nrows; i += nrows_per_chunk) { - int64_t chunk_nrows = std::min(nrows_per_chunk, nrows - i); + for (int64_t i = 0; i < nrows; i += chunk_nrows) { + int iter_nrows = chunk_nrows < nrows - i ? chunk_nrows : nrows - i; - if (shared_mem > max_shared_mem || ncols > 1024) { - argsort_f32_i32_cuda_cub(pool, src0_d, tmp_dst, ncols, chunk_nrows, GGML_SORT_ORDER_DESC, stream); + if (use_bitonic) { + argsort_f32_i32_cuda_bitonic(src0_d, tmp_dst, ncols, iter_nrows, GGML_SORT_ORDER_DESC, stream); } else { - argsort_f32_i32_cuda_bitonic(src0_d, tmp_dst, ncols, chunk_nrows, GGML_SORT_ORDER_DESC, stream); + argsort_f32_i32_cuda_cub(pool, src0_d, tmp_dst, ncols, iter_nrows, GGML_SORT_ORDER_DESC, stream); } - CUDA_CHECK(cudaMemcpy2DAsync(dst_d, k * sizeof(int), tmp_dst, ncols * sizeof(int), k * sizeof(int), chunk_nrows, + CUDA_CHECK(cudaMemcpy2DAsync(dst_d, k * sizeof(int), tmp_dst, ncols * sizeof(int), k * sizeof(int), iter_nrows, cudaMemcpyDeviceToDevice, stream)); - src0_d += ncols * chunk_nrows; - dst_d += k * chunk_nrows; + src0_d += ncols * iter_nrows; + dst_d += k * iter_nrows; } #else // GGML_CUDA_USE_CUB ggml_cuda_pool_alloc temp_dst_alloc(pool, ncols * nrows);