From c1a79a6946ec5080718bfc0bfba4bf8e7666a43e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Stanis=C5=82aw=20Szymczyk?= <sszymczy@gmail.com>
Date: Thu, 18 Jun 2026 19:38:14 +0200
Subject: [PATCH 1/4] ggml : process data in smaller chunks in CUDA
 ggml_top_k() implementation to reduce temporary buffers memory usage

---
 ggml/src/ggml-cuda/top-k.cu | 30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)
diff --git a/ggml/src/ggml-cuda/top-k.cu b/ggml/src/ggml-cuda/top-k.cu
index db1d39e2dc71..83e896d5c441 100644
--- a/ggml/src/ggml-cuda/top-k.cu
+++ b/ggml/src/ggml-cuda/top-k.cu
@@ -76,16 +76,30 @@ void ggml_cuda_op_top_k(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     const size_t shared_mem     = ncols_pad * sizeof(int);
     const size_t max_shared_mem = ggml_cuda_info().devices[ggml_cuda_get_device()].smpb;
 
-    ggml_cuda_pool_alloc<int> temp_dst_alloc(pool, ncols * nrows);
-    int *                     tmp_dst = temp_dst_alloc.get();
+    // process input in chunks to avoid excessive temporary buffers memory usage
+    const size_t nb01 = src0->nb[1];
+    const size_t chunk_size = 1LL << 26;
+    const int64_t nrows_per_chunk = chunk_size > nb01 ? chunk_size / nb01 : 1;
+    // make sure chunk_nrows can be safely cast to int below
+    GGML_ASSERT(nrows_per_chunk <= std::numeric_limits<int>::max());
+
+    for (int64_t i = 0; i < nrows; i+= nrows_per_chunk) {
+        int64_t chunk_nrows = std::min(nrows_per_chunk, nrows - i);
+
+        ggml_cuda_pool_alloc<int> temp_dst_alloc(pool, ncols * chunk_nrows);
+        int *                     tmp_dst = temp_dst_alloc.get();
 
-    if (shared_mem > max_shared_mem || ncols > 1024) {
-        argsort_f32_i32_cuda_cub(pool, src0_d, tmp_dst, ncols, nrows, GGML_SORT_ORDER_DESC, stream);
-    } else {
-        argsort_f32_i32_cuda_bitonic(src0_d, tmp_dst, ncols, nrows, GGML_SORT_ORDER_DESC, stream);
+        if (shared_mem > max_shared_mem || ncols > 1024) {
+            argsort_f32_i32_cuda_cub(pool, src0_d, tmp_dst, ncols, chunk_nrows, GGML_SORT_ORDER_DESC, stream);
+        } else {
+            argsort_f32_i32_cuda_bitonic(src0_d, tmp_dst, ncols, chunk_nrows, GGML_SORT_ORDER_DESC, stream);
+        }
+        CUDA_CHECK(cudaMemcpy2DAsync(dst_d, k * sizeof(int), tmp_dst, ncols * sizeof(int), k * sizeof(int), chunk_nrows,
+                                     cudaMemcpyDeviceToDevice, stream));
+
+        src0_d += ncols * chunk_nrows;
+        dst_d  += k     * chunk_nrows;
     }
-    CUDA_CHECK(cudaMemcpy2DAsync(dst_d, k * sizeof(int), tmp_dst, ncols * sizeof(int), k * sizeof(int), nrows,
-                                 cudaMemcpyDeviceToDevice, stream));
 #else                             // GGML_CUDA_USE_CUB
     ggml_cuda_pool_alloc<int> temp_dst_alloc(pool, ncols * nrows);
     int *                     tmp_dst = temp_dst_alloc.get();

From ca88122e13dc8f6a90f3ebcfccc3b340966bcf40 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Stanis=C5=82aw=20Szymczyk?= <sszymczy@gmail.com>
Date: Fri, 19 Jun 2026 10:27:27 +0200
Subject: [PATCH 2/4] ggml : allocate tmp_dst only only once before the loop

---
 ggml/src/ggml-cuda/top-k.cu | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/ggml/src/ggml-cuda/top-k.cu b/ggml/src/ggml-cuda/top-k.cu
index 83e896d5c441..7b6ad13220f0 100644
--- a/ggml/src/ggml-cuda/top-k.cu
+++ b/ggml/src/ggml-cuda/top-k.cu
@@ -83,12 +83,13 @@ void ggml_cuda_op_top_k(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     // make sure chunk_nrows can be safely cast to int below
     GGML_ASSERT(nrows_per_chunk <= std::numeric_limits<int>::max());
 
+    int64_t tmp_dst_nrows = std::min(nrows_per_chunk, nrows);
+    ggml_cuda_pool_alloc<int> temp_dst_alloc(pool, ncols * tmp_dst_nrows);
+    int *                     tmp_dst = temp_dst_alloc.get();
+
     for (int64_t i = 0; i < nrows; i+= nrows_per_chunk) {
         int64_t chunk_nrows = std::min(nrows_per_chunk, nrows - i);
 
-        ggml_cuda_pool_alloc<int> temp_dst_alloc(pool, ncols * chunk_nrows);
-        int *                     tmp_dst = temp_dst_alloc.get();
-
         if (shared_mem > max_shared_mem || ncols > 1024) {
             argsort_f32_i32_cuda_cub(pool, src0_d, tmp_dst, ncols, chunk_nrows, GGML_SORT_ORDER_DESC, stream);
         } else {

From 4d0e8387e1170b575c454eadcb9e55ba332a41df Mon Sep 17 00:00:00 2001
From: fairydreaming <166155368+fairydreaming@users.noreply.github.com>
Date: Fri, 19 Jun 2026 10:58:10 +0200
Subject: [PATCH 3/4] chore : whitespaces

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 ggml/src/ggml-cuda/top-k.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-cuda/top-k.cu b/ggml/src/ggml-cuda/top-k.cu
index 7b6ad13220f0..d65c258ff383 100644
--- a/ggml/src/ggml-cuda/top-k.cu
+++ b/ggml/src/ggml-cuda/top-k.cu
@@ -87,7 +87,7 @@ void ggml_cuda_op_top_k(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     ggml_cuda_pool_alloc<int> temp_dst_alloc(pool, ncols * tmp_dst_nrows);
     int *                     tmp_dst = temp_dst_alloc.get();
 
-    for (int64_t i = 0; i < nrows; i+= nrows_per_chunk) {
+    for (int64_t i = 0; i < nrows; i += nrows_per_chunk) {
         int64_t chunk_nrows = std::min(nrows_per_chunk, nrows - i);
 
         if (shared_mem > max_shared_mem || ncols > 1024) {

From 8521ec42510f40ab4bdab59bcc143444086d9d68 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Stanis=C5=82aw=20Szymczyk?= <sszymczy@gmail.com>
Date: Thu, 25 Jun 2026 21:12:51 +0200
Subject: [PATCH 4/4] ggml : use chunked processing in both CUDA CUB top-k and
 argsort implementations

---
 ggml/src/ggml-cuda/argsort.cu  | 35 +++++++++++++++++++++++++++++-----
 ggml/src/ggml-cuda/argsort.cuh |  1 +
 ggml/src/ggml-cuda/top-k.cu    | 28 +++++++++++----------------
 3 files changed, 42 insertions(+), 22 deletions(-)

diff --git a/ggml/src/ggml-cuda/argsort.cu b/ggml/src/ggml-cuda/argsort.cu
index c4f08091e79a..33a38c23e87e 100644
--- a/ggml/src/ggml-cuda/argsort.cu
+++ b/ggml/src/ggml-cuda/argsort.cu
@@ -28,6 +28,20 @@ static __global__ void init_offsets(int * offsets, const int ncols, const int nr
 #endif  // STRIDED_ITERATOR_AVAILABLE
 
 #ifdef GGML_CUDA_USE_CUB
+
+// returns the suggested maximum number of rows to process during one argsort_f32_i32_cuda_cub() call
+int argsort_f32_i32_cuda_cub_chunk_nrows(const size_t nb01, const int64_t nrows) {
+    // perform argsort in chunks up to approximately this size (currently 64MB)
+    // to avoid excessive temporary buffers memory usage
+    const int chunk_bytes = 1 << 26;
+
+    // calculate how many rows will fit in one chunk (must be at least one)
+    const int chunk_nrows = chunk_bytes > nb01 ? chunk_bytes / nb01 : 1;
+
+    // limit the resulting amount to total nrows
+    return nrows < chunk_nrows ? nrows : chunk_nrows;
+}
+
 void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool,
                               const float *    x,
                               int *            dst,
@@ -254,11 +268,22 @@ void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     const size_t shared_mem     = ncols_pad * sizeof(int);
     const size_t max_shared_mem = ggml_cuda_info().devices[ggml_cuda_get_device()].smpb;
 
-    if (shared_mem > max_shared_mem || ncols > 1024) {
-        ggml_cuda_pool & pool = ctx.pool();
-        argsort_f32_i32_cuda_cub(pool, src0_d, (int *) dst_d, ncols, nrows, order, stream);
-    } else {
-        argsort_f32_i32_cuda_bitonic(src0_d, (int *) dst_d, ncols, nrows, order, stream);
+    // early return if we can use bitonic argsort
+    if (shared_mem <= max_shared_mem && ncols <= 1024) {
+        return argsort_f32_i32_cuda_bitonic(src0_d, (int *) dst_d, ncols, nrows, order, stream);
+    }
+
+    const int chunk_nrows = argsort_f32_i32_cuda_cub_chunk_nrows(src0->nb[1], nrows);
+
+    ggml_cuda_pool & pool = ctx.pool();
+
+    for (int64_t i = 0; i < nrows; i += chunk_nrows) {
+        int iter_nrows = chunk_nrows < nrows - i ? chunk_nrows : nrows - i;
+
+        argsort_f32_i32_cuda_cub(pool, src0_d, (int *) dst_d, ncols, iter_nrows, order, stream);
+
+        src0_d += ncols * iter_nrows;
+        dst_d  += ncols * iter_nrows;
     }
 #else
     argsort_f32_i32_cuda_bitonic(src0_d, (int *) dst_d, ncols, nrows, order, stream);
diff --git a/ggml/src/ggml-cuda/argsort.cuh b/ggml/src/ggml-cuda/argsort.cuh
index 22b7306f2020..3abb6448a057 100644
--- a/ggml/src/ggml-cuda/argsort.cuh
+++ b/ggml/src/ggml-cuda/argsort.cuh
@@ -3,6 +3,7 @@
 void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 
 #ifdef GGML_CUDA_USE_CUB
+int argsort_f32_i32_cuda_cub_chunk_nrows(const size_t nb01, const int64_t nrows);
 void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool,
                               const float *    x,
                               int *            dst,
diff --git a/ggml/src/ggml-cuda/top-k.cu b/ggml/src/ggml-cuda/top-k.cu
index d65c258ff383..5e708e6c5ed4 100644
--- a/ggml/src/ggml-cuda/top-k.cu
+++ b/ggml/src/ggml-cuda/top-k.cu
@@ -75,31 +75,25 @@ void ggml_cuda_op_top_k(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     const int    ncols_pad      = next_power_of_2(ncols);
     const size_t shared_mem     = ncols_pad * sizeof(int);
     const size_t max_shared_mem = ggml_cuda_info().devices[ggml_cuda_get_device()].smpb;
+    const bool   use_bitonic    = shared_mem <= max_shared_mem && ncols <= 1024;
+    const int    chunk_nrows    = argsort_f32_i32_cuda_cub_chunk_nrows(src0->nb[1], nrows);
 
-    // process input in chunks to avoid excessive temporary buffers memory usage
-    const size_t nb01 = src0->nb[1];
-    const size_t chunk_size = 1LL << 26;
-    const int64_t nrows_per_chunk = chunk_size > nb01 ? chunk_size / nb01 : 1;
-    // make sure chunk_nrows can be safely cast to int below
-    GGML_ASSERT(nrows_per_chunk <= std::numeric_limits<int>::max());
-
-    int64_t tmp_dst_nrows = std::min(nrows_per_chunk, nrows);
-    ggml_cuda_pool_alloc<int> temp_dst_alloc(pool, ncols * tmp_dst_nrows);
+    ggml_cuda_pool_alloc<int> temp_dst_alloc(pool, ncols * chunk_nrows);
     int *                     tmp_dst = temp_dst_alloc.get();
 
-    for (int64_t i = 0; i < nrows; i += nrows_per_chunk) {
-        int64_t chunk_nrows = std::min(nrows_per_chunk, nrows - i);
+    for (int64_t i = 0; i < nrows; i += chunk_nrows) {
+        int iter_nrows = chunk_nrows < nrows - i ? chunk_nrows : nrows - i;
 
-        if (shared_mem > max_shared_mem || ncols > 1024) {
-            argsort_f32_i32_cuda_cub(pool, src0_d, tmp_dst, ncols, chunk_nrows, GGML_SORT_ORDER_DESC, stream);
+        if (use_bitonic) {
+            argsort_f32_i32_cuda_bitonic(src0_d, tmp_dst, ncols, iter_nrows, GGML_SORT_ORDER_DESC, stream);
         } else {
-            argsort_f32_i32_cuda_bitonic(src0_d, tmp_dst, ncols, chunk_nrows, GGML_SORT_ORDER_DESC, stream);
+            argsort_f32_i32_cuda_cub(pool, src0_d, tmp_dst, ncols, iter_nrows, GGML_SORT_ORDER_DESC, stream);
         }
-        CUDA_CHECK(cudaMemcpy2DAsync(dst_d, k * sizeof(int), tmp_dst, ncols * sizeof(int), k * sizeof(int), chunk_nrows,
+        CUDA_CHECK(cudaMemcpy2DAsync(dst_d, k * sizeof(int), tmp_dst, ncols * sizeof(int), k * sizeof(int), iter_nrows,
                                      cudaMemcpyDeviceToDevice, stream));
 
-        src0_d += ncols * chunk_nrows;
-        dst_d  += k     * chunk_nrows;
+        src0_d += ncols * iter_nrows;
+        dst_d  += k     * iter_nrows;
     }
 #else                             // GGML_CUDA_USE_CUB
     ggml_cuda_pool_alloc<int> temp_dst_alloc(pool, ncols * nrows);