ggml-org · fairydreaming · Jun 18, 2026 · Jun 19, 2026 · Jun 19, 2026 · Jun 25, 2026
@@ -76,16 +76,30 @@ void ggml_cuda_op_top_k(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     const size_t shared_mem     = ncols_pad * sizeof(int);
     const size_t max_shared_mem = ggml_cuda_info().devices[ggml_cuda_get_device()].smpb;
 
-    ggml_cuda_pool_alloc<int> temp_dst_alloc(pool, ncols * nrows);
-    int *                     tmp_dst = temp_dst_alloc.get();
+    // process input in chunks to avoid excessive temporary buffers memory usage
+    const size_t nb01 = src0->nb[1];
+    const size_t chunk_size = 1LL << 26;
+    const int64_t nrows_per_chunk = chunk_size > nb01 ? chunk_size / nb01 : 1;
+    // make sure chunk_nrows can be safely cast to int below
+    GGML_ASSERT(nrows_per_chunk <= std::numeric_limits<int>::max());
+
+    for (int64_t i = 0; i < nrows; i+= nrows_per_chunk) {
+        int64_t chunk_nrows = std::min(nrows_per_chunk, nrows - i);
+
+        ggml_cuda_pool_alloc<int> temp_dst_alloc(pool, ncols * chunk_nrows);
+        int *                     tmp_dst = temp_dst_alloc.get();
 
-    if (shared_mem > max_shared_mem || ncols > 1024) {
-        argsort_f32_i32_cuda_cub(pool, src0_d, tmp_dst, ncols, nrows, GGML_SORT_ORDER_DESC, stream);
-    } else {
-        argsort_f32_i32_cuda_bitonic(src0_d, tmp_dst, ncols, nrows, GGML_SORT_ORDER_DESC, stream);
+        if (shared_mem > max_shared_mem || ncols > 1024) {
+            argsort_f32_i32_cuda_cub(pool, src0_d, tmp_dst, ncols, chunk_nrows, GGML_SORT_ORDER_DESC, stream);
+        } else {
+            argsort_f32_i32_cuda_bitonic(src0_d, tmp_dst, ncols, chunk_nrows, GGML_SORT_ORDER_DESC, stream);
+        }
+        CUDA_CHECK(cudaMemcpy2DAsync(dst_d, k * sizeof(int), tmp_dst, ncols * sizeof(int), k * sizeof(int), chunk_nrows,
+                                     cudaMemcpyDeviceToDevice, stream));
+
+        src0_d += ncols * chunk_nrows;
+        dst_d  += k     * chunk_nrows;
     }
-    CUDA_CHECK(cudaMemcpy2DAsync(dst_d, k * sizeof(int), tmp_dst, ncols * sizeof(int), k * sizeof(int), nrows,
-                                 cudaMemcpyDeviceToDevice, stream));
 #else                             // GGML_CUDA_USE_CUB
     ggml_cuda_pool_alloc<int> temp_dst_alloc(pool, ncols * nrows);
     int *                     tmp_dst = temp_dst_alloc.get();