diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 5fbebc6d7519..26e6737be840 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -1898,6 +1898,38 @@ static bool vk_enable_sync_logger = false; static uint32_t vk_perf_logger_frequency = 1; static std::string vk_pipeline_stats_filter; +static uint64_t ggml_vk_get_node_flops(const ggml_tensor * node) { + if (node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL_MAT_ID) { + const uint64_t m = node->ne[0]; + const uint64_t n = node->ne[1]; + const uint64_t k = node->src[1]->ne[0]; + const uint64_t batch = node->ne[2] * node->ne[3]; + return m * n * (k + (k - 1)) * batch; + } + if (node->op == GGML_OP_CONV_2D || node->op == GGML_OP_CONV_TRANSPOSE_2D) { + const ggml_tensor * knl = node->src[0]; + const uint64_t Cout = node->ne[2]; + const uint64_t size_K = node->src[1]->ne[2] * knl->ne[0] * knl->ne[1]; + const uint64_t size_N = node->ne[3] * node->ne[0] * node->ne[1]; + return Cout * size_N * (size_K + (size_K - 1)); + } + if (node->op == GGML_OP_CONV_3D) { + const ggml_tensor * knl = node->src[0]; + const uint64_t OC = ggml_get_op_params_i32(node, 11); + const uint64_t IC = ggml_get_op_params_i32(node, 9); + const uint64_t size_K = IC * knl->ne[0] * knl->ne[1] * knl->ne[2]; + const uint64_t size_N = node->ne[3] / OC * node->ne[0] * node->ne[1] * node->ne[2]; + return OC * size_N * (size_K + (size_K - 1)); + } + if (node->op == GGML_OP_FLASH_ATTN_EXT) { + const ggml_tensor * q = node->src[0]; + const ggml_tensor * k = node->src[1]; + const ggml_tensor * v = node->src[2]; + return 2ull * q->ne[1] * q->ne[2] * (k->ne[0] + v->ne[0]) * k->ne[1] * q->ne[3]; + } + return 0; +} + class vk_perf_logger { public: void print_timings(bool force = false) { @@ -1946,7 +1978,7 @@ class vk_perf_logger { } std::string get_node_fusion_name(const ggml_tensor * node, const char *fusion_name, uint64_t *n_flops) { - *n_flops = 0; + *n_flops = ggml_vk_get_node_flops(node); std::string fusion_str; if (fusion_name) { fusion_str = fusion_name + std::string(" "); @@ -1973,35 +2005,22 @@ class vk_perf_logger { if (batch > 1) { name += " batch=" + std::to_string(batch); } - name = fusion_str + name; - *n_flops = m * n * (k + (k - 1)) * batch; - return name; + return fusion_str + name; } if (node->op == GGML_OP_CONV_2D || node->op == GGML_OP_CONV_TRANSPOSE_2D) { std::string name = ggml_op_name(node->op); - ggml_tensor * knl = node->src[0]; - uint64_t OW = node->ne[0]; - uint64_t OH = node->ne[1]; - uint64_t N = node->ne[3]; + const ggml_tensor * knl = node->src[0]; uint64_t Cout = node->ne[2]; - uint64_t KW = knl->ne[0]; - uint64_t KH = knl->ne[1]; - uint64_t Cin = node->src[1]->ne[2]; - // KxCRS @ CRSxNPQ = KxNPQ -> M=K, K=CRS, N=NPQ - uint64_t size_M = Cout; - uint64_t size_K = Cin * KW * KH; - uint64_t size_N = N * OW * OH; - *n_flops = size_M * size_N * (size_K + (size_K - 1)); - name += " M=Cout=" + std::to_string(size_M) + ", K=Cin*KW*KH=" + std::to_string(size_K) + + uint64_t size_K = node->src[1]->ne[2] * knl->ne[0] * knl->ne[1]; + uint64_t size_N = node->ne[3] * node->ne[0] * node->ne[1]; + name += " M=Cout=" + std::to_string(Cout) + ", K=Cin*KW*KH=" + std::to_string(size_K) + ", N=N*OW*OH=" + std::to_string(size_N); - name = fusion_str + name; - return name; + return fusion_str + name; } if (node->op == GGML_OP_RMS_NORM) { std::string name = ggml_op_name(node->op); name += "(" + std::to_string(node->ne[0]) + "," + std::to_string(node->ne[1]) + "," + std::to_string(node->ne[2]) + "," + std::to_string(node->ne[3]) + ")"; - name = fusion_str + name; - return name; + return fusion_str + name; } if (node->op == GGML_OP_FLASH_ATTN_EXT) { const ggml_tensor * dst = node; @@ -2017,7 +2036,6 @@ class vk_perf_logger { " k(" << k->ne[0] << "," << k->ne[1] << "," << k->ne[2] << "," << k->ne[3] << "), " << " v(" << v->ne[0] << "," << v->ne[1] << "," << v->ne[2] << "," << v->ne[3] << "), " << " m(" << (m?m->ne[0]:0) << "," << (m?m->ne[1]:0) << "," << (m?m->ne[2]:0) << "," << (m?m->ne[3]:0) << ")"; - *n_flops = 2ull * q->ne[1] * q->ne[2] * (k->ne[0] + v->ne[0]) * k->ne[1] * q->ne[3]; return name.str(); } if (node->op == GGML_OP_TOP_K) { @@ -2081,7 +2099,7 @@ struct ggml_backend_vk_context { bool do_add_rms_partials_offset_calculation; bool do_add_rms_partials; - uint64_t last_total_mul_mat_bytes {}; + uint64_t last_total_flops {UINT64_MAX}; // Cache most recent tensor that was converted into prealloc_y, and what pipeline it used to convert. vk_pipeline_struct * prealloc_y_last_pipeline_used {}; @@ -16180,22 +16198,23 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg } // Submit after enough work has accumulated, to overlap CPU cmdbuffer generation with GPU execution. - // Estimate the amount of matmul work by looking at the weight matrix size, and submit every 100MB - // (and scaled down based on model size, so smaller models submit earlier). - int submitted_nodes = 0; - int submit_count = 0; - uint64_t mul_mat_bytes = 0; - uint64_t total_mul_mat_bytes = 0; - uint64_t mul_mat_bytes_per_submit = std::min(uint64_t(100*1000*1000), ctx->last_total_mul_mat_bytes / 40u); + // Estimate the amount of compute work using flops, and submit every 200 GFLOP + // (and scaled down based on total graph flops, so smaller models submit earlier). + // Also submit at least every 100 nodes, in case there are workloads without heavy compute. + uint32_t submitted_nodes = 0; + uint32_t submit_count = 0; + uint64_t batch_flops = 0; + uint64_t total_flops = 0; + uint64_t flops_per_submit = std::min(uint64_t(200'000'000'000), ctx->last_total_flops / 40u); for (int i = 0; i < cgraph->n_nodes; i++) { if (first_node_in_batch) { submit_node_idx = i; } - if (cgraph->nodes[i]->op == GGML_OP_MUL_MAT || cgraph->nodes[i]->op == GGML_OP_MUL_MAT_ID) { - auto bytes = ggml_nbytes(cgraph->nodes[i]->src[0]); - mul_mat_bytes += bytes; - total_mul_mat_bytes += bytes; + { + auto node_flops = ggml_vk_get_node_flops(cgraph->nodes[i]); + batch_flops += node_flops; + total_flops += node_flops; } // op_srcs_fused_elementwise indicates whether an op's srcs all contribute to @@ -16407,8 +16426,8 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg // Signal the almost_ready fence when the graph is mostly complete (< 20% remaining) bool almost_ready = (cgraph->n_nodes - i) < cgraph->n_nodes / 5; - bool submit = ((uint32_t)submitted_nodes >= ctx->device->max_nodes_per_submit) || - (mul_mat_bytes_per_submit != 0 && mul_mat_bytes >= mul_mat_bytes_per_submit) || + bool submit = (submitted_nodes >= ctx->device->max_nodes_per_submit) || + (flops_per_submit != 0 && batch_flops >= flops_per_submit) || (i + ctx->num_additional_fused_ops >= last_node) || (almost_ready && !ctx->almost_ready_fence_pending); @@ -16442,9 +16461,9 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg if (submit && enqueued) { first_node_in_batch = true; submitted_nodes = 0; - mul_mat_bytes = 0; + batch_flops = 0; if (submit_count < 3) { - mul_mat_bytes_per_submit *= 2; + flops_per_submit *= 2; } submit_count++; } @@ -16453,7 +16472,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg ctx->fused_ops_write_mask = 0; } - ctx->last_total_mul_mat_bytes = total_mul_mat_bytes; + ctx->last_total_flops = total_flops; if (vk_perf_logger_enabled) { // End the command buffer and submit/wait