Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 58 additions & 39 deletions ggml/src/ggml-vulkan/ggml-vulkan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1898,6 +1898,38 @@ static bool vk_enable_sync_logger = false;
static uint32_t vk_perf_logger_frequency = 1;
static std::string vk_pipeline_stats_filter;

static uint64_t ggml_vk_get_node_flops(const ggml_tensor * node) {
if (node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL_MAT_ID) {
const uint64_t m = node->ne[0];
const uint64_t n = node->ne[1];
const uint64_t k = node->src[1]->ne[0];
const uint64_t batch = node->ne[2] * node->ne[3];
return m * n * (k + (k - 1)) * batch;
}
if (node->op == GGML_OP_CONV_2D || node->op == GGML_OP_CONV_TRANSPOSE_2D) {
const ggml_tensor * knl = node->src[0];
const uint64_t Cout = node->ne[2];
const uint64_t size_K = node->src[1]->ne[2] * knl->ne[0] * knl->ne[1];
const uint64_t size_N = node->ne[3] * node->ne[0] * node->ne[1];
return Cout * size_N * (size_K + (size_K - 1));
}
if (node->op == GGML_OP_CONV_3D) {
const ggml_tensor * knl = node->src[0];
const uint64_t OC = ggml_get_op_params_i32(node, 11);
const uint64_t IC = ggml_get_op_params_i32(node, 9);
const uint64_t size_K = IC * knl->ne[0] * knl->ne[1] * knl->ne[2];
const uint64_t size_N = node->ne[3] / OC * node->ne[0] * node->ne[1] * node->ne[2];
return OC * size_N * (size_K + (size_K - 1));
}
if (node->op == GGML_OP_FLASH_ATTN_EXT) {
const ggml_tensor * q = node->src[0];
const ggml_tensor * k = node->src[1];
const ggml_tensor * v = node->src[2];
return 2ull * q->ne[1] * q->ne[2] * (k->ne[0] + v->ne[0]) * k->ne[1] * q->ne[3];
}
return 0;
}

class vk_perf_logger {
public:
void print_timings(bool force = false) {
Expand Down Expand Up @@ -1946,7 +1978,7 @@ class vk_perf_logger {
}

std::string get_node_fusion_name(const ggml_tensor * node, const char *fusion_name, uint64_t *n_flops) {
*n_flops = 0;
*n_flops = ggml_vk_get_node_flops(node);
std::string fusion_str;
if (fusion_name) {
fusion_str = fusion_name + std::string(" ");
Expand All @@ -1973,35 +2005,22 @@ class vk_perf_logger {
if (batch > 1) {
name += " batch=" + std::to_string(batch);
}
name = fusion_str + name;
*n_flops = m * n * (k + (k - 1)) * batch;
return name;
return fusion_str + name;
}
if (node->op == GGML_OP_CONV_2D || node->op == GGML_OP_CONV_TRANSPOSE_2D) {
std::string name = ggml_op_name(node->op);
ggml_tensor * knl = node->src[0];
uint64_t OW = node->ne[0];
uint64_t OH = node->ne[1];
uint64_t N = node->ne[3];
const ggml_tensor * knl = node->src[0];
uint64_t Cout = node->ne[2];
uint64_t KW = knl->ne[0];
uint64_t KH = knl->ne[1];
uint64_t Cin = node->src[1]->ne[2];
// KxCRS @ CRSxNPQ = KxNPQ -> M=K, K=CRS, N=NPQ
uint64_t size_M = Cout;
uint64_t size_K = Cin * KW * KH;
uint64_t size_N = N * OW * OH;
*n_flops = size_M * size_N * (size_K + (size_K - 1));
name += " M=Cout=" + std::to_string(size_M) + ", K=Cin*KW*KH=" + std::to_string(size_K) +
uint64_t size_K = node->src[1]->ne[2] * knl->ne[0] * knl->ne[1];
uint64_t size_N = node->ne[3] * node->ne[0] * node->ne[1];
name += " M=Cout=" + std::to_string(Cout) + ", K=Cin*KW*KH=" + std::to_string(size_K) +
", N=N*OW*OH=" + std::to_string(size_N);
name = fusion_str + name;
return name;
return fusion_str + name;
}
if (node->op == GGML_OP_RMS_NORM) {
std::string name = ggml_op_name(node->op);
name += "(" + std::to_string(node->ne[0]) + "," + std::to_string(node->ne[1]) + "," + std::to_string(node->ne[2]) + "," + std::to_string(node->ne[3]) + ")";
name = fusion_str + name;
return name;
return fusion_str + name;
}
if (node->op == GGML_OP_FLASH_ATTN_EXT) {
const ggml_tensor * dst = node;
Expand All @@ -2017,7 +2036,6 @@ class vk_perf_logger {
" k(" << k->ne[0] << "," << k->ne[1] << "," << k->ne[2] << "," << k->ne[3] << "), " <<
" v(" << v->ne[0] << "," << v->ne[1] << "," << v->ne[2] << "," << v->ne[3] << "), " <<
" m(" << (m?m->ne[0]:0) << "," << (m?m->ne[1]:0) << "," << (m?m->ne[2]:0) << "," << (m?m->ne[3]:0) << ")";
*n_flops = 2ull * q->ne[1] * q->ne[2] * (k->ne[0] + v->ne[0]) * k->ne[1] * q->ne[3];
return name.str();
}
if (node->op == GGML_OP_TOP_K) {
Expand Down Expand Up @@ -2081,7 +2099,7 @@ struct ggml_backend_vk_context {
bool do_add_rms_partials_offset_calculation;
bool do_add_rms_partials;

uint64_t last_total_mul_mat_bytes {};
uint64_t last_total_flops {UINT64_MAX};

// Cache most recent tensor that was converted into prealloc_y, and what pipeline it used to convert.
vk_pipeline_struct * prealloc_y_last_pipeline_used {};
Expand Down Expand Up @@ -16180,22 +16198,23 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
}

// Submit after enough work has accumulated, to overlap CPU cmdbuffer generation with GPU execution.
// Estimate the amount of matmul work by looking at the weight matrix size, and submit every 100MB
// (and scaled down based on model size, so smaller models submit earlier).
int submitted_nodes = 0;
int submit_count = 0;
uint64_t mul_mat_bytes = 0;
uint64_t total_mul_mat_bytes = 0;
uint64_t mul_mat_bytes_per_submit = std::min(uint64_t(100*1000*1000), ctx->last_total_mul_mat_bytes / 40u);
// Estimate the amount of compute work using flops, and submit every 200 GFLOP
// (and scaled down based on total graph flops, so smaller models submit earlier).
// Also submit at least every 100 nodes, in case there are workloads without heavy compute.
uint32_t submitted_nodes = 0;
uint32_t submit_count = 0;
uint64_t batch_flops = 0;
uint64_t total_flops = 0;
uint64_t flops_per_submit = std::min(uint64_t(200'000'000'000), ctx->last_total_flops / 40u);
for (int i = 0; i < cgraph->n_nodes; i++) {
if (first_node_in_batch) {
submit_node_idx = i;
}

if (cgraph->nodes[i]->op == GGML_OP_MUL_MAT || cgraph->nodes[i]->op == GGML_OP_MUL_MAT_ID) {
auto bytes = ggml_nbytes(cgraph->nodes[i]->src[0]);
mul_mat_bytes += bytes;
total_mul_mat_bytes += bytes;
{
auto node_flops = ggml_vk_get_node_flops(cgraph->nodes[i]);
batch_flops += node_flops;
total_flops += node_flops;
}

// op_srcs_fused_elementwise indicates whether an op's srcs all contribute to
Expand Down Expand Up @@ -16407,8 +16426,8 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg

// Signal the almost_ready fence when the graph is mostly complete (< 20% remaining)
bool almost_ready = (cgraph->n_nodes - i) < cgraph->n_nodes / 5;
bool submit = ((uint32_t)submitted_nodes >= ctx->device->max_nodes_per_submit) ||
(mul_mat_bytes_per_submit != 0 && mul_mat_bytes >= mul_mat_bytes_per_submit) ||
bool submit = (submitted_nodes >= ctx->device->max_nodes_per_submit) ||
(flops_per_submit != 0 && batch_flops >= flops_per_submit) ||
(i + ctx->num_additional_fused_ops >= last_node) ||
(almost_ready && !ctx->almost_ready_fence_pending);

Expand Down Expand Up @@ -16442,9 +16461,9 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
if (submit && enqueued) {
first_node_in_batch = true;
submitted_nodes = 0;
mul_mat_bytes = 0;
batch_flops = 0;
if (submit_count < 3) {
mul_mat_bytes_per_submit *= 2;
flops_per_submit *= 2;
}
submit_count++;
}
Expand All @@ -16453,7 +16472,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
ctx->fused_ops_write_mask = 0;
}

ctx->last_total_mul_mat_bytes = total_mul_mat_bytes;
ctx->last_total_flops = total_flops;

if (vk_perf_logger_enabled) {
// End the command buffer and submit/wait
Expand Down
Loading