Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
128 changes: 116 additions & 12 deletions ggml/src/ggml-openvino/ggml-decoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,16 +98,31 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, std::map<std::string, std::sh
}
}

namespace {
bool is_inplace_op(const ggml_tensor * node) {
return node->op == GGML_OP_SET_ROWS || node->op == GGML_OP_CPY || (node->op == GGML_OP_SCALE && node->view_src);
}

bool is_same_shape(const ggml_tensor * a, const ggml_tensor * b) {
for (int i = 0; i < GGML_MAX_DIMS; i++) {
if (a->ne[i] != b->ne[i]) {
return false;
}
}
return true;
}
} // namespace

void GgmlOvDecoder::set_input_output() {
for (int node_n = 0; node_n < m_cgraph->n_nodes; node_n++) {
auto node = m_cgraph->nodes[node_n];
auto * node = m_cgraph->nodes[node_n];

NodeInfo current_node_info;
auto node_name = std::string(node->name);
auto node_output_name = node_name;
auto * node_output = node;
if (node->op == GGML_OP_SET_ROWS) {
// SET_ROWS updates the tensor in place. For later ov op that uses the
if (::is_inplace_op(node)) {
// In-place ops update the tensor in place. For later ov op that uses the
// the view_src of SET_ROWS, we need to make sure they get the updated tensor
// by putting the view_src name in the tensor_map in
// <openvino>/src/frontends/ggml/src/translate_session.cpp
Expand Down Expand Up @@ -167,6 +182,10 @@ int GgmlOvDecoder::compute_op_case(const ggml_tensor * node) const {
switch (node->op) {
case GGML_OP_RESHAPE: {
auto * src = node->src[0];
if (is_same_shape(src, node)) {
op_case = 7;
break;
}
if (src->op == GGML_OP_RESHAPE && src->src[0]->ne[0] == node->ne[0] && src->src[0]->ne[1] == node->ne[1]) {
op_case = 4;
} else if (node->ne[0] * node->ne[1] == src->ne[0]) {
Expand Down Expand Up @@ -295,6 +314,33 @@ int GgmlOvDecoder::compute_op_case(const ggml_tensor * node) const {
}
break;
}
case GGML_OP_RMS_NORM: {
if (node->src[0]->op == GGML_OP_VIEW) {
if (is_same_shape(node->src[0]->src[0], node->src[0])) {
op_case = 1;
} else if (node->src[0]->src[0]->op == GGML_OP_GATED_DELTA_NET) {
op_case = 2;
}
}
break;
}
case GGML_OP_CPY: {
if (node->src[0]->op == GGML_OP_VIEW) {
if (node->src[0]->src[0]->op == GGML_OP_GATED_DELTA_NET) {
op_case = 1;
} else if (std::string(node->src[0]->name).find("conv_state_last") == 0) {
op_case = 2;
break;
}
}
break;
}
case GGML_OP_SCALE: {
if (is_kvcache(node->view_src, nullptr)) {
op_case = 1;
}
break;
}
default:
break;
}
Expand Down Expand Up @@ -476,6 +522,13 @@ std::pair<ModelParams, ComputeParams> GgmlOvDecoder::compute_llm_params(ggml_cgr
model_params.mixed_rope_params = true;
}
}
if (node->op == GGML_OP_GATED_DELTA_NET) {
model_params.state_size = node->src[0]->ne[0];
}
if (node->op == GGML_OP_SCALE && is_kvcache(node->view_src, nullptr)) {
compute_params.cache_rs_reset_len = ggml_nelements(node) / node->view_src->ne[0];
compute_params.cache_rs_reset_idx = node->src[0]->view_offs / node->view_src->ne[0];
}
}
auto * output_tensor = cgraph->nodes[cgraph->n_nodes - 1];
compute_params.output_len = output_tensor->ne[1];
Expand Down Expand Up @@ -595,6 +648,11 @@ void GgmlOvDecoder::add_extra_inputs() {
create_1d_input("token_len_per_seq", m_compute_params.token_len_per_seq);
}
// create_1d_input("token_len", m_compute_params.token_len_per_seq * m_compute_params.n_seq_active);

if (m_compute_params.cache_rs_reset_idx != -1) {
create_1d_input("cache_rs_reset_idx", m_compute_params.cache_rs_reset_idx);
create_1d_input("cache_rs_reset_len", m_compute_params.cache_rs_reset_len);
}
}

bool GgmlOvDecoder::node_is_used_as_src(const int node_idx) {
Expand Down Expand Up @@ -691,8 +749,8 @@ void GgmlOvDecoder::compute_model_outputs() {
}
auto cur_node_use_count = m_cgraph->use_counts[ggml_hash_find(&m_cgraph->visited_hash_set, cur_node)];
if (cur_node_use_count == 0) {
// The output of SET_ROWS is the view_src tensor, which is updated in place. We should use the view_src name as the output name to make sure it can be correctly matched with the later ops that use the view_src.
if (cur_node != nullptr && cur_node->op == GGML_OP_SET_ROWS) {
// The output of in-place ops is the view_src tensor, which is updated in place. We should use the view_src name as the output name to make sure it can be correctly matched with the later ops that use the view_src.
if (cur_node != nullptr && ::is_inplace_op(cur_node)) {
cur_node = cur_node->view_src;
}
} else {
Expand All @@ -712,7 +770,7 @@ void GgmlOvDecoder::compute_model_outputs() {
if (cur_node != nullptr) {
std::string node_output_name(cur_node->name);
m_model_outputs[node_output_name] = cur_node;
m_model_output_names.push_back(node_output_name);
m_model_output_names.insert(node_output_name);
}
}
}
Expand Down Expand Up @@ -1231,6 +1289,26 @@ std::vector<std::string> GgmlOvDecoder::get_output_names(int node_idx) const {
return {m_node_info_list[node_idx].node_output_name};
}

bool GgmlOvDecoder::is_inplace_op(int node_idx) const {
return ::is_inplace_op(m_node_info_list[node_idx].node);
}

std::string GgmlOvDecoder::get_view_src_name(int node_idx) const {
auto * node = m_node_info_list[node_idx].node;
if (node->view_src == nullptr) {
return "";
}
return node->view_src->name;
}

bool GgmlOvDecoder::is_view_like_alias_of(int node_idx, const std::string & view_src_name) const {
auto * node = m_node_info_list[node_idx].node;
if (node->view_src == nullptr || std::string(node->view_src->name) != view_src_name) {
return false;
}
return node->op == GGML_OP_RESHAPE || node->op == GGML_OP_VIEW;
}

const std::string & GgmlOvDecoder::get_op_name() const {
static const std::string unknown_name = "UNKNOWN_OP_NAME";
return unknown_name;
Expand Down Expand Up @@ -1404,14 +1482,18 @@ void GgmlOvDecoder::compute_node_dynamic_dims() {
}
if (m_node_dynamic_dims[node] != -1 && dynamic_dim_value != node->ne[m_node_dynamic_dims[node]]) {
m_node_dynamic_dims[node] = -1;
// std::cout << "Warning: Dynamic dim value mismatch for node: " << node->name
// << " and its src[0]: " << node->src[0]->name << std::endl;
GGML_LOG_WARN("ggml-openvino: dynamic dim value mismatch for VIEW node '%s', src[0]: '%s'\n",
node->name, node->src[0]->name);
}
}
break;
}
case GGML_OP_TRANSPOSE:
case GGML_OP_RESHAPE: {
if (is_same_shape(node->src[0], node)) {
m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[0]];
break;
}
// RESHAPE requires src[0] to be contiguous, so both src and result
// have standard compact strides: nb[i] = type_size * prod(ne[0..i-1]).
// Match src->nb[dynamic_dim] against result->nb[i] to find the output
Expand All @@ -1429,7 +1511,7 @@ void GgmlOvDecoder::compute_node_dynamic_dims() {
}
}
if (m_node_dynamic_dims[node] == -1) {
// std::cout << "Cannot determine dynamic dim for RESHAPE node: " << node->name << std::endl;
GGML_LOG_WARN("ggml-openvino: cannot determine dynamic dim for RESHAPE node '%s'\n", node->name);
}
}
break;
Expand Down Expand Up @@ -1480,25 +1562,46 @@ void GgmlOvDecoder::compute_node_dynamic_dims() {
}
if (matched_dim_count != 1) {
m_node_dynamic_dims[node] = -1;
// std::cout << "Warning: Cannot determine dynamic dim for CONT node: " << node->name
// << " and its src[0]: " << node->src[0]->name << std::endl;
GGML_LOG_WARN("ggml-openvino: cannot determine dynamic dim for CONT node '%s', src[0]: '%s'\n",
node->name, node->src[0]->name);
}
}
}
break;
case GGML_OP_CONCAT:
for (int i = 0; i < GGML_MAX_DIMS; i++) {
if (node->src[0]->ne[i] != node->ne[i]) {
m_node_dynamic_dims[node] = i;
break;
}
}
break;
case GGML_OP_SSM_CONV:
case GGML_OP_GATED_DELTA_NET:
m_node_dynamic_dims[node] = 1;
break;
case GGML_OP_RMS_NORM:
case GGML_OP_L2_NORM:
case GGML_OP_NORM:
case GGML_OP_ADD:
case GGML_OP_SUB:
case GGML_OP_GLU:
case GGML_OP_ROPE:
case GGML_OP_SCALE:
case GGML_OP_SOFT_MAX:
case GGML_OP_ARGSORT:
case GGML_OP_ADD_ID:
case GGML_OP_UNARY:
case GGML_OP_CUMSUM:
case GGML_OP_FILL:
case GGML_OP_SET:
case GGML_OP_DIAG:
case GGML_OP_TRI:
case GGML_OP_REPEAT:
m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[0]];
break;
case GGML_OP_MUL_MAT_ID:
case GGML_OP_SOLVE_TRI:
m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[1]];
break;
case GGML_OP_CPY:
Expand Down Expand Up @@ -1534,7 +1637,8 @@ void GgmlOvDecoder::compute_node_dynamic_dims() {
break;
}
default:
// std::cout << "Doesn't handle node name: " << node->name << " op: " << ggml_op_name(node->op) << std::endl;
GGML_LOG_DEBUG("ggml-openvino: compute_node_dynamic_dims: unhandled op %s for node '%s'\n",
ggml_op_name(node->op), node->name);
break;
}
};
Expand Down
24 changes: 22 additions & 2 deletions ggml/src/ggml-openvino/ggml-decoder.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ struct ModelParams {
int n_seq = 1;
int n_heads_kv = -1;
int head_size = -1;
int state_size = -1; // for SSM molels, eg qwen35
int32_t rope_params[15];
bool mixed_rope_params = false;
std::vector<int> swa_layers;
Expand Down Expand Up @@ -48,6 +49,16 @@ struct ComputeParams {
int token_len_per_seq = -1;
int past_kv_len = -1;
int output_len = 1;

int cache_rs_reset_idx = -1;
int cache_rs_reset_len = -1;
// SSM/DeltaNet models otionally clear cache_r and cache_s of certain slots in the cgraph
// 3: [ 18432, 4, 1, 1] RESHAPE cache_r_l0 (reshaped)
// [ 18432, 4, 1, 1] 0: NONE cache_r_l0
// 4: [ 18432, 1, 1, 1] VIEW cache_r_l0 (reshaped) (view)
// [ 18432, 4, 1, 1] 0: RESHAPE cache_r_l0 (reshaped)
// 5: [ 18432, 1, 1, 1] SCALE cache_r_l0 (reshaped) (view) (view)
// [ 18432, 1, 1, 1] 0: VIEW cache_r_l0 (reshaped) (view)
};

class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
Expand Down Expand Up @@ -156,6 +167,12 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {

virtual std::vector<std::string> get_output_names(int node_idx) const override;

virtual bool is_inplace_op(int node_idx) const override;

virtual std::string get_view_src_name(int node_idx) const override;

virtual bool is_view_like_alias_of(int node_idx, const std::string & view_src_name) const override;

virtual const std::string & get_op_type() const override;

virtual const std::string & get_op_type(int node_idx) const override;
Expand Down Expand Up @@ -189,7 +206,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
return m_model_weights;
}

virtual std::vector<std::string> get_model_output_names() const override { return m_model_output_names; }
virtual std::set<std::string> get_model_output_names() const override { return m_model_output_names; }

const std::map<std::string, ggml_tensor *> & get_model_outputs() const { return m_model_outputs; }

Expand All @@ -214,6 +231,8 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {

virtual bool has_mixed_rope_params() const override { return m_model_params.mixed_rope_params; }

virtual int get_ssm_state_size() const override { return m_model_params.state_size; }

virtual std::map<std::string, std::string> get_kv_param_res_names() const override;

virtual bool is_static() const override { return m_is_static; }
Expand Down Expand Up @@ -287,6 +306,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
return op->op == GGML_OP_ROPE && tensor == op->src[2];
}

// also returns true for cache_s and cache_r in SSM/DeltaNet models
inline static bool is_kvcache(const ggml_tensor * tensor, const ggml_tensor * op) {
return tensor->buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY ||
(op != nullptr && op->op == GGML_OP_SET_ROWS && op->src[2] == tensor);
Expand Down Expand Up @@ -334,7 +354,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
std::map<std::string, std::shared_ptr<ov::Tensor>> m_model_extra_input_values;
std::map<std::string, std::shared_ptr<ov::Node>> m_model_weights;
std::map<std::string, ggml_tensor *> m_model_outputs;
std::vector<std::string> m_model_output_names;
std::set<std::string> m_model_output_names;
std::vector<NodeInfo> m_node_info_list;
std::map<ggml_tensor *, int> m_node_dynamic_dims;

Expand Down
Loading
Loading