diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 0cd69eeaaa4e..bd51e60d6735 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -98,9 +98,24 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, std::mapop == GGML_OP_SET_ROWS || node->op == GGML_OP_CPY || (node->op == GGML_OP_SCALE && node->view_src); +} + +bool is_same_shape(const ggml_tensor * a, const ggml_tensor * b) { + for (int i = 0; i < GGML_MAX_DIMS; i++) { + if (a->ne[i] != b->ne[i]) { + return false; + } + } + return true; +} +} // namespace + void GgmlOvDecoder::set_input_output() { for (int node_n = 0; node_n < m_cgraph->n_nodes; node_n++) { - auto node = m_cgraph->nodes[node_n]; + auto * node = m_cgraph->nodes[node_n]; NodeInfo current_node_info; auto node_name = std::string(node->name); @@ -158,6 +173,7 @@ int GgmlOvDecoder::compute_op_case(const ggml_tensor * node) const { int op_case = 0; switch (node->op) { case GGML_OP_RESHAPE: { + auto name = std::string(node->name); auto * src = node->src[0]; if (src->op == GGML_OP_RESHAPE && src->src[0]->ne[0] == node->ne[0] && src->src[0]->ne[1] == node->ne[1]) { op_case = 4; @@ -170,11 +186,12 @@ int GgmlOvDecoder::compute_op_case(const ggml_tensor * node) const { } } else if (src->ne[0] * src->ne[1] * src->ne[2] == node->ne[1]) { op_case = 3; - } else if (src->ne[1] * src->ne[2] == node->ne[1]) { - op_case = 6; - } - if (op_case == 0 && ggml_nelements(node) == ggml_nelements(src)) { + } else if (name.find("linear_attn_qkv_mixed") == 0 || name.find("alpha") == 0) { op_case = 6; + } else if (name.find("linear_attn_out") == 0) { + op_case = 7; + } else if (name.find("state_predelta") == 0) { + op_case = 8; } break; } @@ -224,7 +241,14 @@ int GgmlOvDecoder::compute_op_case(const ggml_tensor * node) const { } case GGML_OP_GET_ROWS: { if (node->src[1]->op == GGML_OP_VIEW) { - op_case = 2; + // GET_ROWS gathering recurrent state cache rows via the inp->s_copy index list: + // src[0] is a reshape of cache_r/cache_s, src[1] is a view of the s_copy leaf. + // op_case 3: main view (active sequences, view offset 0) + // op_case 4: extra view (defrag remainder, nonzero view offset) + if (node->src[0]->op == GGML_OP_RESHAPE && node->src[0]->src[0] != nullptr && + is_kvcache(node->src[0]->src[0], nullptr)) { + op_case = node->src[1]->view_offs == 0 ? 1 : 2; + } } break; } @@ -287,6 +311,44 @@ int GgmlOvDecoder::compute_op_case(const ggml_tensor * node) const { } break; } + case GGML_OP_RMS_NORM: { + if (node->src[0]->op == GGML_OP_VIEW) { + if (is_same_shape(node->src[0]->src[0], node->src[0])) { + op_case = 1; + } else if (node->src[0]->src[0]->op == GGML_OP_GATED_DELTA_NET) { + op_case = 2; + } + } + break; + } + case GGML_OP_CPY: { + if (node->src[0]->op == GGML_OP_VIEW) { + if (node->src[0]->src[0]->op == GGML_OP_GATED_DELTA_NET) { + op_case = 1; + } else if (std::string(node->src[0]->name).find("conv_state_last") == 0) { + op_case = 2; + break; + } + } else if (node->src[0]->op == GGML_OP_GET_ROWS && node->src[1] != nullptr && + node->src[1]->op == GGML_OP_VIEW && node->src[1]->view_src != nullptr && + is_kvcache(node->src[1]->view_src, nullptr)) { + // s_copy defrag remainder writeback: gathered extra state rows copied back into the cache + op_case = 3; + } + break; + } + case GGML_OP_SCALE: { + if (node->view_src && node->buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY) { + op_case = 1; + } + break; + } + case GGML_OP_L2_NORM: { + if (std::string(node->name).find("predelta") != std::string::npos) { + op_case = 1; + } + break; + } default: break; } @@ -468,6 +530,30 @@ std::pair GgmlOvDecoder::compute_llm_params(ggml_cgr model_params.mixed_rope_params = true; } } + if (node->op == GGML_OP_GATED_DELTA_NET) { + model_params.state_size = node->src[0]->ne[0]; + } + if (node->op == GGML_OP_SCALE && is_kvcache(node->view_src, nullptr)) { + compute_params.cache_rs_reset_len = ggml_nelements(node) / node->view_src->ne[0]; + compute_params.cache_rs_reset_idx = node->src[0]->view_offs / node->view_src->ne[0]; + } + // Capture the active-slot block of the recurrent state reorder (inp->s_copy). The active + // sequences occupy a contiguous slot block [idx, idx+len) of the state cache; read both from + // the active conv/gdn state writeback destination view (idx = head, len = n_seqs). + if (node->op == GGML_OP_CPY && node->view_src != nullptr && is_kvcache(node->view_src, nullptr) && + node->src[0]->op == GGML_OP_VIEW && node->src[1] != nullptr) { + const bool is_conv = std::string(node->src[0]->name).find("conv_state_last") == 0; + const bool is_gdn = node->src[0]->src[0] != nullptr && node->src[0]->src[0]->op == GGML_OP_GATED_DELTA_NET; + if (is_conv || is_gdn) { + const ggml_tensor * dest_view = node->src[1]; + const ggml_tensor * cache = node->view_src; + const size_t row_bytes = cache->ne[0] * ggml_type_size(cache->type); + if (row_bytes > 0) { + compute_params.s_copy_active_slot_idx = (int) (dest_view->view_offs / row_bytes); + compute_params.s_copy_active_slot_len = (int) dest_view->ne[1]; + } + } + } } auto * output_tensor = cgraph->nodes[cgraph->n_nodes - 1]; compute_params.output_len = output_tensor->ne[1]; @@ -535,6 +621,9 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, int len = m_is_static ? (m_is_prefill ? m_prefill_chunk_size : 1) : -1; input_shape = ov::PartialShape{1, 1, 1, len}; + } else if (is_inp_s_copy(input, op) || is_s_copy_leaf(input)) { + input_shape = ov::PartialShape{1, 1, 1, -1}; + } else { input_shape = ov::PartialShape{get_shape(input)}; } @@ -550,6 +639,35 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, return input_shape; } +bool GgmlOvDecoder::is_s_copy_leaf(const ggml_tensor * tensor) const { + if (tensor == nullptr || tensor->op != GGML_OP_NONE || m_cgraph == nullptr) { + return false; + } + for (int i = 0; i < m_cgraph->n_nodes; i++) { + const ggml_tensor * node = m_cgraph->nodes[i]; + if (node->op != GGML_OP_GET_ROWS || node->src[0] == nullptr || node->src[1] == nullptr) { + continue; + } + // The index list may reach the s_copy leaf through one or more VIEWs. + const ggml_tensor * idx = node->src[1]; + while (idx != nullptr && idx->op == GGML_OP_VIEW) { + idx = idx->src[0]; + } + if (idx != tensor) { + continue; + } + // The gathered data must be a recurrent state cache (cache_r/cache_s). + const ggml_tensor * data = node->src[0]; + while (data != nullptr && (data->op == GGML_OP_VIEW || data->op == GGML_OP_RESHAPE)) { + data = data->src[0]; + } + if (data != nullptr && is_kvcache(data, nullptr)) { + return true; + } + } + return false; +} + void GgmlOvDecoder::add_extra_inputs() { // Extra inputs: // 1. `attention_size`, used in FLASH_ATTN where the shape of the matmul's are 256 aligned, @@ -587,6 +705,16 @@ void GgmlOvDecoder::add_extra_inputs() { create_1d_input("token_len_per_seq", m_compute_params.token_len_per_seq); } // create_1d_input("token_len", m_compute_params.token_len_per_seq * m_compute_params.n_seq_active); + + if (m_compute_params.cache_rs_reset_idx != -1) { + create_1d_input("cache_rs_reset_idx", m_compute_params.cache_rs_reset_idx); + create_1d_input("cache_rs_reset_len", m_compute_params.cache_rs_reset_len); + } + + if (m_compute_params.s_copy_active_slot_len != -1) { + create_1d_input("s_copy_active_slot_idx", m_compute_params.s_copy_active_slot_idx); + create_1d_input("s_copy_active_slot_len", m_compute_params.s_copy_active_slot_len); + } } bool GgmlOvDecoder::node_is_used_as_src(const int node_idx) { @@ -683,8 +811,8 @@ void GgmlOvDecoder::compute_model_outputs() { } auto cur_node_use_count = m_cgraph->use_counts[ggml_hash_find(&m_cgraph->visited_hash_set, cur_node)]; if (cur_node_use_count == 0) { - // The output of SET_ROWS is the view_src tensor, which is updated in place. We should use the view_src name as the output name to make sure it can be correctly matched with the later ops that use the view_src. - if (cur_node != nullptr && cur_node->op == GGML_OP_SET_ROWS) { + // The output of in-place ops is the view_src tensor, which is updated in place. We should use the view_src name as the output name to make sure it can be correctly matched with the later ops that use the view_src. + if (cur_node != nullptr && ::is_inplace_op(cur_node)) { cur_node = cur_node->view_src; } } else { @@ -704,7 +832,7 @@ void GgmlOvDecoder::compute_model_outputs() { if (cur_node != nullptr) { std::string node_output_name(cur_node->name); m_model_outputs[node_output_name] = cur_node; - m_model_output_names.push_back(node_output_name); + m_model_output_names.insert(node_output_name); } } } @@ -1223,12 +1351,20 @@ std::vector GgmlOvDecoder::get_output_names(int node_idx) const { return {m_node_info_list[node_idx].node_output_name}; } -std::vector GgmlOvDecoder::get_output_aliases(int node_idx) const { - const auto * node = m_node_info_list[node_idx].node; - if (node != nullptr && node->op == GGML_OP_SET_ROWS && node->view_src != nullptr) { - return {std::string(node->view_src->name)}; +std::string GgmlOvDecoder::get_inplace_op_src(int node_idx) const { + auto * node = m_node_info_list[node_idx].node; + if (!::is_inplace_op(node) || node->view_src == nullptr) { + return ""; } - return {}; + return node->view_src->name; +} + +bool GgmlOvDecoder::is_view_like_alias_of(int node_idx, const std::string & view_src_name) const { + auto * node = m_node_info_list[node_idx].node; + if (node->view_src == nullptr || std::string(node->view_src->name) != view_src_name) { + return false; + } + return node->op == GGML_OP_RESHAPE || node->op == GGML_OP_VIEW; } const std::string & GgmlOvDecoder::get_op_name() const { @@ -1404,14 +1540,18 @@ void GgmlOvDecoder::compute_node_dynamic_dims() { } if (m_node_dynamic_dims[node] != -1 && dynamic_dim_value != node->ne[m_node_dynamic_dims[node]]) { m_node_dynamic_dims[node] = -1; - // std::cout << "Warning: Dynamic dim value mismatch for node: " << node->name - // << " and its src[0]: " << node->src[0]->name << std::endl; + GGML_LOG_WARN("ggml-openvino: dynamic dim value mismatch for VIEW node '%s', src[0]: '%s'\n", + node->name, node->src[0]->name); } } break; } case GGML_OP_TRANSPOSE: case GGML_OP_RESHAPE: { + if (is_same_shape(node->src[0], node)) { + m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[0]]; + break; + } // RESHAPE requires src[0] to be contiguous, so both src and result // have standard compact strides: nb[i] = type_size * prod(ne[0..i-1]). // Match src->nb[dynamic_dim] against result->nb[i] to find the output @@ -1429,7 +1569,7 @@ void GgmlOvDecoder::compute_node_dynamic_dims() { } } if (m_node_dynamic_dims[node] == -1) { - // std::cout << "Cannot determine dynamic dim for RESHAPE node: " << node->name << std::endl; + GGML_LOG_WARN("ggml-openvino: cannot determine dynamic dim for RESHAPE node '%s'\n", node->name); } } break; @@ -1480,15 +1620,29 @@ void GgmlOvDecoder::compute_node_dynamic_dims() { } if (matched_dim_count != 1) { m_node_dynamic_dims[node] = -1; - // std::cout << "Warning: Cannot determine dynamic dim for CONT node: " << node->name - // << " and its src[0]: " << node->src[0]->name << std::endl; + GGML_LOG_WARN("ggml-openvino: cannot determine dynamic dim for CONT node '%s', src[0]: '%s'\n", + node->name, node->src[0]->name); } } } break; + case GGML_OP_CONCAT: + for (int i = 0; i < GGML_MAX_DIMS; i++) { + if (node->src[0]->ne[i] != node->ne[i]) { + m_node_dynamic_dims[node] = i; + break; + } + } + break; + case GGML_OP_SSM_CONV: + case GGML_OP_GATED_DELTA_NET: + m_node_dynamic_dims[node] = 1; + break; case GGML_OP_RMS_NORM: + case GGML_OP_L2_NORM: case GGML_OP_NORM: case GGML_OP_ADD: + case GGML_OP_SUB: case GGML_OP_GLU: case GGML_OP_ROPE: case GGML_OP_SCALE: @@ -1496,9 +1650,16 @@ void GgmlOvDecoder::compute_node_dynamic_dims() { case GGML_OP_ARGSORT: case GGML_OP_ADD_ID: case GGML_OP_UNARY: + case GGML_OP_CUMSUM: + case GGML_OP_FILL: + case GGML_OP_SET: + case GGML_OP_DIAG: + case GGML_OP_TRI: + case GGML_OP_REPEAT: m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[0]]; break; case GGML_OP_MUL_MAT_ID: + case GGML_OP_SOLVE_TRI: m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[1]]; break; case GGML_OP_CPY: @@ -1534,7 +1695,8 @@ void GgmlOvDecoder::compute_node_dynamic_dims() { break; } default: - // std::cout << "Doesn't handle node name: " << node->name << " op: " << ggml_op_name(node->op) << std::endl; + GGML_LOG_DEBUG("ggml-openvino: compute_node_dynamic_dims: unhandled op %s for node '%s'\n", + ggml_op_name(node->op), node->name); break; } }; diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 695676acd6ba..fe5dfa09ba51 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -20,6 +20,7 @@ struct ModelParams { int n_seq = 1; int n_heads_kv = -1; int head_size = -1; + int state_size = -1; // for SSM molels, eg qwen35 int32_t rope_params[15]; bool mixed_rope_params = false; std::vector swa_layers; @@ -48,6 +49,37 @@ struct ComputeParams { int token_len_per_seq = -1; int past_kv_len = -1; int output_len = 1; + + int cache_rs_reset_idx = -1; + int cache_rs_reset_len = -1; + // SSM/DeltaNet models otionally clear cache_r and cache_s of certain slots in the cgraph + // 3: [ 18432, 4, 1, 1] RESHAPE cache_r_l0 (reshaped) + // [ 18432, 4, 1, 1] 0: NONE cache_r_l0 + // 4: [ 18432, 1, 1, 1] VIEW cache_r_l0 (reshaped) (view) + // [ 18432, 4, 1, 1] 0: RESHAPE cache_r_l0 (reshaped) + // 5: [ 18432, 1, 1, 1] SCALE cache_r_l0 (reshaped) (view) (view) + // [ 18432, 1, 1, 1] 0: VIEW cache_r_l0 (reshaped) (view) + + int s_copy_active_slot_idx = -1; + int s_copy_active_slot_len = -1; + // SSM/DeltaNet models otionally reorder slots of state cache, to make the active slots contiguous + // leaf_5 is the inp->s_copy in llama-graph.cpp, eg if there are 8 slots in total and slot 3 and 7 + // are active in the current batch, leaf_5 will be [3, 7, 5, 6, 4] + // 6: [ 2, 1, 1, 1] VIEW (view) + // [ 2, 1, 1, 1] 0: NONE leaf_5 + // 7: [ 18432, 2, 1, 1] GET_ROWS conv_states-0 + // [ 18432, 4, 1, 1] 0: RESHAPE cache_r_l0 (reshaped) + // [ 2, 1, 1, 1] 1: VIEW (view) + // 8: [ 0, 1, 1, 1] VIEW (view) + // [ 2, 1, 1, 1] 0: NONE leaf_5 + // 9: [ 18432, 0, 1, 1] GET_ROWS node_9 + // [ 18432, 4, 1, 1] 0: RESHAPE cache_r_l0 (reshaped) + // [ 0, 1, 1, 1] 1: VIEW (view) + // 10: [ 18432, 0, 1, 1] VIEW cache_r_l0 (view) + // [ 18432, 4, 1, 1] 0: NONE cache_r_l0 + // 11: [ 18432, 0, 1, 1] CPY cache_r_l0 (view) (copy of ) + // [ 18432, 0, 1, 1] 0: GET_ROWS node_9 + // [ 18432, 0, 1, 1] 1: VIEW cache_r_l0 (view) }; class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { @@ -156,7 +188,9 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { virtual std::vector get_output_names(int node_idx) const override; - virtual std::vector get_output_aliases(int node_idx) const override; + virtual std::string get_inplace_op_src(int node_idx) const override; + + virtual bool is_view_like_alias_of(int node_idx, const std::string & view_src_name) const override; virtual const std::string & get_op_type() const override; @@ -191,7 +225,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { return m_model_weights; } - virtual std::vector get_model_output_names() const override { return m_model_output_names; } + virtual std::set get_model_output_names() const override { return m_model_output_names; } const std::map & get_model_outputs() const { return m_model_outputs; } @@ -216,6 +250,8 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { virtual bool has_mixed_rope_params() const override { return m_model_params.mixed_rope_params; } + virtual int get_ssm_state_size() const override { return m_model_params.state_size; } + virtual std::map get_kv_param_res_names() const override; virtual bool is_static() const override { return m_is_static; } @@ -289,6 +325,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { return op->op == GGML_OP_ROPE && tensor == op->src[2]; } + // also returns true for cache_s and cache_r in SSM/DeltaNet models inline static bool is_kvcache(const ggml_tensor * tensor, const ggml_tensor * op) { return tensor->buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY || (op != nullptr && op->op == GGML_OP_SET_ROWS && op->src[2] == tensor); @@ -303,6 +340,12 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { op->src[1]->op == GGML_OP_NONE; } + // the state permutation index input used in SSM/DeltaNet models (inp->s_copy in llama-graph.cpp) + inline static bool is_inp_s_copy(const ggml_tensor * tensor, const ggml_tensor * op) { + return op->op == GGML_OP_GET_ROWS && tensor == op->src[1] && + op->src[0]->buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY; + } + std::string get_graph_input_ov_name(const ggml_tensor * tensor, const ggml_tensor * op) { if (is_inp_pos(tensor, op)) { return "inp_pos"; @@ -323,6 +366,10 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { void compute_model_inputs(); void compute_model_outputs(); + // True if tensor is the inp->s_copy index leaf gathered by a recurrent state cache GET_ROWS + // (possibly through a VIEW), so it gets a dynamic [1,1,1,-1] graph-input shape. + bool is_s_copy_leaf(const ggml_tensor * tensor) const; + // Infer and propagate dynamic-dimension indices for all tensors in the GGML graph. void compute_node_dynamic_dims(); @@ -336,7 +383,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { std::map> m_model_extra_input_values; std::map> m_model_weights; std::map m_model_outputs; - std::vector m_model_output_names; + std::set m_model_output_names; std::vector m_node_info_list; std::map m_node_dynamic_dims; diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index 6c88a7405cf4..1d66269243e7 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -892,6 +892,23 @@ static bool is_op_unsupported_case(const ggml_tensor * op) { } break; } + case GGML_OP_SET: { + const auto nb1 = static_cast(op->op_params[0]); + const auto nb2 = static_cast(op->op_params[1]); + const auto nb3 = static_cast(op->op_params[2]); + + // OpenVINO SET translation currently supports dst layouts that match src0 strides. + if (op->src[0] == nullptr || nb1 != op->src[0]->nb[1] || nb2 != op->src[0]->nb[2] || nb3 != op->src[0]->nb[3]) { + // std::cout << "Unsupported SET op with dst nb1=" << nb1 << ", nb2=" << nb2 << ", nb3=" << nb3 + // << " that does not match src0 strides nb[1]=" + // << (op->src[0] != nullptr ? std::to_string(op->src[0]->nb[1]) : "null") + // << ", nb[2]=" << (op->src[0] != nullptr ? std::to_string(op->src[0]->nb[2]) : "null") + // << ", nb[3]=" << (op->src[0] != nullptr ? std::to_string(op->src[0]->nb[3]) : "null") + // << std::endl; + return true; + } + break; + } case GGML_OP_GET_ROWS: case GGML_OP_SET_ROWS: { if (op->ne[3] != 1) { @@ -994,6 +1011,9 @@ static bool is_op_unsupported_case(const ggml_tensor * op) { // GGML_LOG_WARN("OpenVINO backend does not support CPY with non-contiguous data or bf16 types\n"); return true; } + if (ggml_nelements(op->src[0]) != ggml_nelements(op->src[1])) { + return true; + } // op test case with non-contiguous src or dst if ((op->ne[0] == 3 && op->ne[1] == 4 && op->ne[2] == 3 && op->ne[3] == 2) || (op->ne[0] == 1 && op->ne[1] == 4 && op->ne[2] == 3 && op->ne[3] == 2) || @@ -1001,7 +1021,7 @@ static bool is_op_unsupported_case(const ggml_tensor * op) { return true; } // CPY into a strided view of a larger buffer (recurrent-state snapshots) not supported - if (op->view_src && ggml_nbytes(op) != ggml_nbytes(op->view_src)) { + if (op->src[0]->op == GGML_OP_NONE && op->view_src && ggml_nbytes(op) != ggml_nbytes(op->view_src)) { return true; } break; @@ -1030,8 +1050,10 @@ static bool is_op_unsupported_case(const ggml_tensor * op) { // GGML_LOG_WARN("OpenVINO backend does not support ROPE with mode %d\n", mode); return true; } - if (n_dims != 0.0f && n_dims != op->src[0]->ne[0]) { - // GGML_LOG_WARN("OpenVINO backend does not support ROPE with n_dims %d != src[0]->ne[0] %ld\n", n_dims, + const int64_t head_dim = op->src[0]->ne[0]; + const int64_t rope_dims = n_dims == 0 ? head_dim : n_dims; + if (rope_dims <= 0 || rope_dims > head_dim || (rope_dims % 2) != 0) { + // GGML_LOG_WARN("OpenVINO backend does not support ROPE with n_dims %d and src[0]->ne[0] %ld\n", n_dims, // op->src[0]->ne[0]); return true; } @@ -1066,7 +1088,7 @@ static bool is_op_unsupported_case(const ggml_tensor * op) { } case GGML_OP_GATED_DELTA_NET: { // enable after https://github.com/openvinotoolkit/openvino/pull/35917 is included in OV release - return true; + // return true; // if (ggml_openvino_get_device_name() == "GPU" && op->src[0]->ne[2] > 1) { // // CVS-186471 // return true; @@ -1078,13 +1100,8 @@ static bool is_op_unsupported_case(const ggml_tensor * op) { if (op->src[3]->ne[0] != 1) { return true; } - // v_repeat > 1 (GQA): ggml uses modulo head mapping (h_q = h_v % H_k) - // but the fused op uses consecutive mapping (h_q = h_v / group_size) - if (op->src[2]->ne[1] != op->src[0]->ne[1]) { - return true; - } // K > 1 (multiple state snapshots) not supported by fused op - if (op->src[5]->ne[1] > 1) { + if (((const int32_t *) op->op_params)[0] > 1) { return true; } break; @@ -1092,7 +1109,8 @@ static bool is_op_unsupported_case(const ggml_tensor * op) { case GGML_OP_SSM_CONV: { // qwen3next is numerically unstable with OpenVINO SSM_CONV. // Keep this op on CPU until the OpenVINO implementation is fixed. - return true; + // return true; + break; } case GGML_OP_VIEW: { // Skip TOPK_MOE fused tests until it is fully supported. @@ -1161,6 +1179,9 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con // GGML_LOG_WARN("OpenVINO backend does not support unary op %s\n", ggml_unary_op_name(ggml_get_unary_op(op))); return false; } + if (ggml_get_unary_op(op) == GGML_UNARY_OP_EXP && op->type == GGML_TYPE_F32) { + return false; + } break; } case GGML_OP_GLU: { @@ -1186,16 +1207,11 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con // GGML_LOG_WARN("OpenVINO backend does not support op %s\n", ggml_op_name(op->op)); return false; } - static std::set ops_not_support_view_input{ - GGML_OP_L2_NORM, - }; + static std::set ops_not_support_view_input{}; if (ops_not_support_view_input.find(op->op) != ops_not_support_view_input.end() && has_view_op_input(op)) { // GGML_LOG_WARN("OpenVINO backend does not support op %s with view input\n", ggml_op_name(op->op)); return false; } - if (op->op == GGML_OP_RMS_NORM && has_non_contiguous_view_input(op)) { - return false; - } } } diff --git a/ggml/src/ggml-openvino/openvino/decoder.h b/ggml/src/ggml-openvino/openvino/decoder.h index 3b429078c343..7ec6bdd8b40e 100644 --- a/ggml/src/ggml-openvino/openvino/decoder.h +++ b/ggml/src/ggml-openvino/openvino/decoder.h @@ -75,7 +75,9 @@ class GgmlDecoder : public DecoderBase { virtual std::vector get_output_names(int node_idx) const = 0; - virtual std::vector get_output_aliases(int node_idx) const = 0; + virtual std::string get_inplace_op_src(int node_idx) const = 0; + + virtual bool is_view_like_alias_of(int node_idx, const std::string & view_src_name) const = 0; virtual const std::string & get_op_type() const = 0; @@ -89,15 +91,17 @@ class GgmlDecoder : public DecoderBase { virtual int get_op_case(int node_idx) const = 0; - virtual const std::map> & get_model_inputs() const = 0; - virtual const std::map> & get_model_extra_inputs() const = 0; - virtual const std::map> & get_model_weights() const = 0; - virtual std::vector get_model_output_names() const = 0; + virtual const std::map>& get_model_inputs() const = 0; + virtual const std::map>& get_model_extra_inputs() const = 0; + virtual const std::map>& get_model_weights() const = 0; + virtual std::set get_model_output_names() const = 0; virtual int32_t * get_rope_params() const = 0; virtual bool has_mixed_rope_params() const = 0; + virtual int get_ssm_state_size() const = 0; + virtual std::map get_kv_param_res_names() const = 0; virtual bool is_static() const = 0; diff --git a/ggml/src/ggml-openvino/openvino/node_context.h b/ggml/src/ggml-openvino/openvino/node_context.h index 9769c30096e9..2e2756037703 100644 --- a/ggml/src/ggml-openvino/openvino/node_context.h +++ b/ggml/src/ggml-openvino/openvino/node_context.h @@ -153,6 +153,8 @@ class NodeContext : public frontend::NodeContext { bool is_stateful() const { return m_decoder->is_stateful(); } + int get_ssm_state_size() const { return m_decoder->get_ssm_state_size(); } + private: std::shared_ptr m_decoder; std::shared_ptr & m_tensor_map; diff --git a/ggml/src/ggml-openvino/openvino/op/cpy.cpp b/ggml/src/ggml-openvino/openvino/op/cpy.cpp index 3a4355021d98..06de5e052c60 100644 --- a/ggml/src/ggml-openvino/openvino/op/cpy.cpp +++ b/ggml/src/ggml-openvino/openvino/op/cpy.cpp @@ -2,10 +2,18 @@ #include "../op_table.h" #include "../utils.h" +#include #include +#include +#include #include #include +#include +#include +#include #include +#include +#include namespace ov { namespace frontend { @@ -13,18 +21,90 @@ namespace ggml { namespace op { OutputVector translate_cpy(const NodeContext & context) { - auto input = process_view_input_new(context, 0); + auto op_case = context.get_op_case(); auto input_shape = context.get_input_shape(0); - auto output_shape = context.get_output_shape(); + auto output_shape = context.get_input_shape(1); + + // Recurrent state cache writeback with a dynamic active-slot block (inp->s_copy reorder). + // The active sequences occupy a contiguous slot block [idx, idx+len) of the state cache; write + // the new rows into that block while preserving the rest, so the result is the full updated + // cache. op_case 1: gated-delta-net state, op_case 2: conv state, op_case 3: defrag remainder. + const bool slice_assign = context.has_input("s_copy_active_slot_len") && !context.is_stateful() && + (op_case == 1 || op_case == 2 || op_case == 3); + if (slice_assign) { + const int64_t slot_axis = 2; + auto slot_idx = context.get_input("s_copy_active_slot_idx"); + auto slot_len = context.get_input("s_copy_active_slot_len"); + auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + auto int_max = ov::op::v0::Constant::create(ov::element::i64, {1}, {INT_MAX}); + auto axis = ov::op::v0::Constant::create(ov::element::i64, {1}, {slot_axis}); + + ov::Output src; + ov::Output begin; + if (op_case == 1) { + // GDN packs [attn | new_state]; the state is the last ssm_state_size * n_seqs rows. + int ssm_state_size = context.get_ssm_state_size(); + auto state_rows = std::make_shared( + ov::op::v0::Constant::create(ov::element::i64, {1}, {ssm_state_size}), slot_len); + auto state_begin = std::make_shared(state_rows); + auto state_part = + std::make_shared(context.get_input(0), state_begin, int_max, one, axis); + auto feature = (int64_t) output_shape[3].get_length(); + src = std::make_shared( + state_part, + ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector{1, 1, -1, feature}), false); + begin = slot_idx; + } else if (op_case == 2) { + auto cache_r_size = (int64_t) input_shape[3].get_length(); + auto conv_state_last = std::make_shared( + context.get_input(0), ov::op::v0::Constant::create(ov::element::i64, {1}, {-cache_r_size}), int_max, + one, ov::op::v0::Constant::create(ov::element::i64, {1}, {3})); + auto feature = (int64_t) output_shape[3].get_length(); + src = std::make_shared( + conv_state_last, + ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector{1, 1, -1, feature}), false); + begin = slot_idx; + } else { + // op_case 3: gathered remainder rows already have the cache slot layout [1, 1, extra, feature] + src = context.get_input(0); + begin = std::make_shared(slot_idx, slot_len); + } + + if (src.get_element_type() != context.get_output_type()) { + src = std::make_shared(src, context.get_output_type()); + } + + auto base = context.get_input(1); + auto src_len = + std::make_shared(std::make_shared(src, ov::element::i64), axis, + ov::op::v0::Constant::create(ov::element::i64, {}, {0})); + auto end = std::make_shared(begin, src_len); + auto head_part = std::make_shared(base, zero, begin, one, axis); + auto tail_part = std::make_shared(base, end, int_max, one, axis); + auto res = std::make_shared(ov::OutputVector{head_part, src, tail_part}, slot_axis); + return rename_outputs_with_suffix({res}, context.get_name()); + } + + auto input = process_view_input_new(context, 0); - // Non-cast CPY may need a reshape (e.g. [3,192,1,1] -> [576,1,1,1]) if (input_shape != output_shape) { auto new_shape = ov::op::v0::Constant::create( ov::element::i64, {static_cast(output_shape.rank().get_length())}, output_shape.to_shape()); input = std::make_shared(input, new_shape, false); } - auto res = std::make_shared(input, context.get_output_type()); + ov::Output res; + if (context.get_input_type(0) != context.get_output_type()) { + res = std::make_shared(input, context.get_output_type()); + } else { + res = input; + } + + if (res.get_node_shared_ptr() == context.get_input(0).get_node_shared_ptr()) { + return {res}; + } + return rename_outputs_with_suffix({res}, context.get_name()); } diff --git a/ggml/src/ggml-openvino/openvino/op/cumsum.cpp b/ggml/src/ggml-openvino/openvino/op/cumsum.cpp new file mode 100644 index 000000000000..0a414b24f6f5 --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/cumsum.cpp @@ -0,0 +1,29 @@ +#include "../node_context.h" +#include "../op_table.h" +#include "../utils.h" + +#include +#include + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { + +// GGML cumsum computes prefix sum along dim 0 (the innermost/fastest dimension). +// In OV layout the dims are reversed: ggml [ne0, ne1, ne2, ne3] → OV [ne3, ne2, ne1, ne0], +// so ggml dim 0 maps to OV axis 3 (last axis). +OutputVector translate_cumsum(const NodeContext & context) { + num_inputs_check(context, 1, 1); + + auto x = context.get_input(0); + auto axis = ov::op::v0::Constant::create(ov::element::i64, {}, {3}); + auto res = std::make_shared(x, axis); + + return rename_outputs_with_suffix({res}, context.get_name()); +} + +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/diag.cpp b/ggml/src/ggml-openvino/openvino/op/diag.cpp new file mode 100644 index 000000000000..dacea2f05b4a --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/diag.cpp @@ -0,0 +1,58 @@ +#include "../node_context.h" +#include "../op_table.h" +#include "../utils.h" + +#include +#include +#include +#include +#include +#include + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { + +// GGML DIAG takes a 1D vector (ne0, 1, ne2, ne3) and produces a diagonal matrix +// of shape (ne0, ne0, ne2, ne3). +// In OV layout (ggml [ne0, ne1, ne2, ne3] → OV [ne3, ne2, ne1, ne0]): +// input: [ne3, ne2, 1, ne0] +// output: [ne3, ne2, ne0, ne0] +// The diagonal: output[..., i, j] = input[..., 0, j] if i == j, else 0. +OutputVector translate_diag(const NodeContext & context) { + num_inputs_check(context, 1, 1); + + auto x = context.get_input(0); // OV shape: [ne3, ne2, 1, ne0] + + auto out_shape = context.get_output_shape().to_shape(); + int64_t n = static_cast(out_shape[3]); // ne0 + + // Build index range [0, 1, ..., n-1] + auto start = ov::op::v0::Constant::create(ov::element::i64, {}, {int64_t(0)}); + auto stop = ov::op::v0::Constant::create(ov::element::i64, {}, {n}); + auto step = ov::op::v0::Constant::create(ov::element::i64, {}, {int64_t(1)}); + auto range = std::make_shared(start, stop, step, ov::element::i64); + + // col_idx shape [1, 1, 1, n] + auto col_shape = ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector{1, 1, 1, n}); + auto col_idx = std::make_shared(range, col_shape, false); + + // row_idx shape [1, 1, n, 1] + auto row_shape = ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector{1, 1, n, 1}); + auto row_idx = std::make_shared(range, row_shape, false); + + // mask: true where col == row (diagonal) + auto mask = std::make_shared(col_idx, row_idx); + + // Broadcast input from [ne3, ne2, 1, ne0] to [ne3, ne2, ne0, ne0] via select + auto zero = ov::op::v0::Constant::create(ov::element::f32, {}, {0.0f}); + auto res = std::make_shared(mask, x, zero); + + return rename_outputs_with_suffix({res}, context.get_name()); +} + +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/fill.cpp b/ggml/src/ggml-openvino/openvino/op/fill.cpp index 1450b70be23d..db2fecb53caf 100644 --- a/ggml/src/ggml-openvino/openvino/op/fill.cpp +++ b/ggml/src/ggml-openvino/openvino/op/fill.cpp @@ -2,46 +2,33 @@ #include "../op_table.h" #include "../utils.h" -#include -#include -#include #include #include -#include -#include namespace ov { namespace frontend { namespace ggml { namespace op { +// GGML FILL sets all elements of a tensor to a constant value. +// The constant is stored as a float in op_params[0]. OutputVector translate_fill(const NodeContext & context) { num_inputs_check(context, 1, 1); - const int32_t * op_params = context.get_output_op_params(); - FRONT_END_CHECK_IMPLEMENTED(op_params != nullptr, "FILL requires output op params"); + float c; + memcpy(&c, context.get_output_op_params(), sizeof(float)); - float value; - std::memcpy(&value, op_params, sizeof(float)); + auto shape = context.get_input_shape(0).to_shape(); - auto scalar = ov::op::v0::Constant::create(context.get_output_type(), ov::Shape{}, {value}); + auto val = ov::op::v0::Constant::create(ov::element::f32, {}, {c}); + auto target_shape = ov::op::v0::Constant::create(ov::element::i64, {shape.size()}, + std::vector(shape.begin(), shape.end())); + auto res = std::make_shared(val, target_shape); - ov::Output target_shape; - const auto output_shape = context.get_output_shape(); - if (output_shape.rank().is_static() && output_shape.is_static()) { - const auto static_shape = output_shape.to_shape(); - std::vector shape_values(static_shape.begin(), static_shape.end()); - target_shape = ov::op::v0::Constant::create(ov::element::i64, {shape_values.size()}, shape_values); - } else { - auto input = process_view_input_new(context, 0); - target_shape = std::make_shared(input, ov::element::i64); - } - - auto res = std::make_shared(scalar, target_shape); return rename_outputs_with_suffix({res}, context.get_name()); } } // namespace op } // namespace ggml } // namespace frontend -} // namespace ov \ No newline at end of file +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/gated_delta_net.cpp b/ggml/src/ggml-openvino/openvino/op/gated_delta_net.cpp index 26c4bbfa9850..66c748283311 100644 --- a/ggml/src/ggml-openvino/openvino/op/gated_delta_net.cpp +++ b/ggml/src/ggml-openvino/openvino/op/gated_delta_net.cpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -31,57 +32,76 @@ namespace op { static OutputVector translate_gated_delta_net_ref(const NodeContext & context); OutputVector translate_gated_delta_net(const NodeContext & context) { - // auto v_shape = context.get_input_shape(2).to_shape(); // [B, T, H_v, S_v] - // auto q_shape = context.get_input_shape(0).to_shape(); // [B, T, H_k, S_k] - - // // Fused GatedDeltaNet op only supports scalar gate (kda=0). - // // Fall back to reference implementation for per-key-dimension gating. - // // if (kda) { - // // return translate_gated_delta_net_ref(context); - // // } - - // auto q = context.get_input(0); - // auto k = context.get_input(1); - // auto v = context.get_input(2); - // auto g = context.get_input(3); - // auto beta = context.get_input(4); - // auto state = context.get_input(5); + auto v_shape = context.get_input_shape(2).to_shape(); // [B, T, H_v, S_v] + auto q_shape = context.get_input_shape(0).to_shape(); // [B, T, H_k, S_k] + + // Fused GatedDeltaNet op only supports scalar gate (kda=0). + // Fall back to reference implementation for per-key-dimension gating. + // if (kda) { + // return translate_gated_delta_net_ref(context); + // } // const int64_t B = v_shape[0]; // const int64_t T = v_shape[1]; - // const int64_t H_v = v_shape[2]; - // const int64_t S_v = v_shape[3]; + const int64_t H_v = v_shape[2]; + const int64_t S_v = v_shape[3]; + const int64_t H_k = q_shape[2]; // const int64_t S_k = q_shape[3]; - // // ggml state layout (OV notation): [B, H_v, value_dim, key_dim] - // // GatedDeltaNet op expects: [B, H_v, key_dim, value_dim] - // auto state_reshape_shape = - // ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector{B, H_v, S_v, S_k}); - // state = std::make_shared(state, state_reshape_shape, false); - // auto state_perm = ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector{0, 1, 3, 2}); - // state = std::make_shared(state, state_perm); - - // g = std::make_shared(g, ov::op::v0::Constant::create(ov::element::i64, {1}, {3})); - // beta = std::make_shared(beta, ov::op::v0::Constant::create(ov::element::i64, {1}, {3})); - - // auto gdn = std::make_shared(q, k, v, state, g, beta); - - // auto attn_4d = gdn->output(0); - // auto state_4d = gdn->output(1); // [B, H_v, key_dim, value_dim] - // // Transpose output state back to ggml layout [B, H_v, value_dim, key_dim] - // auto state_transposed = std::make_shared(state_4d, state_perm); - // auto flat_shape_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {-1}); - // auto attn = std::make_shared(attn_4d, flat_shape_1d, false); - // auto new_state = std::make_shared(state_transposed, flat_shape_1d, false); - // auto packed = std::make_shared(ov::OutputVector{attn, new_state}, 0); - // auto out_shape = - // ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector{1, 1, T * B + S_v * B, S_v * H_v}); - // auto res = std::make_shared(packed, out_shape, false); - - // return rename_outputs_with_suffix({res}, context.get_name()); - - // The OV version in CI does not have the GatedDeltaNet op, so use reference implementation for now. - return translate_gated_delta_net_ref(context); + auto q = context.get_input(0); + auto k = context.get_input(1); + auto v = process_view_input(context, 2, H_v * S_v); + auto g = context.get_input(3); + auto beta = context.get_input(4); + auto state = context.get_input(5); + + // ggml maps GQA heads in tiled order, while OV GDN maps repeated heads in grouped order. + if (H_v != H_k) { + const int64_t repeat = H_v / H_k; + auto repeats = ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector{1, 1, repeat, 1}); + q = std::make_shared(q, repeats); + k = std::make_shared(k, repeats); + } + + if (context.get_view_input_size(2)) { + // Same as l2_norm case 1 + v = std::make_shared(v, ov::op::v0::Constant::create(ov::element::i64, {1}, {0})); + auto v_shape = context.get_input_shape(2).to_shape(); + std::vector reshape_pattern = {0, 0, (int64_t) v_shape[2], (int64_t) v_shape[3]}; + v = std::make_shared( + v, ov::op::v0::Constant::create(ov::element::i64, {4}, reshape_pattern), true); + } + + // ggml state layout (OV notation): [B, H_v, value_dim, key_dim] + // GatedDeltaNet op expects: [B, H_v, key_dim, value_dim] + auto state_perm = ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector{0, 1, 3, 2}); + state = std::make_shared(state, state_perm); + + g = std::make_shared(g, ov::op::v0::Constant::create(ov::element::i64, {1}, {3})); + beta = std::make_shared(beta, ov::op::v0::Constant::create(ov::element::i64, {1}, {3})); + + // std::cout << "GatedDeltaNet input shapes: q=" << q.get_partial_shape() << ", k=" << k.get_partial_shape() + // << ", v=" << v.get_partial_shape() << ", g=" << g.get_partial_shape() + // << ", beta=" << beta.get_partial_shape() << ", state=" << state.get_partial_shape() << std::endl; + + auto gdn = std::make_shared(q, k, v, state, g, beta); + auto attn_4d = gdn->output(0); + auto state_4d = gdn->output(1); // [B, H_v, key_dim, value_dim] + + // std::cout << "GatedDeltaNet output shapes: attn=" << gdn->output(0).get_partial_shape() + // << ", new_state=" << gdn->output(1).get_partial_shape() << std::endl; + + // Transpose output state back to ggml layout [B, H_v, value_dim, key_dim] + auto state_transposed = std::make_shared(state_4d, state_perm); + auto flat_shape_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {-1}); + auto attn = std::make_shared(attn_4d, flat_shape_1d, false); + auto new_state = std::make_shared(state_transposed, flat_shape_1d, false); + auto packed = std::make_shared(ov::OutputVector{attn, new_state}, 0); + auto out_shape = ov::op::v0::Constant::create(ov::element::i64, {4}, + std::vector{1, 1, -1 /*T * B + S_v * B*/, S_v * H_v}); + auto res = std::make_shared(packed, out_shape, false); + + return rename_outputs_with_suffix({res}, context.get_name()); } static OutputVector translate_gated_delta_net_ref(const NodeContext & context) { diff --git a/ggml/src/ggml-openvino/openvino/op/get_rows.cpp b/ggml/src/ggml-openvino/openvino/op/get_rows.cpp index 380e70a72e07..0a9de23de76b 100644 --- a/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +++ b/ggml/src/ggml-openvino/openvino/op/get_rows.cpp @@ -2,11 +2,13 @@ #include "../op_table.h" #include "../utils.h" +#include #include #include #include #include #include +#include #include #include @@ -20,7 +22,27 @@ OutputVector translate_get_rows(const NodeContext & context) { Output res; auto data = process_view_input_new(context, 0); - auto indices = process_view_input_new(context, 1); + + auto op_case = context.get_op_case(); + ov::Output indices; + if (op_case == 1 || op_case == 2) { + // Recurrent state reorder (inp->s_copy): slice the active (op_case 1) or extra (op_case 2) + // segment from the s_copy index list at runtime, instead of baking the static view offset, + // so the cached IR works for any number of active sequences. + auto s_copy = context.get_input(1); + auto len = context.get_input("s_copy_active_slot_len"); + auto step = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + auto axis = ov::op::v0::Constant::create(ov::element::i64, {1}, {3}); + if (op_case == 1) { + auto begin = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + indices = std::make_shared(s_copy, begin, len, step, axis); + } else { + auto end = ov::op::v0::Constant::create(ov::element::i64, {1}, {INT_MAX}); + indices = std::make_shared(s_copy, len, end, step, axis); + } + } else { + indices = process_view_input_new(context, 1); + } // data[1,b,x,y] ind[1,1,b,x'] test-backend-ops case // data[x,y] ind[1,1,1,x'] normal case diff --git a/ggml/src/ggml-openvino/openvino/op/l2_norm.cpp b/ggml/src/ggml-openvino/openvino/op/l2_norm.cpp index 4b8ed3b6c4a2..4c9bc06c965b 100644 --- a/ggml/src/ggml-openvino/openvino/op/l2_norm.cpp +++ b/ggml/src/ggml-openvino/openvino/op/l2_norm.cpp @@ -8,7 +8,9 @@ #include #include #include +#include #include +#include namespace ov { namespace frontend { @@ -20,6 +22,21 @@ OutputVector translate_l2_norm(const NodeContext & context) { auto input_node = process_view_input_new(context, 0); + if (context.get_op_case() == 1) { + // 92: [ 128, 16, 1, 2] VIEW q_conv-1 + // [ 6144, 1, 2, 1] 0: UNARY conv_output_silu-1 + // 93: [ 128, 16, 1, 2] L2_NORM q_conv_predelta-1 + // [ 128, 16, 1, 2] 0: VIEW q_conv-1 + auto output_shape = context.get_output_shape().to_shape(); + input_node = process_view_input(context, 0, output_shape[2] * output_shape[3]); + input_node = + std::make_shared(input_node, ov::op::v0::Constant::create(ov::element::i64, {1}, {0})); + + std::vector reshape_pattern = {0, 0, (int64_t) output_shape[2], (int64_t) output_shape[3]}; + input_node = std::make_shared( + input_node, ov::op::v0::Constant::create(ov::element::i64, {4}, reshape_pattern), true); + } + auto squared = std::make_shared(input_node, input_node); auto sum_squared = std::make_shared( diff --git a/ggml/src/ggml-openvino/openvino/op/repeat.cpp b/ggml/src/ggml-openvino/openvino/op/repeat.cpp index 4b742134b0cf..d58b59e4e309 100644 --- a/ggml/src/ggml-openvino/openvino/op/repeat.cpp +++ b/ggml/src/ggml-openvino/openvino/op/repeat.cpp @@ -23,47 +23,21 @@ OutputVector translate_repeat(const NodeContext & context) { auto input = process_view_input_new(context, 0); - const auto input_shape = context.get_input_shape(0); - const auto output_shape = context.get_output_shape(); + const auto input_shape = context.get_input_shape(0).to_shape(); + const auto output_shape = context.get_output_shape().to_shape(); - if (input_shape.rank().is_static() && output_shape.rank().is_static() && - input_shape.rank() == output_shape.rank()) { - const auto rank = static_cast(input_shape.rank().get_length()); - std::vector repeats(rank, 1); - bool all_static = true; + std::vector repeats(4, 1); + for (size_t axis = 0; axis < 4; ++axis) { + const int64_t input_dim = input_shape[axis]; + const int64_t output_dim = output_shape[axis]; - for (size_t axis = 0; axis < rank; ++axis) { - if (!input_shape[axis].is_static() || !output_shape[axis].is_static()) { - all_static = false; - break; - } + FRONT_END_OP_CONVERSION_CHECK(input_dim > 0 && output_dim > 0 && output_dim % input_dim == 0, + "REPEAT input shape ", input_shape, " cannot tile to match ", output_shape); - const int64_t input_dim = input_shape[axis].get_length(); - const int64_t output_dim = output_shape[axis].get_length(); - - FRONT_END_OP_CONVERSION_CHECK(input_dim > 0 && output_dim > 0 && output_dim % input_dim == 0, - "REPEAT input shape ", input_shape, " cannot tile to match ", output_shape); - - repeats[axis] = output_dim / input_dim; - } - - if (all_static) { - auto repeats_node = ov::op::v0::Constant::create(ov::element::i64, {repeats.size()}, repeats); - ov::Output res = std::make_shared(input, repeats_node); - return rename_outputs_with_suffix({res}, context.get_name()); - } + repeats[axis] = output_dim / input_dim; } - // Dynamic fallback: tile by the ratio of output to input shape. - auto input_shape_node = std::make_shared(input, ov::element::i64); - std::shared_ptr target_shape_node; - if (output_shape.rank().is_static() && output_shape.is_static()) { - target_shape_node = - ov::op::v0::Constant::create(ov::element::i64, {output_shape.to_shape().size()}, output_shape.to_shape()); - } else { - target_shape_node = std::make_shared(context.get_input(1), ov::element::i64); - } - auto repeats_node = std::make_shared(target_shape_node, input_shape_node); + auto repeats_node = ov::op::v0::Constant::create(ov::element::i64, {repeats.size()}, repeats); ov::Output res = std::make_shared(input, repeats_node); return rename_outputs_with_suffix({res}, context.get_name()); } diff --git a/ggml/src/ggml-openvino/openvino/op/reshape.cpp b/ggml/src/ggml-openvino/openvino/op/reshape.cpp index 602d3387c9f9..eb2a62991421 100644 --- a/ggml/src/ggml-openvino/openvino/op/reshape.cpp +++ b/ggml/src/ggml-openvino/openvino/op/reshape.cpp @@ -25,13 +25,12 @@ OutputVector translate_reshape(const NodeContext & context) { } int op_case = context.get_op_case(); - FRONT_END_CHECK_IMPLEMENTED( - op_case == 1 || op_case == 2 || op_case == 3 || op_case == 4 || op_case == 5 || op_case == 6, - "Unsupported RESHAPE case"); auto output_shape = context.get_output_shape().to_shape(); std::shared_ptr new_shape_node; - if (op_case == 1) { + if (op_case == 0) { + new_shape_node = ov::op::v0::Constant::create(ov::element::i64, {4}, context.get_output_shape().to_shape()); + } else if (op_case == 1) { if (context.is_stateful()) { new_shape_node = ov::op::v0::Constant::create( ov::element::i64, {3}, std::vector{-1, (int64_t) output_shape[2], (int64_t) output_shape[3]}); @@ -76,9 +75,29 @@ OutputVector translate_reshape(const NodeContext & context) { // ov::op::v0::Constant::create(ov::element::i64, {1}, {(int64_t) context.get_output_shape().to_shape()[3]}); // auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); // new_shape_node = std::make_shared(ov::OutputVector{one, one, token_len, emb_size}, 0); - } else if (op_case == 6) { - new_shape_node = ov::op::v0::Constant::create(ov::element::i64, {4}, context.get_output_shape().to_shape()); + // 14: [ 6144, 1, 2, 1] RESHAPE linear_attn_qkv_mixed-0 + // [ 6144, 2, 1, 1] 0: MUL_MAT node_13 + // reshape to [1, n_slot_active_len, -1, 6144] + auto n_slot_active_len = context.get_input("s_copy_active_slot_len"); + auto emb_size = + ov::op::v0::Constant::create(ov::element::i64, {1}, {(int64_t) context.get_output_shape().to_shape()[3]}); + auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + auto neg_one = ov::op::v0::Constant::create(ov::element::i64, {1}, {-1}); + new_shape_node = + std::make_shared(ov::OutputVector{one, n_slot_active_len, neg_one, emb_size}, 0); + } else if (op_case == 7) { + // 57: [ 2048, 2, 1, 1] RESHAPE linear_attn_out-0 (reshaped) + // [ 2048, 1, 2, 1] 0: MUL_MAT linear_attn_out-0 + std::vector shape_vec = {1, 1, -1, (int64_t) context.get_output_shape().to_shape()[3]}; + new_shape_node = ov::op::v0::Constant::create(ov::element::i64, {4}, shape_vec); + } else if (op_case == 8) { + // 106: [ 128, 128, 16, 2] RESHAPE state_predelta-1 + // [ 262144, 2, 1, 1] 0: GET_ROWS node_86 + auto output_shape = context.get_output_shape().to_shape(); + std::vector shape_vec = {-1, (int64_t) output_shape[1], (int64_t) output_shape[2], + (int64_t) output_shape[3]}; + new_shape_node = ov::op::v0::Constant::create(ov::element::i64, {4}, shape_vec); } auto res = std::make_shared(context.get_input(0), new_shape_node, false); return rename_outputs_with_suffix({res}, context.get_name()); diff --git a/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp b/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp index e76ec55b8aab..f1ca56954d7b 100644 --- a/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +++ b/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp @@ -7,8 +7,11 @@ #include #include #include +#include #include #include +#include +#include #include namespace ov { @@ -19,7 +22,40 @@ namespace op { OutputVector translate_rms_norm(const NodeContext & context) { num_inputs_check(context, 1, 1); - auto input_node = process_view_input_new(context, 0); + auto op_case = context.get_op_case(); + + ov::Output input_node; + if (op_case == 1) { + input_node = context.get_input(0); + } else if (op_case == 2) { + auto ssm_state_size = context.get_ssm_state_size(); + // The GDN op packs [attn | new_state] along the row axis; the state occupies the last + // ssm_state_size * n_seqs rows. Slice it off (scaling by the active sequence count) to keep + // just the attention output. + ov::Output state_end; + if (context.has_input("s_copy_active_slot_len")) { + auto len = context.get_input("s_copy_active_slot_len"); + auto state_rows = std::make_shared( + ov::op::v0::Constant::create(ov::element::i64, {1}, {ssm_state_size}), len); + state_end = std::make_shared(state_rows); + } else { + state_end = ov::op::v0::Constant::create(ov::element::i64, {1}, {-ssm_state_size}); + } + auto gdn_attn_output = std::make_shared( + context.get_input(0), ov::op::v0::Constant::create(ov::element::i64, {1}, {0}), state_end, + ov::op::v0::Constant::create(ov::element::i64, {1}, {1}), + ov::op::v0::Constant::create(ov::element::i64, {1}, {2})); + + auto input_shape = context.get_input_shape(0).to_shape(); + input_node = std::make_shared( + gdn_attn_output, + ov::op::v0::Constant::create( + ov::element::i64, {4}, std::vector{1, -1, (int64_t) input_shape[2], (int64_t) input_shape[3]}), + false); + + } else { + input_node = process_view_input_new(context, 0); + } auto square = std::make_shared( input_node, ov::op::v0::Constant::create(ov::element::f32, ov::Shape{1}, {2.0f})); diff --git a/ggml/src/ggml-openvino/openvino/op/rope.cpp b/ggml/src/ggml-openvino/openvino/op/rope.cpp index 9bb2d75d0a4c..94918513dd04 100644 --- a/ggml/src/ggml-openvino/openvino/op/rope.cpp +++ b/ggml/src/ggml-openvino/openvino/op/rope.cpp @@ -22,6 +22,7 @@ #include #include #include +#include #include namespace ov { @@ -40,6 +41,9 @@ OutputVector translate_rope(const NodeContext & context) { auto output_shape = context.get_output_shape().to_shape(); int32_t * op_params = context.get_output_op_params(); const int mode = op_case; + const int64_t head_dim = static_cast(output_shape[3]); + const int64_t configured_n_dims = static_cast(op_params[1]); + const int64_t n_dims = configured_n_dims == 0 ? head_dim : configured_n_dims; constexpr int TYPE_NORMAL = 0; constexpr int TYPE_NEOX = 1; @@ -80,6 +84,9 @@ OutputVector translate_rope(const NodeContext & context) { data_node = std::make_shared(data_node, ov::element::f32); } + FRONT_END_OP_CONVERSION_CHECK(n_dims > 0 && n_dims <= head_dim && (n_dims % 2 == 0), + "ROPE expects even n_dims in [1, head_dim]"); + // TODO(openvino-gpu-rope-fusion): TEMPORARY WORKAROUND - do NOT revert until the // OpenVINO GPU plugin is updated. // @@ -94,13 +101,18 @@ OutputVector translate_rope(const NodeContext & context) { // be restored to the captured even/odd translation. Until then, keep both paths: // the active Flux rewrite here and the previous translation preserved below. if (mode == TYPE_NORMAL) { + auto axis_last = ov::op::v0::Constant::create(ov::element::i64, {1}, {-1}); + auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + auto step_one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + // Emit the Flux-style interleaved-RoPE pattern so the GPU plugin's // RoPEFusionFlux matcher folds this subgraph into ov::op::internal::RoPE: - // x_paired = Reshape(x, [1, S, n_heads, head_size/2, 2]) + // x_paired = Reshape(x_rot, [1, S, n_heads, n_dims/2, 2]) // x0, x1 = Split(x_paired, axis=-1, num_splits=2) // x1_neg = x1 * -1 - // x_rotated = Reshape(Concat([x1_neg, x0], axis=-1), [1, S, n_heads, head_size]) - // y = x * t_cos + x_rotated * t_sin + // x_rotated = Reshape(Concat([x1_neg, x0], axis=-1), [1, S, n_heads, n_dims]) + // y_rot = x_rot * t_cos + x_rotated * t_sin + // y = Concat([y_rot, x_tail], axis=-1) if n_dims < head_dim // Mathematically equivalent to the even/odd Slice form below. // // RoPEFusionFlux requires rank_equals(4) on x, t_cos and t_sin. The cos/sin @@ -114,15 +126,16 @@ OutputVector translate_rope(const NodeContext & context) { std::vector{1, -1, (int64_t) output_shape[2], (int64_t) output_shape[3]}); data_node = std::make_shared(data_node, r4_shape, false); } - const int64_t head_size = static_cast(output_shape[3]); const int64_t n_heads = static_cast(output_shape[2]); - const int64_t half = head_size / 2; + const int64_t half = n_dims / 2; + auto rot_end = ov::op::v0::Constant::create(ov::element::i64, {1}, {n_dims}); + auto rot_data = std::make_shared(data_node, zero, rot_end, step_one, axis_last); auto neg_one_f = ov::op::v0::Constant::create(data_node->get_element_type(), ov::Shape{}, {-1.0f}); - auto paired_shape = - ov::op::v0::Constant::create(ov::element::i64, {5}, std::vector{1, -1, n_heads, half, 2}); - auto x_paired = std::make_shared(data_node, paired_shape, false); + auto paired_shape = ov::op::v0::Constant::create( + ov::element::i64, {5}, std::vector{1, -1, n_heads, half, 2}); + auto x_paired = std::make_shared(rot_data, paired_shape, false); auto split_axis = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {-1}); auto data_split = std::make_shared(x_paired, split_axis, 2); @@ -133,28 +146,38 @@ OutputVector translate_rope(const NodeContext & context) { auto x_rotated_paired = std::make_shared(ov::OutputVector{x1_neg, x0}, -1); auto flat_shape = - ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector{1, -1, n_heads, head_size}); - auto x_rotated = std::make_shared(x_rotated_paired, flat_shape, false); + ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector{1, -1, n_heads, n_dims}); + auto x_rotated = + std::make_shared(x_rotated_paired, flat_shape, false); - // Expand cos/sin from [..., head_size/2] to [..., head_size] by repeating each + // Expand cos/sin from [..., n_dims/2] to [..., n_dims] by repeating each // entry twice. Use special_zero on the final Reshape so the seq dim passes // through dynamically. Final rank is 4 to satisfy the matcher's predicate. auto expand_cos_sin = [&](Output cs) { - auto cs_unsq = - std::make_shared(cs, ov::op::v0::Constant::create(ov::element::i64, {1}, {-1})); - auto bcast_target = - ov::op::v0::Constant::create(ov::element::i64, {5}, std::vector{1, 1, 1, half, 2}); - auto bcast = - std::make_shared(cs_unsq, bcast_target, ov::op::BroadcastType::BIDIRECTIONAL); - auto flat = ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector{0, 0, 0, head_size}); + auto cs_unsq = std::make_shared( + cs, ov::op::v0::Constant::create(ov::element::i64, {1}, {-1})); + auto bcast_target = ov::op::v0::Constant::create( + ov::element::i64, {5}, std::vector{1, 1, 1, half, 2}); + auto bcast = std::make_shared( + cs_unsq, bcast_target, ov::op::BroadcastType::BIDIRECTIONAL); + auto flat = ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector{0, 0, 0, n_dims}); return std::make_shared(bcast, flat, true); }; Output cos_full = expand_cos_sin(cos_theta_node); Output sin_full = expand_cos_sin(sin_theta_node); - auto y1 = std::make_shared(data_node, cos_full); + auto y1 = std::make_shared(rot_data, cos_full); auto y2 = std::make_shared(x_rotated, sin_full); - res = std::make_shared(y1, y2); + auto rotated = std::make_shared(y1, y2); + + if (n_dims < head_dim) { + auto tail_start = ov::op::v0::Constant::create(ov::element::i64, {1}, {n_dims}); + auto tail_end = ov::op::v0::Constant::create(ov::element::i64, {1}, {head_dim}); + auto tail = std::make_shared(data_node, tail_start, tail_end, step_one, axis_last); + res = std::make_shared(ov::OutputVector{rotated, tail}, -1); + } else { + res = rotated; + } } // PRESERVED PREVIOUS TRANSLATION - Re-enable this branch (and remove the Flux branch above) once // the GPU plugin's RoPE fusion is updated to recognize the even/odd Slice form; @@ -196,8 +219,15 @@ OutputVector translate_rope(const NodeContext & context) { // ov::element::i64, {4}, std::vector{1, -1, (int64_t) output_shape[2], (int64_t) output_shape[3]}); // res = std::make_shared(stack, data_shape, false); else if (mode == TYPE_NEOX) { - auto data_split = std::make_shared( - data_node, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {-1}), 2); + auto axis_last = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {-1}); + std::vector split_lengths = {n_dims / 2, n_dims / 2}; + if (n_dims < head_dim) { + split_lengths.push_back(head_dim - n_dims); + } + + auto data_split = std::make_shared( + data_node, axis_last, + ov::op::v0::Constant::create(ov::element::i64, {split_lengths.size()}, split_lengths)); Output slice_data_node_0 = data_split->outputs()[0]; Output slice_data_node_1 = data_split->outputs()[1]; @@ -209,16 +239,27 @@ OutputVector translate_rope(const NodeContext & context) { std::make_shared(slice_data_node_0, sin_theta_node), std::make_shared(slice_data_node_1, cos_theta_node)); - res = std::make_shared(ov::OutputVector{first_half_node, second_half_node}, -1); + if (n_dims < head_dim) { + Output tail = data_split->outputs()[2]; + res = std::make_shared(ov::OutputVector{first_half_node, second_half_node, tail}, -1); + } else { + res = std::make_shared(ov::OutputVector{first_half_node, second_half_node}, -1); + } } else if (mode == TYPE_IMROPE) { - int64_t n_dims = data_node->get_output_partial_shape(0)[3].get_length(); auto cos_sin_shape = std::make_shared(ov::element::i64, ov::Shape{4}, std::vector{1, -1, 1, (n_dims >> 1)}); auto cos_reshaped = std::make_shared(cos_theta_node, cos_sin_shape, true); auto sin_reshaped = std::make_shared(sin_theta_node, cos_sin_shape, true); auto split_axis = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {3}); - auto split_a = std::make_shared(data_node, split_axis, 2); + std::vector split_lengths = {n_dims / 2, n_dims / 2}; + if (n_dims < head_dim) { + split_lengths.push_back(head_dim - n_dims); + } + + auto split_a = std::make_shared( + data_node, split_axis, + ov::op::v0::Constant::create(ov::element::i64, {split_lengths.size()}, split_lengths)); auto x0 = split_a->output(0); auto x1 = split_a->output(1); auto mul_a = std::make_shared(x0, cos_reshaped); @@ -229,7 +270,12 @@ OutputVector translate_rope(const NodeContext & context) { auto mul_d = std::make_shared(x1, cos_reshaped); auto add = std::make_shared(mul_c, mul_d); - res = std::make_shared(ov::OutputVector{sub, add}, 3); + if (n_dims < head_dim) { + auto tail = split_a->output(2); + res = std::make_shared(ov::OutputVector{sub, add, tail}, 3); + } else { + res = std::make_shared(ov::OutputVector{sub, add}, 3); + } } if (res.get_element_type() != output_type) { diff --git a/ggml/src/ggml-openvino/openvino/op/scale.cpp b/ggml/src/ggml-openvino/openvino/op/scale.cpp index 0f3d800c1990..1d5ef4ffa4ac 100644 --- a/ggml/src/ggml-openvino/openvino/op/scale.cpp +++ b/ggml/src/ggml-openvino/openvino/op/scale.cpp @@ -2,9 +2,24 @@ #include "../op_table.h" #include "../utils.h" +#include #include +#include #include +#include +#include +#include +#include +#include +#include +#include #include +#include +#include +#include +#include +#include +#include #include namespace ov { @@ -21,6 +36,36 @@ OutputVector translate_scale(const NodeContext & context) { memcpy(&bias, (float *) context.get_output_op_params() + 1, sizeof(float)); auto scale_node = std::make_shared(ov::element::f32, ov::Shape{}, std::vector{scale}); + + if (context.get_op_case() == 1 && context.has_input("cache_rs_reset_len")) { + auto cache_rs_reset_idx = context.get_input("cache_rs_reset_idx"); + auto cache_rs_reset_len = context.get_input("cache_rs_reset_len"); + + auto cache_rs = context.get_input(0); + + auto cache_shape = std::make_shared(cache_rs, ov::element::i64); + auto n_slots_1d = std::make_shared( + cache_shape, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {2}), + ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {0})); + auto n_slots = std::make_shared(n_slots_1d); + + auto iota = std::make_shared( + ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {0}), n_slots, + ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {1}), ov::element::i64); + + auto idx_plus_len = std::make_shared(cache_rs_reset_idx, cache_rs_reset_len); + auto less_than_idx = std::make_shared(iota, cache_rs_reset_idx); + auto greater_equal_idx_plus_len = std::make_shared(iota, idx_plus_len); + auto keep_mask = std::make_shared(less_than_idx, greater_equal_idx_plus_len); + + auto keep_mask_f32 = std::make_shared(keep_mask, ov::element::f32); + auto keep_mask_reshape = std::make_shared( + keep_mask_f32, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {1})); + + auto cleared_cache_rs = std::make_shared(cache_rs, keep_mask_reshape); + return rename_outputs_with_suffix({cleared_cache_rs}, context.get_name()); + } + auto scaled = std::make_shared(context.get_input(0), scale_node); std::shared_ptr res; diff --git a/ggml/src/ggml-openvino/openvino/op/set.cpp b/ggml/src/ggml-openvino/openvino/op/set.cpp new file mode 100644 index 000000000000..50199916ca21 --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/set.cpp @@ -0,0 +1,83 @@ +#include "../node_context.h" +#include "../op_table.h" +#include "../utils.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { + +// GGML SET writes src1 into a view of src0 and returns the updated tensor. +// This translation supports the contiguous destination-layout form used in llama.cpp. +OutputVector translate_set(const NodeContext & context) { + num_inputs_check(context, 2, 2); + + auto dst = process_view_input_new(context, 0); + auto src = process_view_input_new(context, 1); + + src = std::make_shared(src, context.get_output_type()); + + const auto dst_stride = context.get_input_stride(0); + FRONT_END_OP_CONVERSION_CHECK(dst_stride.size() >= 4, "SET requires 4D destination strides"); + + const auto * op_params = context.get_output_op_params(); + const size_t nb1 = static_cast(op_params[0]); + const size_t nb2 = static_cast(op_params[1]); + const size_t nb3 = static_cast(op_params[2]); + const size_t offset = static_cast(op_params[3]); + + FRONT_END_OP_CONVERSION_CHECK(nb1 == dst_stride[1] && nb2 == dst_stride[2] && nb3 == dst_stride[3], + "SET requires destination strides that match src0 layout"); + + const size_t elem_size = dst_stride[0]; + FRONT_END_OP_CONVERSION_CHECK(elem_size != 0 && offset % elem_size == 0, + "SET offset must be aligned to destination element size"); + + const int64_t offset_elems = static_cast(offset / elem_size); + + auto dst_flat = std::make_shared( + dst, + ov::op::v0::Constant::create(ov::element::i64, {1}, {-1}), + false); + + auto src_flat = std::make_shared( + src, + ov::op::v0::Constant::create(ov::element::i64, {1}, {-1}), + false); + + auto src_shape = std::make_shared(src_flat, ov::element::i64); + auto src_len = std::make_shared( + src_shape, + ov::op::v0::Constant::create(ov::element::i64, {1}, {0}), + false); + + auto start = ov::op::v0::Constant::create(ov::element::i64, {}, {offset_elems}); + auto stop = std::make_shared(start, src_len); + auto step = ov::op::v0::Constant::create(ov::element::i64, {}, {1}); + + auto indices = std::make_shared(start, stop, step, ov::element::i64); + auto axis = ov::op::v0::Constant::create(ov::element::i64, {}, {0}); + + auto updated_flat = std::make_shared(dst_flat, indices, src_flat, axis); + + auto dst_shape = std::make_shared(dst, ov::element::i64); + auto res = std::make_shared(updated_flat, dst_shape, false); + + return rename_outputs_with_suffix({res}, context.get_name()); +} + +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/solve_tri.cpp b/ggml/src/ggml-openvino/openvino/op/solve_tri.cpp new file mode 100644 index 000000000000..840233f85440 --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/solve_tri.cpp @@ -0,0 +1,108 @@ +#include "../node_context.h" +#include "../op_table.h" +#include "../utils.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { + +// GGML SOLVE_TRI: solve Ax = B for lower-triangular A via forward substitution. +// Currently only lower, right, non-unitriangular variant is implemented. +// +// ggml layout: A [n, n, B1, B2], B [k, n, B1, B2] → X [k, n, B1, B2] +// OV layout: A [B2, B1, n, n], B [B2, B1, n, k] → X [B2, B1, n, k] +// +// Forward substitution row i: +// x[i] = (b[i] - sum_{t(A_shape[2]); + + // Initial X: zeros with shape of B + auto B_shape_node = std::make_shared(B, ov::element::i64); + auto zero_f32 = ov::op::v0::Constant::create(ov::element::f32, {}, {0.0f}); + auto X_init = std::make_shared(zero_f32, B_shape_node); + + // --- Loop body parameters --- + // body_iter: iteration counter injected by the Loop op (i64, shape {1}) + auto body_iter = std::make_shared(ov::element::i64, ov::Shape{1}); + auto body_X = std::make_shared(ov::element::f32, ov::PartialShape::dynamic(4)); + auto body_A = std::make_shared(ov::element::f32, ov::PartialShape::dynamic(4)); + auto body_B_p = std::make_shared(ov::element::f32, ov::PartialShape::dynamic(4)); + + auto c_axis2 = ov::op::v0::Constant::create(ov::element::i64, {1}, {int64_t(2)}); + auto c_axis3 = ov::op::v0::Constant::create(ov::element::i64, {1}, {int64_t(3)}); + auto c_axis2_scalar = ov::op::v0::Constant::create(ov::element::i64, {}, {int64_t(2)}); + + // b_i = B[..., i, :] [B2, B1, 1, k] + auto b_i = std::make_shared(body_B_p, body_iter, c_axis2); + + // A_row_i = A[..., i, :] [B2, B1, 1, n] + auto A_row_i = std::make_shared(body_A, body_iter, c_axis2); + + // sum_i = A_row_i @ X [B2, B1, 1, k] + // (lower-tri zeros + unfilled-X zeros make this equal to the partial sum) + auto sum_i = std::make_shared(A_row_i, body_X, false, false); + + // diag_i = A[..., i, i] [B2, B1, 1, 1] + auto diag_i = std::make_shared(A_row_i, body_iter, c_axis3); + + // x_i = (b_i - sum_i) / diag_i [B2, B1, 1, k] + auto x_i = std::make_shared( + std::make_shared(b_i, sum_i), diag_i); + + // X_updated: scatter x_i into body_X at row i along axis 2 + auto X_updated = std::make_shared(body_X, body_iter, x_i, c_axis2_scalar); + + auto body_cond = ov::op::v0::Constant::create(ov::element::boolean, ov::Shape{1}, {true}); + + auto body = std::make_shared( + ov::OutputVector{body_cond, X_updated}, + ov::ParameterVector{body_iter, body_X, body_A, body_B_p}); + + // --- Assemble Loop --- + auto trip_count = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, std::vector{n}); + auto exec_cond = ov::op::v0::Constant::create(ov::element::boolean, ov::Shape{1}, {true}); + + auto loop = std::make_shared(trip_count, exec_cond); + loop->set_function(body); + // iter_counter_body_param_idx=0 (body_iter), exec_condition_body_result_idx=0 (body_cond) + loop->set_special_body_ports(ov::op::v5::Loop::SpecialBodyPorts{0, 0}); + + // Carried state: X feeds back from X_updated each iteration + loop->set_merged_input(body_X, X_init, X_updated); + // Invariant inputs passed through unchanged + loop->set_invariant_input(body_A, A); + loop->set_invariant_input(body_B_p, B); + + // Final output: value of X_updated after the last iteration + auto X_final = loop->get_iter_value(X_updated, -1); + + return rename_outputs_with_suffix({X_final}, context.get_name()); +} + +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/ssm_conv.cpp b/ggml/src/ggml-openvino/openvino/op/ssm_conv.cpp index 522308726a8d..352fd90560f2 100644 --- a/ggml/src/ggml-openvino/openvino/op/ssm_conv.cpp +++ b/ggml/src/ggml-openvino/openvino/op/ssm_conv.cpp @@ -5,7 +5,9 @@ #include #include #include +#include #include +#include namespace ov { namespace frontend { @@ -21,15 +23,15 @@ OutputVector translate_ssm_conv(const NodeContext & context) { auto sx_shape = context.get_input_shape(0).to_shape(); // [1, n_s, d_inner, ncs] auto c_shape = context.get_input_shape(1).to_shape(); // [1, 1, d_inner, d_conv] - int64_t n_s = sx_shape[1]; + // int64_t n_s = sx_shape[1]; int64_t d_inner = sx_shape[2]; - int64_t ncs = sx_shape[3]; // d_conv - 1 + n_t - int64_t d_conv = c_shape[3]; - int64_t n_t = ncs - d_conv + 1; + // int64_t ncs = sx_shape[3]; // d_conv - 1 + n_t + int64_t d_conv = c_shape[3]; + // int64_t n_t = ncs - d_conv + 1; // Reshape sx from [1, n_s, d_inner, ncs] to [n_s, d_inner, ncs] for 1D GroupConvolution - auto sx_new_shape = ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{n_s, d_inner, ncs}); - auto sx_reshaped = std::make_shared(sx, sx_new_shape, false); + auto sx_reshaped = + std::make_shared(sx, ov::op::v0::Constant::create(ov::element::i64, {1}, {0})); // Reshape c from [1, 1, d_inner, d_conv] to [d_inner, 1, 1, d_conv] // GroupConvolution filter: [groups, out_channels/groups, in_channels/groups, kernel_size] @@ -47,8 +49,8 @@ OutputVector translate_ssm_conv(const NodeContext & context) { auto transposed = std::make_shared(conv, perm); // Reshape to output shape [1, n_s, n_t, d_inner] - auto out_shape = ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector{1, n_s, n_t, d_inner}); - auto res = std::make_shared(transposed, out_shape, false); + auto res = + std::make_shared(transposed, ov::op::v0::Constant::create(ov::element::i64, {1}, {0})); return rename_outputs_with_suffix({res}, context.get_name()); } diff --git a/ggml/src/ggml-openvino/openvino/op/tri.cpp b/ggml/src/ggml-openvino/openvino/op/tri.cpp new file mode 100644 index 000000000000..9b7774a383ec --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/tri.cpp @@ -0,0 +1,82 @@ +#include "../node_context.h" +#include "../op_table.h" +#include "../utils.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { + +// GGML TRI zeroes out elements outside a triangular region of a square matrix. +// The type param (stored in op_params[0]) maps to ggml_tri_type: +// 0 = UPPER_DIAG : keep where col >= row +// 1 = UPPER : keep where col > row +// 2 = LOWER_DIAG : keep where col <= row +// 3 = LOWER : keep where col < row +// +// In OV layout (ggml [ne0, ne1, ne2, ne3] → OV [ne3, ne2, ne1, ne0]): +// ggml dim 0 (ne0, cols) → OV axis 3 +// ggml dim 1 (ne1, rows) → OV axis 2 +// The matrix is square so ne0 == ne1. +OutputVector translate_tri(const NodeContext & context) { + num_inputs_check(context, 1, 1); + + auto x = context.get_input(0); // OV shape: [ne3, ne2, ne1, ne0] + + int32_t tri_type = context.get_output_op_params()[0]; + + auto shape = context.get_input_shape(0).to_shape(); + int64_t n = static_cast(shape[3]); // ne0 == ne1 + + // Build index range [0, 1, ..., n-1] + auto start = ov::op::v0::Constant::create(ov::element::i64, {}, {int64_t(0)}); + auto stop = ov::op::v0::Constant::create(ov::element::i64, {}, {n}); + auto step = ov::op::v0::Constant::create(ov::element::i64, {}, {int64_t(1)}); + auto range = std::make_shared(start, stop, step, ov::element::i64); + + // col_idx shape [1, 1, 1, n] — broadcasts over batch and row dims + auto col_shape = ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector{1, 1, 1, n}); + auto col_idx = std::make_shared(range, col_shape, false); + + // row_idx shape [1, 1, n, 1] — broadcasts over batch and col dims + auto row_shape = ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector{1, 1, n, 1}); + auto row_idx = std::make_shared(range, row_shape, false); + + // Build boolean mask: true where element should be kept + std::shared_ptr mask; + switch (tri_type) { + case 0: // UPPER_DIAG: col >= row + mask = std::make_shared(col_idx, row_idx); + break; + case 1: // UPPER: col > row + mask = std::make_shared(col_idx, row_idx); + break; + case 2: // LOWER_DIAG: col <= row + mask = std::make_shared(col_idx, row_idx); + break; + case 3: // LOWER: col < row + mask = std::make_shared(col_idx, row_idx); + break; + default: + throw std::runtime_error("translate_tri: invalid tri_type " + std::to_string(tri_type)); + } + + auto zero = ov::op::v0::Constant::create(ov::element::f32, {}, {0.0f}); + auto res = std::make_shared(mask, x, zero); + + return rename_outputs_with_suffix({res}, context.get_name()); +} + +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op_table.cpp b/ggml/src/ggml-openvino/openvino/op_table.cpp index cca448a7cec1..1e9a0ede7e20 100644 --- a/ggml/src/ggml-openvino/openvino/op_table.cpp +++ b/ggml/src/ggml-openvino/openvino/op_table.cpp @@ -4,10 +4,12 @@ #include #include +#include #include #include #include #include +#include #include #include #include @@ -50,6 +52,9 @@ std::unordered_map get_supported_ops() { {"GGML_UNARY_OP_SILU", op::translate_unary_silu }, {"GGML_UNARY_OP_SOFTPLUS", op::translate_unary_softplus }, {"GGML_UNARY_OP_TANH", op::translate_1to1_match_1_input }, + {"GGML_UNARY_OP_SIGMOID", op::translate_1to1_match_1_input }, + {"GGML_UNARY_OP_EXP", op::translate_1to1_match_1_input }, + {"GGML_UNARY_OP_NEG", op::translate_1to1_match_1_input }, {"GGML_OP_VIEW", op::translate_view }, {"GGML_GLU_OP_SWIGLU", op::translate_glu_swiglu }, {"GGML_GLU_OP_SWIGLU_OAI", op::translate_glu_swiglu_oai }, @@ -62,6 +67,12 @@ std::unordered_map get_supported_ops() { {"GGML_OP_SSM_CONV", op::translate_ssm_conv }, {"GGML_OP_GATED_DELTA_NET", op::translate_gated_delta_net }, {"GGML_OP_REPEAT", op::translate_repeat }, + {"GGML_OP_CUMSUM", op::translate_cumsum }, + {"GGML_OP_FILL", op::translate_fill }, + {"GGML_OP_DIAG", op::translate_diag }, + {"GGML_OP_TRI", op::translate_tri }, + {"GGML_OP_SOLVE_TRI", op::translate_solve_tri }, + {"GGML_OP_SET", op::translate_set }, }; } diff --git a/ggml/src/ggml-openvino/openvino/op_table.h b/ggml/src/ggml-openvino/openvino/op_table.h index cd35e1429ec7..c184a4319927 100644 --- a/ggml/src/ggml-openvino/openvino/op_table.h +++ b/ggml/src/ggml-openvino/openvino/op_table.h @@ -46,6 +46,12 @@ GGML_OP_CONVERTER(translate_pad); GGML_OP_CONVERTER(translate_ssm_conv); GGML_OP_CONVERTER(translate_gated_delta_net); GGML_OP_CONVERTER(translate_repeat); +GGML_OP_CONVERTER(translate_cumsum); +GGML_OP_CONVERTER(translate_fill); +GGML_OP_CONVERTER(translate_set); +GGML_OP_CONVERTER(translate_diag); +GGML_OP_CONVERTER(translate_tri); +GGML_OP_CONVERTER(translate_solve_tri); } // namespace op diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index 4e981c5cff7d..1bf9135742e1 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -192,18 +192,17 @@ std::shared_ptr TranslateSession::translate_graph(const frontend::InputMo (*tensor_map)[it.first] = it.second; } - auto node_visitor = [&](std::shared_ptr decoder, int node_idx) { + auto translate_node = [&](const std::shared_ptr & decoder, int node_idx) { auto operation_type = decoder->get_op_type(node_idx); if (operation_type == "GGML_OP_NONE") { - return; + return ov::OutputVector{}; } - ov::OutputVector converted_outputs; auto it = m_translator_map.find(operation_type); FRONT_END_OP_CONVERSION_CHECK(it != m_translator_map.end(), "Translation for operation type ", operation_type, " is not implemented."); NodeContext node_context(decoder, tensor_map, node_idx, this); - converted_outputs = it->second(node_context); + ov::OutputVector converted_outputs = it->second(node_context); const auto & node_output_names = decoder->get_output_names(node_idx); FRONT_END_OP_CONVERSION_CHECK(node_output_names.size() == converted_outputs.size(), "Number of ", @@ -216,15 +215,48 @@ std::shared_ptr TranslateSession::translate_graph(const frontend::InputMo (*tensor_map)[output_name] = converted_outputs[i]; } } + return converted_outputs; + }; - const auto & node_output_aliases = decoder->get_output_aliases(node_idx); - for (const auto & output_alias : node_output_aliases) { - if (!converted_outputs.empty() && converted_outputs[0].get_node_shared_ptr() != nullptr) { - (*tensor_map)[output_alias] = converted_outputs[0]; + // To handle cases like this + // 3: [ 18432, 1, 1, 1] RESHAPE cache_r_l0 (reshaped)#3 + // [ 18432, 1, 1, 1] 0: NONE cache_r_l0 + // 4: [ 0, 1, 1, 1] VIEW cache_r_l0 (reshaped) (view)#4 + // [ 18432, 1, 1, 1] 0: RESHAPE cache_r_l0 (reshaped)#3 + // 5: [ 0, 1, 1, 1] SCALE cache_r_l0 (reshaped) (view) (view)#5 + // [ 0, 1, 1, 1] 0: VIEW cache_r_l0 (reshaped) (view)#4 + // 6: [ 1, 1, 1, 1] VIEW (view)#6 + // [ 1, 1, 1, 1] 0: NONE leaf_5 + // 7: [ 18432, 1, 1, 1] GET_ROWS conv_states-0#7 + // [ 18432, 1, 1, 1] 0: RESHAPE cache_r_l0 (reshaped)#3 + // [ 1, 1, 1, 1] 1: VIEW (view)#6 + // The scale is in-place which modifies cache_r_l0 (reshaped)#3 + // The translation of scale overwrites cache_r in the tensor_map, + // but we also need to overwrite the old cache_r_l0 (reshaped)#3 + auto refresh_inplace_aliases = [&](const std::shared_ptr & decoder, int inplace_node_idx, + const std::string & view_src_name) { + for (int node_idx = 0; node_idx < inplace_node_idx; node_idx++) { + if (decoder->is_view_like_alias_of(node_idx, view_src_name)) { + translate_node(decoder, node_idx); } } }; + auto node_visitor = [&](std::shared_ptr decoder, int node_idx) { + auto converted_outputs = translate_node(decoder, node_idx); + if (converted_outputs.empty()) { + return; + } + const auto inplace_src = decoder->get_inplace_op_src(node_idx); + if (inplace_src.empty()) { + return; + } + if (converted_outputs[0].get_node_shared_ptr() != nullptr) { + (*tensor_map)[inplace_src] = converted_outputs[0]; + } + refresh_inplace_aliases(decoder, node_idx, inplace_src); + }; + if (!m_naive) { preprocess(*tensor_map, *ggml_model_decoder); } @@ -296,21 +328,11 @@ std::shared_ptr TranslateSession::apply_transformations(std::shared_ptris_stateful()) { - auto output_names = ggml_model_decoder->get_model_output_names(); - std::map model_output_indexes; - for (size_t i = 0; i < output_names.size(); i++) { - model_output_indexes.insert(std::make_pair(output_names[i], i)); - } ov::preprocess::PrePostProcessor ppp(model); for (size_t i = 0; i < model->get_output_size(); i++) { - auto output_friendly_name = model->output(i).get_node_shared_ptr()->get_friendly_name(); - auto output_id = model_output_indexes[output_friendly_name]; auto model_output_shape = model->output(i).get_partial_shape(); - auto decoder_output_shape = ggml_model_decoder->get_output_shape(output_id); - if (model_output_shape.rank().is_static() && decoder_output_shape.rank().is_static() && - model_output_shape.rank().get_length() + 1 == decoder_output_shape.rank().get_length() && - decoder_output_shape[0].is_static() && decoder_output_shape[0].get_length() == 1) { - ppp.output(i).postprocess().custom([](const ov::Output & node) { + if (model_output_shape.rank().is_static() && model_output_shape.rank().get_length() == 3) { + ppp.output(i).postprocess().custom([](const ov::Output& node) { auto axes = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{1}, {0}); return std::make_shared(node, axes); }); diff --git a/ggml/src/ggml-openvino/openvino/utils.cpp b/ggml/src/ggml-openvino/openvino/utils.cpp index 4e4f5dd0492e..f8e65db46487 100644 --- a/ggml/src/ggml-openvino/openvino/utils.cpp +++ b/ggml/src/ggml-openvino/openvino/utils.cpp @@ -234,23 +234,30 @@ std::pair, ov::Output> make_sin_cos(int32_t * rope_params return std::make_pair(sin_theta, cos_theta); } -ov::Output process_view_input(const NodeContext & context, int input_index, int slice_len) { - // Only works for VIEW operations that slice at the lowest dimension - // If the VIEW also reshape the result, `slice_len` should be provided +ov::Output process_view_input(const NodeContext & context, int input_index, int slice_len, int axis) { + // Only works for VIEW operations that does a non-strided slice with optinal reshape on the slice result. + // The function only does the slice part, the reshape (if any) should be handled by the caller. + // Default axis is -1, which means slicing the last dimension. + // If the VIEW reshapes the result, `slice_len` should be provided auto input = context.get_input(input_index); auto * op_params = (size_t *) context.get_input_op_params(input_index); - auto src1_stride = context.get_input_stride(input_index); + auto src_stride = context.get_input_stride(input_index); - int64_t split_addr = op_params[0] / src1_stride[3]; + int64_t slice_start = op_params[0] / src_stride[3]; if (slice_len == 0) { slice_len = context.get_input_shape(input_index)[3].get_length(); } - int64_t slice_end = split_addr + slice_len; + int64_t slice_end = slice_start + slice_len; - auto begin = ov::op::v0::Constant::create(ov::element::i64, {1}, {split_addr}); + auto begin = ov::op::v0::Constant::create(ov::element::i64, {1}, {slice_start}); auto end = ov::op::v0::Constant::create(ov::element::i64, {1}, {slice_end}); auto stride = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); - auto axes = ov::op::v0::Constant::create(ov::element::i64, {1}, {context.is_stateful() ? 2 : 3}); + ov::Output axes; + if (axis == -1) { + axes = ov::op::v0::Constant::create(ov::element::i64, {1}, {context.is_stateful() ? 2 : 3}); + } else { + axes = ov::op::v0::Constant::create(ov::element::i64, {1}, {axis}); + } auto sliced = std::make_shared(input, begin, end, stride, axes); return sliced; } diff --git a/ggml/src/ggml-openvino/openvino/utils.h b/ggml/src/ggml-openvino/openvino/utils.h index 8dc3e8765e82..5d4c3538664a 100644 --- a/ggml/src/ggml-openvino/openvino/utils.h +++ b/ggml/src/ggml-openvino/openvino/utils.h @@ -62,7 +62,7 @@ std::pair, ov::Output> make_sin_cos(int32_t * rope_params bool imrope = false, bool stateful = false); -ov::Output process_view_input(const NodeContext & context, int input_index, int slice_len = 0); +ov::Output process_view_input(const NodeContext & context, int input_index, int slice_len = 0, int axis = -1); ov::Output process_view_input_new(const NodeContext & context, int input_index); diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 70af08bdf182..14d4ac7760c2 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -642,6 +642,13 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptrsrc. // Step 2 verifies that node inputs come from model nodes/weights/leafs; external sources imply split. bool is_model_splitted(ggml_cgraph * cgraph) { + // Backend op tests execute each node through ggml_graph_view(), which preserves the original + // graph use_counts while exposing only one node. Treat those single-node views as regular + // naive graphs so intermediate ops do not look like split-model fragments. + if (cgraph->n_nodes <= 1 && cgraph->n_leafs == 0) { + return false; + } + // check the nodes of the model are used by the following nodes, through compare the node's use count and the count of nodes that use it as input. If does not match, return true, else return false. for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor * node = cgraph->nodes[i];