ravi9 · zhaixuejun1993 · Jun 23, 2026 · Jun 24, 2026
@@ -834,7 +834,7 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor
     // GGML_LOG_DEBUG("%s: creating new weight node for %s\n", __func__, tensor->name);
     static const std::set<ggml_type> weight_types = {GGML_TYPE_F32,  GGML_TYPE_F16,  GGML_TYPE_BF16, GGML_TYPE_Q8_0,
                                                      GGML_TYPE_Q4_0, GGML_TYPE_Q4_1, GGML_TYPE_Q5_1, GGML_TYPE_Q4_K,
-                                                     GGML_TYPE_Q5_K, GGML_TYPE_Q6_K};
+                                                     GGML_TYPE_Q5_K, GGML_TYPE_Q6_K, GGML_TYPE_MXFP4};
     if (weight_types.find(tensor->type) == weight_types.end()) {
         throw std::runtime_error("Unexpected weight tensor type: " + std::string(tensor->name) + " with type " +
                                  ggml_type_name(tensor->type));

@@ -252,14 +252,24 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten
         return layout;
     }
 
-    // Only handle 2D weight tensors
-    if (tensor->ne[2] != 1 || tensor->ne[3] != 1) {
+    // Most quantized weights use the existing 2D extraction path. MXFP4 also
+    // appears as 3D expert weights for MUL_MAT_ID, so allow that type through.
+    if (tensor->type != GGML_TYPE_MXFP4 && (tensor->ne[2] != 1 || tensor->ne[3] != 1)) {
         return layout;
     }
 
     int64_t n_elements = ggml_nelements(tensor);
     const size_t alignment = 64;  // Good for SIMD
 
+    if (tensor->type == GGML_TYPE_MXFP4 && (tensor->ne[2] > 1 || tensor->ne[3] > 1)) {
+        layout.weights_per_block = 32;
+        layout.is_symmetric = true;
+        layout.weights_size = ggml_nbytes(tensor);
+        layout.weights_offset = 0;
+        layout.total_size = layout.weights_size;
+        return layout;
+    }
+
     // Check if requantization is needed (NPU-specific)
     auto requant_type = ggml_openvino_get_requant_type(tensor, use_bias);
     if (requant_type.has_value()) {
@@ -334,6 +344,11 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten
     layout.is_symmetric = false;
 
     switch (tensor->type) {
+    case GGML_TYPE_MXFP4:
+        layout.is_u4 = true;
+        layout.is_symmetric = true;
+        break;
+
     case GGML_TYPE_Q4_0:
         layout.is_u4 = true;
         layout.is_symmetric = true;
@@ -369,9 +384,9 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten
     // Weights: U4 = n_elements/2 bytes, U8 = n_elements bytes
     layout.weights_size = layout.is_u4 ? (n_elements / 2) : n_elements;
 
-    // Scales: F16 per block
+    // Scales: F16 per block, except MXFP4 which stores one E8M0 byte per block.
     int64_t n_blocks = n_elements / layout.weights_per_block;
-    layout.scales_size = n_blocks * sizeof(uint16_t);  // F16 = 2 bytes
+    layout.scales_size = n_blocks * (tensor->type == GGML_TYPE_MXFP4 ? sizeof(uint8_t) : sizeof(uint16_t));
     // For symmetric quantization, no zp needed (weights stored as signed)
     if (layout.is_symmetric) {
         layout.zp_size = 0;

@@ -237,8 +237,9 @@ static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer
     bool is_full_tensor_set = (offset == 0 && size == ggml_nbytes(tensor) && tensor->view_src == nullptr);
     // 2D tensor (typical weight shape)
     bool is_2d = (tensor->ne[2] == 1 && tensor->ne[3] == 1);
+    bool is_supported_weight_shape = is_2d || tensor->type == GGML_TYPE_MXFP4;
 
-    if (is_weight_buffer && is_full_tensor_set && is_2d) {
+    if (is_weight_buffer && is_full_tensor_set && is_supported_weight_shape) {
         try {
             auto result = process_weight_tensor(tensor, data, tensor->data);
             result.weight_node->set_friendly_name(tensor->name);
@@ -458,8 +459,9 @@ static size_t ggml_backend_openvino_buffer_type_get_alloc_size(ggml_backend_buff
                                                                const ggml_tensor * tensor) {
     GGML_UNUSED(buft);
 
-    // For quantized 2D tensors (weights), we need extra space for extracted data
-    if (ggml_is_quantized(tensor->type) && tensor->ne[2] == 1 && tensor->ne[3] == 1) {
+    // For quantized weight tensors, we need extra space for extracted data.
+    if (ggml_is_quantized(tensor->type) &&
+        ((tensor->ne[2] == 1 && tensor->ne[3] == 1) || tensor->type == GGML_TYPE_MXFP4)) {
         ggml_openvino_extracted_layout layout = ggml_openvino_get_extracted_layout(tensor);
         if (layout.total_size > 0) {
             // GGML_LOG_DEBUG("%s: tensor %s needs %zu bytes (original %zu, extracted: weights=%zu scales=%zu zp=%zu)\n",
@@ -901,17 +903,10 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
             return true;
         }
 
-        // Keep the MoE routing weights gather on CPU for GPU runs. Splitting
-        // only at the later SUM/CLAMP/DIV nodes still leaves this routing path
-        // numerically unstable for arctic-style MoE graphs.
-        if (strncmp(op->name, "ffn_moe_weights", sizeof("ffn_moe_weights") - 1) == 0) {
-            return true;
-        }
         break;
     }
     case GGML_OP_RESHAPE: {
-        if (strncmp(op->name, "ffn_moe_weights", sizeof("ffn_moe_weights") - 1) == 0 ||
-            strncmp(op->name, "ffn_norm_exps", sizeof("ffn_norm_exps") - 1) == 0) {
+        if (strncmp(op->name, "ffn_norm_exps", sizeof("ffn_norm_exps") - 1) == 0) {
             return true;
         }
         break;
@@ -958,49 +953,15 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
             return true;
         }
 
-        // qwen3next MoE weight normalization is numerically sensitive on the GPU
-        // path. Keep the normalization divide on CPU to match the reference.
-        if (strncmp(op->name, "ffn_moe_weights_norm", sizeof("ffn_moe_weights_norm") - 1) == 0) {
-            return true;
-        }
-        break;
-    }
-    case GGML_OP_SOFT_MAX: {
-        if (op->src[2] != nullptr) {
-            // GGML_LOG_WARN("OpenVINO backend does not support SOFT_MAX with sinks\n");
-            return true;
-        }
-
-        if (strncmp(op->name, "ffn_moe_probs", sizeof("ffn_moe_probs") - 1) == 0) {
-            return true;
-        }
-
-        // GPU execution of the MoE routing weights softmax is numerically unstable
-        // when fused with the surrounding GET_ROWS/reshape path. Keep this softmax
-        // on CPU so the scheduler splits at the same boundary that restores parity.
-        if (op->src[0] != nullptr && op->src[0]->op == GGML_OP_RESHAPE && op->src[0]->src[0] != nullptr &&
-            strncmp(op->src[0]->src[0]->name, "ffn_moe_weights", sizeof("ffn_moe_weights") - 1) == 0) {
-            return true;
-        }
         break;
     }
     case GGML_OP_SUM_ROWS: {
-        if (strncmp(op->name, "ffn_moe_weights_sum", sizeof("ffn_moe_weights_sum") - 1) == 0) {
-            return true;
-        }
-
         // if the input is PERMUTE skip
         if (op->src[0]->op == GGML_OP_PERMUTE) {
             return true;
         }
         break;
     }
-    case GGML_OP_CLAMP: {
-        if (strncmp(op->name, "ffn_moe_weights_sum_clamped", sizeof("ffn_moe_weights_sum_clamped") - 1) == 0) {
-            return true;
-        }
-        break;
-    }
     case GGML_OP_FLASH_ATTN_EXT: {
         float scale = 1.0f;
         float max_bias = 0.0f;
@@ -1056,12 +1017,6 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
         break;
     }
     case GGML_OP_MUL_MAT: {
-        if (ggml_openvino_get_device_name() == "GPU" && op->src[1]->op == GGML_OP_SOFT_MAX &&
-            op->src[0]->op == GGML_OP_CONT && op->src[0]->src[0] != nullptr &&
-            op->src[0]->src[0]->op == GGML_OP_TRANSPOSE && op->src[0]->src[0]->src[0] != nullptr &&
-            op->src[0]->src[0]->src[0]->op == GGML_OP_PERMUTE) {
-            return true;
-        }
         if (op->src[0]->ne[3] != op->src[1]->ne[3] && op->src[0]->ne[3] != 1 && op->src[1]->ne[3] != 1) {
             return true;
         }
@@ -1071,12 +1026,8 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
         break;
     }
     case GGML_OP_MUL_MAT_ID: {
-        if (strncmp(op->name, "ffn_moe_gate_up", sizeof("ffn_moe_gate_up") - 1) == 0 ||
-            strncmp(op->name, "ffn_moe_down", sizeof("ffn_moe_down") - 1) == 0) {
-            return true;
-        }
-
-        if (mul_mat_id_requires_large_tmp(op)) {
+        if (mul_mat_id_requires_large_tmp(op) &&
+            !(op->src[0] != nullptr && op->src[0]->type == GGML_TYPE_MXFP4)) {
             return true;
         }
         break;
@@ -1154,8 +1105,8 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
         return true;
     }
     case GGML_OP_VIEW: {
-        // Skip TOPK_MOE fused tests until it is fully supported
-        // the argsort_top_k VIEW wrapping ARGSORT is named "selected_experts" in test_topk_moe
+        // Skip TOPK_MOE fused tests until it is fully supported.
+        // The argsort_top_k VIEW wrapping ARGSORT is named "selected_experts" in test_topk_moe.
         if (strcmp(op->name, "selected_experts") == 0) {
             return true;
         }
@@ -1172,7 +1123,8 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con
 
     static std::unordered_set<ggml_type> supported_types{
         GGML_TYPE_F32,  GGML_TYPE_F16,  GGML_TYPE_BF16, GGML_TYPE_I64,  GGML_TYPE_I32,  GGML_TYPE_Q4_0,
-        GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, GGML_TYPE_Q5_1, GGML_TYPE_Q5_K, GGML_TYPE_Q8_0, GGML_TYPE_Q6_K};
+        GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, GGML_TYPE_Q5_1, GGML_TYPE_Q5_K, GGML_TYPE_Q8_0, GGML_TYPE_Q6_K,
+        GGML_TYPE_MXFP4};
 
     // derive supported op sets from the op_table map, keys in
     // the map use the full macro name (e.g. "GGML_OP_ADD"), while
@@ -1270,7 +1222,9 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con
             // GGML_LOG_WARN("OpenVINO backend does not support tensor type %s\n", ggml_type_name(src->type));
             return false;
         }
-        if (ggml_is_quantized(src->type) && src->ne[2] != 1) {
+        const bool is_supported_3d_mxfp4_moe = op->op == GGML_OP_MUL_MAT_ID && i == 0 &&
+                                               src->type == GGML_TYPE_MXFP4;
+        if (ggml_is_quantized(src->type) && src->ne[2] != 1 && !is_supported_3d_mxfp4_moe) {
             // GGML_LOG_WARN("OpenVINO backend does not support 3D quantized tensors\n");
             return false;
         }

@@ -18,7 +18,9 @@
 #include <openvino/core/shape.hpp>
 #include <openvino/core/type/element_type.hpp>
 #include <openvino/core/type/element_type_traits.hpp>
+#include <openvino/core/type/float4_e2m1.hpp>
 #include <openvino/core/type/float16.hpp>
+#include <openvino/core/type/float8_e8m0.hpp>
 #include <openvino/op/add.hpp>
 #include <openvino/op/constant.hpp>
 #include <openvino/op/convert.hpp>
@@ -44,6 +46,38 @@ void unpack_32_4(const uint8_t * data, uint8_t * dst) {
     }
 }
 
+static constexpr size_t MXFP4_BLOCK_SIZE = 32;
+static constexpr size_t MXFP4_BLOCK_QS_SIZE = MXFP4_BLOCK_SIZE / 2;
+static constexpr size_t MXFP4_BLOCK_BYTES = sizeof(uint8_t) + MXFP4_BLOCK_QS_SIZE;
+
+static void pack_32_mxfp4_for_openvino(const uint8_t * data, uint8_t * dst) {
+    for (int j = 0; j < static_cast<int>(MXFP4_BLOCK_QS_SIZE); j += 2) {
+        const uint8_t v0 = data[j] & 0x0F;
+        const uint8_t v1 = (data[j + 1] & 0x0F) << 4;
+        const uint8_t v16 = data[j] >> 4;
+        const uint8_t v17 = data[j + 1] & 0xF0;
+        dst[j / 2] = v0 | v1;
+        dst[MXFP4_BLOCK_SIZE / 4 + j / 2] = v16 | v17;
+    }
+}
+
+void extract_mxfp4_data(const ggml_tensor * tensor, ov::Tensor & weights_arr, ov::Tensor & scales_arr) {
+    GGML_ASSERT(tensor->type == GGML_TYPE_MXFP4);
+    GGML_ASSERT(weights_arr.get_element_type() == ov::element::f4e2m1);
+    GGML_ASSERT(scales_arr.get_element_type() == ov::element::f8e8m0);
+
+    const auto * data = static_cast<const uint8_t *>(tensor->data);
+    auto * weights = static_cast<uint8_t *>(weights_arr.data());
+    auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f8e8m0>::value_type>();
+    const size_t n_blocks = scales_arr.get_size();
+
+    ov::parallel_for(n_blocks, [&](size_t i) {
+        const uint8_t * block = data + i * MXFP4_BLOCK_BYTES;
+        pack_32_mxfp4_for_openvino(block + sizeof(uint8_t), weights + i * MXFP4_BLOCK_QS_SIZE);
+        scales[i] = ov::float8_e8m0::from_bits(block[0]);
+    });
+}
+
 // Extracts (weight, scales, zp) from Q4_0 tensors.
 // Data layout is: |16 bit scale|32 x 4bit weights|.
 // When zp_arr is empty (symmetric), weights are stored as signed i4 (value - 8).
@@ -617,6 +651,42 @@ ov::Output<ov::Node> make_int4_weights(ov::Tensor & weight,
     return std::make_shared<ov::op::v0::Convert>(result, ov::element::f32);
 }
 
+ov::Output<ov::Node> make_mxfp4_weights(ov::Tensor & weight, ov::Tensor & scales) {
+    const ov::Shape final_shape = weight.get_shape();
+    GGML_ASSERT(!final_shape.empty());
+    GGML_ASSERT(final_shape.back() % MXFP4_BLOCK_SIZE == 0);
+
+    ov::Shape packed_shape = final_shape;
+    packed_shape.back() /= MXFP4_BLOCK_SIZE;
+    packed_shape.push_back(MXFP4_BLOCK_SIZE);
+
+    ov::Shape scale_shape = packed_shape;
+    scale_shape.back() = 1;
+    scales.set_shape(scale_shape);
+
+    auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::f4e2m1, packed_shape,
+                                                               static_cast<uint8_t *>(weight.data()), nullptr);
+    weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
+    auto weights_f32 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f32);
+
+    auto scales_node = std::make_shared<ov::op::v0::Constant>(scales);
+    auto scales_f32 = std::make_shared<ov::op::v0::Convert>(scales_node, ov::element::f32);
+    ov::Output<ov::Node> result =
+        std::make_shared<ov::op::v1::Multiply>(weights_f32, scales_f32, ov::op::AutoBroadcastType::NUMPY);
+
+    auto final_shape_node =
+        std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{final_shape.size()}, final_shape);
+    return std::make_shared<ov::op::v1::Reshape>(result, final_shape_node, false);
+}
+
+ov::Output<ov::Node> make_mxfp4_moe_packed_weights(ov::Tensor & weight) {
+    auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::u8, weight.get_shape(),
+                                                               static_cast<uint8_t *>(weight.data()), nullptr);
+    weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
+    weights_node->get_rt_info()["__ggml_openvino_mxfp4_moe_packed"] = true;
+    return weights_node;
+}
+
 // Extract quantized weights from tensor and create weight subgraph
 std::shared_ptr<ov::Node> extract_quantized_weights(const ggml_tensor * tensor,
                                                     const void * data,
@@ -628,6 +698,13 @@ std::shared_ptr<ov::Node> extract_quantized_weights(const ggml_tensor * tensor,
     ggml_tensor temp_tensor = *tensor;
     temp_tensor.data = const_cast<void *>(data);
 
+    if (tensor->type == GGML_TYPE_MXFP4) {
+        extract_mxfp4_data(&temp_tensor, weights, scales);
+        auto result = make_mxfp4_weights(weights, scales).get_node_shared_ptr();
+        result->set_friendly_name(tensor->name);
+        return result;
+    }
+
     // Determine block size based on tensor type
     int64_t weights_per_block;
     bool is_u4;
@@ -788,6 +865,27 @@ OvWeight process_weight_tensor(const ggml_tensor * tensor, const void * data, vo
         OPENVINO_THROW("Unsupported quantized type: ", ggml_type_name(tensor->type));
     }
 
+    const bool is_3d_mxfp4_moe = tensor->type == GGML_TYPE_MXFP4 && (tensor->ne[2] > 1 || tensor->ne[3] > 1);
+    if (is_3d_mxfp4_moe) {
+        ov::Shape packed_shape = {static_cast<size_t>(tensor->ne[3]),
+                                  static_cast<size_t>(tensor->ne[2]),
+                                  static_cast<size_t>(tensor->ne[1]),
+                                  static_cast<size_t>(tensor->ne[0] / MXFP4_BLOCK_SIZE),
+                                  MXFP4_BLOCK_BYTES};
+        const size_t tensor_bytes = ggml_nbytes(tensor);
+        if (output_base_ptr) {
+            auto * buf_base = static_cast<uint8_t *>(output_base_ptr);
+            memcpy(buf_base + layout.weights_offset, data, tensor_bytes);
+            result.weights = ov::Tensor(ov::element::u8, packed_shape, buf_base + layout.weights_offset);
+        } else {
+            result.weights = ov::Tensor(ov::element::u8, packed_shape);
+            memcpy(result.weights.data(), data, tensor_bytes);
+        }
+        result.weight_node = make_mxfp4_moe_packed_weights(result.weights).get_node_shared_ptr();
+        result.weight_node->set_friendly_name(tensor->name);
+        return result;
+    }
+
     if (use_bias) {
         OPENVINO_ASSERT(!layout.is_requant,
                         "use_bias is only used for test-backend-ops, which should not have requantization");
@@ -812,22 +910,40 @@ OvWeight process_weight_tensor(const ggml_tensor * tensor, const void * data, vo
     // Quantized path (normal extraction or quantized requant)
     // Create weight/scale/zp tensors - shared between both paths
     // For symmetric quantization, use signed types (i4/i8) and no ZP tensor
-    ov::element::Type weight_type = layout.is_symmetric ? (layout.is_u4 ? ov::element::i4 : ov::element::i8) :
-                                                          (layout.is_u4 ? ov::element::u4 : ov::element::u8);
+    ov::element::Type weight_type = tensor->type == GGML_TYPE_MXFP4 ?
+                                        ov::element::f4e2m1 :
+                                        (layout.is_symmetric ? (layout.is_u4 ? ov::element::i4 : ov::element::i8) :
+                                                               (layout.is_u4 ? ov::element::u4 : ov::element::u8));
     ov::Shape scale_shape = {node_shape[0], node_shape[1] / layout.weights_per_block};
 
+    if (tensor->type == GGML_TYPE_MXFP4) {
+        if (tensor->ne[2] == 1 && tensor->ne[3] == 1) {
+            node_shape = {static_cast<size_t>(tensor->ne[1]), static_cast<size_t>(tensor->ne[0])};
+        } else {
+            node_shape.clear();
+            for (int i = GGML_MAX_DIMS - 1; i >= 0; --i) {
+                node_shape.push_back(static_cast<size_t>(tensor->ne[i]));
+            }
+        }
+
+        scale_shape = node_shape;
+        scale_shape.back() /= layout.weights_per_block;
+    }
+
     if (output_base_ptr) {
         uint8_t * buf_base = static_cast<uint8_t *>(output_base_ptr);
         result.weights = ov::Tensor(weight_type, node_shape, buf_base + layout.weights_offset);
-        result.scales = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.scales_offset);
+        const ov::element::Type scale_type = tensor->type == GGML_TYPE_MXFP4 ? ov::element::f8e8m0 : ov::element::f16;
+        result.scales = ov::Tensor(scale_type, scale_shape, buf_base + layout.scales_offset);
         if (!layout.is_symmetric) {
             ov::element::Type zp_type = layout.is_u4 ? ov::element::u4 : ov::element::u8;
             result.zp = ov::Tensor(zp_type, scale_shape, buf_base + layout.zp_offset);
         }
         // else: result.zp remains default-constructed (empty) for symmetric
     } else {
         result.weights = ov::Tensor(weight_type, node_shape);
-        result.scales = ov::Tensor(ov::element::f16, scale_shape);
+        const ov::element::Type scale_type = tensor->type == GGML_TYPE_MXFP4 ? ov::element::f8e8m0 : ov::element::f16;
+        result.scales = ov::Tensor(scale_type, scale_shape);
         if (!layout.is_symmetric) {
             if (use_bias) {
                 result.zp = ov::Tensor(ov::element::f16, scale_shape);