Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ggml/src/ggml-openvino/ggml-decoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -834,7 +834,7 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor
// GGML_LOG_DEBUG("%s: creating new weight node for %s\n", __func__, tensor->name);
static const std::set<ggml_type> weight_types = {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_Q8_0,
GGML_TYPE_Q4_0, GGML_TYPE_Q4_1, GGML_TYPE_Q5_1, GGML_TYPE_Q4_K,
GGML_TYPE_Q5_K, GGML_TYPE_Q6_K};
GGML_TYPE_Q5_K, GGML_TYPE_Q6_K, GGML_TYPE_MXFP4};
if (weight_types.find(tensor->type) == weight_types.end()) {
throw std::runtime_error("Unexpected weight tensor type: " + std::string(tensor->name) + " with type " +
ggml_type_name(tensor->type));
Expand Down
23 changes: 19 additions & 4 deletions ggml/src/ggml-openvino/ggml-openvino-extra.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -252,14 +252,24 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten
return layout;
}

// Only handle 2D weight tensors
if (tensor->ne[2] != 1 || tensor->ne[3] != 1) {
// Most quantized weights use the existing 2D extraction path. MXFP4 also
// appears as 3D expert weights for MUL_MAT_ID, so allow that type through.
if (tensor->type != GGML_TYPE_MXFP4 && (tensor->ne[2] != 1 || tensor->ne[3] != 1)) {
return layout;
}

int64_t n_elements = ggml_nelements(tensor);
const size_t alignment = 64; // Good for SIMD

if (tensor->type == GGML_TYPE_MXFP4 && (tensor->ne[2] > 1 || tensor->ne[3] > 1)) {
layout.weights_per_block = 32;
layout.is_symmetric = true;
layout.weights_size = ggml_nbytes(tensor);
layout.weights_offset = 0;
layout.total_size = layout.weights_size;
return layout;
}

// Check if requantization is needed (NPU-specific)
auto requant_type = ggml_openvino_get_requant_type(tensor, use_bias);
if (requant_type.has_value()) {
Expand Down Expand Up @@ -334,6 +344,11 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten
layout.is_symmetric = false;

switch (tensor->type) {
case GGML_TYPE_MXFP4:
layout.is_u4 = true;
layout.is_symmetric = true;
break;

case GGML_TYPE_Q4_0:
layout.is_u4 = true;
layout.is_symmetric = true;
Expand Down Expand Up @@ -369,9 +384,9 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten
// Weights: U4 = n_elements/2 bytes, U8 = n_elements bytes
layout.weights_size = layout.is_u4 ? (n_elements / 2) : n_elements;

// Scales: F16 per block
// Scales: F16 per block, except MXFP4 which stores one E8M0 byte per block.
int64_t n_blocks = n_elements / layout.weights_per_block;
layout.scales_size = n_blocks * sizeof(uint16_t); // F16 = 2 bytes
layout.scales_size = n_blocks * (tensor->type == GGML_TYPE_MXFP4 ? sizeof(uint8_t) : sizeof(uint16_t));
// For symmetric quantization, no zp needed (weights stored as signed)
if (layout.is_symmetric) {
layout.zp_size = 0;
Expand Down
76 changes: 15 additions & 61 deletions ggml/src/ggml-openvino/ggml-openvino.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -237,8 +237,9 @@ static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer
bool is_full_tensor_set = (offset == 0 && size == ggml_nbytes(tensor) && tensor->view_src == nullptr);
// 2D tensor (typical weight shape)
bool is_2d = (tensor->ne[2] == 1 && tensor->ne[3] == 1);
bool is_supported_weight_shape = is_2d || tensor->type == GGML_TYPE_MXFP4;

if (is_weight_buffer && is_full_tensor_set && is_2d) {
if (is_weight_buffer && is_full_tensor_set && is_supported_weight_shape) {
try {
auto result = process_weight_tensor(tensor, data, tensor->data);
result.weight_node->set_friendly_name(tensor->name);
Expand Down Expand Up @@ -458,8 +459,9 @@ static size_t ggml_backend_openvino_buffer_type_get_alloc_size(ggml_backend_buff
const ggml_tensor * tensor) {
GGML_UNUSED(buft);

// For quantized 2D tensors (weights), we need extra space for extracted data
if (ggml_is_quantized(tensor->type) && tensor->ne[2] == 1 && tensor->ne[3] == 1) {
// For quantized weight tensors, we need extra space for extracted data.
if (ggml_is_quantized(tensor->type) &&
((tensor->ne[2] == 1 && tensor->ne[3] == 1) || tensor->type == GGML_TYPE_MXFP4)) {
ggml_openvino_extracted_layout layout = ggml_openvino_get_extracted_layout(tensor);
if (layout.total_size > 0) {
// GGML_LOG_DEBUG("%s: tensor %s needs %zu bytes (original %zu, extracted: weights=%zu scales=%zu zp=%zu)\n",
Expand Down Expand Up @@ -901,17 +903,10 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
return true;
}

// Keep the MoE routing weights gather on CPU for GPU runs. Splitting
// only at the later SUM/CLAMP/DIV nodes still leaves this routing path
// numerically unstable for arctic-style MoE graphs.
if (strncmp(op->name, "ffn_moe_weights", sizeof("ffn_moe_weights") - 1) == 0) {
return true;
}
break;
}
case GGML_OP_RESHAPE: {
if (strncmp(op->name, "ffn_moe_weights", sizeof("ffn_moe_weights") - 1) == 0 ||
strncmp(op->name, "ffn_norm_exps", sizeof("ffn_norm_exps") - 1) == 0) {
if (strncmp(op->name, "ffn_norm_exps", sizeof("ffn_norm_exps") - 1) == 0) {
return true;
}
break;
Expand Down Expand Up @@ -958,49 +953,15 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
return true;
}

// qwen3next MoE weight normalization is numerically sensitive on the GPU
// path. Keep the normalization divide on CPU to match the reference.
if (strncmp(op->name, "ffn_moe_weights_norm", sizeof("ffn_moe_weights_norm") - 1) == 0) {
return true;
}
break;
}
case GGML_OP_SOFT_MAX: {
if (op->src[2] != nullptr) {
// GGML_LOG_WARN("OpenVINO backend does not support SOFT_MAX with sinks\n");
return true;
}

if (strncmp(op->name, "ffn_moe_probs", sizeof("ffn_moe_probs") - 1) == 0) {
return true;
}

// GPU execution of the MoE routing weights softmax is numerically unstable
// when fused with the surrounding GET_ROWS/reshape path. Keep this softmax
// on CPU so the scheduler splits at the same boundary that restores parity.
if (op->src[0] != nullptr && op->src[0]->op == GGML_OP_RESHAPE && op->src[0]->src[0] != nullptr &&
strncmp(op->src[0]->src[0]->name, "ffn_moe_weights", sizeof("ffn_moe_weights") - 1) == 0) {
return true;
}
break;
}
case GGML_OP_SUM_ROWS: {
if (strncmp(op->name, "ffn_moe_weights_sum", sizeof("ffn_moe_weights_sum") - 1) == 0) {
return true;
}

// if the input is PERMUTE skip
if (op->src[0]->op == GGML_OP_PERMUTE) {
return true;
}
break;
}
case GGML_OP_CLAMP: {
if (strncmp(op->name, "ffn_moe_weights_sum_clamped", sizeof("ffn_moe_weights_sum_clamped") - 1) == 0) {
return true;
}
break;
}
case GGML_OP_FLASH_ATTN_EXT: {
float scale = 1.0f;
float max_bias = 0.0f;
Expand Down Expand Up @@ -1056,12 +1017,6 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
break;
}
case GGML_OP_MUL_MAT: {
if (ggml_openvino_get_device_name() == "GPU" && op->src[1]->op == GGML_OP_SOFT_MAX &&
op->src[0]->op == GGML_OP_CONT && op->src[0]->src[0] != nullptr &&
op->src[0]->src[0]->op == GGML_OP_TRANSPOSE && op->src[0]->src[0]->src[0] != nullptr &&
op->src[0]->src[0]->src[0]->op == GGML_OP_PERMUTE) {
return true;
}
if (op->src[0]->ne[3] != op->src[1]->ne[3] && op->src[0]->ne[3] != 1 && op->src[1]->ne[3] != 1) {
return true;
}
Expand All @@ -1071,12 +1026,8 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
break;
}
case GGML_OP_MUL_MAT_ID: {
if (strncmp(op->name, "ffn_moe_gate_up", sizeof("ffn_moe_gate_up") - 1) == 0 ||
strncmp(op->name, "ffn_moe_down", sizeof("ffn_moe_down") - 1) == 0) {
return true;
}

if (mul_mat_id_requires_large_tmp(op)) {
if (mul_mat_id_requires_large_tmp(op) &&
!(op->src[0] != nullptr && op->src[0]->type == GGML_TYPE_MXFP4)) {
return true;
}
break;
Expand Down Expand Up @@ -1154,8 +1105,8 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
return true;
}
case GGML_OP_VIEW: {
// Skip TOPK_MOE fused tests until it is fully supported
// the argsort_top_k VIEW wrapping ARGSORT is named "selected_experts" in test_topk_moe
// Skip TOPK_MOE fused tests until it is fully supported.
// The argsort_top_k VIEW wrapping ARGSORT is named "selected_experts" in test_topk_moe.
if (strcmp(op->name, "selected_experts") == 0) {
return true;
}
Expand All @@ -1172,7 +1123,8 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con

static std::unordered_set<ggml_type> supported_types{
GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_I64, GGML_TYPE_I32, GGML_TYPE_Q4_0,
GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, GGML_TYPE_Q5_1, GGML_TYPE_Q5_K, GGML_TYPE_Q8_0, GGML_TYPE_Q6_K};
GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, GGML_TYPE_Q5_1, GGML_TYPE_Q5_K, GGML_TYPE_Q8_0, GGML_TYPE_Q6_K,
GGML_TYPE_MXFP4};

// derive supported op sets from the op_table map, keys in
// the map use the full macro name (e.g. "GGML_OP_ADD"), while
Expand Down Expand Up @@ -1270,7 +1222,9 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con
// GGML_LOG_WARN("OpenVINO backend does not support tensor type %s\n", ggml_type_name(src->type));
return false;
}
if (ggml_is_quantized(src->type) && src->ne[2] != 1) {
const bool is_supported_3d_mxfp4_moe = op->op == GGML_OP_MUL_MAT_ID && i == 0 &&
src->type == GGML_TYPE_MXFP4;
if (ggml_is_quantized(src->type) && src->ne[2] != 1 && !is_supported_3d_mxfp4_moe) {
// GGML_LOG_WARN("OpenVINO backend does not support 3D quantized tensors\n");
return false;
}
Expand Down
124 changes: 120 additions & 4 deletions ggml/src/ggml-openvino/ggml-quants.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@
#include <openvino/core/shape.hpp>
#include <openvino/core/type/element_type.hpp>
#include <openvino/core/type/element_type_traits.hpp>
#include <openvino/core/type/float4_e2m1.hpp>
#include <openvino/core/type/float16.hpp>
#include <openvino/core/type/float8_e8m0.hpp>
#include <openvino/op/add.hpp>
#include <openvino/op/constant.hpp>
#include <openvino/op/convert.hpp>
Expand All @@ -44,6 +46,38 @@ void unpack_32_4(const uint8_t * data, uint8_t * dst) {
}
}

static constexpr size_t MXFP4_BLOCK_SIZE = 32;
static constexpr size_t MXFP4_BLOCK_QS_SIZE = MXFP4_BLOCK_SIZE / 2;
static constexpr size_t MXFP4_BLOCK_BYTES = sizeof(uint8_t) + MXFP4_BLOCK_QS_SIZE;

static void pack_32_mxfp4_for_openvino(const uint8_t * data, uint8_t * dst) {
for (int j = 0; j < static_cast<int>(MXFP4_BLOCK_QS_SIZE); j += 2) {
const uint8_t v0 = data[j] & 0x0F;
const uint8_t v1 = (data[j + 1] & 0x0F) << 4;
const uint8_t v16 = data[j] >> 4;
const uint8_t v17 = data[j + 1] & 0xF0;
dst[j / 2] = v0 | v1;
dst[MXFP4_BLOCK_SIZE / 4 + j / 2] = v16 | v17;
}
}

void extract_mxfp4_data(const ggml_tensor * tensor, ov::Tensor & weights_arr, ov::Tensor & scales_arr) {
GGML_ASSERT(tensor->type == GGML_TYPE_MXFP4);
GGML_ASSERT(weights_arr.get_element_type() == ov::element::f4e2m1);
GGML_ASSERT(scales_arr.get_element_type() == ov::element::f8e8m0);

const auto * data = static_cast<const uint8_t *>(tensor->data);
auto * weights = static_cast<uint8_t *>(weights_arr.data());
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f8e8m0>::value_type>();
const size_t n_blocks = scales_arr.get_size();

ov::parallel_for(n_blocks, [&](size_t i) {
const uint8_t * block = data + i * MXFP4_BLOCK_BYTES;
pack_32_mxfp4_for_openvino(block + sizeof(uint8_t), weights + i * MXFP4_BLOCK_QS_SIZE);
scales[i] = ov::float8_e8m0::from_bits(block[0]);
});
}

// Extracts (weight, scales, zp) from Q4_0 tensors.
// Data layout is: |16 bit scale|32 x 4bit weights|.
// When zp_arr is empty (symmetric), weights are stored as signed i4 (value - 8).
Expand Down Expand Up @@ -617,6 +651,42 @@ ov::Output<ov::Node> make_int4_weights(ov::Tensor & weight,
return std::make_shared<ov::op::v0::Convert>(result, ov::element::f32);
}

ov::Output<ov::Node> make_mxfp4_weights(ov::Tensor & weight, ov::Tensor & scales) {
const ov::Shape final_shape = weight.get_shape();
GGML_ASSERT(!final_shape.empty());
GGML_ASSERT(final_shape.back() % MXFP4_BLOCK_SIZE == 0);

ov::Shape packed_shape = final_shape;
packed_shape.back() /= MXFP4_BLOCK_SIZE;
packed_shape.push_back(MXFP4_BLOCK_SIZE);

ov::Shape scale_shape = packed_shape;
scale_shape.back() = 1;
scales.set_shape(scale_shape);

auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::f4e2m1, packed_shape,
static_cast<uint8_t *>(weight.data()), nullptr);
weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
auto weights_f32 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f32);

auto scales_node = std::make_shared<ov::op::v0::Constant>(scales);
auto scales_f32 = std::make_shared<ov::op::v0::Convert>(scales_node, ov::element::f32);
ov::Output<ov::Node> result =
std::make_shared<ov::op::v1::Multiply>(weights_f32, scales_f32, ov::op::AutoBroadcastType::NUMPY);

auto final_shape_node =
std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{final_shape.size()}, final_shape);
return std::make_shared<ov::op::v1::Reshape>(result, final_shape_node, false);
}

ov::Output<ov::Node> make_mxfp4_moe_packed_weights(ov::Tensor & weight) {
auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::u8, weight.get_shape(),
static_cast<uint8_t *>(weight.data()), nullptr);
weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
weights_node->get_rt_info()["__ggml_openvino_mxfp4_moe_packed"] = true;
return weights_node;
}

// Extract quantized weights from tensor and create weight subgraph
std::shared_ptr<ov::Node> extract_quantized_weights(const ggml_tensor * tensor,
const void * data,
Expand All @@ -628,6 +698,13 @@ std::shared_ptr<ov::Node> extract_quantized_weights(const ggml_tensor * tensor,
ggml_tensor temp_tensor = *tensor;
temp_tensor.data = const_cast<void *>(data);

if (tensor->type == GGML_TYPE_MXFP4) {
extract_mxfp4_data(&temp_tensor, weights, scales);
auto result = make_mxfp4_weights(weights, scales).get_node_shared_ptr();
result->set_friendly_name(tensor->name);
return result;
}

// Determine block size based on tensor type
int64_t weights_per_block;
bool is_u4;
Expand Down Expand Up @@ -788,6 +865,27 @@ OvWeight process_weight_tensor(const ggml_tensor * tensor, const void * data, vo
OPENVINO_THROW("Unsupported quantized type: ", ggml_type_name(tensor->type));
}

const bool is_3d_mxfp4_moe = tensor->type == GGML_TYPE_MXFP4 && (tensor->ne[2] > 1 || tensor->ne[3] > 1);
if (is_3d_mxfp4_moe) {
ov::Shape packed_shape = {static_cast<size_t>(tensor->ne[3]),
static_cast<size_t>(tensor->ne[2]),
static_cast<size_t>(tensor->ne[1]),
static_cast<size_t>(tensor->ne[0] / MXFP4_BLOCK_SIZE),
MXFP4_BLOCK_BYTES};
const size_t tensor_bytes = ggml_nbytes(tensor);
if (output_base_ptr) {
auto * buf_base = static_cast<uint8_t *>(output_base_ptr);
memcpy(buf_base + layout.weights_offset, data, tensor_bytes);
result.weights = ov::Tensor(ov::element::u8, packed_shape, buf_base + layout.weights_offset);
} else {
result.weights = ov::Tensor(ov::element::u8, packed_shape);
memcpy(result.weights.data(), data, tensor_bytes);
}
result.weight_node = make_mxfp4_moe_packed_weights(result.weights).get_node_shared_ptr();
result.weight_node->set_friendly_name(tensor->name);
return result;
}

if (use_bias) {
OPENVINO_ASSERT(!layout.is_requant,
"use_bias is only used for test-backend-ops, which should not have requantization");
Expand All @@ -812,22 +910,40 @@ OvWeight process_weight_tensor(const ggml_tensor * tensor, const void * data, vo
// Quantized path (normal extraction or quantized requant)
// Create weight/scale/zp tensors - shared between both paths
// For symmetric quantization, use signed types (i4/i8) and no ZP tensor
ov::element::Type weight_type = layout.is_symmetric ? (layout.is_u4 ? ov::element::i4 : ov::element::i8) :
(layout.is_u4 ? ov::element::u4 : ov::element::u8);
ov::element::Type weight_type = tensor->type == GGML_TYPE_MXFP4 ?
ov::element::f4e2m1 :
(layout.is_symmetric ? (layout.is_u4 ? ov::element::i4 : ov::element::i8) :
(layout.is_u4 ? ov::element::u4 : ov::element::u8));
ov::Shape scale_shape = {node_shape[0], node_shape[1] / layout.weights_per_block};

if (tensor->type == GGML_TYPE_MXFP4) {
if (tensor->ne[2] == 1 && tensor->ne[3] == 1) {
node_shape = {static_cast<size_t>(tensor->ne[1]), static_cast<size_t>(tensor->ne[0])};
} else {
node_shape.clear();
for (int i = GGML_MAX_DIMS - 1; i >= 0; --i) {
node_shape.push_back(static_cast<size_t>(tensor->ne[i]));
}
}

scale_shape = node_shape;
scale_shape.back() /= layout.weights_per_block;
}

if (output_base_ptr) {
uint8_t * buf_base = static_cast<uint8_t *>(output_base_ptr);
result.weights = ov::Tensor(weight_type, node_shape, buf_base + layout.weights_offset);
result.scales = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.scales_offset);
const ov::element::Type scale_type = tensor->type == GGML_TYPE_MXFP4 ? ov::element::f8e8m0 : ov::element::f16;
result.scales = ov::Tensor(scale_type, scale_shape, buf_base + layout.scales_offset);
if (!layout.is_symmetric) {
ov::element::Type zp_type = layout.is_u4 ? ov::element::u4 : ov::element::u8;
result.zp = ov::Tensor(zp_type, scale_shape, buf_base + layout.zp_offset);
}
// else: result.zp remains default-constructed (empty) for symmetric
} else {
result.weights = ov::Tensor(weight_type, node_shape);
result.scales = ov::Tensor(ov::element::f16, scale_shape);
const ov::element::Type scale_type = tensor->type == GGML_TYPE_MXFP4 ? ov::element::f8e8m0 : ov::element::f16;
result.scales = ov::Tensor(scale_type, scale_shape);
if (!layout.is_symmetric) {
if (use_bias) {
result.zp = ov::Tensor(ov::element::f16, scale_shape);
Expand Down
Loading
Loading