From f7636a78fa81fee0afc9676de9830372d5174322 Mon Sep 17 00:00:00 2001 From: winstonma Date: Wed, 24 Jun 2026 14:12:39 +0800 Subject: [PATCH] vulkan: disable MMVQ on AMD UMA devices MMVQ's quantization overhead outweighs bandwidth savings on UMA where CPU and GPU share the same memory pool. --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index f4a578b893d7..eb84625add5c 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -8777,6 +8777,11 @@ static bool ggml_vk_should_use_mmvq(const vk_device& device, uint32_t m, uint32_ if (k < 2048) { return false; } + // On UMA, MMVQ's quantization overhead outweighs bandwidth savings + // because CPU and GPU share the same memory pool. + if (device->uma) { + return false; + } switch (src0_type) { case GGML_TYPE_Q8_0: