diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index f4a578b893d7..eb84625add5c 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -8777,6 +8777,11 @@ static bool ggml_vk_should_use_mmvq(const vk_device& device, uint32_t m, uint32_ if (k < 2048) { return false; } + // On UMA, MMVQ's quantization overhead outweighs bandwidth savings + // because CPU and GPU share the same memory pool. + if (device->uma) { + return false; + } switch (src0_type) { case GGML_TYPE_Q8_0: