From f7636a78fa81fee0afc9676de9830372d5174322 Mon Sep 17 00:00:00 2001
From: winstonma <winstonma@ymail.com>
Date: Wed, 24 Jun 2026 14:12:39 +0800
Subject: [PATCH] vulkan: disable MMVQ on AMD UMA devices

  MMVQ's quantization overhead outweighs bandwidth savings on UMA
  where CPU and GPU share the same memory pool.
---
 ggml/src/ggml-vulkan/ggml-vulkan.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index f4a578b893d7..eb84625add5c 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -8777,6 +8777,11 @@ static bool ggml_vk_should_use_mmvq(const vk_device& device, uint32_t m, uint32_
         if (k < 2048) {
             return false;
         }
+        // On UMA, MMVQ's quantization overhead outweighs bandwidth savings
+        // because CPU and GPU share the same memory pool.
+        if (device->uma) {
+            return false;
+        }
 
         switch (src0_type) {
         case GGML_TYPE_Q8_0: