diff --git a/.devops/openvino.Dockerfile b/.devops/openvino.Dockerfile
index ab14288ce171..152d56bcc8aa 100644
--- a/.devops/openvino.Dockerfile
+++ b/.devops/openvino.Dockerfile
@@ -1,17 +1,17 @@
-ARG OPENVINO_VERSION_MAJOR=2026.0
-ARG OPENVINO_VERSION_FULL=2026.0.0.20965.c6d6a13a886
+ARG OPENVINO_VERSION_MAJOR=2026.2
+ARG OPENVINO_VERSION_FULL=2026.2.0.21903.52ddc073857
ARG UBUNTU_VERSION=24.04
# Intel GPU driver versions. https://github.com/intel/compute-runtime/releases
-ARG IGC_VERSION=v2.30.1
-ARG IGC_VERSION_FULL=2_2.30.1+20950
-ARG COMPUTE_RUNTIME_VERSION=26.09.37435.1
-ARG COMPUTE_RUNTIME_VERSION_FULL=26.09.37435.1-0
-ARG IGDGMM_VERSION=22.9.0
+ARG IGC_VERSION=v2.34.4
+ARG IGC_VERSION_FULL=2_2.34.4+21428
+ARG COMPUTE_RUNTIME_VERSION=26.18.38308.1
+ARG COMPUTE_RUNTIME_VERSION_FULL=26.18.38308.1-0
+ARG IGDGMM_VERSION=22.10.0
# Intel NPU driver versions. https://github.com/intel/linux-npu-driver/releases
-ARG NPU_DRIVER_VERSION=v1.32.0
-ARG NPU_DRIVER_FULL=v1.32.0.20260402-23905121947
+ARG NPU_DRIVER_VERSION=v1.33.0
+ARG NPU_DRIVER_FULL=v1.33.0.20260529-26625960453
ARG LIBZE1_VERSION=1.27.0-1~24.04~ppa2
# Optional proxy build arguments
@@ -46,13 +46,18 @@ RUN apt-get update && \
intel-opencl-icd && \
rm -rf /var/lib/apt/lists/*
-# Install OpenVINO for Ubuntu 24.04
+# OpenVINO toolkit and GPU/NPU drivers are cached via BuildKit cache mounts to avoid re-downloading on rebuilds.
+# Install OpenVINO for Ubuntu 24.04.
ARG OPENVINO_VERSION_MAJOR
ARG OPENVINO_VERSION_FULL
-RUN mkdir -p /opt/intel && \
- wget https://storage.openvinotoolkit.org/repositories/openvino/packages/${OPENVINO_VERSION_MAJOR}/linux/openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz && \
- tar -xf openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz && \
- mv openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64 /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} && \
+RUN --mount=type=cache,target=/var/cache/openvino,sharing=locked \
+ mkdir -p /opt/intel && \
+ TGZ=/var/cache/openvino/openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz && \
+ if [ ! -f "$TGZ" ]; then \
+ wget -O "$TGZ" https://storage.openvinotoolkit.org/repositories/openvino/packages/${OPENVINO_VERSION_MAJOR}/linux/openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz; \
+ fi && \
+ tar -xf "$TGZ" -C /opt/intel/ && \
+ mv /opt/intel/openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64 /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} && \
cd /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} && \
echo "Y" | ./install_dependencies/install_openvino_dependencies.sh && \
cd - && \
@@ -68,14 +73,14 @@ COPY . .
RUN bash -c "source ${OpenVINO_DIR}/setupvars.sh && \
cmake -B build/ReleaseOV -G Ninja \
-DCMAKE_BUILD_TYPE=Release \
+ -DLLAMA_BUILD_TESTS=OFF \
-DGGML_OPENVINO=ON && \
- cmake --build build/ReleaseOV -j$(nproc)"
+ cmake --build build/ReleaseOV --parallel "
-# Copy all necessary libraries
+# Copy all necessary libraries (build outputs + OpenVINO runtime libs)
RUN mkdir -p /app/lib && \
- find build/ReleaseOV -name '*.so*' -exec cp {} /app/lib \; && \
- find ${OpenVINO_DIR}/runtime/lib/intel64 -name '*.so*' -exec cp -P {} /app/lib \; 2>/dev/null || \
- find ${OpenVINO_DIR}/lib/intel64 -name '*.so*' -exec cp -P {} /app/lib \;
+ find build/ReleaseOV -name '*.so*' -exec cp -P {} /app/lib \; && \
+ find "${OpenVINO_DIR}/runtime/lib/intel64" -name '*.so*' -exec cp -P {} /app/lib \;
# Create runtime directories and copy binaries
RUN mkdir -p /app/full \
@@ -120,33 +125,41 @@ ARG IGC_VERSION_FULL
ARG COMPUTE_RUNTIME_VERSION
ARG COMPUTE_RUNTIME_VERSION_FULL
ARG IGDGMM_VERSION
-RUN mkdir /tmp/neo/ && cd /tmp/neo/ \
- && wget https://github.com/intel/intel-graphics-compiler/releases/download/${IGC_VERSION}/intel-igc-core-${IGC_VERSION_FULL}_amd64.deb \
- && wget https://github.com/intel/intel-graphics-compiler/releases/download/${IGC_VERSION}/intel-igc-opencl-${IGC_VERSION_FULL}_amd64.deb \
- && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-ocloc-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
- && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-ocloc_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
- && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-opencl-icd-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
- && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-opencl-icd_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
- && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libigdgmm12_${IGDGMM_VERSION}_amd64.deb \
- && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libze-intel-gpu1-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
- && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libze-intel-gpu1_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
- && dpkg --install *.deb \
- && rm -rf /tmp/neo/
+RUN --mount=type=cache,target=/var/cache/intel-gpu,sharing=locked \
+ set -eux; \
+ cd /var/cache/intel-gpu; \
+ for url in \
+ https://github.com/intel/intel-graphics-compiler/releases/download/${IGC_VERSION}/intel-igc-core-${IGC_VERSION_FULL}_amd64.deb \
+ https://github.com/intel/intel-graphics-compiler/releases/download/${IGC_VERSION}/intel-igc-opencl-${IGC_VERSION_FULL}_amd64.deb \
+ https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-ocloc_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
+ https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-opencl-icd_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
+ https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libigdgmm12_${IGDGMM_VERSION}_amd64.deb \
+ https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libze-intel-gpu1_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb ; do \
+ f=$(basename "$url"); \
+ [ -f "$f" ] || wget -q -O "$f" "$url"; \
+ done; \
+ apt-get update; \
+ apt-get install -y --no-install-recommends ./*.deb; \
+ rm -rf /var/lib/apt/lists/*
# Install NPU drivers
ARG NPU_DRIVER_VERSION
ARG NPU_DRIVER_FULL
ARG LIBZE1_VERSION
-RUN mkdir /tmp/npu/ && cd /tmp/npu/ \
- && wget https://github.com/intel/linux-npu-driver/releases/download/${NPU_DRIVER_VERSION}/linux-npu-driver-${NPU_DRIVER_FULL}-ubuntu2404.tar.gz \
- && tar -xf linux-npu-driver-${NPU_DRIVER_FULL}-ubuntu2404.tar.gz \
- && dpkg --install *.deb \
- && rm -rf /tmp/npu/
-
-RUN cd /tmp \
- && wget https://snapshot.ppa.launchpadcontent.net/kobuk-team/intel-graphics/ubuntu/20260324T100000Z/pool/main/l/level-zero-loader/libze1_${LIBZE1_VERSION}_amd64.deb \
- && dpkg --install libze1_${LIBZE1_VERSION}_amd64.deb \
- && rm libze1_${LIBZE1_VERSION}_amd64.deb
+RUN --mount=type=cache,target=/var/cache/intel-npu,sharing=locked \
+ set -eux; \
+ TGZ=/var/cache/intel-npu/linux-npu-driver-${NPU_DRIVER_FULL}-ubuntu2404.tar.gz; \
+ if [ ! -f "$TGZ" ]; then \
+ wget -q -O "$TGZ" https://github.com/intel/linux-npu-driver/releases/download/${NPU_DRIVER_VERSION}/linux-npu-driver-${NPU_DRIVER_FULL}-ubuntu2404.tar.gz; \
+ fi; \
+ DEB=/var/cache/intel-npu/libze1_${LIBZE1_VERSION}_amd64.deb; \
+ if [ ! -f "$DEB" ]; then \
+ wget -q -O "$DEB" https://snapshot.ppa.launchpadcontent.net/kobuk-team/intel-graphics/ubuntu/20260324T100000Z/pool/main/l/level-zero-loader/libze1_${LIBZE1_VERSION}_amd64.deb; \
+ fi; \
+ mkdir /tmp/npu/ && cd /tmp/npu/ && tar -xf "$TGZ" && cp "$DEB" .; \
+ apt-get update; \
+ apt-get install -y --no-install-recommends ./*.deb; \
+ rm -rf /tmp/npu/ /var/lib/apt/lists/*
COPY --from=build /app/lib/ /app/
@@ -166,22 +179,26 @@ RUN apt-get update && \
python3 \
python3-venv \
python3-pip && \
- python3 -m venv /ov-venv && \
- /ov-venv/bin/pip install --no-cache-dir --upgrade pip setuptools wheel && \
- /ov-venv/bin/pip install --no-cache-dir -r requirements.txt && \
+ python3 -m venv /openvino-venv && \
+ /openvino-venv/bin/pip install --no-cache-dir --upgrade pip setuptools wheel && \
+ /openvino-venv/bin/pip install --no-cache-dir -r requirements.txt && \
apt-get autoremove -y && \
apt-get clean && \
rm -rf /tmp/* /var/tmp/* && \
find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
find /var/cache -type f -delete
-ENTRYPOINT ["/bin/bash", "-c", "source /ov-venv/bin/activate && exec /app/tools.sh \"$@\"", "--"]
+# Activate the venv
+ENV VIRTUAL_ENV=/openvino-venv \
+ PATH=/openvino-venv/bin:$PATH
+
+ENTRYPOINT ["/app/tools.sh"]
### Light, CLI only
FROM base AS light
-COPY --from=build /app/full/llama-cli /app/
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app/
WORKDIR /app
diff --git a/.github/actions/windows-setup-openvino/action.yml b/.github/actions/windows-setup-openvino/action.yml
new file mode 100644
index 000000000000..f983df56025b
--- /dev/null
+++ b/.github/actions/windows-setup-openvino/action.yml
@@ -0,0 +1,24 @@
+name: "Windows - Setup OpenVINO Toolkit"
+description: "Setup OpenVINO Toolkit for Windows"
+inputs:
+ path:
+ description: "Installation path"
+ required: true
+ version_major:
+ description: "OpenVINO major version (e.g., 2026.2)"
+ required: true
+ version_full:
+ description: "OpenVINO full version"
+ required: true
+
+runs:
+ using: "composite"
+ steps:
+ - name: Download and extract OpenVINO Runtime
+ shell: powershell
+ run: |
+ $url = "https://storage.openvinotoolkit.org/repositories/openvino/packages/${{ inputs.version_major }}/windows/openvino_toolkit_windows_${{ inputs.version_full }}_x86_64.zip"
+ $out = "openvino.zip"
+ Invoke-WebRequest -Uri $url -OutFile $out
+ Expand-Archive -Path $out -DestinationPath ${{ inputs.path }} -Force
+ Remove-Item $out
diff --git a/.github/workflows/build-cache.yml b/.github/workflows/build-cache.yml
index 53d65f3768b4..b36c6e1ea89b 100644
--- a/.github/workflows/build-cache.yml
+++ b/.github/workflows/build-cache.yml
@@ -68,8 +68,8 @@ jobs:
env:
# Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
- OPENVINO_VERSION_MAJOR: "2026.0"
- OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
+ OPENVINO_VERSION_MAJOR: "2026.2"
+ OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857"
steps:
- name: Clone
@@ -91,6 +91,34 @@ jobs:
version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
version_full: ${{ env.OPENVINO_VERSION_FULL }}
+ windows-2022-openvino-cache:
+ runs-on: windows-2022
+
+ env:
+ # Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
+ OPENVINO_VERSION_MAJOR: "2026.2"
+ OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857"
+
+ steps:
+ - name: Clone
+ id: checkout
+ uses: actions/checkout@v6
+
+ - name: Setup Cache
+ uses: actions/cache@v5
+ id: cache-openvino
+ with:
+ path: ./openvino_toolkit
+ key: cache-gha-openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
+
+ - name: Setup OpenVINO Toolkit
+ if: steps.cache-openvino.outputs.cache-hit != 'true'
+ uses: ./.github/actions/windows-setup-openvino
+ with:
+ path: ./openvino_toolkit
+ version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
+ version_full: ${{ env.OPENVINO_VERSION_FULL }}
+
windows-2022-rocm-cache:
runs-on: windows-2022
diff --git a/.github/workflows/build-openvino.yml b/.github/workflows/build-openvino.yml
index ddcbc6697455..49ab13695cbf 100644
--- a/.github/workflows/build-openvino.yml
+++ b/.github/workflows/build-openvino.yml
@@ -37,14 +37,10 @@ jobs:
ubuntu-24-openvino:
runs-on: [self-hosted, Linux, Intel, OpenVINO]
- concurrency:
- group: openvino-gpu-${{ github.head_ref || github.ref }}
- cancel-in-progress: false
-
env:
# Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
- OPENVINO_VERSION_MAJOR: "2026.0"
- OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
+ OPENVINO_VERSION_MAJOR: "2026.2"
+ OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857"
steps:
- name: Clone
@@ -78,7 +74,7 @@ jobs:
cmake -B build/ReleaseOV -G Ninja \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_OPENVINO=ON
- time cmake --build build/ReleaseOV --config Release -j $(nproc)
+ time cmake --build build/ReleaseOV --config Release --parallel
- name: Test (CPU)
id: cmake_test_cpu
@@ -93,4 +89,81 @@ jobs:
run: |
cd ${{ github.workspace }}
export GGML_OPENVINO_DEVICE=GPU
- ctest --test-dir build/ReleaseOV -L main -E "test-llama-archs" --verbose --timeout 2000
+ ctest --test-dir build/ReleaseOV -L main -E "test-llama-archs" --verbose --timeout 3000
+
+ openvino-windows-2022:
+ runs-on: windows-2022
+
+ env:
+ # Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
+ OPENVINO_VERSION_MAJOR: "2026.2"
+ OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857"
+
+ steps:
+ - name: Clone
+ id: checkout
+ uses: actions/checkout@v6
+
+ - name: ccache
+ uses: ggml-org/ccache-action@v1.2.21
+ with:
+ key: openvino-windows-2022
+ variant: ccache
+ evict-old-files: 1d
+ save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+ - name: Setup Cache
+ uses: actions/cache@v5
+ id: cache-openvino
+ with:
+ path: ./openvino_toolkit
+ key: cache-gha-openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
+
+ - name: Setup OpenVINO Toolkit
+ if: steps.cache-openvino.outputs.cache-hit != 'true'
+ uses: ./.github/actions/windows-setup-openvino
+ with:
+ path: ./openvino_toolkit
+ version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
+ version_full: ${{ env.OPENVINO_VERSION_FULL }}
+
+ - name: Install OpenCL using vcpkg
+ shell: powershell
+ run: |
+ git clone https://github.com/microsoft/vcpkg C:\vcpkg
+ C:\vcpkg\bootstrap-vcpkg.bat
+ C:\vcpkg\vcpkg install opencl
+
+ - name: Build
+ id: cmake_build
+ shell: cmd
+ run: |
+ REM Find extracted OpenVINO folder dynamically
+ for /d %%i in (openvino_toolkit\*) do set OPENVINO_ROOT=%%i
+
+ if not exist "%OPENVINO_ROOT%\runtime\cmake\OpenVINOConfig.cmake" (
+ echo ERROR: OpenVINOConfig.cmake not found
+ exit /b 1
+ )
+
+ call "%OPENVINO_ROOT%\setupvars.bat"
+
+ cmake -B build\ReleaseOV -G "Visual Studio 17 2022" ^
+ -A x64 ^
+ -DCMAKE_BUILD_TYPE=Release ^
+ -DGGML_OPENVINO=ON ^
+ -DCMAKE_TOOLCHAIN_FILE=C:\vcpkg\scripts\buildsystems\vcpkg.cmake
+
+ cmake --build build\ReleaseOV --config Release -- /m
+
+ - name: Test (CPU)
+ id: cmake_test_cpu
+ shell: cmd
+ # TODO: fix and re-enable the `test-llama-archs` test below
+ run: |
+ REM Find extracted OpenVINO folder dynamically
+ for /d %%i in (openvino_toolkit\*) do set OPENVINO_ROOT=%%i
+ call "%OPENVINO_ROOT%\setupvars.bat"
+
+ cd build
+ ctest --test-dir ReleaseOV -L main -E "test-llama-archs" -C Release --verbose --timeout 3000
diff --git a/.github/workflows/build-self-hosted.yml b/.github/workflows/build-self-hosted.yml
index 436100c8a4cd..c4366ece3e59 100644
--- a/.github/workflows/build-self-hosted.yml
+++ b/.github/workflows/build-self-hosted.yml
@@ -264,14 +264,10 @@ jobs:
gpu-openvino-low-perf:
runs-on: [self-hosted, Linux, Intel, OpenVINO]
- concurrency:
- group: openvino-gpu-${{ github.head_ref || github.ref }}
- cancel-in-progress: false
-
env:
# Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
- OPENVINO_VERSION_MAJOR: "2026.0"
- OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
+ OPENVINO_VERSION_MAJOR: "2026.2"
+ OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857"
steps:
- name: Clone
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 2763557bb112..7b394201fbbd 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -443,9 +443,9 @@ jobs:
openvino_version: ${{ steps.openvino_version.outputs.value }}
env:
- # Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
- OPENVINO_VERSION_MAJOR: "2026.0"
- OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
+ # Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
+ OPENVINO_VERSION_MAJOR: "2026.2"
+ OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857"
steps:
- name: Set OpenVINO version output
@@ -528,6 +528,108 @@ jobs:
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz
name: llama-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz
+ windows-openvino:
+ runs-on: windows-2022
+
+ outputs:
+ openvino_version: ${{ steps.openvino_version.outputs.value }}
+
+ env:
+ # Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
+ OPENVINO_VERSION_MAJOR: "2026.2"
+ OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857"
+
+ steps:
+ - name: Set OpenVINO version output
+ id: openvino_version
+ run: echo "value=${{ env.OPENVINO_VERSION_MAJOR }}" >> $GITHUB_OUTPUT
+
+ - name: Clone
+ id: checkout
+ uses: actions/checkout@v6
+ with:
+ fetch-depth: 0
+
+ - name: Setup Node.js
+ uses: actions/setup-node@v6
+ with:
+ node-version: "24"
+ cache: "npm"
+ cache-dependency-path: "tools/ui/package-lock.json"
+
+ - name: ccache
+ uses: ggml-org/ccache-action@v1.2.21
+ with:
+ key: release-windows-2022-openvino
+ variant: ccache
+ evict-old-files: 1d
+
+ - name: Setup Cache
+ uses: actions/cache@v5
+ id: cache-openvino
+ with:
+ path: ./openvino_toolkit
+ key: cache-gha-openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
+
+ - name: Setup OpenVINO Toolkit
+ if: steps.cache-openvino.outputs.cache-hit != 'true'
+ uses: ./.github/actions/windows-setup-openvino
+ with:
+ path: ./openvino_toolkit
+ version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
+ version_full: ${{ env.OPENVINO_VERSION_FULL }}
+
+ - name: Install OpenCL using vcpkg
+ shell: powershell
+ run: |
+ git clone https://github.com/microsoft/vcpkg C:\vcpkg
+ C:\vcpkg\bootstrap-vcpkg.bat
+ C:\vcpkg\vcpkg install opencl
+
+ - name: Build
+ id: cmake_build
+ shell: cmd
+ run: |
+ REM Find extracted OpenVINO folder dynamically
+ for /d %%i in (openvino_toolkit\*) do set OPENVINO_ROOT=%%i
+
+ if not exist "%OPENVINO_ROOT%\runtime\cmake\OpenVINOConfig.cmake" (
+ echo ERROR: OpenVINOConfig.cmake not found
+ exit /b 1
+ )
+
+ call "%OPENVINO_ROOT%\setupvars.bat"
+
+ cmake -B build\ReleaseOV -G "Visual Studio 17 2022" ^
+ -A x64 ^
+ -DCMAKE_BUILD_TYPE=Release ^
+ -DGGML_OPENVINO=ON ^
+ -DCMAKE_TOOLCHAIN_FILE=C:\vcpkg\scripts\buildsystems\vcpkg.cmake
+
+ cmake --build build\ReleaseOV --config Release -- /m
+
+ - name: ccache-clear
+ uses: ./.github/actions/ccache-clear
+ with:
+ key: release-windows-2022-openvino
+
+ - name: Determine tag name
+ id: tag
+ uses: ./.github/actions/get-tag-name
+
+ - name: Pack artifacts
+ id: pack_artifacts
+ shell: powershell
+ run: |
+ Copy-Item LICENSE .\build\ReleaseOV\bin\
+ 7z a -snl llama-${{ steps.tag.outputs.name }}-bin-win-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.zip .\build\ReleaseOV\bin\*
+
+ - name: Upload artifacts
+ uses: actions/upload-artifact@v6
+ with:
+ path: llama-${{ steps.tag.outputs.name }}-bin-win-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.zip
+ name: llama-bin-win-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.zip
+
windows-cpu:
needs: [check-release]
if: ${{ needs.check-release.outputs.should_release == 'true' }}
@@ -1399,6 +1501,7 @@ jobs:
- windows-cuda
#- windows-sycl
- windows-hip
+ - windows-openvino
- ubuntu-22-rocm
- ubuntu-cpu
- ubuntu-vulkan
@@ -1520,6 +1623,7 @@ jobs:
- [Windows x64 (CUDA 12)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-12.4-x64.zip) - [CUDA 12.4 DLLs](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/cudart-llama-bin-win-cuda-12.4-x64.zip)
- [Windows x64 (CUDA 13)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-13.3-x64.zip) - [CUDA 13.3 DLLs](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/cudart-llama-bin-win-cuda-13.3-x64.zip)
- [Windows x64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-vulkan-x64.zip)
+ - [Windows x64 (OpenVINO)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-openvino-${{ needs.windows-openvino.outputs.openvino_version }}-x64.zip)
- [Windows x64 (SYCL)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip)
- [Windows x64 (HIP)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-hip-radeon-x64.zip)
diff --git a/docs/backend/OPENVINO.md b/docs/backend/OPENVINO.md
index b0e19abb0901..631d4bc3bf78 100644
--- a/docs/backend/OPENVINO.md
+++ b/docs/backend/OPENVINO.md
@@ -12,6 +12,25 @@ The OpenVINO backend is implemented in `ggml/src/ggml-openvino` and provides a t
- Compiles and caches the model for the target device.
- Binds GGML tensor memory to OpenVINO inference tensors and runs inference.
+## Contents
+
+- [Supported Devices](#supported-devices)
+- [Supported Model Precisions](#supported-model-precisions)
+- [Supported Llama.cpp Tools](#supported-llamacpp-tools)
+- [Validated Models](#validated-models)
+- [Build Instructions](#build-instructions)
+ - [0. Prerequisites](#0-prerequisites)
+ - [1. Install OpenVINO Runtime](#1-install-openvino-runtime)
+ - [2. Build llama.cpp with OpenVINO Backend](#2-build-llamacpp-with-openvino-backend)
+ - [Automated Ubuntu Build Script](#automated-ubuntu-build-script)
+ - [Automated Windows Build Script](#automated-windows-build-script)
+ - [3. Download Sample Model](#3-download-sample-model)
+ - [4. Run Inference with OpenVINO Backend](#4-run-inference-with-openvino-backend)
+ - [5. Docker Build](#5-docker-build)
+- [GGML OpenVINO Backend Runtime Configurations](#ggml-openvino-backend-runtime-configurations)
+- [Known Limitations](#known-limitations)
+- [Work in Progress](#work-in-progress)
+
## Supported Devices
OpenVINO backend supports the following hardware:
@@ -31,55 +50,102 @@ Although OpenVINO supports a wide range of [Intel hardware](https://docs.openvin
- `Q4_1`
- `Q4_K`
- `Q4_K_M`
-- `Q5_K` (converted to Q8_0_C at runtime)
-- `Q6_K` (converted to Q8_0_C at runtime)
+- `Q5_K` (converted to `Q8_0_C` at runtime)
+- `Q6_K` (converted to `Q8_0_C` at runtime)
> [!NOTE]
> Accuracy validation and performance optimizations for quantized models are a work in progress.
-## Quantization Support Details
-
-### CPU and GPU
-
-- **`Q4_0`, `Q4_1`, `Q4_K_M`, `Q6_K` models are supported**
+**CPU and GPU Quantization Details:**
- `Q5_K` and `Q6_K` tensors are converted to `Q8_0_C`
-### NPU
-
-- **Primary supported quantization scheme is `Q4_0`**
+**NPU Quantization Details:**
+- Primary supported quantization scheme is `Q4_0`
- `Q6_K` tensors are requantized to `Q4_0_128` in general. For embedding weights, `Q6_K` tensors are requantized to `Q8_0_C` except for the token embedding matrix which is dequantized to fp16
-### Additional Notes
-
+**Additional Notes:**
- Both `Q4_0` and `Q4_1` models use `Q6_K` for the token embedding tensor and the final matmul weight tensor (often the same tensor)
- `Q4_0` models may produce some `Q4_1` tensors if an imatrix is provided during quantization using `llama-quantize`
- `Q4_K_M` models may include both `Q6_K` and `Q5_K` tensors (observed in Phi-3)
+- `Q5_1` tensors are dequantized natively (weights, scales, and zero-points extracted directly)
+
+## Supported Llama.cpp Tools
+
+The OpenVINO backend integrates with the standard llama.cpp tools listed below.
+However, all the tools coverage across all devices is not uniform and exhaustive validation is work in progress.
+
+- llama-bench
+- llama-cli
+- llama-completion
+- llama-embedding
+- llama-perplexity
+- llama-run
+- llama-server
+- llama-simple
## Validated Models
-The following models were validated on Intel® Core™ Ultra Series 2. While our testing was limited, the OpenVINO backend is expected to work across a broad range of [Intel hardware](https://docs.openvino.ai/2026/about-openvino/release-notes-openvino/system-requirements.html).
-- Use `GGML_OPENVINO_STATEFUL_EXECUTION=1` when using GPU device.
-- `-fa 1` is required when running llama-bench with the OpenVINO backend.
-- Additional model support, quantization formats and validations are work in progress.
-
-| Model | Validated | Known Issues |
-| :------| :---------- | :-------------|
-| [Llama-3.2-1B-Instruct](https://huggingface.co/unsloth/Llama-3.2-1B-Instruct-GGUF/) | `FP16`, `Q8_0`, `Q4_0`, `Q4_1`, `Q4_K_M` on CPU/GPU/NPU | — |
-| [Meta-Llama-3.1-8B-Instruct](https://huggingface.co/bartowski/Meta-Llama-3.1-8B-Instruct-GGUF) | `Q8_0`, `Q4_K_M` on CPU/GPU/NPU | `Q4_0_8_8`, `Q4_0_4_8`, `Q4_0_4_4` fail |
-| [Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf) | `FP16`, `Q4` on CPU/NPU | GPU unsupported for `FP16` and `Q4` (`llama-cli`, `llama-bench`) |
-| [Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GGUF) | `FP16`, `Q8_0`, `Q4_0`, `Q4_1`, `Q4_K_M` on CPU/GPU/NPU | — |
-| [Qwen3-8B-Instruct](https://huggingface.co/Qwen/Qwen3-8B-GGUF) | `FP16`, `Q8_0`, `Q4_0`, `Q4_1`, `Q4_K_M` on CPU/NPU; GPU works via `llama-bench` | GPU `llama-cli` unsupported for all quantizations |
-| [MiniCPM-V-2_6-GGUF](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) | `Q4_0` on CPU/GPU/NPU | — |
-| [DeepSeek-R1-Distill-Llama-8B](https://huggingface.co/bartowski/DeepSeek-R1-Distill-Llama-8B-GGUF) | `Q8_0`, `Q4_0`, `Q4_1`, `Q4_K_M` on CPU/GPU/NPU | — |
-| [Hunyuan-7B-Instruct](https://huggingface.co/bartowski/tencent_Hunyuan-7B-Instruct-GGUF) | CPU: `Q8_0`, `Q4_0`, `Q4_1`, `Q4_K_M`; GPU: `Q8_0`, `Q4_0`, `Q4_1`; NPU (`llama-bench` only): `Q4_0`, `Q4_1`, `Q4_K_M` | GPU `Q4_K_M` unsupported; NPU `llama-cli` unsupported |
-| [Mistral-7B-Instruct-v0.3](https://huggingface.co/bartowski/Mistral-7B-Instruct-v0.3-GGUF/) | CPU/GPU: `Q8_0`, `Q4_K_M`; NPU: `Q8_0`, `Q4_K_M` (via `llama-bench`) | NPU `llama-cli` unsupported for `Q8_0`, `Q4_K_M` |
+Although, the validated models below were tested with `llama-cli` using the `Q4_K_M` quantization format on Intel® Core™ Ultra Series 2 (Lunar Lake), the OpenVINO backend is expected to work across a broader range of [Intel hardware](https://docs.openvino.ai/2026/about-openvino/release-notes-openvino/system-requirements.html), [supported model precisions](#supported-model-precisions), [supported llama.cpp tools](#supported-llamacpp-tools) and additional model architectures.
+
+> [!NOTE]
+> Extensive accuracy validation, performance optimizations, and broader architecture coverage are work in progress.
+
+**Legend & Test Configuration:**
+- **Status:** ✓ = Passed | ✗ = Failed or Unsupported
+- **Execution Modes:**
+ - **SL** = Stateless (`GGML_OPENVINO_STATEFUL_EXECUTION=0`)
+ - **SF** = Stateful (`GGML_OPENVINO_STATEFUL_EXECUTION=1`)
+ - Note: The NPU operates in stateless mode only.
+- **Validation system:** Intel® Core™ Ultra 5 238V (Lunar Lake) | 32 GB RAM | Ubuntu 24.04 | Intel OpenCL GPU Driver 26.18.38308.1 | Intel NPU Driver 1.33.0.
+- See [Known Limitations](#known-limitations) for context on observed failures.
+
+| Model | CPU (SL / SF) | GPU (SL / SF) | NPU (SL) |
+| :--- | :---: | :---: | :---: |
+| [bartowski/Llama-3.2-1B-Instruct-Q4_K_M](https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF) | ✓ / ✓ | ✓ / ✓ | ✓ |
+| [bartowski/Llama-3.2-3B-Instruct-Q4_K_M](https://huggingface.co/bartowski/Llama-3.2-3B-Instruct-GGUF) | ✓ / ✓ | ✓ / ✓ | ✓ |
+| [bartowski/Meta-Llama-3.1-8B-Instruct-Q4_K_M](https://huggingface.co/bartowski/Meta-Llama-3.1-8B-Instruct-GGUF) | ✓ / ✓ | ✓ / ✓ | ✓ |
+| | | | |
+| [Qwen/qwen2.5-1.5b-instruct-q4_k_m](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GGUF) | ✓ / ✓ | ✓ / ✗ | ✓ |
+| [Qwen/qwen2.5-coder-7b-instruct-q4_k_m](https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF) | ✓ / ✓ | ✓ / ✗ | ✓ |
+| [bartowski/Qwen_Qwen3-0.6B-Q4_K_M](https://huggingface.co/bartowski/Qwen_Qwen3-0.6B-GGUF) | ✓ / ✓ | ✓ / ✗ | ✓ |
+| [bartowski/Qwen_Qwen3-1.7B-Q4_K_M](https://huggingface.co/bartowski/Qwen_Qwen3-1.7B-GGUF) | ✓ / ✓ | ✓ / ✗ | ✓ |
+| [Qwen/Qwen3-4B-Q4_K_M](https://huggingface.co/Qwen/Qwen3-4B-GGUF) | ✓ / ✓ | ✓ / ✗ | ✓ |
+| [lm-kit/Qwen3-8B-Q4_K_M](https://huggingface.co/lm-kit/qwen-3-8b-instruct-gguf) | ✓ / ✓ | ✓ / ✗ | ✓ |
+| | | | |
+| [unsloth/gemma-3-4b-it-Q4_K_M](https://huggingface.co/unsloth/gemma-3-4b-it-GGUF) | ✓ / ✓ | ✓ / ✗ | ✓ |
+| [bartowski/google_gemma-4-E2B-it-Q4_K_M](https://huggingface.co/bartowski/google_gemma-4-E2B-it-GGUF) | ✓ / ✗ | ✓ / ✗ | ✓ |
+| [bartowski/google_gemma-4-E4B-it-Q4_K_M](https://huggingface.co/bartowski/google_gemma-4-E4B-it-GGUF) | ✓ / ✗ | ✓ / ✗ | ✓ |
+| [bartowski/gemma-4-12B-it-Q4_K_M](https://huggingface.co/bartowski/gemma-4-12B-it-GGUF) | ✓ / ✗ | ✓ / ✗ | ✗ |
+| | | | |
+| [bartowski/Phi-3-mini-4k-instruct-Q4_K_M](https://huggingface.co/bartowski/Phi-3-mini-4k-instruct-GGUF) | ✓ / ✓ | ✓ / ✗ | ✓ |
+| [bartowski/Phi-3.5-mini-instruct-Q4_K_M](https://huggingface.co/bartowski/Phi-3.5-mini-instruct-GGUF) | ✓ / ✓ | ✓ / ✗ | ✓ |
+| | | | |
+| [bartowski/Mistral-7B-Instruct-v0.3-Q4_K_M](https://huggingface.co/bartowski/Mistral-7B-Instruct-v0.3-GGUF) | ✓ / ✓ | ✓ / ✓ | ✓ |
+| [QuantFactory/Ministral-3b-instruct.Q4_K_M](https://huggingface.co/QuantFactory/Ministral-3b-instruct-GGUF) | ✓ / ✓ | ✓ / ✓ | ✓ |
+| [bartowski/Ministral-8B-Instruct-2410-Q4_K_M](https://huggingface.co/bartowski/Ministral-8B-Instruct-2410-GGUF) | ✓ / ✓ | ✓ / ✓ | ✓ |
+| | | | |
+| [bartowski/DeepSeek-R1-Distill-Llama-8B-Q4_K_M](https://huggingface.co/bartowski/DeepSeek-R1-Distill-Llama-8B-GGUF) | ✓ / ✓ | ✓ / ✓ | ✓ |
+| [bartowski/DeepSeek-R1-Distill-Qwen-7B-Q4_K_M](https://huggingface.co/bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF) | ✓ / ✓ | ✓ / ✗ | ✓ |
+| | | | |
+| [ibm-granite/granite-4.0-350m-Q4_K_M](https://huggingface.co/ibm-granite/granite-4.0-350m-GGUF) | ✓ / ✓ | ✗ / ✗ | ✓ |
+| [ibm-granite/granite-4.0-micro-Q4_K_M](https://huggingface.co/ibm-granite/granite-4.0-micro-GGUF) | ✓ / ✓ | ✓ / ✓ | ✓ |
+| [ibm-granite/granite-4.0-1b-Q4_K_M](https://huggingface.co/ibm-granite/granite-4.0-1b-GGUF) | ✓ / ✓ | ✗ / ✗ | ✗ |
+| [ibm-research/granite-3.2-8b-instruct-Q4_K_M](https://huggingface.co/ibm-research/granite-3.2-8b-instruct-GGUF) | ✓ / ✓ | ✓ / ✓ | ✓ |
+| | | | |
+| [HuggingFaceTB/smollm2-1.7b-instruct-q4_k_m](https://huggingface.co/HuggingFaceTB/SmolLM2-1.7B-Instruct-GGUF) | ✓ / ✓ | ✓ / ✓ | ✓ |
+| [openbmb/MiniCPM-V-2_6-Q4_K_M](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) | ✓ / ✓ | ✓ / ✗ | ✓ |
+| [bartowski/tencent_Hunyuan-7B-Instruct-Q4_K_M](https://huggingface.co/bartowski/tencent_Hunyuan-7B-Instruct-GGUF) | ✓ / ✓ | ✓ / ✗ | ✓ |
+| [LGAI-EXAONE/EXAONE-3.5-7.8B-Instruct-Q4_K_M](https://huggingface.co/LGAI-EXAONE/EXAONE-3.5-7.8B-Instruct-GGUF) | ✓ / ✓ | ✓ / ✗ | ✓ |
+| [bartowski/prism-ml_Bonsai-8B-unpacked-Q4_K_M](https://huggingface.co/bartowski/prism-ml_Bonsai-8B-unpacked-GGUF) | ✓ / ✓ | ✓ / ✗ | ✓ |
+| | | | |
+| [gpustack/bge-m3-Q4_K_M.gguf](https://huggingface.co/gpustack/bge-m3-GGUF) | ✓ | ✗ | ✗ |
## Build Instructions
-### Prerequisites
+### 0. Prerequisites
- Linux or Windows system with Intel hardware (CPU, GPU, or NPU)
-- **For Intel GPU or NPU Usage**: Install the appropriate hardware drivers for your Intel GPU or NPU. For detailed instructions, see: [Additional Configurations for Hardware Acceleration](https://docs.openvino.ai/2025/get-started/install-openvino/configurations.html).
+- **For Intel GPU or NPU Usage**: Install the appropriate hardware drivers for your Intel GPU or NPU. For detailed instructions, see: [Additional Configurations for Hardware Acceleration](https://docs.openvino.ai/2026/get-started/install-openvino/configurations.html).
- **Linux:**
- Git, CMake, and Ninja software tools are needed for building.
@@ -119,68 +185,390 @@ The following models were validated on Intel® Core™ Ultra Series 2. While our
- Follow the guide to install OpenVINO Runtime from an archive file: [Linux](https://docs.openvino.ai/2026/get-started/install-openvino/install-openvino-archive-linux.html) | [Windows](https://docs.openvino.ai/2026/get-started/install-openvino/install-openvino-archive-windows.html)
+- Verify OpenVINO is initialized properly:
+ ```bash
+ echo $OpenVINO_DIR
+ ```
+
+### 2. Build llama.cpp with OpenVINO Backend
+
+Clone llama.cpp repo and build :
+
+```bash
+git clone https://github.com/ggml-org/llama.cpp
+cd llama.cpp
+```
+
- **Linux:**
+```bash
+source /opt/intel/openvino/setupvars.sh
+cmake -B build/ReleaseOV -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_OPENVINO=ON
+cmake --build build/ReleaseOV --parallel
+```
-
- 📦 Click to expand OpenVINO installation from an archive file on Ubuntu
-
+- **Windows:** Open a **Developer Command Prompt for VS 2022** (so the MSVC toolchain is on `PATH`), then run:
- ```bash
- wget https://raw.githubusercontent.com/ravi9/misc-scripts/main/openvino/ov-archive-install/install-openvino-from-archive.sh
- chmod +x install-openvino-from-archive.sh
- ./install-openvino-from-archive.sh
- ```
+```cmd
+C:\Intel\openvino\setupvars.bat
+cmake -B build\ReleaseOV -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_OPENVINO=ON -DCMAKE_TOOLCHAIN_FILE=C:\vcpkg\scripts\buildsystems\vcpkg.cmake
+cmake --build build\ReleaseOV --parallel
+```
- Verify OpenVINO is initialized properly:
- ```bash
- echo $OpenVINO_DIR
- ```
-
+> [!NOTE]
+> The Windows install path is `C:\Intel\openvino` (no spaces) to avoid quoting problems some CMake/Ninja toolchains have with `C:\Program Files (x86)\...`. Adjust to wherever you installed OpenVINO Runtime. From `cmd`, run `C:\Intel\openvino\setupvars.bat`; from PowerShell, run `& "C:\Intel\openvino\setupvars.ps1"` instead. Once the build is finished you can launch the binaries from any `cmd` or `PowerShell` window after sourcing the matching `setupvars` script for that shell.
+#### Automated Ubuntu Build Script
-### 2. Build llama.cpp with OpenVINO Backend
+For Ubuntu24 users, the following shell script automates the prerequisite installs (build tools, OpenCL ICD), the OpenVINO Runtime download/extract/setup, and the Ninja-based llama.cpp build.
+Save the following as `ubuntu-llamacpp-ov-install.sh` next to where you want the `llama.cpp` folder to land, then run it:
-Clone the OpenVINO-enabled llama.cpp fork and build it:
+```bash
+chmod +x ubuntu-llamacpp-ov-install.sh
+./ubuntu-llamacpp-ov-install.sh
+```
+
+
+Click to expand ubuntu-llamacpp-ov-install.sh
```bash
-git clone https://github.com/ggml-org/llama.cpp
-cd llama.cpp
+#!/usr/bin/env bash
+# ============================================
+# llama.cpp OpenVINO Build Script (Ninja)
+# ============================================
+set -euo pipefail
+
+OPENVINO_VERSION_MAJOR="2026.2"
+OPENVINO_VERSION_FULL="2026.2.0.21903.52ddc073857"
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+OPENVINO_INSTALL_DIR="/opt/intel/openvino_${OPENVINO_VERSION_MAJOR}"
+OPENVINO_LINK_DIR="/opt/intel/openvino"
+OPENVINO_TGZ="${SCRIPT_DIR}/openvino.tgz"
+OPENVINO_URL="https://storage.openvinotoolkit.org/repositories/openvino/packages/${OPENVINO_VERSION_MAJOR}/linux/openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz"
+
+echo "============================================"
+echo "Installing prerequisites (apt)..."
+echo "============================================"
+sudo apt-get update
+sudo apt-get install -y \
+ build-essential libcurl4-openssl-dev libtbb12 \
+ cmake ninja-build python3-pip \
+ curl wget tar git
+
+echo "============================================"
+echo "Installing OpenCL runtime + headers..."
+echo "============================================"
+sudo apt-get install -y \
+ ocl-icd-opencl-dev opencl-headers opencl-clhpp-headers intel-opencl-icd
+
+cd "${SCRIPT_DIR}"
+
+# ============================================
+# Clone llama.cpp if missing
+# ============================================
+if [[ ! -f "llama.cpp/CMakeLists.txt" ]]; then
+ echo "Cloning llama.cpp..."
+ git clone https://github.com/ggml-org/llama.cpp
+fi
+
+# ============================================
+# Setup OpenVINO: download & extract to /opt/intel/openvino_${OPENVINO_VERSION_MAJOR},
+# then point /opt/intel/openvino at it via symlink so the active version is swappable.
+# ============================================
+if [[ -f "${OPENVINO_INSTALL_DIR}/setupvars.sh" ]]; then
+ echo "OpenVINO ${OPENVINO_VERSION_MAJOR} already installed at ${OPENVINO_INSTALL_DIR}. Skipping download."
+else
+ echo "OpenVINO not found at ${OPENVINO_INSTALL_DIR}. Starting download..."
+ curl -L -o "${OPENVINO_TGZ}" "${OPENVINO_URL}"
+
+ echo "Extracting OpenVINO to ${OPENVINO_INSTALL_DIR}..."
+ sudo mkdir -p "${OPENVINO_INSTALL_DIR}"
+ sudo tar -xzf "${OPENVINO_TGZ}" -C "${OPENVINO_INSTALL_DIR}" --strip-components=1
+ rm -f "${OPENVINO_TGZ}"
+fi
+
+# Refresh symlink: /opt/intel/openvino -> /opt/intel/openvino_${OPENVINO_VERSION_MAJOR}
+sudo ln -sfn "${OPENVINO_INSTALL_DIR}" "${OPENVINO_LINK_DIR}"
+
+OPENVINO_ROOT="${OPENVINO_LINK_DIR}"
+echo "OpenVINO Ready: ${OPENVINO_ROOT} -> ${OPENVINO_INSTALL_DIR}"
+
+# Install OpenVINO's own runtime dependencies (one-time per system).
+if [[ -x "${OPENVINO_ROOT}/install_dependencies/install_openvino_dependencies.sh" ]]; then
+ echo "============================================"
+ echo "Installing OpenVINO runtime dependencies..."
+ echo "============================================"
+ echo "Y" | sudo -E "${OPENVINO_ROOT}/install_dependencies/install_openvino_dependencies.sh"
+fi
+
+# ============================================
+# Clean old build cache
+# ============================================
+cd "${SCRIPT_DIR}/llama.cpp"
+if [[ -d "build/ReleaseOV" ]]; then
+ echo "Removing old build directory..."
+ rm -rf "build/ReleaseOV"
+fi
+
+echo "============================================"
+echo "Configuring with CMake..."
+echo "============================================"
+# shellcheck disable=SC1091
+source "${OPENVINO_ROOT}/setupvars.sh"
+
+cmake -B build/ReleaseOV -G Ninja \
+ -DCMAKE_BUILD_TYPE=Release \
+ -DGGML_OPENVINO=ON
+
+cmake --build build/ReleaseOV --parallel
+
+echo "============================================"
+echo "Build completed successfully!"
+echo "============================================"
+echo "Binaries: $(pwd)/build/ReleaseOV/bin"
+echo
+echo "NOTE: To run, source setupvars.sh and pick a device:"
+echo " source /opt/intel/openvino/setupvars.sh"
+echo " export GGML_OPENVINO_DEVICE=CPU # or GPU / NPU"
+echo " ./build/ReleaseOV/bin/llama-cli -m model.gguf"
```
-- **Linux:**
- ```bash
- source /opt/intel/openvino/setupvars.sh
- cmake -B build/ReleaseOV -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_OPENVINO=ON
- cmake --build build/ReleaseOV --parallel
- ```
+> [!NOTE]
+> The script pins OpenVINO `2026.2` via the `OPENVINO_VERSION_MAJOR` / `OPENVINO_VERSION_FULL` variables at the top — edit them to track a different release.
+
+
+
+#### Automated Windows Build Script
+
+For Windows users, the following `.bat` script automates the prerequisite installs (Git, Ninja, CMake, Visual Studio 2022 Build Tools, vcpkg + OpenCL), the OpenVINO Runtime download/extract, and the Ninja-based llama.cpp build.
+Save the following as `windows-llamacpp-ov-install.bat` next to where you want the `llama.cpp` to land, then run it from either **Command Prompt** or **PowerShell**:
+
+```cmd
+:: Command Prompt
+windows-llamacpp-ov-install.bat
+```
+
+```powershell
+# PowerShell
+.\windows-llamacpp-ov-install.bat
+```
+
+
+Click to expand windows-llamacpp-ov-install.bat
+
+```bat
+@echo off
+setlocal enabledelayedexpansion
+
+REM ============================================
+REM llama.cpp OpenVINO Build Script (Ninja)
+REM ============================================
+
+set "OPENVINO_VERSION_MAJOR=2026.2"
+set "OPENVINO_VERSION_FULL=2026.2.0.21903.52ddc073857"
+
+set "SCRIPT_DIR=%~dp0"
+set "VCPKG_DIR=C:\vcpkg"
+set "OPENVINO_INSTALL_DIR=C:\Intel\openvino_%OPENVINO_VERSION_MAJOR%"
+set "OPENVINO_LINK_DIR=C:\Intel\openvino"
+set "OPENVINO_ZIP=%SCRIPT_DIR%openvino.zip"
+set "OPENVINO_EXTRACT_TMP=%SCRIPT_DIR%openvino_extract_tmp"
+set "OPENVINO_URL=https://storage.openvinotoolkit.org/repositories/openvino/packages/%OPENVINO_VERSION_MAJOR%/windows/openvino_toolkit_windows_%OPENVINO_VERSION_FULL%_x86_64.zip"
+
+echo ============================================
+echo Installing prerequisites...
+echo ============================================
+winget install --id Git.Git -e --accept-source-agreements --accept-package-agreements 2>nul
+winget install --id Ninja-build.Ninja -e --accept-source-agreements --accept-package-agreements 2>nul
+winget install --id Kitware.CMake -e --accept-source-agreements --accept-package-agreements 2>nul
+
+REM Ensure Visual Studio Build Tools are installed.
+echo Checking for Visual Studio Build Tools...
+set "VSWHERE=%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe"
+set "VS_INSTALLED="
+if exist "%VSWHERE%" (
+ for /f "usebackq tokens=*" %%i in (`"%VSWHERE%" -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath 2^>nul`) do (
+ set "VS_INSTALLED=%%i"
+ )
+)
+if defined VS_INSTALLED (
+ echo Visual Studio with VC++ x86/x64 tools already present at "!VS_INSTALLED!". Skipping winget install.
+) else (
+ winget install --id Microsoft.VisualStudio.2022.BuildTools -e --override "--wait --passive --add Microsoft.VisualStudio.Workload.VCTools --includeRecommended" --accept-source-agreements --accept-package-agreements
+ if errorlevel 1 (
+ echo WARNING: winget could not install Visual Studio Build Tools automatically.
+ echo Install manually from https://aka.ms/vs/17/release/vs_BuildTools.exe ^(select the "Desktop development with C++" workload^)
+ echo and re-run this script from a "Developer Command Prompt for VS 2022".
+ )
+)
+
+echo ============================================
+echo Installing OpenCL via vcpkg...
+echo ============================================
+if not exist "%VCPKG_DIR%" (
+ git clone https://github.com/microsoft/vcpkg "%VCPKG_DIR%"
+ cd /d "%VCPKG_DIR%"
+ call bootstrap-vcpkg.bat
+ call vcpkg integrate install
+)
+cd /d "%VCPKG_DIR%"
+call vcpkg install opencl
+
+cd /d "%SCRIPT_DIR%"
+
+REM ============================================
+REM Clone llama.cpp if missing
+REM ============================================
+if not exist "llama.cpp\CMakeLists.txt" (
+ echo Cloning llama.cpp...
+ git clone https://github.com/ggml-org/llama.cpp
+)
+
+cd /d "llama.cpp"
+set "SCRIPT_DIR=%CD%"
+
+REM ============================================
+REM Setup OpenVINO: download & extract to C:\Intel\openvino_%OPENVINO_VERSION_MAJOR%,
+REM then point C:\Intel\openvino at it via a directory junction (mklink /J).
+REM ============================================
+
+if exist "%OPENVINO_INSTALL_DIR%\setupvars.bat" (
+ echo OpenVINO %OPENVINO_VERSION_MAJOR% already installed at "%OPENVINO_INSTALL_DIR%". Skipping download.
+) else (
+ echo OpenVINO not found at "%OPENVINO_INSTALL_DIR%". Starting download...
+
+ curl -L -o "%OPENVINO_ZIP%" "%OPENVINO_URL%"
+ if errorlevel 1 (
+ echo ERROR: Download failed.
+ exit /b 1
+ )
+
+ echo Extracting OpenVINO...
+ if exist "%OPENVINO_EXTRACT_TMP%" rmdir /s /q "%OPENVINO_EXTRACT_TMP%"
+ mkdir "%OPENVINO_EXTRACT_TMP%"
+ tar -xf "%OPENVINO_ZIP%" -C "%OPENVINO_EXTRACT_TMP%"
+ if errorlevel 1 (
+ echo ERROR: Extraction failed.
+ exit /b 1
+ )
+
+ REM Move the single top-level folder contents into the versioned install dir.
+ REM NOTE: delayed expansion (!VAR!) is required because the surrounding else( ... )
+ REM block is parsed once up-front, so %OPENVINO_EXTRACTED% would expand to "" here
+ REM and xcopy would then treat "\*" as C:\* and fail with "Cannot perform a cyclic copy".
+ set "OPENVINO_EXTRACTED="
+ for /d %%i in ("%OPENVINO_EXTRACT_TMP%\*") do set "OPENVINO_EXTRACTED=%%i"
+ if not defined OPENVINO_EXTRACTED (
+ echo ERROR: Could not locate extracted OpenVINO folder under "%OPENVINO_EXTRACT_TMP%".
+ exit /b 1
+ )
+ if not exist "%OPENVINO_INSTALL_DIR%" mkdir "%OPENVINO_INSTALL_DIR%"
+ xcopy /e /i /y /q "!OPENVINO_EXTRACTED!\*" "%OPENVINO_INSTALL_DIR%\" >nul
+ if errorlevel 1 (
+ echo ERROR: Failed to copy OpenVINO from "!OPENVINO_EXTRACTED!" to "%OPENVINO_INSTALL_DIR%".
+ echo Re-run this script from an elevated Command Prompt ^(Run as administrator^) if access is denied.
+ exit /b 1
+ )
+
+ rmdir /s /q "%OPENVINO_EXTRACT_TMP%"
+ del "%OPENVINO_ZIP%"
+)
+
+REM Refresh junction: C:\Intel\openvino -> C:\Intel\openvino_.
+REM `mklink /J` creates a directory junction (no admin / Developer Mode required).
+if exist "%OPENVINO_LINK_DIR%" rmdir "%OPENVINO_LINK_DIR%"
+mklink /J "%OPENVINO_LINK_DIR%" "%OPENVINO_INSTALL_DIR%" >nul
+if errorlevel 1 (
+ echo ERROR: Failed to create junction "%OPENVINO_LINK_DIR%" -^> "%OPENVINO_INSTALL_DIR%".
+ echo If "%OPENVINO_LINK_DIR%" already exists as a regular non-empty folder, remove it manually and re-run.
+ exit /b 1
+)
+
+set "OPENVINO_ROOT=%OPENVINO_LINK_DIR%"
+echo OpenVINO Ready: %OPENVINO_ROOT% -^> %OPENVINO_INSTALL_DIR%
+
+
+echo ============================================
+echo Setting up compiler environment...
+echo ============================================
+REM Locate Visual Studio Build Tools vcvars64.bat
+set "VSWHERE=%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe"
+if exist "%VSWHERE%" (
+ for /f "usebackq tokens=*" %%i in (`"%VSWHERE%" -latest -products Microsoft.VisualStudio.Product.BuildTools -property installationPath`) do (
+ set "VS_PATH=%%i"
+ )
+)
+if defined VS_PATH (
+ call "%VS_PATH%\VC\Auxiliary\Build\vcvars64.bat" >nul
+) else (
+ echo WARNING: Visual Studio Build Tools not found. Compiler may be missing.
+)
+
+REM ============================================
+REM Clean old build cache
+REM ============================================
+if exist "build\ReleaseOV" (
+ echo Removing old build directory ...
+ rmdir /s /q "build\ReleaseOV"
+)
+
+echo ============================================
+echo Configuring with CMake...
+echo ============================================
+call "%OPENVINO_ROOT%\setupvars.bat" >nul 2>nul
+
+cmake -B build\ReleaseOV -G Ninja ^
+ -DCMAKE_BUILD_TYPE=Release ^
+ -DGGML_OPENVINO=ON ^
+ -DCMAKE_TOOLCHAIN_FILE="%VCPKG_DIR%\scripts\buildsystems\vcpkg.cmake"
+
+if errorlevel 1 (
+ echo If you continue to face CMAKE errors, make sure to install:
+ echo winget install Microsoft.VisualStudio.2022.BuildTools
+ echo Then run the "Developer Command Prompt for VS 2022" and launch this script from there.
+ exit /b 1
+)
+
+cmake --build build\ReleaseOV --config Release
+if errorlevel 1 exit /b 1
+
+echo ============================================
+echo Build completed successfully!
+echo ============================================
+echo Binaries: %CD%\build\ReleaseOV\bin
+echo.
+echo NOTE: To run, source setupvars.bat and pick a device:
+echo call "C:\Intel\openvino\setupvars.bat"
+echo set GGML_OPENVINO_DEVICE=CPU ^&^& REM or GPU / NPU
+echo build\ReleaseOV\bin\llama-cli.exe -m model.gguf
+echo.
+
+endlocal
+```
-- **Windows:**
- ```cmd
- # x64 Native Tools Command Prompt for VS 2022
- "C:\Program Files (x86)\Intel\openvino_2026.0\setupvars.bat"
- cmake -B build\ReleaseOV -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_OPENVINO=ON -DLLAMA_CURL=OFF -DCMAKE_TOOLCHAIN_FILE=C:\vcpkg\scripts\buildsystems\vcpkg.cmake
- cmake --build build\ReleaseOV --parallel
- ```
> [!NOTE]
-> Use `x64 Native Tools Command Prompt` for Windows build. After building, you could use either `cmd` or `PowerShell` to run the OpenVINO backend.
+> The script pins OpenVINO `2026.2` via the `OPENVINO_VERSION_MAJOR` / `OPENVINO_VERSION_FULL` variables at the top — edit them to track a different release. From any new shell, source the matching `setupvars` script via the junction — `call "C:\Intel\openvino\setupvars.bat"` from `cmd`, or `& "C:\Intel\openvino\setupvars.ps1"` from PowerShell. If `winget` cannot register Visual Studio Build Tools on first run, install them once manually and re-run the script from an elevated **Developer Command Prompt for VS 2022**.
+
+
+
### 3. Download Sample Model
-Download models for testing:
+Download sample model for testing.
```bash
# Linux
mkdir -p ~/models/
-wget https://huggingface.co/unsloth/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_0.gguf \
- -O ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf
+wget https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_K_M.gguf \
+ -O ~/models/Llama-3.2-1B-Instruct-Q4_K_M.gguf
# Windows PowerShell
mkdir C:\models
-Invoke-WebRequest -Uri https://huggingface.co/unsloth/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_0.gguf -OutFile C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf
+Invoke-WebRequest -Uri https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_K_M.gguf -OutFile C:\models\Llama-3.2-1B-Instruct-Q4_K_M.gguf
# Windows Command Line
mkdir C:\models
-curl -L https://huggingface.co/unsloth/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_0.gguf -o C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf
+curl -L https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_K_M.gguf -o C:\models\Llama-3.2-1B-Instruct-Q4_K_M.gguf
```
### 4. Run Inference with OpenVINO Backend
@@ -196,65 +584,45 @@ When using the OpenVINO backend, the first inference token may have slightly hig
# Linux
export GGML_OPENVINO_DEVICE=GPU
-# Enable stateful execution with GPU device to avoid known stateless execution failures.
+# Optional: enable stateful execution for improved GPU performance (recommended).
export GGML_OPENVINO_STATEFUL_EXECUTION=1
# To run llama-simple:
-./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf -n 50 "The story of AI is "
+./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct-Q4_K_M.gguf -n 50 "The story of AI is "
# To run in chat mode:
-./build/ReleaseOV/bin/llama-cli -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf -c 1024
+./build/ReleaseOV/bin/llama-cli -m ~/models/Llama-3.2-1B-Instruct-Q4_K_M.gguf -c 1024
# To run llama-bench, -fa 1 is needed
-GGML_OPENVINO_STATEFUL_EXECUTION=1 GGML_OPENVINO_DEVICE=GPU ./build/ReleaseOV/bin/llama-bench -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf -fa 1
+GGML_OPENVINO_STATEFUL_EXECUTION=1 GGML_OPENVINO_DEVICE=GPU ./build/ReleaseOV/bin/llama-bench -m ~/models/Llama-3.2-1B-Instruct-Q4_K_M.gguf -fa 1
# NPU: keep context small to avoid failures from very large model context windows.
export GGML_OPENVINO_DEVICE=NPU
-./build/ReleaseOV/bin/llama-cli -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf -c 512
+./build/ReleaseOV/bin/llama-cli -m ~/models/Llama-3.2-1B-Instruct-Q4_K_M.gguf -c 512
# Windows Command Line
set GGML_OPENVINO_DEVICE=GPU
-# Enable stateful execution with GPU device to avoid known stateless execution failures.
+# Optional: enable stateful execution for improved GPU performance (recommended).
set GGML_OPENVINO_STATEFUL_EXECUTION=1
# Windows PowerShell
$env:GGML_OPENVINO_DEVICE = "GPU"
$env:GGML_OPENVINO_STATEFUL_EXECUTION = "1"
# To run llama-simple
-build\ReleaseOV\bin\llama-simple.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf" -n 50 "The story of AI is "
+build\ReleaseOV\bin\llama-simple.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_K_M.gguf" -n 50 "The story of AI is "
# To run in chat mode:
-build\ReleaseOV\bin\llama-cli.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf" -c 1024
+build\ReleaseOV\bin\llama-cli.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_K_M.gguf" -c 1024
# To run llama-bench, -fa 1 is needed
-build\ReleaseOV\bin\llama-bench.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf" -fa 1
+build\ReleaseOV\bin\llama-bench.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_K_M.gguf" -fa 1
# NPU: keep context small to avoid failures from very large model context windows.
# Windows Command Line
set GGML_OPENVINO_DEVICE=NPU
# Windows PowerShell
$env:GGML_OPENVINO_DEVICE = "NPU"
-build\ReleaseOV\bin\llama-cli.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf" -c 512
+build\ReleaseOV\bin\llama-cli.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_K_M.gguf" -c 512
```
> [!NOTE]
> On systems with multiple GPUs, use `GPU.0` or `GPU.1` to explicitly target specific GPU. See [OpenVINO GPU Device](https://docs.openvino.ai/2026/openvino-workflow/running-inference/inference-devices-and-modes/gpu-device.html) for more details.
-### Known Issues and Current Workarounds
-
-- GPU stateless execution is currently affected by a known issue.
- - Workaround: set `GGML_OPENVINO_STATEFUL_EXECUTION=1` when using GPU device.
-- NPU failures can happen when context size is too large. Recent llama.cpp behavior may resolve context size to the model training context (for example, 131072 for Llama 3.2 1B), which is too large for current NPU usage and can also stress laptop CPU/GPU on larger models. To inspect the selected context size, run `llama-cli` or `llama-server` with `-lv 3`.
- - Workaround: explicitly set context size, for ex. `-c 1024` for NPU runs. Performance will be better with lower context size.
-- Additional NPU limitations:
- - Model caching is not yet supported.
- - `llama-server -np > 1` (multiple parallel sequences) is not supported.
- - `llama-perplexity` is only supported with `-b 512` or smaller.
-- `--context-shift` with `llama-cli` is currently not supported with OpenVINO backend across CPU, GPU, and NPU devices.
-- Encoder models (embedding, reranking) are not supported with the current OpenVINO backend implementation.
-- `-fa 1` is required when running llama-bench with the OpenVINO backend.
- - `GGML_OPENVINO_STATEFUL_EXECUTION=1 GGML_OPENVINO_DEVICE=GPU ./llama-bench -fa 1`
-- `llama-server` with OpenVINO backend supports only one chat session/thread, when `GGML_OPENVINO_STATEFUL_EXECUTION=1` is enabled.
-
-> [!NOTE]
-> The OpenVINO backend is actively under development. Fixes are underway, and this document will continue to be updated as issues are resolved.
-
-
-### Docker Build
+### 5. Docker Build
You can build and run llama.cpp with OpenVINO backend using Docker.
@@ -272,7 +640,7 @@ docker build --target=light -t llama-openvino:light -f .devops/openvino.Dockerfi
docker build --target=server -t llama-openvino:server -f .devops/openvino.Dockerfile .
# If you are behind a proxy:
-docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy --target=light -t llama-openvino:light -f .devops/openvino.Dockerfile .
+docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy --target=server -t llama-openvino:server -f .devops/openvino.Dockerfile .
```
Run llama.cpp with OpenVINO backend Docker container.
@@ -281,19 +649,19 @@ Save sample models in `~/models` as [shown above](#3-download-sample-model). It
```bash
# Run Docker container
-docker run --rm -it -v ~/models:/models llama-openvino:light --no-warmup -c 1024 -m /models/Llama-3.2-1B-Instruct-Q4_0.gguf
+docker run --rm -it -v ~/models:/models llama-openvino:light --no-warmup -c 1024 -m /models/Llama-3.2-1B-Instruct-Q4_K_M.gguf
# With Intel GPU access (iGPU or dGPU)
docker run --rm -it -v ~/models:/models \
--device=/dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) \
--env=GGML_OPENVINO_DEVICE=GPU --env=GGML_OPENVINO_STATEFUL_EXECUTION=1 \
-llama-openvino:light --no-warmup -c 1024 -m /models/Llama-3.2-1B-Instruct-Q4_0.gguf
+llama-openvino:light --no-warmup -c 1024 -m /models/Llama-3.2-1B-Instruct-Q4_K_M.gguf
# With Intel NPU access
docker run --rm -it -v ~/models:/models \
--device=/dev/accel --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) \
--env=GGML_OPENVINO_DEVICE=NPU \
-llama-openvino:light --no-warmup -c 1024 -m /models/Llama-3.2-1B-Instruct-Q4_0.gguf
+llama-openvino:light --no-warmup -c 1024 -m /models/Llama-3.2-1B-Instruct-Q4_K_M.gguf
```
Run Llama.cpp Server with OpenVINO Backend.
@@ -301,17 +669,30 @@ Run Llama.cpp Server with OpenVINO Backend.
> `llama-server` with OpenVINO backend supports only one chat session/thread, when `GGML_OPENVINO_STATEFUL_EXECUTION=1` is enabled.
```bash
-# Run the Server Docker container
-docker run --rm -it -p 8080:8080 -v ~/models:/models llama-openvino:server --no-warmup -m /models/Llama-3.2-1B-Instruct-Q4_0.gguf -c 1024
-# Or Using llama-server executable
-./build/ReleaseOV/bin/llama-server -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf --port 8080 -c 1024
+# Run the llama-openvino:server Docker container (CPU)
+docker run --rm -it -p 8080:8080 -v ~/models:/models llama-openvino:server --no-warmup -m /models/Llama-3.2-1B-Instruct-Q4_K_M.gguf -c 1024 --host 0.0.0.0
-# If you are behind a proxy, make sure to set NO_PROXY to avoid proxy for localhost
-export NO_PROXY=localhost,127.0.0.1
+# Run the llama-openvino:server Docker container with Intel GPU access (iGPU or dGPU)
+docker run --rm -it -v ~/models:/models \
+--device=/dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) \
+-p 8080:8080 --env=GGML_OPENVINO_DEVICE=GPU \
+llama-openvino:server --no-warmup -c 1024 -m /models/Llama-3.2-1B-Instruct-Q4_K_M.gguf --host 0.0.0.0
+
+# Run the llama-openvino:server Docker container with Intel NPU access
+docker run --rm -it -v ~/models:/models \
+--device=/dev/accel --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) \
+-p 8080:8080 --env=GGML_OPENVINO_DEVICE=NPU \
+llama-openvino:server --no-warmup -c 1024 -m /models/Llama-3.2-1B-Instruct-Q4_K_M.gguf --host 0.0.0.0
+
+# Or Using llama-server executable
+./build/ReleaseOV/bin/llama-server -m ~/models/Llama-3.2-1B-Instruct-Q4_K_M.gguf --port 8080 -c 1024
# Option 1: Open your browser to http://localhost:8080 to access the web UI for the llama.cpp server.
# Option 2: In a NEW terminal, test the server with curl
+# If you are behind a proxy, make sure to set NO_PROXY to avoid proxy for localhost
+export NO_PROXY=localhost,127.0.0.1
+
# Test health endpoint
curl -f http://localhost:8080/health
@@ -320,24 +701,26 @@ curl -X POST "http://localhost:8080/v1/chat/completions" -H "Content-Type: appli
-d '{"messages":[{"role":"user","content":"Write a poem about OpenVINO"}],"max_tokens":100}' | jq .
```
-## Runtime Configuration
+## GGML OpenVINO Backend Runtime Configurations
The OpenVINO backend can be configured using the following environment variables at runtime to control device selection, caching, debugging, and profiling behavior.
-
-### Configuration Options
-
-| Variable | Default | Description |
-|-----------------------------------|------------|-------------------------------------------------------------------------------------------------------------|
-| `GGML_OPENVINO_DEVICE` | `CPU` | Specify the target device (CPU, GPU, NPU). On systems with multiple GPUs, use `GPU.0` or `GPU.1` to explicitly target specific GPU. See [OpenVINO GPU Device](https://docs.openvino.ai/2026/openvino-workflow/running-inference/inference-devices-and-modes/gpu-device.html). When set to **NPU**, static compilation mode is enabled for optimal performance. |
-| `GGML_OPENVINO_CACHE_DIR` | `not set` | Directory for OpenVINO model caching (recommended: `/tmp/ov_cache`). Enables model caching when set. **Not supported on NPU devices.** |
-| `GGML_OPENVINO_PREFILL_CHUNK_SIZE`| `256` | Token chunk size for **NPU** prefill. |
-| `GGML_OPENVINO_STATEFUL_EXECUTION`| `0` | Enable stateful KV cache on for better performance. Recommended on CPU, GPU. |
-| `GGML_OPENVINO_PROFILING` | `0` | Enable execution-time profiling. |
-| `GGML_OPENVINO_DUMP_CGRAPH` | `0` | Dump the GGML compute graph to `cgraph_ov.txt`. |
-| `GGML_OPENVINO_DUMP_IR` | `0` | Serialize OpenVINO IR files with timestamps. |
-| `GGML_OPENVINO_DEBUG_INPUT` | `0` | Enable input debugging and print input tensor info. |
-| `GGML_OPENVINO_DEBUG_OUTPUT` | `0` | Enable output debugging and print output tensor info. |
-| `GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS` | `0` | Print tensor address map once. |
+Boolean flags follow a uniform convention: set to a **positive integer** (e.g. `1`) to enable; unset, empty, `0`, negative, or non-numeric values are treated as disabled.
+
+| Variable | Type | Default | Description |
+|-----------------------------------|-----------|------------|-------------------------------------------------------------------------------------------------------------|
+| `GGML_OPENVINO_DEVICE` | String | `CPU` | Specify the target device (CPU, GPU, NPU). On systems with multiple GPUs, use `GPU.0` or `GPU.1` to explicitly target specific GPU. See [OpenVINO GPU Device](https://docs.openvino.ai/2026/openvino-workflow/running-inference/inference-devices-and-modes/gpu-device.html). When set to **NPU**, static compilation mode is enabled for optimal performance. |
+| `GGML_OPENVINO_CACHE_DIR` | String | `not set` | Directory for OpenVINO model caching (recommended: `/tmp/ov_cache`). Enables model caching when set. **Not supported on NPU devices.** |
+| `GGML_OPENVINO_PREFILL_CHUNK_SIZE`| Integer | `256` | Token chunk size for **NPU** prefill (NPU-only; ignored on CPU/GPU). Must be a positive integer; otherwise the default is used. |
+| `GGML_OPENVINO_STATEFUL_EXECUTION`| Boolean | `0` | Enable stateful KV cache for better performance. Recommended on CPU, GPU. |
+| `GGML_OPENVINO_DISABLE_CACHE` | Boolean | `0` | Disable the in-process compiled-model / decoder cache (cache is on by default). Set to `1` to disable. |
+| `GGML_OPENVINO_DISABLE_KV_SLICE` | Boolean | `0` | Disable the KV-cache input-tensor slicing optimization (slicing is on by default on CPU/GPU). Set to `1` to disable. |
+| `GGML_OPENVINO_MANUAL_GQA_ATTN` | Boolean | device-based | Tri-state. When **unset**, manual GQA attention is enabled by default on `GPU` and disabled on other devices. Set to a positive integer to force-enable, or `0` to force-disable. |
+| `GGML_OPENVINO_PROFILING` | Boolean | `0` | Enable execution-time profiling. |
+| `GGML_OPENVINO_DUMP_CGRAPH` | Boolean | `0` | Dump the GGML compute graph to `cgraph_ov.txt`. |
+| `GGML_OPENVINO_DUMP_IR` | Boolean | `0` | Serialize OpenVINO IR files with timestamps. |
+| `GGML_OPENVINO_DEBUG_INPUT` | Boolean | `0` | Enable input debugging and print input tensor info. |
+| `GGML_OPENVINO_DEBUG_OUTPUT` | Boolean | `0` | Enable output debugging and print output tensor info. |
+| `GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS` | Boolean | `0` | Print tensor address map once. |
> [!NOTE]
>`GGML_OPENVINO_STATEFUL_EXECUTION` is an **Experimental** feature to allow stateful execution for managing the KV cache internally inside the OpenVINO model, improving performance on CPUs and GPUs. Stateful execution is not effective on NPUs, and not all models currently support this feature. This feature is experimental and has been validated only with the llama-simple, llama-cli, llama-bench, and llama-run applications and is recommended to enable for the best performance. Other applications, such as llama-server and llama-perplexity, are not yet supported.
@@ -355,7 +738,7 @@ export GGML_OPENVINO_PROFILING=1
export GGML_OPENVINO_DEVICE=GPU
export GGML_OPENVINO_STATEFUL_EXECUTION=1
-./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf -n 50 "The story of AI is "
+./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct-Q4_K_M.gguf -n 50 "The story of AI is "
# Windows Command Line
set GGML_OPENVINO_CACHE_DIR=C:\tmp\ov_cache
@@ -369,19 +752,39 @@ $env:GGML_OPENVINO_PROFILING = "1"
$env:GGML_OPENVINO_DEVICE = "GPU"
$env:GGML_OPENVINO_STATEFUL_EXECUTION = "1"
-build\ReleaseOV\bin\llama-simple.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf" -n 50 "The story of AI is "
+build\ReleaseOV\bin\llama-simple.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_K_M.gguf" -n 50 "The story of AI is "
```
-## Llama.cpp Tools
+## Known Limitations
-The following tools work with the OpenVINO backend on CPU, GPU, NPU:
-- llama-bench
-- llama-cli
-- llama-completion
-- llama-perplexity
-- llama-server
-- llama-simple
+**General (all devices)**
+
+- Llama.cpp OpenVINO backend currently supports a subset of GGML ops and text-only models. Unsupported ops or unsupported op shapes/cases fail during OpenVINO translation.
+- Multimodal features (audio/image/video) are a work in progress.
+- Limited Embedding and Reranking model support.
+- Llama.cpp tool coverage across CPU/GPU/NPU is not uniform.
+
+**Tool-specific**
+
+- `llama-bench`: requires `-fa 1` (flash-attention).
+- `llama-cli --context-shift`: stateless only (`GGML_OPENVINO_STATEFUL_EXECUTION=0`). In stateful mode the KV cache is owned by the OpenVINO model and cannot be shifted externally.
+- `llama-server`: only one chat session/thread when `GGML_OPENVINO_STATEFUL_EXECUTION=1`.
+
+**GPU-specific**
+
+- `llama-server -np > 1`: concurrent requests are batched together, which may slightly reduce per-request throughput.
+
+**NPU-specific**
+
+- Default context resolves to the model's training context (e.g. 131072 for Llama 3.2 1B), which can OOM or fail or degrade performance on NPU. Inspect the resolved value with `-lv 3`.
+ - **Workaround:** Pass an explicit `-c `, e.g. `-c 1024`.
+- NPU device uses a static graph with a fixed prefill chunk size (defaults to 256), configurable with `GGML_OPENVINO_PREFILL_CHUNK_SIZE`. Large prefill/batch settings may need tuning.
+- `llama-server -np > 1` (multiple parallel sequences) is not supported.
+- `llama-perplexity`: requires `-b 512` or smaller.
+
+> [!NOTE]
+> The OpenVINO backend is actively under development. Fixes and improvements are underway, and this document will continue to be updated.
## Work in Progress
diff --git a/ggml/src/ggml-openvino/.clang-format b/ggml/src/ggml-openvino/.clang-format
index a2a24d7d33a0..4a5c7c208676 100644
--- a/ggml/src/ggml-openvino/.clang-format
+++ b/ggml/src/ggml-openvino/.clang-format
@@ -2,12 +2,7 @@
# Override root .clang-format
AlignConsecutiveAssignments: false
AlignConsecutiveDeclarations: false
-Cpp11BracedListStyle: true
-SpacesInContainerLiterals: false
-BreakBeforeBraces: Attach
AccessModifierOffset: -4
-IndentCaseBlocks: false
-IndentCaseLabels: false
Language: Cpp
AlignAfterOpenBracket: Align
diff --git a/ggml/src/ggml-openvino/CMakeLists.txt b/ggml/src/ggml-openvino/CMakeLists.txt
index 175b585661d3..cc089b721fc3 100644
--- a/ggml/src/ggml-openvino/CMakeLists.txt
+++ b/ggml/src/ggml-openvino/CMakeLists.txt
@@ -1,8 +1,6 @@
-find_package(OpenVINO REQUIRED)
+find_package(OpenVINO REQUIRED COMPONENTS Runtime Threading)
find_package(OpenCL REQUIRED)
-include("${OpenVINO_DIR}/../3rdparty/tbb/lib/cmake/TBB/TBBConfig.cmake")
-
file(GLOB_RECURSE GGML_HEADERS_OPENVINO "*.h" "*.hpp")
file(GLOB_RECURSE GGML_SOURCES_OPENVINO "*.cpp")
@@ -11,7 +9,7 @@ ggml_add_backend_library(ggml-openvino
${GGML_HEADERS_OPENVINO}
)
-target_link_libraries(ggml-openvino PRIVATE openvino::runtime TBB::tbb OpenCL::OpenCL)
+target_link_libraries(ggml-openvino PRIVATE openvino::runtime openvino::threading OpenCL::OpenCL)
if (GGML_OPENVINO)
if (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp
index 5095e7998493..b479ece177da 100644
--- a/ggml/src/ggml-openvino/ggml-decoder.cpp
+++ b/ggml/src/ggml-openvino/ggml-decoder.cpp
@@ -1,20 +1,17 @@
#include "ggml-decoder.h"
-#include "ggml-backend-impl.h"
-#include "ggml-backend.h"
+#include "ggml-impl.h"
#include "ggml-openvino-extra.h"
#include "ggml-openvino.h"
#include "ggml-quants.h"
-
-#include
-#include
+#include "ggml.h"
+#include "utils.h"
#include
#include
#include
#include
#include
-#include
#include
#include
#include