diff --git a/.devops/openvino.Dockerfile b/.devops/openvino.Dockerfile index fec72b1c7deb..bcaea16cc3f1 100644 --- a/.devops/openvino.Dockerfile +++ b/.devops/openvino.Dockerfile @@ -1,12 +1,12 @@ -ARG OPENVINO_VERSION_MAJOR=2026.2 -ARG OPENVINO_VERSION_FULL=2026.2.0.21903.52ddc073857 +ARG OPENVINO_VERSION_MAJOR=2026.2.1 +ARG OPENVINO_VERSION_FULL=2026.2.1.21919.ede283a88e3 ARG UBUNTU_VERSION=24.04 # Intel GPU driver versions. https://github.com/intel/compute-runtime/releases -ARG IGC_VERSION=v2.34.4 -ARG IGC_VERSION_FULL=2_2.34.4+21428 -ARG COMPUTE_RUNTIME_VERSION=26.18.38308.1 -ARG COMPUTE_RUNTIME_VERSION_FULL=26.18.38308.1-0 +ARG IGC_VERSION=v2.36.3 +ARG IGC_VERSION_FULL=2_2.36.3+21719 +ARG COMPUTE_RUNTIME_VERSION=26.22.38646.4 +ARG COMPUTE_RUNTIME_VERSION_FULL=26.22.38646.4-0 ARG IGDGMM_VERSION=22.10.0 # Intel NPU driver versions. https://github.com/intel/linux-npu-driver/releases diff --git a/.github/workflows/build-cache.yml b/.github/workflows/build-cache.yml index b36c6e1ea89b..327f71978bf1 100644 --- a/.github/workflows/build-cache.yml +++ b/.github/workflows/build-cache.yml @@ -68,8 +68,8 @@ jobs: env: # Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile - OPENVINO_VERSION_MAJOR: "2026.2" - OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857" + OPENVINO_VERSION_MAJOR: "2026.2.1" + OPENVINO_VERSION_FULL: "2026.2.1.21919.ede283a88e3" steps: - name: Clone @@ -96,8 +96,8 @@ jobs: env: # Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile - OPENVINO_VERSION_MAJOR: "2026.2" - OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857" + OPENVINO_VERSION_MAJOR: "2026.2.1" + OPENVINO_VERSION_FULL: "2026.2.1.21919.ede283a88e3" steps: - name: Clone diff --git a/.github/workflows/build-openvino.yml b/.github/workflows/build-openvino.yml index 49ab13695cbf..938cde3f20ff 100644 --- a/.github/workflows/build-openvino.yml +++ b/.github/workflows/build-openvino.yml @@ -39,8 +39,8 @@ jobs: env: # Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile - OPENVINO_VERSION_MAJOR: "2026.2" - OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857" + OPENVINO_VERSION_MAJOR: "2026.2.1" + OPENVINO_VERSION_FULL: "2026.2.1.21919.ede283a88e3" steps: - name: Clone @@ -96,8 +96,8 @@ jobs: env: # Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile - OPENVINO_VERSION_MAJOR: "2026.2" - OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857" + OPENVINO_VERSION_MAJOR: "2026.2.1" + OPENVINO_VERSION_FULL: "2026.2.1.21919.ede283a88e3" steps: - name: Clone diff --git a/.github/workflows/build-self-hosted.yml b/.github/workflows/build-self-hosted.yml index c4366ece3e59..1a71ed827729 100644 --- a/.github/workflows/build-self-hosted.yml +++ b/.github/workflows/build-self-hosted.yml @@ -266,8 +266,8 @@ jobs: env: # Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile - OPENVINO_VERSION_MAJOR: "2026.2" - OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857" + OPENVINO_VERSION_MAJOR: "2026.2.1" + OPENVINO_VERSION_FULL: "2026.2.1.21919.ede283a88e3" steps: - name: Clone diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index c7b67e49255f..eb7e1f20d468 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -446,8 +446,8 @@ jobs: env: # Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile - OPENVINO_VERSION_MAJOR: "2026.2" - OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857" + OPENVINO_VERSION_MAJOR: "2026.2.1" + OPENVINO_VERSION_FULL: "2026.2.1.21919.ede283a88e3" steps: - name: Set OpenVINO version output @@ -506,8 +506,11 @@ jobs: cmake -B build/ReleaseOV -G Ninja \ -DCMAKE_BUILD_TYPE=Release \ -DGGML_OPENVINO=ON \ - -DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }} - cmake --build build/ReleaseOV --config Release -j $(nproc) + -DCMAKE_INSTALL_RPATH='$ORIGIN' \ + -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \ + -DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }} \ + ${{ env.CMAKE_ARGS }} + cmake --build build/ReleaseOV --config Release --parallel - name: ccache-clear uses: ./.github/actions/ccache-clear @@ -521,8 +524,26 @@ jobs: - name: Pack artifacts id: pack_artifacts run: | - cp LICENSE ./build/ReleaseOV/bin/ - tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/ReleaseOV/bin . + dest=./build/ReleaseOV/bin + OPENVINO_ROOT=./openvino_toolkit + ov_lib="$OPENVINO_ROOT/runtime/lib/intel64" + + # Bundle OpenVINO runtime libs + TBB. Binaries built with RPATH=$ORIGIN + # load these siblings without setupvars.sh / LD_LIBRARY_PATH. + cp -P "$ov_lib"/libopenvino.so* \ + "$ov_lib"/libopenvino_c.so* \ + "$ov_lib"/libopenvino_*_plugin.so \ + "$ov_lib"/libopenvino_intel_npu_compiler*.so \ + "$OPENVINO_ROOT"/runtime/3rdparty/tbb/lib/*.so* \ + "$dest" + cp -P /usr/lib/x86_64-linux-gnu/libOpenCL.so.1* "$dest" 2>/dev/null || true + cp "$ov_lib"/cache.json "$dest" 2>/dev/null || true + + # OpenVINO licensing + cp -r "$OPENVINO_ROOT"/docs/licensing "$dest"/openvino-licensing + + cp LICENSE "$dest" + tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C "$dest" . - name: Upload artifacts uses: actions/upload-artifact@v6 @@ -538,8 +559,8 @@ jobs: env: # Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile - OPENVINO_VERSION_MAJOR: "2026.2" - OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857" + OPENVINO_VERSION_MAJOR: "2026.2.1" + OPENVINO_VERSION_FULL: "2026.2.1.21919.ede283a88e3" steps: - name: Set OpenVINO version output @@ -607,7 +628,9 @@ jobs: -A x64 ^ -DCMAKE_BUILD_TYPE=Release ^ -DGGML_OPENVINO=ON ^ - -DCMAKE_TOOLCHAIN_FILE=C:\vcpkg\scripts\buildsystems\vcpkg.cmake + -DLLAMA_BUILD_BORINGSSL=ON ^ + -DCMAKE_TOOLCHAIN_FILE=C:\vcpkg\scripts\buildsystems\vcpkg.cmake ^ + ${{ env.CMAKE_ARGS }} cmake --build build\ReleaseOV --config Release -- /m @@ -624,8 +647,29 @@ jobs: id: pack_artifacts shell: powershell run: | - Copy-Item LICENSE .\build\ReleaseOV\bin\ - 7z a -snl llama-${{ steps.tag.outputs.name }}-bin-win-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.zip .\build\ReleaseOV\bin\* + # Locate the extracted OpenVINO toolkit root (same pattern as the Build step). + $OPENVINO_ROOT = (Get-ChildItem -Directory openvino_toolkit | Select-Object -First 1).FullName + if (-not $OPENVINO_ROOT) { + Write-Error "OpenVINO toolkit folder not found under .\openvino_toolkit" + exit 1 + } + + $dest = ".\build\ReleaseOV\bin\Release" + + $ovBin = Join-Path $OPENVINO_ROOT 'runtime\bin\intel64\Release' + Copy-Item -Path (Join-Path $ovBin '*.dll') -Destination $dest -Force + Copy-Item -Path (Join-Path $ovBin 'cache.json') -Destination $dest -Force + + $tbbBin = Join-Path $OPENVINO_ROOT 'runtime\3rdparty\tbb\bin' + Copy-Item -Path (Join-Path $tbbBin 'tbb*.dll') -Destination $dest -Force + + # OpenVINO licensing + $licensingDest = Join-Path $dest 'openvino-licensing' + New-Item -ItemType Directory -Force -Path $licensingDest | Out-Null + Copy-Item -Path (Join-Path $OPENVINO_ROOT 'docs\licensing\*') -Destination $licensingDest -Recurse -Force + + Copy-Item LICENSE $dest + 7z a -snl llama-${{ steps.tag.outputs.name }}-bin-win-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.zip $dest\* - name: Upload artifacts uses: actions/upload-artifact@v6 diff --git a/docs/backend/OPENVINO.md b/docs/backend/OPENVINO.md index 631d4bc3bf78..d5c6f46e299d 100644 --- a/docs/backend/OPENVINO.md +++ b/docs/backend/OPENVINO.md @@ -237,8 +237,8 @@ chmod +x ubuntu-llamacpp-ov-install.sh # ============================================ set -euo pipefail -OPENVINO_VERSION_MAJOR="2026.2" -OPENVINO_VERSION_FULL="2026.2.0.21903.52ddc073857" +OPENVINO_VERSION_MAJOR="2026.2.1" +OPENVINO_VERSION_FULL="2026.2.1.21919.ede283a88e3" SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" OPENVINO_INSTALL_DIR="/opt/intel/openvino_${OPENVINO_VERSION_MAJOR}" @@ -334,7 +334,7 @@ echo " ./build/ReleaseOV/bin/llama-cli -m model.gguf" ``` > [!NOTE] -> The script pins OpenVINO `2026.2` via the `OPENVINO_VERSION_MAJOR` / `OPENVINO_VERSION_FULL` variables at the top — edit them to track a different release. +> The script pins OpenVINO `2026.2.1` via the `OPENVINO_VERSION_MAJOR` / `OPENVINO_VERSION_FULL` variables at the top — edit them to track a different release. @@ -364,8 +364,8 @@ REM ============================================ REM llama.cpp OpenVINO Build Script (Ninja) REM ============================================ -set "OPENVINO_VERSION_MAJOR=2026.2" -set "OPENVINO_VERSION_FULL=2026.2.0.21903.52ddc073857" +set "OPENVINO_VERSION_MAJOR=2026.2.1" +set "OPENVINO_VERSION_FULL=2026.2.1.21919.ede283a88e3" set "SCRIPT_DIR=%~dp0" set "VCPKG_DIR=C:\vcpkg" @@ -547,7 +547,7 @@ endlocal ``` > [!NOTE] -> The script pins OpenVINO `2026.2` via the `OPENVINO_VERSION_MAJOR` / `OPENVINO_VERSION_FULL` variables at the top — edit them to track a different release. From any new shell, source the matching `setupvars` script via the junction — `call "C:\Intel\openvino\setupvars.bat"` from `cmd`, or `& "C:\Intel\openvino\setupvars.ps1"` from PowerShell. If `winget` cannot register Visual Studio Build Tools on first run, install them once manually and re-run the script from an elevated **Developer Command Prompt for VS 2022**. +> The script pins OpenVINO `2026.2.1` via the `OPENVINO_VERSION_MAJOR` / `OPENVINO_VERSION_FULL` variables at the top — edit them to track a different release. From any new shell, source the matching `setupvars` script via the junction — `call "C:\Intel\openvino\setupvars.bat"` from `cmd`, or `& "C:\Intel\openvino\setupvars.ps1"` from PowerShell. If `winget` cannot register Visual Studio Build Tools on first run, install them once manually and re-run the script from an elevated **Developer Command Prompt for VS 2022**. diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index b6df4f0fbb7a..48c63e4d70fa 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -1270,77 +1270,14 @@ void GgmlOvDecoder::visit_subgraph(std::function ops = { - {GGML_OP_NONE, "GGML_OP_NONE" }, - {GGML_OP_ACC, "GGML_OP_ACC" }, - {GGML_OP_ADD, "GGML_OP_ADD" }, - {GGML_OP_ADD1, "GGML_OP_ADD1" }, - {GGML_OP_ADD_ID, "GGML_OP_ADD_ID" }, - {GGML_OP_CONCAT, "GGML_OP_CONCAT" }, - {GGML_OP_CONT, "GGML_OP_CONT" }, - {GGML_OP_DIV, "GGML_OP_DIV" }, - {GGML_OP_DUP, "GGML_OP_DUP" }, - {GGML_OP_GET_ROWS, "GGML_OP_GET_ROWS" }, - {GGML_OP_MUL, "GGML_OP_MUL" }, - {GGML_OP_MUL_MAT, "GGML_OP_MUL_MAT" }, - {GGML_OP_MUL_MAT_ID, "GGML_OP_MUL_MAT_ID" }, - {GGML_OP_PERMUTE, "GGML_OP_PERMUTE" }, - {GGML_OP_RESHAPE, "GGML_OP_RESHAPE" }, - {GGML_OP_RMS_NORM, "GGML_OP_RMS_NORM" }, - {GGML_OP_NORM, "GGML_OP_NORM" }, - {GGML_OP_ROPE, "GGML_OP_ROPE" }, - {GGML_OP_SCALE, "GGML_OP_SCALE" }, - {GGML_OP_SOFT_MAX, "GGML_OP_SOFT_MAX" }, - {GGML_OP_SUM_ROWS, "GGML_OP_SUM_ROWS" }, - {GGML_OP_SUB, "GGML_OP_SUB" }, - {GGML_OP_TRANSPOSE, "GGML_OP_TRANSPOSE" }, - {GGML_OP_VIEW, "GGML_OP_VIEW" }, - {GGML_OP_SET_ROWS, "GGML_OP_SET_ROWS" }, - {GGML_OP_CPY, "GGML_OP_CPY" }, - {GGML_OP_FLASH_ATTN_EXT, "GGML_OP_FLASH_ATTN_EXT" }, - {GGML_OP_L2_NORM, "GGML_OP_L2_NORM" }, - {GGML_OP_CLAMP, "GGML_OP_CLAMP" }, - {GGML_OP_PAD, "GGML_OP_PAD" }, - {GGML_OP_SSM_CONV, "GGML_OP_SSM_CONV" }, - {GGML_OP_GATED_DELTA_NET, "GGML_OP_GATED_DELTA_NET"}, - {GGML_OP_ARGSORT, "GGML_OP_ARGSORT" }, - {GGML_OP_REPEAT, "GGML_OP_REPEAT" }, - {GGML_OP_IM2COL, "GGML_OP_IM2COL" } - }; - static const std::map unary_ops = { - {GGML_UNARY_OP_ABS, "GGML_UNARY_OP_ABS" }, - {GGML_UNARY_OP_SGN, "GGML_UNARY_OP_SGN" }, - {GGML_UNARY_OP_NEG, "GGML_UNARY_OP_NEG" }, - {GGML_UNARY_OP_STEP, "GGML_UNARY_OP_STEP" }, - {GGML_UNARY_OP_TANH, "GGML_UNARY_OP_TANH" }, - {GGML_UNARY_OP_ELU, "GGML_UNARY_OP_ELU" }, - {GGML_UNARY_OP_RELU, "GGML_UNARY_OP_RELU" }, - {GGML_UNARY_OP_SIGMOID, "GGML_UNARY_OP_SIGMOID" }, - {GGML_UNARY_OP_GELU, "GGML_UNARY_OP_GELU" }, - {GGML_UNARY_OP_GELU_QUICK, "GGML_UNARY_OP_GELU_QUICK" }, - {GGML_UNARY_OP_SILU, "GGML_UNARY_OP_SILU" }, - {GGML_UNARY_OP_SOFTPLUS, "GGML_UNARY_OP_SOFTPLUS" }, - {GGML_UNARY_OP_HARDSWISH, "GGML_UNARY_OP_HARDSWISH" }, - {GGML_UNARY_OP_HARDSIGMOID, "GGML_UNARY_OP_HARDSIGMOID"}, - {GGML_UNARY_OP_EXP, "GGML_UNARY_OP_EXP" }, - {GGML_UNARY_OP_COUNT, "GGML_UNARY_OP_COUNT" } - }; - static const std::map glu_ops = { - {GGML_GLU_OP_SWIGLU, "GGML_GLU_OP_SWIGLU"}, - {GGML_GLU_OP_GEGLU, "GGML_GLU_OP_GEGLU" }, - {GGML_GLU_OP_REGLU, "GGML_GLU_OP_REGLU" } - }; - switch (node->op) { case GGML_OP_UNARY: - return unary_ops.at(ggml_get_unary_op(node)); + return std::string("GGML_UNARY_OP_") + ggml_unary_op_name(ggml_get_unary_op(node)); case GGML_OP_GLU: - return glu_ops.at(ggml_get_glu_op(node)); + return std::string("GGML_GLU_OP_") + ggml_glu_op_name(ggml_get_glu_op(node)); default: - return ops.at(node->op); + return std::string("GGML_OP_") + ggml_op_name(node->op); } - static const std::string unknown_op = "UNKNOWN_GGML_OP"; - return unknown_op; } const std::string & GgmlOvDecoder::get_op_type(int node_idx) const { diff --git a/ggml/src/ggml-openvino/openvino/op/add_id.cpp b/ggml/src/ggml-openvino/openvino/op/add_id.cpp index c8bf08152242..e54d700d421a 100644 --- a/ggml/src/ggml-openvino/openvino/op/add_id.cpp +++ b/ggml/src/ggml-openvino/openvino/op/add_id.cpp @@ -17,6 +17,22 @@ namespace frontend { namespace ggml { namespace op { +static ov::Output reshape_add_id_input_to_2d(const ov::Output & input, + const ov::PartialShape & input_shape, + const std::vector & dims) { + const auto actual_shape = input.get_partial_shape(); + if (actual_shape.rank().is_static() && actual_shape.rank().get_length() == 2) { + return input; + } + + if (input_shape.rank().is_static() && input_shape.rank().get_length() == 2) { + return input; + } + + auto shape = std::make_shared(input, ov::element::i64); + return std::make_shared(input, get_dimensions(shape, dims), false); +} + OutputVector translate_add_id(const NodeContext & context) { num_inputs_check(context, 3, 3); @@ -28,11 +44,9 @@ OutputVector translate_add_id(const NodeContext & context) { // input: [1, n_token, n_used, n_embd] // bias: [1, 1, n_expert, n_embd] // ids: [1, 1, n_token, n_used] - auto bias_shape_4d = std::make_shared(bias, ov::element::i64); - auto ids_shape_4d = std::make_shared(ids, ov::element::i64); - - bias = std::make_shared(bias, get_dimensions(bias_shape_4d, {2, 3}), false); - ids = std::make_shared(ids, get_dimensions(ids_shape_4d, {2, 3}), false); + // Model bias constants may already be stored as [n_expert, n_embd]. + bias = reshape_add_id_input_to_2d(bias, context.get_input_shape(1), {2, 3}); + ids = reshape_add_id_input_to_2d(ids, context.get_input_shape(2), {2, 3}); if (ids.get_element_type() != ov::element::i32 && ids.get_element_type() != ov::element::i64) { ids = std::make_shared(ids, ov::element::i32); diff --git a/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp b/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp index 5c46e071375e..d220f2f584a5 100644 --- a/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp +++ b/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp @@ -3,8 +3,11 @@ #include "../utils.h" #include +#include #include #include +#include +#include #include #include #include @@ -15,7 +18,7 @@ namespace frontend { namespace ggml { namespace op { -OutputVector translate_glu_swiglu(const NodeContext & context) { +static std::pair, ov::Output> get_glu_inputs(const NodeContext & context) { num_inputs_check(context, 1, 2); ov::Output src0; @@ -52,6 +55,12 @@ OutputVector translate_glu_swiglu(const NodeContext & context) { std::swap(src0, src1); } + return {src0, src1}; +} + +OutputVector translate_glu_swiglu(const NodeContext & context) { + auto [src0, src1] = get_glu_inputs(context); + auto sigmoid = std::make_shared(src0); auto silu = std::make_shared(src0, sigmoid); auto res = std::make_shared(silu, src1); @@ -59,6 +68,27 @@ OutputVector translate_glu_swiglu(const NodeContext & context) { return rename_outputs_with_suffix({res}, context.get_name()); } +OutputVector translate_glu_swiglu_oai(const NodeContext & context) { + auto [src0, src1] = get_glu_inputs(context); + + const int32_t * params = context.get_output_op_params(); + const float alpha = reinterpret_cast(params)[2]; + const float limit = reinterpret_cast(params)[3]; + + auto gate = std::make_shared(src0, -std::numeric_limits::infinity(), limit); + auto alpha_const = ov::op::v0::Constant::create(ov::element::f32, {}, {alpha}); + auto scaled_gate = std::make_shared(gate, alpha_const); + auto sigmoid = std::make_shared(scaled_gate); + auto out_glu = std::make_shared(gate, sigmoid); + + auto up = std::make_shared(src1, -limit, limit); + auto one = ov::op::v0::Constant::create(ov::element::f32, {}, {1.0f}); + auto up_plus_one = std::make_shared(up, one); + auto res = std::make_shared(out_glu, up_plus_one); + + return rename_outputs_with_suffix({res}, context.get_name()); +} + } // namespace op } // namespace ggml } // namespace frontend diff --git a/ggml/src/ggml-openvino/openvino/op/mul_mat_id.cpp b/ggml/src/ggml-openvino/openvino/op/mul_mat_id.cpp index 09e29d4cce2a..6df2784c2e45 100644 --- a/ggml/src/ggml-openvino/openvino/op/mul_mat_id.cpp +++ b/ggml/src/ggml-openvino/openvino/op/mul_mat_id.cpp @@ -2,23 +2,135 @@ #include "../op_table.h" #include "../utils.h" +#include +#include +#include #include +#include +#include #include #include #include #include #include #include +#include #include #include -#include +#include #include +#include namespace ov { namespace frontend { namespace ggml { namespace op { +namespace { + +std::shared_ptr const_i64(const std::vector & values) { + return ov::op::v0::Constant::create(ov::element::i64, ov::Shape{values.size()}, values); +} + +ov::Output slice_axis(const ov::Output & input, int64_t axis, int64_t begin, int64_t end) { + return std::make_shared(input, const_i64({begin}), const_i64({end}), const_i64({1}), + const_i64({axis})); +} + +ov::Output translate_mul_mat_id_mxfp4_packed(const NodeContext & context, + ov::Output expert_weights, + ov::Output activations, + ov::Output ids) { + auto packed_shape = expert_weights.get_partial_shape().to_shape(); + FRONT_END_OP_CONVERSION_CHECK(packed_shape.size() == 5 && packed_shape[4] == 17, + "Expected packed MXFP4 expert weights with shape [1, n_expert, m, k_blocks, 17]"); + + const int64_t n_expert = static_cast(packed_shape[1]); + const int64_t rows = static_cast(packed_shape[2]); + const int64_t k_blocks = static_cast(packed_shape[3]); + const int64_t qk = 32; + const int64_t cols = k_blocks * qk; + + auto packed_shape_4d = const_i64({n_expert, rows, k_blocks, 17}); + expert_weights = std::make_shared(expert_weights, packed_shape_4d, false); + + auto activations_shape_4d = std::make_shared(activations, ov::element::i64); + auto ids_shape_4d = std::make_shared(ids, ov::element::i64); + auto activations_shape_3d = get_dimensions(activations_shape_4d, {1, 2, 3}); + auto ids_shape_2d = get_dimensions(ids_shape_4d, {2, 3}); + + activations = std::make_shared(activations, activations_shape_3d, false); + ids = std::make_shared(ids, ids_shape_2d, false); + if (ids.get_element_type() != ov::element::i32 && ids.get_element_type() != ov::element::i64) { + ids = std::make_shared(ids, ov::element::i32); + } + + auto gather_axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {0}); + + static const std::vector f4e2m1_lut = {0.0f, 0.5f, 1.0f, 1.5f, 2.0f, 3.0f, 4.0f, 6.0f, + -0.0f, -0.5f, -1.0f, -1.5f, -2.0f, -3.0f, -4.0f, -6.0f}; + std::vector e8m0_lut(256); + for (size_t i = 0; i < e8m0_lut.size(); ++i) { + uint32_t bits = static_cast(i) << 23; + memcpy(&e8m0_lut[i], &bits, sizeof(float)); + } + e8m0_lut[0] = std::numeric_limits::min() / 2.0f; + e8m0_lut[255] = std::numeric_limits::quiet_NaN(); + + auto f4_lut = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{f4e2m1_lut.size()}, f4e2m1_lut); + auto scale_lut = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{e8m0_lut.size()}, e8m0_lut); + + auto selected_packed_weights = std::make_shared(expert_weights, ids, gather_axis); + auto scale_byte = slice_axis(selected_packed_weights, 4, 0, 1); + auto qs = slice_axis(selected_packed_weights, 4, 1, 17); + auto low = std::make_shared( + qs, ov::op::v0::Constant::create(ov::element::u8, ov::Shape{}, {0x0F}), ov::op::AutoBroadcastType::NUMPY); + auto high_shift = std::make_shared( + qs, ov::op::v0::Constant::create(ov::element::u8, ov::Shape{}, {4}), ov::op::AutoBroadcastType::NUMPY); + auto nibbles = std::make_shared(ov::OutputVector{low, high_shift}, 4); + auto nibble_indices = std::make_shared(nibbles, ov::element::i32); + auto weights_f32 = std::make_shared(f4_lut, nibble_indices, gather_axis); + + auto scale_indices = std::make_shared(scale_byte, ov::element::i32); + auto scales_f32 = std::make_shared(scale_lut, scale_indices, gather_axis); + ov::Output selected_weights = std::make_shared(weights_f32, scales_f32, + ov::op::AutoBroadcastType::NUMPY); + + auto ids_shape = std::make_shared(ids, ov::element::i64); + auto selected_weights_target_dims = std::make_shared( + ov::OutputVector{get_dimensions(ids_shape, {0, 1}), const_i64({rows, cols})}, 0); + selected_weights = std::make_shared(selected_weights, selected_weights_target_dims, false); + + auto activations_shape = std::make_shared(activations, ov::element::i64); + ov::Output acts_target_dims = std::make_shared( + ov::OutputVector{ + get_dimensions(activations_shape, {0}), + get_dimensions(ids_shape, {1}), + get_dimensions(activations_shape, {2}), + }, + 0); + ov::Output acts_broadcasted = + std::make_shared(activations, acts_target_dims, ov::op::BroadcastType::BIDIRECTIONAL); + + auto activations_expanded = std::make_shared(acts_broadcasted, const_i64({2})); + ov::Output result = + std::make_shared(activations_expanded, selected_weights, false, true); + + auto batch_dim = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + auto row_dim = ov::op::v0::Constant::create(ov::element::i64, {1}, {rows}); + auto result_target_dims = std::make_shared( + ov::OutputVector{batch_dim, get_dimensions(ids_shape, {0, 1}), row_dim}, 0); + result = std::make_shared(result, result_target_dims, false); + + const auto output_type = context.get_output_type(); + if (result.get_element_type() != output_type) { + result = std::make_shared(result, output_type); + } + return result; +} + +} // namespace + OutputVector translate_mul_mat_id(const NodeContext & context) { num_inputs_check(context, 3, 3); @@ -26,6 +138,12 @@ OutputVector translate_mul_mat_id(const NodeContext & context) { auto activations = process_view_input_new(context, 1); auto ids = process_view_input_new(context, 2); + if (expert_weights.get_element_type() == ov::element::u8 && expert_weights.get_partial_shape().rank().is_static() && + expert_weights.get_partial_shape().rank().get_length() == 5) { + return rename_outputs_with_suffix({translate_mul_mat_id_mxfp4_packed(context, expert_weights, activations, ids)}, + context.get_name()); + } + // OpenVINO sees GGML tensors in reversed dimension order: // weights: [1, n_expert, m, k] // activations: [1, n_tokens, n_used_or_1, k] diff --git a/ggml/src/ggml-openvino/openvino/op/softmax.cpp b/ggml/src/ggml-openvino/openvino/op/softmax.cpp index 287faedbb531..b391d3f91075 100644 --- a/ggml/src/ggml-openvino/openvino/op/softmax.cpp +++ b/ggml/src/ggml-openvino/openvino/op/softmax.cpp @@ -6,12 +6,16 @@ #include #include #include +#include #include #include +#include #include #include #include #include +#include +#include #include #include @@ -20,12 +24,31 @@ namespace frontend { namespace ggml { namespace op { +static bool is_static_one(const ov::Dimension & dim) { + return dim.is_static() && dim.get_length() == 1; +} + +static bool same_static_dim(const ov::Dimension & lhs, const ov::Dimension & rhs) { + return lhs.is_static() && rhs.is_static() && lhs.get_length() == rhs.get_length(); +} + +static bool is_attention_sinks_input_shape(const ov::PartialShape & candidate, const ov::PartialShape & logits_shape) { + if (candidate.rank().is_dynamic() || logits_shape.rank().is_dynamic() || candidate.rank().get_length() != 4 || + logits_shape.rank().get_length() != 4) { + return false; + } + + return is_static_one(candidate[0]) && is_static_one(candidate[1]) && is_static_one(candidate[2]) && + same_static_dim(candidate[3], logits_shape[1]); +} + // Reimplementation of GGML_OP_SOFT_MAX semantics for OpenVINO backend: // 1) logits = src0 * scale // 2) logits += mask (if provided) -// 3) softmax over the last dimension +// 3) append attention sinks as hidden logits (if provided) +// 4) softmax over the last dimension and remove the hidden sink column OutputVector translate_soft_max(const NodeContext & context) { - num_inputs_check(context, 1, 2); + num_inputs_check(context, 1, 3); float scale = 1.0f; float max_bias = 0.0f; @@ -33,6 +56,11 @@ OutputVector translate_soft_max(const NodeContext & context) { memcpy(&max_bias, (float *) context.get_output_op_params() + 1, sizeof(float)); ov::Output logits = context.get_input(0); + const bool second_input_is_sinks = + context.get_input_size() == 2 && is_attention_sinks_input_shape(context.get_input_shape(1), context.get_output_shape()); + const bool has_mask = context.get_input_size() > 1 && !second_input_is_sinks; + const bool has_sinks = second_input_is_sinks || context.get_input_size() > 2; + const size_t sinks_input_idx = second_input_is_sinks ? 1 : 2; // Apply scale first: logits = src0 * scale if (scale != 1.0f) { @@ -41,12 +69,12 @@ OutputVector translate_soft_max(const NodeContext & context) { logits = std::make_shared(logits, scale_const); } - FRONT_END_CHECK_IMPLEMENTED(!(max_bias > 0.0f && context.get_input_size() < 2), + FRONT_END_CHECK_IMPLEMENTED(!(max_bias > 0.0f && !has_mask), "OpenVINO softmax ALiBi path requires mask input"); // Optional mask add: logits += mask // For max_bias > 0 (ALiBi), apply per-head slope to mask before adding. - if (context.get_input_size() > 1) { + if (has_mask) { ov::Output mask = context.get_input(1); // For stateful @@ -94,8 +122,40 @@ OutputVector translate_soft_max(const NodeContext & context) { logits = std::make_shared(logits, mask); } + ov::Output softmax_input = logits; + if (has_sinks) { + ov::Output sinks = context.get_input(sinks_input_idx); + if (sinks.get_element_type() != logits.get_element_type()) { + sinks = std::make_shared(sinks, logits.get_element_type()); + } + + auto sink_shape = ov::op::v0::Constant::create(ov::element::i64, {4}, {1, -1, 1, 1}); + auto sinks_4d = std::make_shared(sinks, sink_shape, false); + + auto logits_shape = std::make_shared(logits, ov::element::i64); + auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + auto three = ov::op::v0::Constant::create(ov::element::i64, {1}, {3}); + auto four = ov::op::v0::Constant::create(ov::element::i64, {1}, {4}); + auto shape_axis = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + + auto sink_prefix_shape = std::make_shared(logits_shape, zero, three, one, shape_axis); + auto sink_last_dim = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + auto sink_broadcast_shape = std::make_shared( + ov::OutputVector{sink_prefix_shape, sink_last_dim}, 0); + auto sink_column = std::make_shared(sinks_4d, sink_broadcast_shape, + ov::op::BroadcastType::BIDIRECTIONAL); + softmax_input = std::make_shared(ov::OutputVector{logits, sink_column}, 3); + + auto softmax_with_sink = std::make_shared(softmax_input, -1); + auto original_last_dim = std::make_shared(logits_shape, three, four, one, shape_axis); + auto res = std::make_shared(softmax_with_sink, zero, original_last_dim, one, three); + + return rename_outputs_with_suffix({res}, context.get_name()); + } + // Softmax along last dimension (equivalent to ggml softmax over ne[0]). - auto res = std::make_shared(logits, -1); + auto res = std::make_shared(softmax_input, -1); return rename_outputs_with_suffix({res}, context.get_name()); } diff --git a/ggml/src/ggml-openvino/openvino/op_table.cpp b/ggml/src/ggml-openvino/openvino/op_table.cpp index f84a1bf931ae..59fd26df8cd5 100644 --- a/ggml/src/ggml-openvino/openvino/op_table.cpp +++ b/ggml/src/ggml-openvino/openvino/op_table.cpp @@ -47,6 +47,7 @@ std::unordered_map get_supported_ops() { {"GGML_UNARY_OP_TANH", op::translate_1to1_match_1_input }, {"GGML_OP_VIEW", op::translate_view }, {"GGML_GLU_OP_SWIGLU", op::translate_glu_swiglu }, + {"GGML_GLU_OP_SWIGLU_OAI", op::translate_glu_swiglu_oai }, {"GGML_GLU_OP_GEGLU", op::translate_glu_geglu }, {"GGML_OP_SET_ROWS", op::translate_set_rows }, {"GGML_OP_CPY", op::translate_cpy }, diff --git a/ggml/src/ggml-openvino/openvino/op_table.h b/ggml/src/ggml-openvino/openvino/op_table.h index c90ff8377908..1d695fa12588 100644 --- a/ggml/src/ggml-openvino/openvino/op_table.h +++ b/ggml/src/ggml-openvino/openvino/op_table.h @@ -32,6 +32,7 @@ GGML_OP_CONVERTER(translate_soft_max); GGML_OP_CONVERTER(translate_transpose); GGML_OP_CONVERTER(translate_view); GGML_OP_CONVERTER(translate_glu_swiglu); +GGML_OP_CONVERTER(translate_glu_swiglu_oai); GGML_OP_CONVERTER(translate_glu_geglu); GGML_OP_CONVERTER(translate_set_rows); GGML_OP_CONVERTER(translate_cpy);