Skip to content
Merged
Show file tree
Hide file tree
Changes from 24 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
635ac99
Add SQ8↔FP16 x86 SIMD distance kernels [MOD-14954] (#970)
dor-forer Jun 1, 2026
f0f2ec4
Add design spec for SQ8↔FP16 ARM SIMD kernels [MOD-14972]
dor-forer May 28, 2026
e5e7475
Add implementation plan for SQ8↔FP16 ARM SIMD kernels [MOD-14972]
dor-forer May 28, 2026
d076b67
Add NEON_HP SQ8↔FP16 IP kernel header [MOD-14972]
dor-forer May 28, 2026
eedde9d
Add NEON_HP SQ8↔FP16 L2 kernel header [MOD-14972]
dor-forer May 28, 2026
1b435d8
Wire NEON_HP SQ8↔FP16 choosers [MOD-14972]
dor-forer May 28, 2026
33e751d
Dispatch SQ8↔FP16 to NEON_HP tier on AArch64 [MOD-14972]
dor-forer May 28, 2026
0d53e1f
Extend SQ8↔FP16 tier-walk tests with NEON_HP [MOD-14972]
dor-forer May 28, 2026
0089295
Add SVE SQ8↔FP16 IP kernel header [MOD-14972]
dor-forer May 28, 2026
98c8bab
Add SVE SQ8↔FP16 L2 kernel header [MOD-14972]
dor-forer May 28, 2026
ad387e1
Wire SVE/SVE2 SQ8↔FP16 choosers [MOD-14972]
dor-forer May 28, 2026
e8a121c
Dispatch SQ8↔FP16 to SVE/SVE2 tiers on AArch64 [MOD-14972]
dor-forer May 28, 2026
9a0b858
Extend SQ8↔FP16 tier-walk tests with SVE/SVE2 [MOD-14972]
dor-forer May 28, 2026
d76325e
Register ARM SQ8↔FP16 microbenchmarks [MOD-14972]
dor-forer May 28, 2026
88de731
Add missing alignment=0 assertions to SQ8↔FP16 ARM tier-walk tests [M…
dor-forer May 31, 2026
c242391
Fix SVE SQ8↔FP16 kernel: use svzip1 to correct FP16→FP32 widening [MO…
dor-forer May 31, 2026
f7bb4b1
Optimize ARM SQ8↔FP16 kernels and align with codebase conventions [MO…
dor-forer May 31, 2026
72f9a98
Apply clang-format [MOD-14972]
dor-forer May 31, 2026
d7576c3
Trim PR churn: remove docs, dispatcher comments, and test verbosity […
dor-forer May 31, 2026
966e36a
Apply clang-format 18.1.8 (matches CI) [MOD-14972]
dor-forer May 31, 2026
b47be94
bench: register spaces_sq8_fp16 in benchmark setups
lerman25 Jun 1, 2026
7ece249
perf(arm): optimize SQ8<->FP16 NEON_HP widening and add SVE2 FMLALB/F…
lerman25 Jun 1, 2026
db1e68f
style: clang-format SVE2.cpp
lerman25 Jun 1, 2026
1472684
perf(arm): add NEON_FHM FMLAL widening-FMA kernel for SQ8<->FP16 [MOD…
dor-forer Jun 2, 2026
f798f5c
Merge remote-tracking branch 'origin/main' into dor-forer-sq8-fp16-ar…
dor-forer Jun 3, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions cmake/x86_64InstructionFlags.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,24 @@ if(CXX_AVX512F AND CXX_AVX512BW AND CXX_AVX512VL AND CXX_AVX512VNNI)
add_compile_definitions(OPT_AVX512_F_BW_VL_VNNI)
endif()

# OPT_F16C is unusual compared to the other OPT_* macros above:
#
# 1. It is a *capability* gate, not a dispatch tier. Every other OPT_* maps 1:1 to a single
# ISA tier that owns its own translation unit (OPT_AVX2 -> AVX2.cpp, OPT_SSE4 -> SSE4.cpp).
# F16C owns no tier of its own; it only enables the vcvtph2ps (FP16<->FP32) conversion that
# several tiers need. So it is hoisted *around* multiple tiers (AVX2_FMA / AVX2 / SSE4 for
# the SQ8<->FP16 kernels) rather than selecting one.
#
# 2. It is a compound guard (CXX_F16C AND CXX_FMA AND CXX_AVX), not a single flag. F16C is
# VEX-encoded, so vcvtph2ps requires AVX state to execute -- emitting it without AVX is
# invalid. Defining OPT_F16C therefore implies AVX is present, and the F16C kernels must be
# compiled with -mf16c added *on top of* -mavx (see functions/*_F16C.cpp in
# src/VecSim/spaces/CMakeLists.txt). The base AVX2.cpp / SSE4.cpp objects stay F16C-free so
# they still run on CPUs without F16C.
#
# 3. The AVX-512 tier deliberately does NOT use this gate: _mm512_cvtph_ps is part of AVX512F
# itself, so the AVX-512 SQ8<->FP16 path needs only OPT_AVX512F and lives outside any
# OPT_F16C guard.
if(CXX_F16C AND CXX_FMA AND CXX_AVX)
add_compile_definitions(OPT_F16C)
endif()
Expand Down
30 changes: 30 additions & 0 deletions src/VecSim/spaces/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -50,18 +50,40 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64|amd64)|(^i.86$)")
list(APPEND OPTIMIZATIONS functions/AVX512F_BW_VL_VNNI.cpp)
endif()

# F16C is VEX-encoded and requires AVX state, so it is only meaningful when the toolchain
# can also emit AVX/FMA. Mirrors the OPT_F16C macro condition in x86_64InstructionFlags.cmake.
set(_has_full_f16c FALSE)
if(CXX_F16C AND CXX_FMA AND CXX_AVX)
set(_has_full_f16c TRUE)
endif()

# Base AVX2 / AVX2+FMA dispatcher TUs hold only kernels with no F16C dependency.
# SQ8↔FP16 kernels (which require F16C) live in sibling TUs functions/AVX2_F16C.cpp and
# functions/AVX2_FMA_F16C.cpp, compiled only when _has_full_f16c is true.
if(CXX_AVX2)
message("Building with AVX2")
set_source_files_properties(functions/AVX2.cpp PROPERTIES COMPILE_FLAGS -mavx2)
list(APPEND OPTIMIZATIONS functions/AVX2.cpp)
endif()

if(CXX_AVX2 AND _has_full_f16c)
message("Building functions/AVX2_F16C.cpp with AVX2 and F16C")
set_source_files_properties(functions/AVX2_F16C.cpp PROPERTIES COMPILE_FLAGS "-mavx2 -mf16c")
list(APPEND OPTIMIZATIONS functions/AVX2_F16C.cpp)
endif()

if(CXX_AVX2 AND CXX_FMA)
message("Building with AVX2 and FMA")
set_source_files_properties(functions/AVX2_FMA.cpp PROPERTIES COMPILE_FLAGS "-mavx2 -mfma")
list(APPEND OPTIMIZATIONS functions/AVX2_FMA.cpp)
endif()

if(CXX_AVX2 AND CXX_FMA AND _has_full_f16c)
message("Building functions/AVX2_FMA_F16C.cpp with AVX2, FMA, and F16C")
set_source_files_properties(functions/AVX2_FMA_F16C.cpp PROPERTIES COMPILE_FLAGS "-mavx2 -mfma -mf16c")
list(APPEND OPTIMIZATIONS functions/AVX2_FMA_F16C.cpp)
endif()

if(CXX_F16C AND CXX_FMA AND CXX_AVX)
message("Building with CXX_F16C")
set_source_files_properties(functions/F16C.cpp PROPERTIES COMPILE_FLAGS "-mf16c -mfma -mavx")
Expand All @@ -86,6 +108,14 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64|amd64)|(^i.86$)")
list(APPEND OPTIMIZATIONS functions/SSE4.cpp)
endif()

# SSE4 SQ8↔FP16 kernels need F16C, which is VEX-encoded → require -mavx alongside -mf16c
# (mirrors the F16C.cpp recipe above).
if(CXX_SSE4 AND _has_full_f16c)
message("Building functions/SSE4_F16C.cpp with SSE4.1, AVX, and F16C")
set_source_files_properties(functions/SSE4_F16C.cpp PROPERTIES COMPILE_FLAGS "-msse4.1 -mavx -mf16c")
list(APPEND OPTIMIZATIONS functions/SSE4_F16C.cpp)
endif()

if(CXX_SSE)
message("Building with SSE")
set_source_files_properties(functions/SSE.cpp PROPERTIES COMPILE_FLAGS -msse)
Expand Down
102 changes: 102 additions & 0 deletions src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8_FP16.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
/*
* Copyright (c) 2006-Present, Redis Ltd.
* All rights reserved.
*
* Licensed under your choice of the Redis Source Available License 2.0
* (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
* GNU Affero General Public License v3 (AGPLv3).
*/
#pragma once
#include "VecSim/spaces/space_includes.h"
#include "VecSim/spaces/AVX_utils.h"
#include "VecSim/types/sq8.h"
#include "VecSim/types/float16.h"
#include "VecSim/utils/alignment.h"

using sq8 = vecsim_types::sq8;
using float16 = vecsim_types::float16;

/*
* Asymmetric SQ8 (storage) <-> FP16 (query) inner product using algebraic identity:
* IP(x, y) = min * y_sum + delta * Σ(q_i * y_i)
*
* FP16 query lanes are widened to FP32 per 8-lane chunk via _mm256_cvtph_ps (F16C);
* inner-loop arithmetic runs in FP32 with _mm256_fmadd_ps.
*/

// 8-wide AVX2+FMA step: 8 SQ8 lanes + 8 FP16 lanes -> 8 FP32 fused-multiply-add.
static inline void SQ8_FP16_InnerProductStep_AVX2_FMA(const uint8_t *&pVect1,
const float16 *&pVect2, __m256 &sum256) {
__m128i v1_128 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(pVect1));
pVect1 += 8;
__m256i v1_256 = _mm256_cvtepu8_epi32(v1_128);
__m256 v1_f = _mm256_cvtepi32_ps(v1_256);

__m128i v2_128 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pVect2));
__m256 v2_f = _mm256_cvtph_ps(v2_128);
pVect2 += 8;

sum256 = _mm256_fmadd_ps(v1_f, v2_f, sum256);
}

// pVec1v = SQ8 storage, pVec2v = FP16 query. Precondition: dim >= 16 (enforced by dispatcher).
template <unsigned char residual> // 0..15
float SQ8_FP16_InnerProductImp_AVX2_FMA(const void *pVec1v, const void *pVec2v, size_t dimension) {
const uint8_t *pVec1 = static_cast<const uint8_t *>(pVec1v);
const float16 *pVec2 = static_cast<const float16 *>(pVec2v);
const uint8_t *pEnd1 = pVec1 + dimension;

// Two accumulators break the FMA dependency chain across consecutive iterations.
__m256 sum_a = _mm256_setzero_ps();
__m256 sum_b = _mm256_setzero_ps();

if constexpr (residual % 8) {
constexpr int mask = (1 << (residual % 8)) - 1;

__m128i v1_128 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(pVec1));
pVec1 += residual % 8;
__m256i v1_256 = _mm256_cvtepu8_epi32(v1_128);
__m256 v1_f = _mm256_cvtepi32_ps(v1_256);

__m128i v2_128 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pVec2));
__m256 v2_f = _mm256_cvtph_ps(v2_128);
v2_f = _mm256_blend_ps(_mm256_setzero_ps(), v2_f, mask);
pVec2 += residual % 8;

sum_a = _mm256_mul_ps(v1_f, v2_f);
}

if constexpr (residual >= 8) {
SQ8_FP16_InnerProductStep_AVX2_FMA(pVec1, pVec2, sum_b);
}

do {
SQ8_FP16_InnerProductStep_AVX2_FMA(pVec1, pVec2, sum_a);
SQ8_FP16_InnerProductStep_AVX2_FMA(pVec1, pVec2, sum_b);
} while (pVec1 < pEnd1);

__m256 sum256 = _mm256_add_ps(sum_a, sum_b);
float quantized_dot = my_mm256_reduce_add_ps(sum256);

const uint8_t *pVec1Base = static_cast<const uint8_t *>(pVec1v);
const uint8_t *params_bytes = pVec1Base + dimension;
const float min_val = load_unaligned<float>(params_bytes + sq8::MIN_VAL * sizeof(float));
const float delta = load_unaligned<float>(params_bytes + sq8::DELTA * sizeof(float));

const float16 *pVec2Base = static_cast<const float16 *>(pVec2v);
const auto *query_meta_bytes = reinterpret_cast<const uint8_t *>(pVec2Base + dimension);
const float y_sum = load_unaligned<float>(query_meta_bytes + sq8::SUM_QUERY * sizeof(float));

return min_val * y_sum + delta * quantized_dot;
}

template <unsigned char residual> // 0..15
float SQ8_FP16_InnerProductSIMD16_AVX2_FMA(const void *pVec1v, const void *pVec2v,
size_t dimension) {
return 1.0f - SQ8_FP16_InnerProductImp_AVX2_FMA<residual>(pVec1v, pVec2v, dimension);
}

template <unsigned char residual> // 0..15
float SQ8_FP16_CosineSIMD16_AVX2_FMA(const void *pVec1v, const void *pVec2v, size_t dimension) {
return SQ8_FP16_InnerProductSIMD16_AVX2_FMA<residual>(pVec1v, pVec2v, dimension);
}
101 changes: 101 additions & 0 deletions src/VecSim/spaces/IP/IP_AVX2_SQ8_FP16.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
/*
* Copyright (c) 2006-Present, Redis Ltd.
* All rights reserved.
*
* Licensed under your choice of the Redis Source Available License 2.0
* (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
* GNU Affero General Public License v3 (AGPLv3).
*/
#pragma once
#include "VecSim/spaces/space_includes.h"
#include "VecSim/spaces/AVX_utils.h"
#include "VecSim/types/sq8.h"
#include "VecSim/types/float16.h"
#include "VecSim/utils/alignment.h"

using sq8 = vecsim_types::sq8;
using float16 = vecsim_types::float16;

/*
* Asymmetric SQ8 (storage) <-> FP16 (query) inner product using algebraic identity:
* IP(x, y) = min * y_sum + delta * Σ(q_i * y_i)
*
* FP16 query lanes are widened to FP32 per 8-lane chunk via _mm256_cvtph_ps (F16C);
* inner-loop arithmetic runs in FP32 with separate _mm256_mul_ps + _mm256_add_ps (no FMA).
*/

// 8-wide AVX2 step (no FMA): 8 SQ8 lanes + 8 FP16 lanes -> mul + add into sum.
static inline void SQ8_FP16_InnerProductStep_AVX2(const uint8_t *&pVect1, const float16 *&pVect2,
__m256 &sum256) {
__m128i v1_128 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(pVect1));
pVect1 += 8;
__m256i v1_256 = _mm256_cvtepu8_epi32(v1_128);
__m256 v1_f = _mm256_cvtepi32_ps(v1_256);

__m128i v2_128 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pVect2));
__m256 v2_f = _mm256_cvtph_ps(v2_128);
pVect2 += 8;

sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v1_f, v2_f));
}

// pVec1v = SQ8 storage, pVec2v = FP16 query. Precondition: dim >= 16 (enforced by dispatcher).
template <unsigned char residual> // 0..15
float SQ8_FP16_InnerProductImp_AVX2(const void *pVec1v, const void *pVec2v, size_t dimension) {
const uint8_t *pVec1 = static_cast<const uint8_t *>(pVec1v);
const float16 *pVec2 = static_cast<const float16 *>(pVec2v);
const uint8_t *pEnd1 = pVec1 + dimension;

// Two accumulators break the mul->add dependency chain (no FMA on this tier).
__m256 sum_a = _mm256_setzero_ps();
__m256 sum_b = _mm256_setzero_ps();

if constexpr (residual % 8) {
constexpr int mask = (1 << (residual % 8)) - 1;

__m128i v1_128 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(pVec1));
pVec1 += residual % 8;
__m256i v1_256 = _mm256_cvtepu8_epi32(v1_128);
__m256 v1_f = _mm256_cvtepi32_ps(v1_256);

__m128i v2_128 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pVec2));
__m256 v2_f = _mm256_cvtph_ps(v2_128);
v2_f = _mm256_blend_ps(_mm256_setzero_ps(), v2_f, mask);
pVec2 += residual % 8;

sum_a = _mm256_mul_ps(v1_f, v2_f);
}

if constexpr (residual >= 8) {
SQ8_FP16_InnerProductStep_AVX2(pVec1, pVec2, sum_b);
}

do {
SQ8_FP16_InnerProductStep_AVX2(pVec1, pVec2, sum_a);
SQ8_FP16_InnerProductStep_AVX2(pVec1, pVec2, sum_b);
} while (pVec1 < pEnd1);

__m256 sum256 = _mm256_add_ps(sum_a, sum_b);
float quantized_dot = my_mm256_reduce_add_ps(sum256);

const uint8_t *pVec1Base = static_cast<const uint8_t *>(pVec1v);
const uint8_t *params_bytes = pVec1Base + dimension;
const float min_val = load_unaligned<float>(params_bytes + sq8::MIN_VAL * sizeof(float));
const float delta = load_unaligned<float>(params_bytes + sq8::DELTA * sizeof(float));

const float16 *pVec2Base = static_cast<const float16 *>(pVec2v);
const auto *query_meta_bytes = reinterpret_cast<const uint8_t *>(pVec2Base + dimension);
const float y_sum = load_unaligned<float>(query_meta_bytes + sq8::SUM_QUERY * sizeof(float));

return min_val * y_sum + delta * quantized_dot;
}

template <unsigned char residual> // 0..15
float SQ8_FP16_InnerProductSIMD16_AVX2(const void *pVec1v, const void *pVec2v, size_t dimension) {
return 1.0f - SQ8_FP16_InnerProductImp_AVX2<residual>(pVec1v, pVec2v, dimension);
}

template <unsigned char residual> // 0..15
float SQ8_FP16_CosineSIMD16_AVX2(const void *pVec1v, const void *pVec2v, size_t dimension) {
return SQ8_FP16_InnerProductSIMD16_AVX2<residual>(pVec1v, pVec2v, dimension);
}
113 changes: 113 additions & 0 deletions src/VecSim/spaces/IP/IP_AVX512F_SQ8_FP16.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
/*
* Copyright (c) 2006-Present, Redis Ltd.
* All rights reserved.
*
* Licensed under your choice of the Redis Source Available License 2.0
* (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
* GNU Affero General Public License v3 (AGPLv3).
*/
#pragma once
#include "VecSim/spaces/space_includes.h"
#include "VecSim/types/sq8.h"
#include "VecSim/types/float16.h"
#include "VecSim/utils/alignment.h"

using sq8 = vecsim_types::sq8;
using float16 = vecsim_types::float16;

/*
* Asymmetric SQ8 (storage) <-> FP16 (query) inner product using algebraic identity:
* IP(x, y) = min * y_sum + delta * Σ(q_i * y_i)
*
* FP16 query lanes are widened to FP32 per 16-lane chunk via _mm512_cvtph_ps (AVX512F);
* inner-loop arithmetic runs in FP32 with _mm512_fmadd_ps.
*/

// 16-wide AVX512F step: 16 SQ8 lanes + 16 FP16 lanes -> 16 FP32 fused-multiply-add.
static inline void SQ8_FP16_InnerProductStep_AVX512(const uint8_t *&pVec1, const float16 *&pVec2,
__m512 &sum) {
__m128i v1_128 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pVec1));
__m512i v1_512 = _mm512_cvtepu8_epi32(v1_128);
__m512 v1_f = _mm512_cvtepi32_ps(v1_512);

__m256i v2_16 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(pVec2));
__m512 v2_f = _mm512_cvtph_ps(v2_16);

sum = _mm512_fmadd_ps(v1_f, v2_f, sum);

pVec1 += 16;
pVec2 += 16;
}

// pVec1v = SQ8 storage, pVec2v = FP16 query. Precondition: dim >= 16 (enforced by dispatcher).
template <unsigned char residual> // 0..15
float SQ8_FP16_InnerProductImp_AVX512(const void *pVec1v, const void *pVec2v, size_t dimension) {
const uint8_t *pVec1 = static_cast<const uint8_t *>(pVec1v);
const float16 *pVec2 = static_cast<const float16 *>(pVec2v);
const uint8_t *pEnd1 = pVec1 + dimension;

// Four accumulators break the FMA dependency chain to saturate both FMA ports.
__m512 sum0 = _mm512_setzero_ps();
__m512 sum1 = _mm512_setzero_ps();
__m512 sum2 = _mm512_setzero_ps();
__m512 sum3 = _mm512_setzero_ps();

if constexpr (residual > 0) {
__mmask16 mask = (1U << residual) - 1;

__m128i v1_128 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pVec1));
__m512i v1_512 = _mm512_cvtepu8_epi32(v1_128);
__m512 v1_f = _mm512_cvtepi32_ps(v1_512);

__m256i v2_16 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(pVec2));
__m512 v2_f = _mm512_cvtph_ps(v2_16);

sum0 = _mm512_maskz_mul_ps(mask, v1_f, v2_f);

pVec1 += residual;
pVec2 += residual;
}

// Main loop: 4 chunks of 16 lanes per iteration, one chunk per accumulator.
while (static_cast<size_t>(pEnd1 - pVec1) >= 64) {
SQ8_FP16_InnerProductStep_AVX512(pVec1, pVec2, sum0);
SQ8_FP16_InnerProductStep_AVX512(pVec1, pVec2, sum1);
SQ8_FP16_InnerProductStep_AVX512(pVec1, pVec2, sum2);
SQ8_FP16_InnerProductStep_AVX512(pVec1, pVec2, sum3);
}

// Tail: at most three remaining 16-lane chunks (post-residual remainder is a multiple of 16).
// Keep chunks on distinct accumulators to preserve ILP when the main loop did not run.
const size_t remaining = pEnd1 - pVec1;
if (remaining >= 16)
SQ8_FP16_InnerProductStep_AVX512(pVec1, pVec2, sum0);
if (remaining >= 32)
SQ8_FP16_InnerProductStep_AVX512(pVec1, pVec2, sum1);
if (remaining >= 48)
SQ8_FP16_InnerProductStep_AVX512(pVec1, pVec2, sum2);

__m512 sum = _mm512_add_ps(_mm512_add_ps(sum0, sum1), _mm512_add_ps(sum2, sum3));
float quantized_dot = _mm512_reduce_add_ps(sum);

const uint8_t *pVec1Base = static_cast<const uint8_t *>(pVec1v);
const uint8_t *params_bytes = pVec1Base + dimension;
const float min_val = load_unaligned<float>(params_bytes + sq8::MIN_VAL * sizeof(float));
const float delta = load_unaligned<float>(params_bytes + sq8::DELTA * sizeof(float));

const float16 *pVec2Base = static_cast<const float16 *>(pVec2v);
const auto *query_meta_bytes = reinterpret_cast<const uint8_t *>(pVec2Base + dimension);
const float y_sum = load_unaligned<float>(query_meta_bytes + sq8::SUM_QUERY * sizeof(float));

return min_val * y_sum + delta * quantized_dot;
}

template <unsigned char residual> // 0..15
float SQ8_FP16_InnerProductSIMD16_AVX512F(const void *pVec1v, const void *pVec2v,
size_t dimension) {
return 1.0f - SQ8_FP16_InnerProductImp_AVX512<residual>(pVec1v, pVec2v, dimension);
}

template <unsigned char residual> // 0..15
float SQ8_FP16_CosineSIMD16_AVX512F(const void *pVec1v, const void *pVec2v, size_t dimension) {
return SQ8_FP16_InnerProductSIMD16_AVX512F<residual>(pVec1v, pVec2v, dimension);
}
Loading