diff --git a/src/VecSim/spaces/IP/IP.cpp b/src/VecSim/spaces/IP/IP.cpp index 29cbc21a7..e96cf0bc3 100644 --- a/src/VecSim/spaces/IP/IP.cpp +++ b/src/VecSim/spaces/IP/IP.cpp @@ -17,22 +17,23 @@ using float16 = vecsim_types::float16; using sq8 = vecsim_types::sq8; /* - * Optimized asymmetric SQ8 inner product using algebraic identity: + * Optimized asymmetric SQ8-FP32 inner product using algebraic identity: * IP(x, y) = Σ(x_i * y_i) * ≈ Σ((min + delta * q_i) * y_i) * = min * Σy_i + delta * Σ(q_i * y_i) * = min * y_sum + delta * quantized_dot_product * * Uses 4x loop unrolling with multiple accumulators for ILP. - * pVect1 is query (FP32): [float values (dim)] [y_sum] [y_sum_squares (L2 only)] - * pVect2 is storage (SQ8): [uint8_t values (dim)] [min_val] [delta] [x_sum] [x_sum_squares (L2 + * pVect1 is storage (SQ8): [uint8_t values (dim)] [min_val] [delta] [x_sum] [x_sum_squares (L2 * only)] + * pVect2 is query (FP32): [float values (dim)] [y_sum] [y_sum_squares (L2 only)] * - * Returns raw inner product value (not distance). Used by SQ8_InnerProduct, SQ8_Cosine, SQ8_L2Sqr. + * Returns raw inner product value (not distance). Used by SQ8_FP32_InnerProduct, SQ8_FP32_Cosine, + * SQ8_FP32_L2Sqr. */ -float SQ8_InnerProduct_Impl(const void *pVect1v, const void *pVect2v, size_t dimension) { - const auto *pVect1 = static_cast(pVect1v); - const auto *pVect2 = static_cast(pVect2v); +float SQ8_FP32_InnerProduct_Impl(const void *pVect1v, const void *pVect2v, size_t dimension) { + const auto *pVect1 = static_cast(pVect1v); + const auto *pVect2 = static_cast(pVect2v); // Use 4 accumulators for instruction-level parallelism float sum0 = 0, sum1 = 0, sum2 = 0, sum3 = 0; @@ -41,38 +42,38 @@ float SQ8_InnerProduct_Impl(const void *pVect1v, const void *pVect2v, size_t dim size_t i = 0; size_t dim4 = dimension & ~size_t(3); // dim4 is a multiple of 4 for (; i < dim4; i += 4) { - sum0 += pVect1[i + 0] * static_cast(pVect2[i + 0]); - sum1 += pVect1[i + 1] * static_cast(pVect2[i + 1]); - sum2 += pVect1[i + 2] * static_cast(pVect2[i + 2]); - sum3 += pVect1[i + 3] * static_cast(pVect2[i + 3]); + sum0 += static_cast(pVect1[i + 0]) * pVect2[i + 0]; + sum1 += static_cast(pVect1[i + 1]) * pVect2[i + 1]; + sum2 += static_cast(pVect1[i + 2]) * pVect2[i + 2]; + sum3 += static_cast(pVect1[i + 3]) * pVect2[i + 3]; } // Handle remainder (0-3 elements) for (; i < dimension; i++) { - sum0 += pVect1[i] * static_cast(pVect2[i]); + sum0 += static_cast(pVect1[i]) * pVect2[i]; } // Combine accumulators float quantized_dot = (sum0 + sum1) + (sum2 + sum3); - // Get quantization parameters from stored vector - const float *params = reinterpret_cast(pVect2 + dimension); + // Get quantization parameters from stored vector (pVect1 is SQ8) + const float *params = reinterpret_cast(pVect1 + dimension); const float min_val = params[sq8::MIN_VAL]; const float delta = params[sq8::DELTA]; - // Get precomputed y_sum from query blob (stored after the dim floats) - const float y_sum = pVect1[dimension + sq8::SUM_QUERY]; + // Get precomputed y_sum from query blob (pVect2 is FP32, stored after the dim floats) + const float y_sum = pVect2[dimension + sq8::SUM_QUERY]; // Apply formula: IP = min * y_sum + delta * Σ(q_i * y_i) return min_val * y_sum + delta * quantized_dot; } -float SQ8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimension) { - return 1.0f - SQ8_InnerProduct_Impl(pVect1v, pVect2v, dimension); +float SQ8_FP32_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimension) { + return 1.0f - SQ8_FP32_InnerProduct_Impl(pVect1v, pVect2v, dimension); } -float SQ8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension) { - return SQ8_InnerProduct(pVect1v, pVect2v, dimension); +float SQ8_FP32_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension) { + return SQ8_FP32_InnerProduct(pVect1v, pVect2v, dimension); } // SQ8-to-SQ8: Common inner product implementation that returns the raw inner product value diff --git a/src/VecSim/spaces/IP/IP.h b/src/VecSim/spaces/IP/IP.h index 8b6a50389..64f2003ec 100644 --- a/src/VecSim/spaces/IP/IP.h +++ b/src/VecSim/spaces/IP/IP.h @@ -10,18 +10,18 @@ #include -// FP32-to-SQ8: Common inner product implementation that returns the raw inner product value -// (not distance). Used by SQ8_InnerProduct, SQ8_Cosine, and SQ8_L2Sqr. -// pVect1 is query (FP32): [float values (dim)] [y_sum] [y_sum_squares (L2 only)] -// pVect2 is storage (SQ8): [uint8_t values (dim)] [min_val] [delta] [x_sum] [x_sum_squares (L2 +// SQ8-FP32: Common inner product implementation that returns the raw inner product value +// (not distance). Used by SQ8_FP32_InnerProduct, SQ8_FP32_Cosine, and SQ8_FP32_L2Sqr. +// pVect1 is storage (SQ8): [uint8_t values (dim)] [min_val] [delta] [x_sum] [x_sum_squares (L2 // only)] -float SQ8_InnerProduct_Impl(const void *pVect1v, const void *pVect2v, size_t dimension); +// pVect2 is query (FP32): [float values (dim)] [y_sum] [y_sum_squares (L2 only)] +float SQ8_FP32_InnerProduct_Impl(const void *pVect1v, const void *pVect2v, size_t dimension); -// pVect1v vector of type fp32 and pVect2v vector of type uint8 -float SQ8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimension); +// pVect1v vector of type uint8 (SQ8) and pVect2v vector of type fp32 +float SQ8_FP32_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimension); -// pVect1v vector of type fp32 and pVect2v vector of type uint8 -float SQ8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension); +// pVect1v vector of type uint8 (SQ8) and pVect2v vector of type fp32 +float SQ8_FP32_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension); // SQ8-to-SQ8: Common inner product implementation that returns the raw inner product value // (not distance). Used by both SQ8_SQ8_InnerProduct, SQ8_SQ8_Cosine, and SQ8_SQ8_L2Sqr. diff --git a/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h b/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8_FP32.h similarity index 59% rename from src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h rename to src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8_FP32.h index 0aae41a04..5767a4828 100644 --- a/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h +++ b/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8_FP32.h @@ -27,28 +27,30 @@ using sq8 = vecsim_types::sq8; */ // Helper: compute Σ(q_i * y_i) for 8 elements using FMA (no dequantization) -static inline void InnerProductStepSQ8_FMA(const float *&pVect1, const uint8_t *&pVect2, +// pVect1 = SQ8 storage (quantized values), pVect2 = FP32 query +static inline void InnerProductStepSQ8_FMA(const uint8_t *&pVect1, const float *&pVect2, __m256 &sum256) { - // Load 8 float elements from query - __m256 v1 = _mm256_loadu_ps(pVect1); + // Load 8 uint8 elements and convert to float + __m128i v1_128 = _mm_loadl_epi64(reinterpret_cast(pVect1)); pVect1 += 8; - // Load 8 uint8 elements and convert to float - __m128i v2_128 = _mm_loadl_epi64(reinterpret_cast(pVect2)); - pVect2 += 8; + __m256i v1_256 = _mm256_cvtepu8_epi32(v1_128); + __m256 v1_f = _mm256_cvtepi32_ps(v1_256); - __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128); - __m256 v2_f = _mm256_cvtepi32_ps(v2_256); + // Load 8 float elements from query + __m256 v2 = _mm256_loadu_ps(pVect2); + pVect2 += 8; // Accumulate q_i * y_i using FMA (no dequantization!) - sum256 = _mm256_fmadd_ps(v2_f, v1, sum256); + sum256 = _mm256_fmadd_ps(v1_f, v2, sum256); } +// pVect1v = SQ8 storage, pVect2v = FP32 query template // 0..15 -float SQ8_InnerProductImp_FMA(const void *pVect1v, const void *pVect2v, size_t dimension) { - const float *pVect1 = static_cast(pVect1v); - const uint8_t *pVect2 = static_cast(pVect2v); - const float *pEnd1 = pVect1 + dimension; +float SQ8_FP32_InnerProductImp_FMA(const void *pVect1v, const void *pVect2v, size_t dimension) { + const uint8_t *pVect1 = static_cast(pVect1v); // SQ8 storage + const float *pVect2 = static_cast(pVect2v); // FP32 query + const uint8_t *pEnd1 = pVect1 + dimension; // Initialize sum accumulator for Σ(q_i * y_i) __m256 sum256 = _mm256_setzero_ps(); @@ -56,18 +58,20 @@ float SQ8_InnerProductImp_FMA(const void *pVect1v, const void *pVect2v, size_t d // Handle residual elements first (0-7 elements) if constexpr (residual % 8) { __mmask8 constexpr mask = (1 << (residual % 8)) - 1; - __m256 v1 = my_mm256_maskz_loadu_ps(pVect1); - pVect1 += residual % 8; // Load uint8 elements and convert to float - __m128i v2_128 = _mm_loadl_epi64(reinterpret_cast(pVect2)); - pVect2 += residual % 8; + __m128i v1_128 = _mm_loadl_epi64(reinterpret_cast(pVect1)); + pVect1 += residual % 8; - __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128); - __m256 v2_f = _mm256_cvtepi32_ps(v2_256); + __m256i v1_256 = _mm256_cvtepu8_epi32(v1_128); + __m256 v1_f = _mm256_cvtepi32_ps(v1_256); + + // Load masked float elements from query + __m256 v2 = my_mm256_maskz_loadu_ps(pVect2); + pVect2 += residual % 8; // Compute q_i * y_i (no dequantization) - sum256 = _mm256_mul_ps(v1, v2_f); + sum256 = _mm256_mul_ps(v1_f, v2); } // If the residual is >=8, have another step of 8 floats @@ -86,25 +90,26 @@ float SQ8_InnerProductImp_FMA(const void *pVect1v, const void *pVect2v, size_t d float quantized_dot = my_mm256_reduce_add_ps(sum256); // Get quantization parameters from stored vector (after quantized data) - const uint8_t *pVect2Base = static_cast(pVect2v); - const float *params2 = reinterpret_cast(pVect2Base + dimension); - const float min_val = params2[sq8::MIN_VAL]; - const float delta = params2[sq8::DELTA]; + const uint8_t *pVect1Base = static_cast(pVect1v); + const float *params1 = reinterpret_cast(pVect1Base + dimension); + const float min_val = params1[sq8::MIN_VAL]; + const float delta = params1[sq8::DELTA]; // Get precomputed y_sum from query blob (stored after the dim floats) - const float y_sum = static_cast(pVect1v)[dimension + sq8::SUM_QUERY]; + const float y_sum = static_cast(pVect2v)[dimension + sq8::SUM_QUERY]; // Apply the algebraic formula: IP = min * y_sum + delta * Σ(q_i * y_i) return min_val * y_sum + delta * quantized_dot; } template // 0..15 -float SQ8_InnerProductSIMD16_AVX2_FMA(const void *pVect1v, const void *pVect2v, size_t dimension) { - return 1.0f - SQ8_InnerProductImp_FMA(pVect1v, pVect2v, dimension); +float SQ8_FP32_InnerProductSIMD16_AVX2_FMA(const void *pVect1v, const void *pVect2v, + size_t dimension) { + return 1.0f - SQ8_FP32_InnerProductImp_FMA(pVect1v, pVect2v, dimension); } template // 0..15 -float SQ8_CosineSIMD16_AVX2_FMA(const void *pVect1v, const void *pVect2v, size_t dimension) { +float SQ8_FP32_CosineSIMD16_AVX2_FMA(const void *pVect1v, const void *pVect2v, size_t dimension) { // Cosine distance = 1 - IP (vectors are pre-normalized) - return SQ8_InnerProductSIMD16_AVX2_FMA(pVect1v, pVect2v, dimension); + return SQ8_FP32_InnerProductSIMD16_AVX2_FMA(pVect1v, pVect2v, dimension); } diff --git a/src/VecSim/spaces/IP/IP_AVX2_SQ8.h b/src/VecSim/spaces/IP/IP_AVX2_SQ8_FP32.h similarity index 54% rename from src/VecSim/spaces/IP/IP_AVX2_SQ8.h rename to src/VecSim/spaces/IP/IP_AVX2_SQ8_FP32.h index dca88696d..dea167eb3 100644 --- a/src/VecSim/spaces/IP/IP_AVX2_SQ8.h +++ b/src/VecSim/spaces/IP/IP_AVX2_SQ8_FP32.h @@ -26,29 +26,31 @@ using sq8 = vecsim_types::sq8; */ // Helper: compute Σ(q_i * y_i) for 8 elements (no dequantization) -static inline void InnerProductStepSQ8(const float *&pVect1, const uint8_t *&pVect2, - __m256 &sum256) { - // Load 8 float elements from query - __m256 v1 = _mm256_loadu_ps(pVect1); +// pVect1 = SQ8 storage (quantized values), pVect2 = FP32 query +static inline void InnerProductStepSQ8_FP32(const uint8_t *&pVect1, const float *&pVect2, + __m256 &sum256) { + // Load 8 uint8 elements and convert to float + __m128i v1_128 = _mm_loadl_epi64(reinterpret_cast(pVect1)); pVect1 += 8; - // Load 8 uint8 elements and convert to float - __m128i v2_128 = _mm_loadl_epi64(reinterpret_cast(pVect2)); - pVect2 += 8; + __m256i v1_256 = _mm256_cvtepu8_epi32(v1_128); + __m256 v1_f = _mm256_cvtepi32_ps(v1_256); - __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128); - __m256 v2_f = _mm256_cvtepi32_ps(v2_256); + // Load 8 float elements from query + __m256 v2 = _mm256_loadu_ps(pVect2); + pVect2 += 8; // Accumulate q_i * y_i (no dequantization!) // Using mul + add since this is the non-FMA version - sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v2_f, v1)); + sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v1_f, v2)); } +// pVect1v = SQ8 storage, pVect2v = FP32 query template // 0..15 -float SQ8_InnerProductImp_AVX2(const void *pVect1v, const void *pVect2v, size_t dimension) { - const float *pVect1 = static_cast(pVect1v); - const uint8_t *pVect2 = static_cast(pVect2v); - const float *pEnd1 = pVect1 + dimension; +float SQ8_FP32_InnerProductImp_AVX2(const void *pVect1v, const void *pVect2v, size_t dimension) { + const uint8_t *pVect1 = static_cast(pVect1v); // SQ8 storage + const float *pVect2 = static_cast(pVect2v); // FP32 query + const uint8_t *pEnd1 = pVect1 + dimension; // Initialize sum accumulator for Σ(q_i * y_i) __m256 sum256 = _mm256_setzero_ps(); @@ -56,55 +58,57 @@ float SQ8_InnerProductImp_AVX2(const void *pVect1v, const void *pVect2v, size_t // Handle residual elements first (0-7 elements) if constexpr (residual % 8) { __mmask8 constexpr mask = (1 << (residual % 8)) - 1; - __m256 v1 = my_mm256_maskz_loadu_ps(pVect1); - pVect1 += residual % 8; // Load uint8 elements and convert to float - __m128i v2_128 = _mm_loadl_epi64(reinterpret_cast(pVect2)); - pVect2 += residual % 8; + __m128i v1_128 = _mm_loadl_epi64(reinterpret_cast(pVect1)); + pVect1 += residual % 8; - __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128); - __m256 v2_f = _mm256_cvtepi32_ps(v2_256); + __m256i v1_256 = _mm256_cvtepu8_epi32(v1_128); + __m256 v1_f = _mm256_cvtepi32_ps(v1_256); + + // Load masked float elements from query + __m256 v2 = my_mm256_maskz_loadu_ps(pVect2); + pVect2 += residual % 8; // Compute q_i * y_i (no dequantization) - sum256 = _mm256_mul_ps(v1, v2_f); + sum256 = _mm256_mul_ps(v1_f, v2); } // If the residual is >=8, have another step of 8 floats if constexpr (residual >= 8) { - InnerProductStepSQ8(pVect1, pVect2, sum256); + InnerProductStepSQ8_FP32(pVect1, pVect2, sum256); } // Process remaining full chunks of 16 elements (2x8) // Using do-while since dim > 16 guarantees at least one iteration do { - InnerProductStepSQ8(pVect1, pVect2, sum256); - InnerProductStepSQ8(pVect1, pVect2, sum256); + InnerProductStepSQ8_FP32(pVect1, pVect2, sum256); + InnerProductStepSQ8_FP32(pVect1, pVect2, sum256); } while (pVect1 < pEnd1); // Reduce to get Σ(q_i * y_i) float quantized_dot = my_mm256_reduce_add_ps(sum256); // Get quantization parameters from stored vector (after quantized data) - const uint8_t *pVect2Base = static_cast(pVect2v); - const float *params2 = reinterpret_cast(pVect2Base + dimension); - const float min_val = params2[sq8::MIN_VAL]; - const float delta = params2[sq8::DELTA]; + const uint8_t *pVect1Base = static_cast(pVect1v); + const float *params1 = reinterpret_cast(pVect1Base + dimension); + const float min_val = params1[sq8::MIN_VAL]; + const float delta = params1[sq8::DELTA]; // Get precomputed y_sum from query blob (stored after the dim floats) - const float y_sum = static_cast(pVect1v)[dimension + sq8::SUM_QUERY]; + const float y_sum = static_cast(pVect2v)[dimension + sq8::SUM_QUERY]; // Apply the algebraic formula: IP = min * y_sum + delta * Σ(q_i * y_i) return min_val * y_sum + delta * quantized_dot; } template // 0..15 -float SQ8_InnerProductSIMD16_AVX2(const void *pVect1v, const void *pVect2v, size_t dimension) { - return 1.0f - SQ8_InnerProductImp_AVX2(pVect1v, pVect2v, dimension); +float SQ8_FP32_InnerProductSIMD16_AVX2(const void *pVect1v, const void *pVect2v, size_t dimension) { + return 1.0f - SQ8_FP32_InnerProductImp_AVX2(pVect1v, pVect2v, dimension); } template // 0..15 -float SQ8_CosineSIMD16_AVX2(const void *pVect1v, const void *pVect2v, size_t dimension) { +float SQ8_FP32_CosineSIMD16_AVX2(const void *pVect1v, const void *pVect2v, size_t dimension) { // Calculate inner product using common implementation with normalization - return SQ8_InnerProductSIMD16_AVX2(pVect1v, pVect2v, dimension); + return SQ8_FP32_InnerProductSIMD16_AVX2(pVect1v, pVect2v, dimension); } diff --git a/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8.h b/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8_FP32.h similarity index 55% rename from src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8.h rename to src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8_FP32.h index 13dce0c0a..76a590519 100644 --- a/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8.h +++ b/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8_FP32.h @@ -25,28 +25,31 @@ using sq8 = vecsim_types::sq8; */ // Helper: compute Σ(q_i * y_i) for 16 elements -static inline void SQ8_InnerProductStep(const float *&pVec1, const uint8_t *&pVec2, __m512 &sum) { - // Load 16 float elements from query (pVec1) - __m512 v1 = _mm512_loadu_ps(pVec1); - +// pVec1 = SQ8 storage (quantized values), pVec2 = FP32 query +static inline void SQ8_FP32_InnerProductStep(const uint8_t *&pVec1, const float *&pVec2, + __m512 &sum) { // Load 16 uint8 elements from quantized vector and convert to float - __m128i v2_128 = _mm_loadu_si128(reinterpret_cast(pVec2)); - __m512i v2_512 = _mm512_cvtepu8_epi32(v2_128); - __m512 v2_f = _mm512_cvtepi32_ps(v2_512); + __m128i v1_128 = _mm_loadu_si128(reinterpret_cast(pVec1)); + __m512i v1_512 = _mm512_cvtepu8_epi32(v1_128); + __m512 v1_f = _mm512_cvtepi32_ps(v1_512); + + // Load 16 float elements from query (pVec2) + __m512 v2 = _mm512_loadu_ps(pVec2); // Accumulate q_i * y_i (no dequantization!) - sum = _mm512_fmadd_ps(v2_f, v1, sum); + sum = _mm512_fmadd_ps(v1_f, v2, sum); pVec1 += 16; pVec2 += 16; } // Common implementation for both inner product and cosine similarity +// pVec1v = SQ8 storage, pVec2v = FP32 query template // 0..15 -float SQ8_InnerProductImp_AVX512(const void *pVec1v, const void *pVec2v, size_t dimension) { - const float *pVec1 = static_cast(pVec1v); - const uint8_t *pVec2 = static_cast(pVec2v); - const float *pEnd1 = pVec1 + dimension; +float SQ8_FP32_InnerProductImp_AVX512(const void *pVec1v, const void *pVec2v, size_t dimension) { + const uint8_t *pVec1 = static_cast(pVec1v); // SQ8 storage + const float *pVec2 = static_cast(pVec2v); // FP32 query + const uint8_t *pEnd1 = pVec1 + dimension; // Initialize sum accumulator for Σ(q_i * y_i) __m512 sum = _mm512_setzero_ps(); @@ -55,16 +58,16 @@ float SQ8_InnerProductImp_AVX512(const void *pVec1v, const void *pVec2v, size_t if constexpr (residual > 0) { __mmask16 mask = (1U << residual) - 1; - // Load masked float elements from query - __m512 v1 = _mm512_maskz_loadu_ps(mask, pVec1); - // Load uint8 elements (safe to load 16 bytes due to padding) - __m128i v2_128 = _mm_loadu_si128(reinterpret_cast(pVec2)); - __m512i v2_512 = _mm512_cvtepu8_epi32(v2_128); - __m512 v2_f = _mm512_cvtepi32_ps(v2_512); + __m128i v1_128 = _mm_loadu_si128(reinterpret_cast(pVec1)); + __m512i v1_512 = _mm512_cvtepu8_epi32(v1_128); + __m512 v1_f = _mm512_cvtepi32_ps(v1_512); + + // Load masked float elements from query + __m512 v2 = _mm512_maskz_loadu_ps(mask, pVec2); // Compute q_i * y_i with mask (no dequantization) - sum = _mm512_maskz_mul_ps(mask, v2_f, v1); + sum = _mm512_maskz_mul_ps(mask, v1_f, v2); pVec1 += residual; pVec2 += residual; @@ -73,37 +76,37 @@ float SQ8_InnerProductImp_AVX512(const void *pVec1v, const void *pVec2v, size_t // Process full chunks of 16 elements // Using do-while since dim > 16 guarantees at least one iteration do { - SQ8_InnerProductStep(pVec1, pVec2, sum); + SQ8_FP32_InnerProductStep(pVec1, pVec2, sum); } while (pVec1 < pEnd1); // Reduce to get Σ(q_i * y_i) float quantized_dot = _mm512_reduce_add_ps(sum); // Get quantization parameters from stored vector (after quantized data) - // Use the original base pointer since pVec2 has been advanced - const uint8_t *pVec2Base = static_cast(pVec2v); - const float *params2 = reinterpret_cast(pVec2Base + dimension); - const float min_val = params2[sq8::MIN_VAL]; - const float delta = params2[sq8::DELTA]; + // Use the original base pointer since pVec1 has been advanced + const uint8_t *pVec1Base = static_cast(pVec1v); + const float *params1 = reinterpret_cast(pVec1Base + dimension); + const float min_val = params1[sq8::MIN_VAL]; + const float delta = params1[sq8::DELTA]; // Get precomputed y_sum from query blob (stored after the dim floats) - // Use the original base pointer since pVec1 has been advanced - const float y_sum = static_cast(pVec1v)[dimension + sq8::SUM_QUERY]; + // Use the original base pointer since pVec2 has been advanced + const float y_sum = static_cast(pVec2v)[dimension + sq8::SUM_QUERY]; // Apply the algebraic formula: IP = min * y_sum + delta * Σ(q_i * y_i) return min_val * y_sum + delta * quantized_dot; } template // 0..15 -float SQ8_InnerProductSIMD16_AVX512F_BW_VL_VNNI(const void *pVec1v, const void *pVec2v, - size_t dimension) { +float SQ8_FP32_InnerProductSIMD16_AVX512F_BW_VL_VNNI(const void *pVec1v, const void *pVec2v, + size_t dimension) { // The inner product similarity is 1 - ip - return 1.0f - SQ8_InnerProductImp_AVX512(pVec1v, pVec2v, dimension); + return 1.0f - SQ8_FP32_InnerProductImp_AVX512(pVec1v, pVec2v, dimension); } template // 0..15 -float SQ8_CosineSIMD16_AVX512F_BW_VL_VNNI(const void *pVec1v, const void *pVec2v, - size_t dimension) { +float SQ8_FP32_CosineSIMD16_AVX512F_BW_VL_VNNI(const void *pVec1v, const void *pVec2v, + size_t dimension) { // Cosine distance = 1 - IP (vectors are pre-normalized) - return SQ8_InnerProductSIMD16_AVX512F_BW_VL_VNNI(pVec1v, pVec2v, dimension); + return SQ8_FP32_InnerProductSIMD16_AVX512F_BW_VL_VNNI(pVec1v, pVec2v, dimension); } diff --git a/src/VecSim/spaces/IP/IP_NEON_SQ8.h b/src/VecSim/spaces/IP/IP_NEON_SQ8_FP32.h similarity index 53% rename from src/VecSim/spaces/IP/IP_NEON_SQ8.h rename to src/VecSim/spaces/IP/IP_NEON_SQ8_FP32.h index 495608762..53a89bc7d 100644 --- a/src/VecSim/spaces/IP/IP_NEON_SQ8.h +++ b/src/VecSim/spaces/IP/IP_NEON_SQ8_FP32.h @@ -25,27 +25,30 @@ using sq8 = vecsim_types::sq8; */ // Helper: compute Σ(q_i * y_i) for 4 elements (no dequantization) -static inline void InnerProductStepSQ8(const float *&pVect1, const uint8_t *&pVect2, - float32x4_t &sum) { - // Load 4 float elements from query - float32x4_t v1 = vld1q_f32(pVect1); +// pVect1 = SQ8 storage (quantized values), pVect2 = FP32 query +static inline void InnerProductStepSQ8_FP32(const uint8_t *&pVect1, const float *&pVect2, + float32x4_t &sum) { + // Load 4 uint8 elements and convert to float + uint8x8_t v1_u8 = vld1_u8(pVect1); pVect1 += 4; - // Load 4 uint8 elements and convert to float - uint8x8_t v2_u8 = vld1_u8(pVect2); - pVect2 += 4; + uint32x4_t v1_u32 = vmovl_u16(vget_low_u16(vmovl_u8(v1_u8))); + float32x4_t v1_f = vcvtq_f32_u32(v1_u32); - uint32x4_t v2_u32 = vmovl_u16(vget_low_u16(vmovl_u8(v2_u8))); - float32x4_t v2_f = vcvtq_f32_u32(v2_u32); + // Load 4 float elements from query + float32x4_t v2 = vld1q_f32(pVect2); + pVect2 += 4; // Accumulate q_i * y_i (no dequantization!) - sum = vmlaq_f32(sum, v2_f, v1); + sum = vmlaq_f32(sum, v1_f, v2); } +// pVect1v = SQ8 storage, pVect2v = FP32 query template // 0..15 -float SQ8_InnerProductSIMD16_NEON_IMP(const void *pVect1v, const void *pVect2v, size_t dimension) { - const float *pVect1 = static_cast(pVect1v); - const uint8_t *pVect2 = static_cast(pVect2v); +float SQ8_FP32_InnerProductSIMD16_NEON_IMP(const void *pVect1v, const void *pVect2v, + size_t dimension) { + const uint8_t *pVect1 = static_cast(pVect1v); // SQ8 storage + const float *pVect2 = static_cast(pVect2v); // FP32 query // Multiple accumulators for ILP float32x4_t sum0 = vdupq_n_f32(0.0f); @@ -57,47 +60,47 @@ float SQ8_InnerProductSIMD16_NEON_IMP(const void *pVect1v, const void *pVect2v, // Process 16 elements at a time in the main loop for (size_t i = 0; i < num_of_chunks; i++) { - InnerProductStepSQ8(pVect1, pVect2, sum0); - InnerProductStepSQ8(pVect1, pVect2, sum1); - InnerProductStepSQ8(pVect1, pVect2, sum2); - InnerProductStepSQ8(pVect1, pVect2, sum3); + InnerProductStepSQ8_FP32(pVect1, pVect2, sum0); + InnerProductStepSQ8_FP32(pVect1, pVect2, sum1); + InnerProductStepSQ8_FP32(pVect1, pVect2, sum2); + InnerProductStepSQ8_FP32(pVect1, pVect2, sum3); } // Handle remaining complete 4-element blocks within residual if constexpr (residual >= 4) { - InnerProductStepSQ8(pVect1, pVect2, sum0); + InnerProductStepSQ8_FP32(pVect1, pVect2, sum0); } if constexpr (residual >= 8) { - InnerProductStepSQ8(pVect1, pVect2, sum1); + InnerProductStepSQ8_FP32(pVect1, pVect2, sum1); } if constexpr (residual >= 12) { - InnerProductStepSQ8(pVect1, pVect2, sum2); + InnerProductStepSQ8_FP32(pVect1, pVect2, sum2); } // Handle final residual elements (0-3 elements) constexpr size_t final_residual = residual % 4; if constexpr (final_residual > 0) { - float32x4_t v1 = vdupq_n_f32(0.0f); - float32x4_t v2_f = vdupq_n_f32(0.0f); + float32x4_t v1_f = vdupq_n_f32(0.0f); + float32x4_t v2 = vdupq_n_f32(0.0f); if constexpr (final_residual >= 1) { - v1 = vld1q_lane_f32(pVect1, v1, 0); - float q0 = static_cast(pVect2[0]); - v2_f = vld1q_lane_f32(&q0, v2_f, 0); + float q0 = static_cast(pVect1[0]); + v1_f = vld1q_lane_f32(&q0, v1_f, 0); + v2 = vld1q_lane_f32(pVect2, v2, 0); } if constexpr (final_residual >= 2) { - v1 = vld1q_lane_f32(pVect1 + 1, v1, 1); - float q1 = static_cast(pVect2[1]); - v2_f = vld1q_lane_f32(&q1, v2_f, 1); + float q1 = static_cast(pVect1[1]); + v1_f = vld1q_lane_f32(&q1, v1_f, 1); + v2 = vld1q_lane_f32(pVect2 + 1, v2, 1); } if constexpr (final_residual >= 3) { - v1 = vld1q_lane_f32(pVect1 + 2, v1, 2); - float q2 = static_cast(pVect2[2]); - v2_f = vld1q_lane_f32(&q2, v2_f, 2); + float q2 = static_cast(pVect1[2]); + v1_f = vld1q_lane_f32(&q2, v1_f, 2); + v2 = vld1q_lane_f32(pVect2 + 2, v2, 2); } // Compute q_i * y_i (no dequantization) - sum3 = vmlaq_f32(sum3, v2_f, v1); + sum3 = vmlaq_f32(sum3, v1_f, v2); } // Combine all four sum accumulators @@ -109,25 +112,25 @@ float SQ8_InnerProductSIMD16_NEON_IMP(const void *pVect1v, const void *pVect2v, float quantized_dot = vget_lane_f32(summed, 0); // Get quantization parameters from stored vector (after quantized data) - const uint8_t *pVect2Base = static_cast(pVect2v); - const float *params2 = reinterpret_cast(pVect2Base + dimension); - const float min_val = params2[sq8::MIN_VAL]; - const float delta = params2[sq8::DELTA]; + const uint8_t *pVect1Base = static_cast(pVect1v); + const float *params1 = reinterpret_cast(pVect1Base + dimension); + const float min_val = params1[sq8::MIN_VAL]; + const float delta = params1[sq8::DELTA]; // Get precomputed y_sum from query blob (stored after the dim floats) - const float y_sum = static_cast(pVect1v)[dimension + sq8::SUM_QUERY]; + const float y_sum = static_cast(pVect2v)[dimension + sq8::SUM_QUERY]; // Apply the algebraic formula: IP = min * y_sum + delta * Σ(q_i * y_i) return min_val * y_sum + delta * quantized_dot; } template // 0..15 -float SQ8_InnerProductSIMD16_NEON(const void *pVect1v, const void *pVect2v, size_t dimension) { - return 1.0f - SQ8_InnerProductSIMD16_NEON_IMP(pVect1v, pVect2v, dimension); +float SQ8_FP32_InnerProductSIMD16_NEON(const void *pVect1v, const void *pVect2v, size_t dimension) { + return 1.0f - SQ8_FP32_InnerProductSIMD16_NEON_IMP(pVect1v, pVect2v, dimension); } template // 0..15 -float SQ8_CosineSIMD16_NEON(const void *pVect1v, const void *pVect2v, size_t dimension) { +float SQ8_FP32_CosineSIMD16_NEON(const void *pVect1v, const void *pVect2v, size_t dimension) { // Cosine distance = 1 - IP (vectors are pre-normalized) - return SQ8_InnerProductSIMD16_NEON(pVect1v, pVect2v, dimension); + return SQ8_FP32_InnerProductSIMD16_NEON(pVect1v, pVect2v, dimension); } diff --git a/src/VecSim/spaces/IP/IP_SSE4_SQ8.h b/src/VecSim/spaces/IP/IP_SSE4_SQ8_FP32.h similarity index 51% rename from src/VecSim/spaces/IP/IP_SSE4_SQ8.h rename to src/VecSim/spaces/IP/IP_SSE4_SQ8_FP32.h index 302c902b4..45f9f31f4 100644 --- a/src/VecSim/spaces/IP/IP_SSE4_SQ8.h +++ b/src/VecSim/spaces/IP/IP_SSE4_SQ8_FP32.h @@ -24,71 +24,75 @@ using sq8 = vecsim_types::sq8; */ // Helper: compute Σ(q_i * y_i) for 4 elements (no dequantization) -static inline void InnerProductStepSQ8(const float *&pVect1, const uint8_t *&pVect2, __m128 &sum) { - // Load 4 float elements from query - __m128 v1 = _mm_loadu_ps(pVect1); +// pVect1 = SQ8 storage (quantized values), pVect2 = FP32 query +static inline void InnerProductStepSQ8_FP32(const uint8_t *&pVect1, const float *&pVect2, + __m128 &sum) { + // Load 4 uint8 elements and convert to float + __m128i v1_i = _mm_cvtepu8_epi32(_mm_cvtsi32_si128(*reinterpret_cast(pVect1))); pVect1 += 4; - // Load 4 uint8 elements and convert to float - __m128i v2_i = _mm_cvtepu8_epi32(_mm_cvtsi32_si128(*reinterpret_cast(pVect2))); - pVect2 += 4; + __m128 v1_f = _mm_cvtepi32_ps(v1_i); - __m128 v2_f = _mm_cvtepi32_ps(v2_i); + // Load 4 float elements from query + __m128 v2 = _mm_loadu_ps(pVect2); + pVect2 += 4; // Accumulate q_i * y_i (no dequantization!) // SSE doesn't have FMA, so use mul + add - sum = _mm_add_ps(sum, _mm_mul_ps(v2_f, v1)); + sum = _mm_add_ps(sum, _mm_mul_ps(v1_f, v2)); } +// pVect1v = SQ8 storage, pVect2v = FP32 query template // 0..15 -float SQ8_InnerProductSIMD16_SSE4_IMP(const void *pVect1v, const void *pVect2v, size_t dimension) { - const float *pVect1 = static_cast(pVect1v); - const uint8_t *pVect2 = static_cast(pVect2v); - const float *pEnd1 = pVect1 + dimension; +float SQ8_FP32_InnerProductSIMD16_SSE4_IMP(const void *pVect1v, const void *pVect2v, + size_t dimension) { + const uint8_t *pVect1 = static_cast(pVect1v); // SQ8 storage + const float *pVect2 = static_cast(pVect2v); // FP32 query + const uint8_t *pEnd1 = pVect1 + dimension; // Initialize sum accumulator for Σ(q_i * y_i) __m128 sum = _mm_setzero_ps(); // Process residual elements first (1-3 elements) if constexpr (residual % 4) { - __m128 v1; - __m128 v2_f; + __m128 v1_f; + __m128 v2; if constexpr (residual % 4 == 3) { - v1 = _mm_set_ps(0.0f, pVect1[2], pVect1[1], pVect1[0]); - v2_f = _mm_set_ps(0.0f, static_cast(pVect2[2]), static_cast(pVect2[1]), - static_cast(pVect2[0])); + v1_f = _mm_set_ps(0.0f, static_cast(pVect1[2]), static_cast(pVect1[1]), + static_cast(pVect1[0])); + v2 = _mm_set_ps(0.0f, pVect2[2], pVect2[1], pVect2[0]); } else if constexpr (residual % 4 == 2) { - v1 = _mm_set_ps(0.0f, 0.0f, pVect1[1], pVect1[0]); - v2_f = _mm_set_ps(0.0f, 0.0f, static_cast(pVect2[1]), - static_cast(pVect2[0])); + v1_f = _mm_set_ps(0.0f, 0.0f, static_cast(pVect1[1]), + static_cast(pVect1[0])); + v2 = _mm_set_ps(0.0f, 0.0f, pVect2[1], pVect2[0]); } else if constexpr (residual % 4 == 1) { - v1 = _mm_set_ps(0.0f, 0.0f, 0.0f, pVect1[0]); - v2_f = _mm_set_ps(0.0f, 0.0f, 0.0f, static_cast(pVect2[0])); + v1_f = _mm_set_ps(0.0f, 0.0f, 0.0f, static_cast(pVect1[0])); + v2 = _mm_set_ps(0.0f, 0.0f, 0.0f, pVect2[0]); } pVect1 += residual % 4; pVect2 += residual % 4; // Compute q_i * y_i (no dequantization) - sum = _mm_mul_ps(v1, v2_f); + sum = _mm_mul_ps(v1_f, v2); } // Handle remaining residual in chunks of 4 (for residual 4-15) if constexpr (residual >= 4) { - InnerProductStepSQ8(pVect1, pVect2, sum); + InnerProductStepSQ8_FP32(pVect1, pVect2, sum); } if constexpr (residual >= 8) { - InnerProductStepSQ8(pVect1, pVect2, sum); + InnerProductStepSQ8_FP32(pVect1, pVect2, sum); } if constexpr (residual >= 12) { - InnerProductStepSQ8(pVect1, pVect2, sum); + InnerProductStepSQ8_FP32(pVect1, pVect2, sum); } // Process remaining full chunks of 4 elements // Using do-while since dim > 16 guarantees at least one iteration do { - InnerProductStepSQ8(pVect1, pVect2, sum); + InnerProductStepSQ8_FP32(pVect1, pVect2, sum); } while (pVect1 < pEnd1); // Horizontal sum to get Σ(q_i * y_i) @@ -97,26 +101,26 @@ float SQ8_InnerProductSIMD16_SSE4_IMP(const void *pVect1v, const void *pVect2v, float quantized_dot = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3]; // Get quantization parameters from stored vector (after quantized data) - const uint8_t *pVect2Base = static_cast(pVect2v); - const float *params2 = reinterpret_cast(pVect2Base + dimension); - const float min_val = params2[sq8::MIN_VAL]; - const float delta = params2[sq8::DELTA]; + const uint8_t *pVect1Base = static_cast(pVect1v); + const float *params1 = reinterpret_cast(pVect1Base + dimension); + const float min_val = params1[sq8::MIN_VAL]; + const float delta = params1[sq8::DELTA]; // Get precomputed y_sum from query blob (stored after the dim floats) - const float *pVect1Base = static_cast(pVect1v); - const float y_sum = pVect1Base[dimension + sq8::SUM_QUERY]; + const float *pVect2Base = static_cast(pVect2v); + const float y_sum = pVect2Base[dimension + sq8::SUM_QUERY]; // Apply the algebraic formula: IP = min * y_sum + delta * Σ(q_i * y_i) return min_val * y_sum + delta * quantized_dot; } template // 0..15 -float SQ8_InnerProductSIMD16_SSE4(const void *pVect1v, const void *pVect2v, size_t dimension) { - return 1.0f - SQ8_InnerProductSIMD16_SSE4_IMP(pVect1v, pVect2v, dimension); +float SQ8_FP32_InnerProductSIMD16_SSE4(const void *pVect1v, const void *pVect2v, size_t dimension) { + return 1.0f - SQ8_FP32_InnerProductSIMD16_SSE4_IMP(pVect1v, pVect2v, dimension); } template // 0..15 -float SQ8_CosineSIMD16_SSE4(const void *pVect1v, const void *pVect2v, size_t dimension) { +float SQ8_FP32_CosineSIMD16_SSE4(const void *pVect1v, const void *pVect2v, size_t dimension) { // Cosine distance = 1 - IP (vectors are pre-normalized) - return SQ8_InnerProductSIMD16_SSE4(pVect1v, pVect2v, dimension); + return SQ8_FP32_InnerProductSIMD16_SSE4(pVect1v, pVect2v, dimension); } diff --git a/src/VecSim/spaces/IP/IP_SVE_SQ8.h b/src/VecSim/spaces/IP/IP_SVE_SQ8_FP32.h similarity index 60% rename from src/VecSim/spaces/IP/IP_SVE_SQ8.h rename to src/VecSim/spaces/IP/IP_SVE_SQ8_FP32.h index 735f36906..c4d5dbd7f 100644 --- a/src/VecSim/spaces/IP/IP_SVE_SQ8.h +++ b/src/VecSim/spaces/IP/IP_SVE_SQ8_FP32.h @@ -25,29 +25,32 @@ using sq8 = vecsim_types::sq8; */ // Helper: compute Σ(q_i * y_i) for one SVE vector width (no dequantization) -static inline void InnerProductStepSQ8(const float *pVect1, const uint8_t *pVect2, size_t &offset, - svfloat32_t &sum, const size_t chunk) { +// pVect1 = SQ8 storage (quantized values), pVect2 = FP32 query +static inline void InnerProductStepSQ8_FP32(const uint8_t *pVect1, const float *pVect2, + size_t &offset, svfloat32_t &sum, const size_t chunk) { svbool_t pg = svptrue_b32(); - // Load float elements from query - svfloat32_t v1 = svld1_f32(pg, pVect1 + offset); - // Load uint8 elements and zero-extend to uint32 - svuint32_t v2_u32 = svld1ub_u32(pg, pVect2 + offset); + svuint32_t v1_u32 = svld1ub_u32(pg, pVect1 + offset); // Convert uint32 to float32 - svfloat32_t v2_f = svcvt_f32_u32_x(pg, v2_u32); + svfloat32_t v1_f = svcvt_f32_u32_x(pg, v1_u32); + + // Load float elements from query + svfloat32_t v2 = svld1_f32(pg, pVect2 + offset); // Accumulate q_i * y_i (no dequantization!) - sum = svmla_f32_x(pg, sum, v2_f, v1); + sum = svmla_f32_x(pg, sum, v1_f, v2); offset += chunk; } +// pVect1v = SQ8 storage, pVect2v = FP32 query template -float SQ8_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v, size_t dimension) { - const float *pVect1 = static_cast(pVect1v); - const uint8_t *pVect2 = static_cast(pVect2v); +float SQ8_FP32_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v, + size_t dimension) { + const uint8_t *pVect1 = static_cast(pVect1v); // SQ8 storage + const float *pVect2 = static_cast(pVect2v); // FP32 query size_t offset = 0; svbool_t pg = svptrue_b32(); @@ -69,17 +72,17 @@ float SQ8_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v, siz svbool_t pg_partial = svwhilelt_b32(static_cast(0), static_cast(remaining)); - // Load float elements from query with predicate - svfloat32_t v1 = svld1_f32(pg_partial, pVect1); - // Load uint8 elements and zero-extend to uint32 - svuint32_t v2_u32 = svld1ub_u32(pg_partial, pVect2 + offset); + svuint32_t v1_u32 = svld1ub_u32(pg_partial, pVect1 + offset); // Convert uint32 to float32 - svfloat32_t v2_f = svcvt_f32_u32_z(pg_partial, v2_u32); + svfloat32_t v1_f = svcvt_f32_u32_z(pg_partial, v1_u32); + + // Load float elements from query with predicate + svfloat32_t v2 = svld1_f32(pg_partial, pVect2); // Compute q_i * y_i (no dequantization) - sum0 = svmla_f32_z(pg_partial, sum0, v2_f, v1); + sum0 = svmla_f32_z(pg_partial, sum0, v1_f, v2); offset += remaining; } @@ -91,21 +94,21 @@ float SQ8_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v, siz (dimension - (partial_chunk ? dimension % chunk : 0)) / chunk_size; for (size_t i = 0; i < number_of_chunks; i++) { - InnerProductStepSQ8(pVect1, pVect2, offset, sum0, chunk); - InnerProductStepSQ8(pVect1, pVect2, offset, sum1, chunk); - InnerProductStepSQ8(pVect1, pVect2, offset, sum2, chunk); - InnerProductStepSQ8(pVect1, pVect2, offset, sum3, chunk); + InnerProductStepSQ8_FP32(pVect1, pVect2, offset, sum0, chunk); + InnerProductStepSQ8_FP32(pVect1, pVect2, offset, sum1, chunk); + InnerProductStepSQ8_FP32(pVect1, pVect2, offset, sum2, chunk); + InnerProductStepSQ8_FP32(pVect1, pVect2, offset, sum3, chunk); } // Handle remaining steps (0-3) if constexpr (additional_steps > 0) { - InnerProductStepSQ8(pVect1, pVect2, offset, sum0, chunk); + InnerProductStepSQ8_FP32(pVect1, pVect2, offset, sum0, chunk); } if constexpr (additional_steps > 1) { - InnerProductStepSQ8(pVect1, pVect2, offset, sum1, chunk); + InnerProductStepSQ8_FP32(pVect1, pVect2, offset, sum1, chunk); } if constexpr (additional_steps > 2) { - InnerProductStepSQ8(pVect1, pVect2, offset, sum2, chunk); + InnerProductStepSQ8_FP32(pVect1, pVect2, offset, sum2, chunk); } // Combine the accumulators @@ -117,25 +120,26 @@ float SQ8_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v, siz float quantized_dot = svaddv_f32(pg, sum); // Get quantization parameters from stored vector (after quantized data) - const float *params2 = reinterpret_cast(pVect2 + dimension); - const float min_val = params2[sq8::MIN_VAL]; - const float delta = params2[sq8::DELTA]; + const float *params1 = reinterpret_cast(pVect1 + dimension); + const float min_val = params1[sq8::MIN_VAL]; + const float delta = params1[sq8::DELTA]; // Get precomputed y_sum from query blob (stored after the dim floats) - const float y_sum = pVect1[dimension + sq8::SUM_QUERY]; + const float y_sum = pVect2[dimension + sq8::SUM_QUERY]; // Apply the algebraic formula: IP = min * y_sum + delta * Σ(q_i * y_i) return min_val * y_sum + delta * quantized_dot; } template -float SQ8_InnerProductSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) { - return 1.0f - SQ8_InnerProductSIMD_SVE_IMP(pVect1v, pVect2v, - dimension); +float SQ8_FP32_InnerProductSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) { + return 1.0f - SQ8_FP32_InnerProductSIMD_SVE_IMP( + pVect1v, pVect2v, dimension); } template -float SQ8_CosineSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) { +float SQ8_FP32_CosineSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) { // Cosine distance = 1 - IP (vectors are pre-normalized) - return SQ8_InnerProductSIMD_SVE(pVect1v, pVect2v, dimension); + return SQ8_FP32_InnerProductSIMD_SVE(pVect1v, pVect2v, + dimension); } diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp index c25f0d043..859b90271 100644 --- a/src/VecSim/spaces/IP_space.cpp +++ b/src/VecSim/spaces/IP_space.cpp @@ -35,29 +35,31 @@ using bfloat16 = vecsim_types::bfloat16; using float16 = vecsim_types::float16; namespace spaces { -dist_func_t IP_SQ8_GetDistFunc(size_t dim, unsigned char *alignment, const void *arch_opt) { +// SQ8-FP32: asymmetric distance between SQ8 storage and FP32 query +dist_func_t IP_SQ8_FP32_GetDistFunc(size_t dim, unsigned char *alignment, + const void *arch_opt) { unsigned char dummy_alignment; if (alignment == nullptr) { alignment = &dummy_alignment; } - dist_func_t ret_dist_func = SQ8_InnerProduct; + dist_func_t ret_dist_func = SQ8_FP32_InnerProduct; [[maybe_unused]] auto features = getCpuOptimizationFeatures(arch_opt); #ifdef CPU_FEATURES_ARCH_AARCH64 #ifdef OPT_SVE2 if (features.sve2) { - return Choose_SQ8_IP_implementation_SVE2(dim); + return Choose_SQ8_FP32_IP_implementation_SVE2(dim); } #endif #ifdef OPT_SVE if (features.sve) { - return Choose_SQ8_IP_implementation_SVE(dim); + return Choose_SQ8_FP32_IP_implementation_SVE(dim); } #endif #ifdef OPT_NEON if (features.asimd) { - return Choose_SQ8_IP_implementation_NEON(dim); + return Choose_SQ8_FP32_IP_implementation_NEON(dim); } #endif @@ -70,52 +72,53 @@ dist_func_t IP_SQ8_GetDistFunc(size_t dim, unsigned char *alignment, cons } #ifdef OPT_AVX512_F_BW_VL_VNNI if (features.avx512f && features.avx512bw && features.avx512vnni) { - return Choose_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(dim); + return Choose_SQ8_FP32_IP_implementation_AVX512F_BW_VL_VNNI(dim); } #endif #ifdef OPT_AVX2_FMA if (features.avx2 && features.fma3) { - return Choose_SQ8_IP_implementation_AVX2_FMA(dim); + return Choose_SQ8_FP32_IP_implementation_AVX2_FMA(dim); } #endif #ifdef OPT_AVX2 if (features.avx2) { - return Choose_SQ8_IP_implementation_AVX2(dim); + return Choose_SQ8_FP32_IP_implementation_AVX2(dim); } #endif #ifdef OPT_SSE4 if (features.sse4_1) { - return Choose_SQ8_IP_implementation_SSE4(dim); + return Choose_SQ8_FP32_IP_implementation_SSE4(dim); } #endif #endif // __x86_64__ return ret_dist_func; } -dist_func_t Cosine_SQ8_GetDistFunc(size_t dim, unsigned char *alignment, - const void *arch_opt) { +// SQ8-FP32: asymmetric cosine distance between SQ8 storage and FP32 query +dist_func_t Cosine_SQ8_FP32_GetDistFunc(size_t dim, unsigned char *alignment, + const void *arch_opt) { unsigned char dummy_alignment; if (alignment == nullptr) { alignment = &dummy_alignment; } - dist_func_t ret_dist_func = SQ8_Cosine; + dist_func_t ret_dist_func = SQ8_FP32_Cosine; [[maybe_unused]] auto features = getCpuOptimizationFeatures(arch_opt); #ifdef CPU_FEATURES_ARCH_AARCH64 #ifdef OPT_SVE2 if (features.sve2) { - return Choose_SQ8_Cosine_implementation_SVE2(dim); + return Choose_SQ8_FP32_Cosine_implementation_SVE2(dim); } #endif #ifdef OPT_SVE if (features.sve) { - return Choose_SQ8_Cosine_implementation_SVE(dim); + return Choose_SQ8_FP32_Cosine_implementation_SVE(dim); } #endif #ifdef OPT_NEON if (features.asimd) { - return Choose_SQ8_Cosine_implementation_NEON(dim); + return Choose_SQ8_FP32_Cosine_implementation_NEON(dim); } #endif @@ -128,22 +131,22 @@ dist_func_t Cosine_SQ8_GetDistFunc(size_t dim, unsigned char *alignment, } #ifdef OPT_AVX512_F_BW_VL_VNNI if (features.avx512f && features.avx512bw && features.avx512vnni) { - return Choose_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim); + return Choose_SQ8_FP32_Cosine_implementation_AVX512F_BW_VL_VNNI(dim); } #endif #ifdef OPT_AVX2_FMA if (features.avx2 && features.fma3) { - return Choose_SQ8_Cosine_implementation_AVX2_FMA(dim); + return Choose_SQ8_FP32_Cosine_implementation_AVX2_FMA(dim); } #endif #ifdef OPT_AVX2 if (features.avx2) { - return Choose_SQ8_Cosine_implementation_AVX2(dim); + return Choose_SQ8_FP32_Cosine_implementation_AVX2(dim); } #endif #ifdef OPT_SSE4 if (features.sse4_1) { - return Choose_SQ8_Cosine_implementation_SSE4(dim); + return Choose_SQ8_FP32_Cosine_implementation_SSE4(dim); } #endif #endif // __x86_64__ diff --git a/src/VecSim/spaces/IP_space.h b/src/VecSim/spaces/IP_space.h index 9a03c6a96..b258ff481 100644 --- a/src/VecSim/spaces/IP_space.h +++ b/src/VecSim/spaces/IP_space.h @@ -10,8 +10,9 @@ #include "VecSim/spaces/spaces.h" namespace spaces { -dist_func_t IP_SQ8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr, - const void *arch_opt = nullptr); +// SQ8-FP32: asymmetric distance between FP32 query and SQ8 storage +dist_func_t IP_SQ8_FP32_GetDistFunc(size_t dim, unsigned char *alignment = nullptr, + const void *arch_opt = nullptr); dist_func_t IP_FP32_GetDistFunc(size_t dim, unsigned char *alignment = nullptr, const void *arch_opt = nullptr); @@ -29,8 +30,9 @@ dist_func_t IP_UINT8_GetDistFunc(size_t dim, unsigned char *alignment = n const void *arch_opt = nullptr); dist_func_t Cosine_UINT8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr, const void *arch_opt = nullptr); -dist_func_t Cosine_SQ8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr, - const void *arch_opt = nullptr); +// SQ8-FP32: asymmetric cosine distance between FP32 query and SQ8 storage +dist_func_t Cosine_SQ8_FP32_GetDistFunc(size_t dim, unsigned char *alignment = nullptr, + const void *arch_opt = nullptr); // SQ8-to-SQ8 distance functions (both vectors are uint8 quantized with precomputed sum) dist_func_t IP_SQ8_SQ8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr, const void *arch_opt = nullptr); diff --git a/src/VecSim/spaces/L2/L2.cpp b/src/VecSim/spaces/L2/L2.cpp index 17af68519..7761df920 100644 --- a/src/VecSim/spaces/L2/L2.cpp +++ b/src/VecSim/spaces/L2/L2.cpp @@ -19,26 +19,26 @@ using float16 = vecsim_types::float16; using sq8 = vecsim_types::sq8; /* - * Optimized asymmetric SQ8 L2 squared distance using algebraic identity: + * Optimized asymmetric SQ8-FP32 L2 squared distance using algebraic identity: * ||x - y||² = Σx_i² - 2*IP(x, y) + Σy_i² * = x_sum_squares - 2 * IP(x, y) + y_sum_squares * where IP(x, y) = min * y_sum + delta * Σ(q_i * y_i) * - * pVect1 is query (FP32): [float values (dim)] [y_sum] [y_sum_squares] - * pVect2 is storage (SQ8): [uint8_t values (dim)] [min_val] [delta] [x_sum] [x_sum_squares] + * pVect1 is storage (SQ8): [uint8_t values (dim)] [min_val] [delta] [x_sum] [x_sum_squares] + * pVect2 is query (FP32): [float values (dim)] [y_sum] [y_sum_squares] */ -float SQ8_L2Sqr(const void *pVect1v, const void *pVect2v, size_t dimension) { +float SQ8_FP32_L2Sqr(const void *pVect1v, const void *pVect2v, size_t dimension) { // Get the raw inner product using the common implementation - const float ip = SQ8_InnerProduct_Impl(pVect1v, pVect2v, dimension); + const float ip = SQ8_FP32_InnerProduct_Impl(pVect1v, pVect2v, dimension); - // Get precomputed sum of squares from storage blob - const auto *pVect2 = static_cast(pVect2v); - const float *params = reinterpret_cast(pVect2 + dimension); + // Get precomputed sum of squares from storage blob (pVect1 is SQ8) + const auto *pVect1 = static_cast(pVect1v); + const float *params = reinterpret_cast(pVect1 + dimension); const float x_sum_sq = params[sq8::SUM_SQUARES]; - // Get precomputed sum of squares from query blob - const auto *pVect1 = static_cast(pVect1v); - const float y_sum_sq = pVect1[dimension + sq8::SUM_SQUARES_QUERY]; + // Get precomputed sum of squares from query blob (pVect2 is FP32) + const auto *pVect2 = static_cast(pVect2v); + const float y_sum_sq = pVect2[dimension + sq8::SUM_SQUARES_QUERY]; // L2² = ||x||² + ||y||² - 2*IP(x, y) return x_sum_sq + y_sum_sq - 2.0f * ip; diff --git a/src/VecSim/spaces/L2/L2.h b/src/VecSim/spaces/L2/L2.h index 381bd9199..d055760f9 100644 --- a/src/VecSim/spaces/L2/L2.h +++ b/src/VecSim/spaces/L2/L2.h @@ -10,8 +10,8 @@ #include -// pVect1v vector of type fp32 and pVect2v vector of type uint8 -float SQ8_L2Sqr(const void *pVect1v, const void *pVect2v, size_t dimension); +// SQ8-FP32: pVect1v vector of type uint8 (SQ8) and pVect2v vector of type fp32 +float SQ8_FP32_L2Sqr(const void *pVect1v, const void *pVect2v, size_t dimension); float FP32_L2Sqr(const void *pVect1v, const void *pVect2v, size_t dimension); diff --git a/src/VecSim/spaces/L2/L2_AVX2_SQ8.h b/src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8_FP32.h similarity index 62% rename from src/VecSim/spaces/L2/L2_AVX2_SQ8.h rename to src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8_FP32.h index b13c87c4f..46eb4cc6e 100644 --- a/src/VecSim/spaces/L2/L2_AVX2_SQ8.h +++ b/src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8_FP32.h @@ -9,7 +9,7 @@ #pragma once #include "VecSim/spaces/space_includes.h" #include "VecSim/spaces/AVX_utils.h" -#include "VecSim/spaces/IP/IP_AVX2_SQ8.h" +#include "VecSim/spaces/IP/IP_AVX2_FMA_SQ8_FP32.h" #include "VecSim/types/sq8.h" using sq8 = vecsim_types::sq8; @@ -21,24 +21,25 @@ using sq8 = vecsim_types::sq8; * = x_sum_squares - 2 * IP(x, y) + y_sum_squares * * where: - * - IP(x, y) = min * y_sum + delta * Σ(q_i * y_i) (computed via SQ8_InnerProductImp_AVX2) + * - IP(x, y) = min * y_sum + delta * Σ(q_i * y_i) (computed via SQ8_FP32_InnerProductImp_FMA) * - x_sum_squares and y_sum_squares are precomputed * * This avoids dequantization in the hot loop. */ +// pVect1v = SQ8 storage, pVect2v = FP32 query template // 0..15 -float SQ8_L2SqrSIMD16_AVX2(const void *pVect1v, const void *pVect2v, size_t dimension) { +float SQ8_FP32_L2SqrSIMD16_AVX2_FMA(const void *pVect1v, const void *pVect2v, size_t dimension) { // Get the raw inner product using the common SIMD implementation - const float ip = SQ8_InnerProductImp_AVX2(pVect1v, pVect2v, dimension); + const float ip = SQ8_FP32_InnerProductImp_FMA(pVect1v, pVect2v, dimension); - // Get precomputed sum of squares from storage blob - const uint8_t *pVect2 = static_cast(pVect2v); - const float *params = reinterpret_cast(pVect2 + dimension); + // Get precomputed sum of squares from storage blob (pVect1v is SQ8 storage) + const uint8_t *pVect1 = static_cast(pVect1v); + const float *params = reinterpret_cast(pVect1 + dimension); const float x_sum_sq = params[sq8::SUM_SQUARES]; - // Get precomputed sum of squares from query blob - const float y_sum_sq = static_cast(pVect1v)[dimension + sq8::SUM_SQUARES_QUERY]; + // Get precomputed sum of squares from query blob (pVect2v is FP32 query) + const float y_sum_sq = static_cast(pVect2v)[dimension + sq8::SUM_SQUARES_QUERY]; // L2² = ||x||² + ||y||² - 2*IP(x, y) return x_sum_sq + y_sum_sq - 2.0f * ip; diff --git a/src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8.h b/src/VecSim/spaces/L2/L2_AVX2_SQ8_FP32.h similarity index 62% rename from src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8.h rename to src/VecSim/spaces/L2/L2_AVX2_SQ8_FP32.h index c06dd95e6..cc1fa4272 100644 --- a/src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8.h +++ b/src/VecSim/spaces/L2/L2_AVX2_SQ8_FP32.h @@ -9,7 +9,7 @@ #pragma once #include "VecSim/spaces/space_includes.h" #include "VecSim/spaces/AVX_utils.h" -#include "VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h" +#include "VecSim/spaces/IP/IP_AVX2_SQ8_FP32.h" #include "VecSim/types/sq8.h" using sq8 = vecsim_types::sq8; @@ -21,24 +21,25 @@ using sq8 = vecsim_types::sq8; * = x_sum_squares - 2 * IP(x, y) + y_sum_squares * * where: - * - IP(x, y) = min * y_sum + delta * Σ(q_i * y_i) (computed via SQ8_InnerProductImp_FMA) + * - IP(x, y) = min * y_sum + delta * Σ(q_i * y_i) (computed via SQ8_FP32_InnerProductImp_AVX2) * - x_sum_squares and y_sum_squares are precomputed * * This avoids dequantization in the hot loop. */ +// pVect1v = SQ8 storage, pVect2v = FP32 query template // 0..15 -float SQ8_L2SqrSIMD16_AVX2_FMA(const void *pVect1v, const void *pVect2v, size_t dimension) { +float SQ8_FP32_L2SqrSIMD16_AVX2(const void *pVect1v, const void *pVect2v, size_t dimension) { // Get the raw inner product using the common SIMD implementation - const float ip = SQ8_InnerProductImp_FMA(pVect1v, pVect2v, dimension); + const float ip = SQ8_FP32_InnerProductImp_AVX2(pVect1v, pVect2v, dimension); - // Get precomputed sum of squares from storage blob - const uint8_t *pVect2 = static_cast(pVect2v); - const float *params = reinterpret_cast(pVect2 + dimension); + // Get precomputed sum of squares from storage blob (pVect1v is SQ8 storage) + const uint8_t *pVect1 = static_cast(pVect1v); + const float *params = reinterpret_cast(pVect1 + dimension); const float x_sum_sq = params[sq8::SUM_SQUARES]; - // Get precomputed sum of squares from query blob - const float y_sum_sq = static_cast(pVect1v)[dimension + sq8::SUM_SQUARES_QUERY]; + // Get precomputed sum of squares from query blob (pVect2v is FP32 query) + const float y_sum_sq = static_cast(pVect2v)[dimension + sq8::SUM_SQUARES_QUERY]; // L2² = ||x||² + ||y||² - 2*IP(x, y) return x_sum_sq + y_sum_sq - 2.0f * ip; diff --git a/src/VecSim/spaces/L2/L2_SSE4_SQ8.h b/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8_FP32.h similarity index 60% rename from src/VecSim/spaces/L2/L2_SSE4_SQ8.h rename to src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8_FP32.h index f9372658c..57db23fb9 100644 --- a/src/VecSim/spaces/L2/L2_SSE4_SQ8.h +++ b/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8_FP32.h @@ -8,7 +8,7 @@ */ #pragma once #include "VecSim/spaces/space_includes.h" -#include "VecSim/spaces/IP/IP_SSE4_SQ8.h" +#include "VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8_FP32.h" #include "VecSim/types/sq8.h" using sq8 = vecsim_types::sq8; @@ -20,24 +20,26 @@ using sq8 = vecsim_types::sq8; * = x_sum_squares - 2 * IP(x, y) + y_sum_squares * * where: - * - IP(x, y) = min * y_sum + delta * Σ(q_i * y_i) (computed via SQ8_InnerProductSIMD16_SSE4_IMP) + * - IP(x, y) = min * y_sum + delta * Σ(q_i * y_i) (computed via SQ8_FP32_InnerProductImp_AVX512) * - x_sum_squares and y_sum_squares are precomputed * * This avoids dequantization in the hot loop. */ +// pVect1v = SQ8 storage, pVect2v = FP32 query template // 0..15 -float SQ8_L2SqrSIMD16_SSE4(const void *pVect1v, const void *pVect2v, size_t dimension) { +float SQ8_FP32_L2SqrSIMD16_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect2v, + size_t dimension) { // Get the raw inner product using the common SIMD implementation - const float ip = SQ8_InnerProductSIMD16_SSE4_IMP(pVect1v, pVect2v, dimension); + const float ip = SQ8_FP32_InnerProductImp_AVX512(pVect1v, pVect2v, dimension); - // Get precomputed sum of squares from storage blob - const uint8_t *pVect2 = static_cast(pVect2v); - const float *params = reinterpret_cast(pVect2 + dimension); + // Get precomputed sum of squares from storage blob (pVect1v is SQ8 storage) + const uint8_t *pVect1 = static_cast(pVect1v); + const float *params = reinterpret_cast(pVect1 + dimension); const float x_sum_sq = params[sq8::SUM_SQUARES]; - // Get precomputed sum of squares from query blob - const float y_sum_sq = static_cast(pVect1v)[dimension + sq8::SUM_SQUARES_QUERY]; + // Get precomputed sum of squares from query blob (pVect2v is FP32 query) + const float y_sum_sq = static_cast(pVect2v)[dimension + sq8::SUM_SQUARES_QUERY]; // L2² = ||x||² + ||y||² - 2*IP(x, y) return x_sum_sq + y_sum_sq - 2.0f * ip; diff --git a/src/VecSim/spaces/L2/L2_NEON_SQ8.h b/src/VecSim/spaces/L2/L2_NEON_SQ8_FP32.h similarity index 61% rename from src/VecSim/spaces/L2/L2_NEON_SQ8.h rename to src/VecSim/spaces/L2/L2_NEON_SQ8_FP32.h index cd76a640e..e98beb13e 100644 --- a/src/VecSim/spaces/L2/L2_NEON_SQ8.h +++ b/src/VecSim/spaces/L2/L2_NEON_SQ8_FP32.h @@ -8,7 +8,7 @@ */ #pragma once #include "VecSim/spaces/space_includes.h" -#include "VecSim/spaces/IP/IP_NEON_SQ8.h" +#include "VecSim/spaces/IP/IP_NEON_SQ8_FP32.h" #include "VecSim/types/sq8.h" #include @@ -21,24 +21,26 @@ using sq8 = vecsim_types::sq8; * = x_sum_squares - 2 * IP(x, y) + y_sum_squares * * where: - * - IP(x, y) = min * y_sum + delta * Σ(q_i * y_i) (computed via SQ8_InnerProductSIMD16_NEON_IMP) + * - IP(x, y) = min * y_sum + delta * Σ(q_i * y_i) (computed via + * SQ8_FP32_InnerProductSIMD16_NEON_IMP) * - x_sum_squares and y_sum_squares are precomputed * * This avoids dequantization in the hot loop. */ +// pVect1v = SQ8 storage, pVect2v = FP32 query template // 0..15 -float SQ8_L2SqrSIMD16_NEON(const void *pVect1v, const void *pVect2v, size_t dimension) { +float SQ8_FP32_L2SqrSIMD16_NEON(const void *pVect1v, const void *pVect2v, size_t dimension) { // Get the raw inner product using the common SIMD implementation - const float ip = SQ8_InnerProductSIMD16_NEON_IMP(pVect1v, pVect2v, dimension); + const float ip = SQ8_FP32_InnerProductSIMD16_NEON_IMP(pVect1v, pVect2v, dimension); - // Get precomputed sum of squares from storage blob - const uint8_t *pVect2 = static_cast(pVect2v); - const float *params = reinterpret_cast(pVect2 + dimension); + // Get precomputed sum of squares from storage blob (pVect1v is SQ8 storage) + const uint8_t *pVect1 = static_cast(pVect1v); + const float *params = reinterpret_cast(pVect1 + dimension); const float x_sum_sq = params[sq8::SUM_SQUARES]; - // Get precomputed sum of squares from query blob - const float y_sum_sq = static_cast(pVect1v)[dimension + sq8::SUM_SQUARES_QUERY]; + // Get precomputed sum of squares from query blob (pVect2v is FP32 query) + const float y_sum_sq = static_cast(pVect2v)[dimension + sq8::SUM_SQUARES_QUERY]; // L2² = ||x||² + ||y||² - 2*IP(x, y) return x_sum_sq + y_sum_sq - 2.0f * ip; diff --git a/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h b/src/VecSim/spaces/L2/L2_SSE4_SQ8_FP32.h similarity index 61% rename from src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h rename to src/VecSim/spaces/L2/L2_SSE4_SQ8_FP32.h index d29ef705d..29c662786 100644 --- a/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h +++ b/src/VecSim/spaces/L2/L2_SSE4_SQ8_FP32.h @@ -8,7 +8,7 @@ */ #pragma once #include "VecSim/spaces/space_includes.h" -#include "VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8.h" +#include "VecSim/spaces/IP/IP_SSE4_SQ8_FP32.h" #include "VecSim/types/sq8.h" using sq8 = vecsim_types::sq8; @@ -20,25 +20,26 @@ using sq8 = vecsim_types::sq8; * = x_sum_squares - 2 * IP(x, y) + y_sum_squares * * where: - * - IP(x, y) = min * y_sum + delta * Σ(q_i * y_i) (computed via SQ8_InnerProductImp_AVX512) + * - IP(x, y) = min * y_sum + delta * Σ(q_i * y_i) (computed via + * SQ8_FP32_InnerProductSIMD16_SSE4_IMP) * - x_sum_squares and y_sum_squares are precomputed * * This avoids dequantization in the hot loop. */ +// pVect1v = SQ8 storage, pVect2v = FP32 query template // 0..15 -float SQ8_L2SqrSIMD16_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect2v, - size_t dimension) { +float SQ8_FP32_L2SqrSIMD16_SSE4(const void *pVect1v, const void *pVect2v, size_t dimension) { // Get the raw inner product using the common SIMD implementation - const float ip = SQ8_InnerProductImp_AVX512(pVect1v, pVect2v, dimension); + const float ip = SQ8_FP32_InnerProductSIMD16_SSE4_IMP(pVect1v, pVect2v, dimension); - // Get precomputed sum of squares from storage blob - const uint8_t *pVect2 = static_cast(pVect2v); - const float *params = reinterpret_cast(pVect2 + dimension); + // Get precomputed sum of squares from storage blob (pVect1v is SQ8 storage) + const uint8_t *pVect1 = static_cast(pVect1v); + const float *params = reinterpret_cast(pVect1 + dimension); const float x_sum_sq = params[sq8::SUM_SQUARES]; - // Get precomputed sum of squares from query blob - const float y_sum_sq = static_cast(pVect1v)[dimension + sq8::SUM_SQUARES_QUERY]; + // Get precomputed sum of squares from query blob (pVect2v is FP32 query) + const float y_sum_sq = static_cast(pVect2v)[dimension + sq8::SUM_SQUARES_QUERY]; // L2² = ||x||² + ||y||² - 2*IP(x, y) return x_sum_sq + y_sum_sq - 2.0f * ip; diff --git a/src/VecSim/spaces/L2/L2_SVE_SQ8.h b/src/VecSim/spaces/L2/L2_SVE_SQ8_FP32.h similarity index 57% rename from src/VecSim/spaces/L2/L2_SVE_SQ8.h rename to src/VecSim/spaces/L2/L2_SVE_SQ8_FP32.h index 5bb133fe9..0ae9fec74 100644 --- a/src/VecSim/spaces/L2/L2_SVE_SQ8.h +++ b/src/VecSim/spaces/L2/L2_SVE_SQ8_FP32.h @@ -8,38 +8,40 @@ */ #pragma once #include "VecSim/spaces/space_includes.h" -#include "VecSim/spaces/IP/IP_SVE_SQ8.h" +#include "VecSim/spaces/IP/IP_SVE_SQ8_FP32.h" #include "VecSim/types/sq8.h" #include using sq8 = vecsim_types::sq8; /* - * Optimized asymmetric SQ8 L2 squared distance using algebraic identity: + * Optimized asymmetric SQ8-FP32 L2 squared distance using algebraic identity: * * ||x - y||² = Σx_i² - 2*IP(x, y) + Σy_i² * = x_sum_squares - 2 * IP(x, y) + y_sum_squares * * where: - * - IP(x, y) = min * y_sum + delta * Σ(q_i * y_i) (computed via SQ8_InnerProductSIMD_SVE_IMP) + * - IP(x, y) = min * y_sum + delta * Σ(q_i * y_i) (computed via + * SQ8_FP32_InnerProductSIMD_SVE_IMP) * - x_sum_squares and y_sum_squares are precomputed * * This avoids dequantization in the hot loop. */ +// pVect1v = SQ8 storage, pVect2v = FP32 query template -float SQ8_L2SqrSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) { +float SQ8_FP32_L2SqrSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) { // Get the raw inner product using the common SIMD implementation - const float ip = - SQ8_InnerProductSIMD_SVE_IMP(pVect1v, pVect2v, dimension); + const float ip = SQ8_FP32_InnerProductSIMD_SVE_IMP( + pVect1v, pVect2v, dimension); - // Get precomputed sum of squares from storage blob - const uint8_t *pVect2 = static_cast(pVect2v); - const float *params = reinterpret_cast(pVect2 + dimension); + // Get precomputed sum of squares from storage blob (pVect1v is SQ8 storage) + const uint8_t *pVect1 = static_cast(pVect1v); + const float *params = reinterpret_cast(pVect1 + dimension); const float x_sum_sq = params[sq8::SUM_SQUARES]; - // Get precomputed sum of squares from query blob - const float y_sum_sq = static_cast(pVect1v)[dimension + sq8::SUM_SQUARES_QUERY]; + // Get precomputed sum of squares from query blob (pVect2v is FP32 query) + const float y_sum_sq = static_cast(pVect2v)[dimension + sq8::SUM_SQUARES_QUERY]; // L2² = ||x||² + ||y||² - 2*IP(x, y) return x_sum_sq + y_sum_sq - 2.0f * ip; diff --git a/src/VecSim/spaces/L2_space.cpp b/src/VecSim/spaces/L2_space.cpp index f43ae618e..dcccd513f 100644 --- a/src/VecSim/spaces/L2_space.cpp +++ b/src/VecSim/spaces/L2_space.cpp @@ -35,29 +35,31 @@ using float16 = vecsim_types::float16; namespace spaces { -dist_func_t L2_SQ8_GetDistFunc(size_t dim, unsigned char *alignment, const void *arch_opt) { +// SQ8-FP32: asymmetric L2 distance between SQ8 storage and FP32 query +dist_func_t L2_SQ8_FP32_GetDistFunc(size_t dim, unsigned char *alignment, + const void *arch_opt) { unsigned char dummy_alignment; if (!alignment) { alignment = &dummy_alignment; } - dist_func_t ret_dist_func = SQ8_L2Sqr; + dist_func_t ret_dist_func = SQ8_FP32_L2Sqr; [[maybe_unused]] auto features = getCpuOptimizationFeatures(arch_opt); #ifdef CPU_FEATURES_ARCH_AARCH64 #ifdef OPT_SVE2 if (features.sve2) { - return Choose_SQ8_L2_implementation_SVE2(dim); + return Choose_SQ8_FP32_L2_implementation_SVE2(dim); } #endif #ifdef OPT_SVE if (features.sve) { - return Choose_SQ8_L2_implementation_SVE(dim); + return Choose_SQ8_FP32_L2_implementation_SVE(dim); } #endif #ifdef OPT_NEON if (features.asimd) { - return Choose_SQ8_L2_implementation_NEON(dim); + return Choose_SQ8_FP32_L2_implementation_NEON(dim); } #endif #endif @@ -70,22 +72,22 @@ dist_func_t L2_SQ8_GetDistFunc(size_t dim, unsigned char *alignment, cons } #ifdef OPT_AVX512_F_BW_VL_VNNI if (features.avx512f && features.avx512bw && features.avx512vnni) { - return Choose_SQ8_L2_implementation_AVX512F_BW_VL_VNNI(dim); + return Choose_SQ8_FP32_L2_implementation_AVX512F_BW_VL_VNNI(dim); } #endif #ifdef OPT_AVX2_FMA if (features.avx2 && features.fma3) { - return Choose_SQ8_L2_implementation_AVX2_FMA(dim); + return Choose_SQ8_FP32_L2_implementation_AVX2_FMA(dim); } #endif #ifdef OPT_AVX2 if (features.avx2) { - return Choose_SQ8_L2_implementation_AVX2(dim); + return Choose_SQ8_FP32_L2_implementation_AVX2(dim); } #endif #ifdef OPT_SSE4 if (features.sse4_1) { - return Choose_SQ8_L2_implementation_SSE4(dim); + return Choose_SQ8_FP32_L2_implementation_SSE4(dim); } #endif #endif // __x86_64__ diff --git a/src/VecSim/spaces/L2_space.h b/src/VecSim/spaces/L2_space.h index f7d779f14..dd2dfec0c 100644 --- a/src/VecSim/spaces/L2_space.h +++ b/src/VecSim/spaces/L2_space.h @@ -22,8 +22,9 @@ dist_func_t L2_INT8_GetDistFunc(size_t dim, unsigned char *alignment = nu const void *arch_opt = nullptr); dist_func_t L2_UINT8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr, const void *arch_opt = nullptr); -dist_func_t L2_SQ8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr, - const void *arch_opt = nullptr); +// SQ8-FP32: asymmetric L2 distance between FP32 query and SQ8 storage +dist_func_t L2_SQ8_FP32_GetDistFunc(size_t dim, unsigned char *alignment = nullptr, + const void *arch_opt = nullptr); dist_func_t L2_SQ8_SQ8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr, const void *arch_opt = nullptr); } // namespace spaces diff --git a/src/VecSim/spaces/functions/AVX2.cpp b/src/VecSim/spaces/functions/AVX2.cpp index 3b24060c1..322ed0aec 100644 --- a/src/VecSim/spaces/functions/AVX2.cpp +++ b/src/VecSim/spaces/functions/AVX2.cpp @@ -10,8 +10,8 @@ #include "VecSim/spaces/IP/IP_AVX2_BF16.h" #include "VecSim/spaces/L2/L2_AVX2_BF16.h" -#include "VecSim/spaces/IP/IP_AVX2_SQ8.h" -#include "VecSim/spaces/L2/L2_AVX2_SQ8.h" +#include "VecSim/spaces/IP/IP_AVX2_SQ8_FP32.h" +#include "VecSim/spaces/L2/L2_AVX2_SQ8_FP32.h" namespace spaces { @@ -29,21 +29,21 @@ dist_func_t Choose_BF16_L2_implementation_AVX2(size_t dim) { return ret_dist_func; } -dist_func_t Choose_SQ8_IP_implementation_AVX2(size_t dim) { +dist_func_t Choose_SQ8_FP32_IP_implementation_AVX2(size_t dim) { dist_func_t ret_dist_func; - CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_InnerProductSIMD16_AVX2); + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_FP32_InnerProductSIMD16_AVX2); return ret_dist_func; } -dist_func_t Choose_SQ8_Cosine_implementation_AVX2(size_t dim) { +dist_func_t Choose_SQ8_FP32_Cosine_implementation_AVX2(size_t dim) { dist_func_t ret_dist_func; - CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_CosineSIMD16_AVX2); + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_FP32_CosineSIMD16_AVX2); return ret_dist_func; } -dist_func_t Choose_SQ8_L2_implementation_AVX2(size_t dim) { +dist_func_t Choose_SQ8_FP32_L2_implementation_AVX2(size_t dim) { dist_func_t ret_dist_func; - CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_L2SqrSIMD16_AVX2); + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_FP32_L2SqrSIMD16_AVX2); return ret_dist_func; } diff --git a/src/VecSim/spaces/functions/AVX2.h b/src/VecSim/spaces/functions/AVX2.h index ecc28f01f..081c42a4e 100644 --- a/src/VecSim/spaces/functions/AVX2.h +++ b/src/VecSim/spaces/functions/AVX2.h @@ -12,9 +12,9 @@ namespace spaces { -dist_func_t Choose_SQ8_IP_implementation_AVX2(size_t dim); -dist_func_t Choose_SQ8_Cosine_implementation_AVX2(size_t dim); -dist_func_t Choose_SQ8_L2_implementation_AVX2(size_t dim); +dist_func_t Choose_SQ8_FP32_IP_implementation_AVX2(size_t dim); +dist_func_t Choose_SQ8_FP32_Cosine_implementation_AVX2(size_t dim); +dist_func_t Choose_SQ8_FP32_L2_implementation_AVX2(size_t dim); dist_func_t Choose_BF16_IP_implementation_AVX2(size_t dim); dist_func_t Choose_BF16_L2_implementation_AVX2(size_t dim); diff --git a/src/VecSim/spaces/functions/AVX2_FMA.cpp b/src/VecSim/spaces/functions/AVX2_FMA.cpp index 4dc627c57..c859128b2 100644 --- a/src/VecSim/spaces/functions/AVX2_FMA.cpp +++ b/src/VecSim/spaces/functions/AVX2_FMA.cpp @@ -7,27 +7,27 @@ * GNU Affero General Public License v3 (AGPLv3). */ #include "AVX2_FMA.h" -#include "VecSim/spaces/L2/L2_AVX2_FMA_SQ8.h" -#include "VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h" +#include "VecSim/spaces/L2/L2_AVX2_FMA_SQ8_FP32.h" +#include "VecSim/spaces/IP/IP_AVX2_FMA_SQ8_FP32.h" namespace spaces { #include "implementation_chooser.h" // FMA optimized implementations -dist_func_t Choose_SQ8_IP_implementation_AVX2_FMA(size_t dim) { +dist_func_t Choose_SQ8_FP32_IP_implementation_AVX2_FMA(size_t dim) { dist_func_t ret_dist_func; - CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_InnerProductSIMD16_AVX2_FMA); + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_FP32_InnerProductSIMD16_AVX2_FMA); return ret_dist_func; } -dist_func_t Choose_SQ8_Cosine_implementation_AVX2_FMA(size_t dim) { +dist_func_t Choose_SQ8_FP32_Cosine_implementation_AVX2_FMA(size_t dim) { dist_func_t ret_dist_func; - CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_CosineSIMD16_AVX2_FMA); + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_FP32_CosineSIMD16_AVX2_FMA); return ret_dist_func; } -dist_func_t Choose_SQ8_L2_implementation_AVX2_FMA(size_t dim) { +dist_func_t Choose_SQ8_FP32_L2_implementation_AVX2_FMA(size_t dim) { dist_func_t ret_dist_func; - CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_L2SqrSIMD16_AVX2_FMA); + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_FP32_L2SqrSIMD16_AVX2_FMA); return ret_dist_func; } diff --git a/src/VecSim/spaces/functions/AVX2_FMA.h b/src/VecSim/spaces/functions/AVX2_FMA.h index b81dfd5ab..b20b1a588 100644 --- a/src/VecSim/spaces/functions/AVX2_FMA.h +++ b/src/VecSim/spaces/functions/AVX2_FMA.h @@ -12,8 +12,8 @@ namespace spaces { -dist_func_t Choose_SQ8_IP_implementation_AVX2_FMA(size_t dim); -dist_func_t Choose_SQ8_Cosine_implementation_AVX2_FMA(size_t dim); -dist_func_t Choose_SQ8_L2_implementation_AVX2_FMA(size_t dim); +dist_func_t Choose_SQ8_FP32_IP_implementation_AVX2_FMA(size_t dim); +dist_func_t Choose_SQ8_FP32_Cosine_implementation_AVX2_FMA(size_t dim); +dist_func_t Choose_SQ8_FP32_L2_implementation_AVX2_FMA(size_t dim); } // namespace spaces diff --git a/src/VecSim/spaces/functions/AVX512F.h b/src/VecSim/spaces/functions/AVX512F.h index 450c3d6bc..fd36f312f 100644 --- a/src/VecSim/spaces/functions/AVX512F.h +++ b/src/VecSim/spaces/functions/AVX512F.h @@ -20,7 +20,4 @@ dist_func_t Choose_FP16_L2_implementation_AVX512F(size_t dim); dist_func_t Choose_FP32_L2_implementation_AVX512F(size_t dim); dist_func_t Choose_FP64_L2_implementation_AVX512F(size_t dim); -dist_func_t Choose_SQ8_IP_implementation_AVX512F(size_t dim); -dist_func_t Choose_SQ8_Cosine_implementation_AVX512F(size_t dim); - } // namespace spaces diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp index 204edd700..3b8813b89 100644 --- a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp +++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp @@ -14,8 +14,8 @@ #include "VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_UINT8.h" #include "VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_UINT8.h" -#include "VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8.h" -#include "VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h" +#include "VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8_FP32.h" +#include "VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8_FP32.h" #include "VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8_SQ8.h" #include "VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8_SQ8.h" @@ -60,19 +60,19 @@ dist_func_t Choose_UINT8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t return ret_dist_func; } -dist_func_t Choose_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim) { +dist_func_t Choose_SQ8_FP32_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim) { dist_func_t ret_dist_func; - CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_InnerProductSIMD16_AVX512F_BW_VL_VNNI); + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_FP32_InnerProductSIMD16_AVX512F_BW_VL_VNNI); return ret_dist_func; } -dist_func_t Choose_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t dim) { +dist_func_t Choose_SQ8_FP32_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t dim) { dist_func_t ret_dist_func; - CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_CosineSIMD16_AVX512F_BW_VL_VNNI); + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_FP32_CosineSIMD16_AVX512F_BW_VL_VNNI); return ret_dist_func; } -dist_func_t Choose_SQ8_L2_implementation_AVX512F_BW_VL_VNNI(size_t dim) { +dist_func_t Choose_SQ8_FP32_L2_implementation_AVX512F_BW_VL_VNNI(size_t dim) { dist_func_t ret_dist_func; - CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_L2SqrSIMD16_AVX512F_BW_VL_VNNI); + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_FP32_L2SqrSIMD16_AVX512F_BW_VL_VNNI); return ret_dist_func; } // SQ8-to-SQ8 distance functions (both vectors are uint8 quantized with precomputed sum) diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h index 41585ae9f..fe1583491 100644 --- a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h +++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h @@ -20,9 +20,9 @@ dist_func_t Choose_UINT8_L2_implementation_AVX512F_BW_VL_VNNI(size_t dim) dist_func_t Choose_UINT8_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim); dist_func_t Choose_UINT8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t dim); -dist_func_t Choose_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim); -dist_func_t Choose_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t dim); -dist_func_t Choose_SQ8_L2_implementation_AVX512F_BW_VL_VNNI(size_t dim); +dist_func_t Choose_SQ8_FP32_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim); +dist_func_t Choose_SQ8_FP32_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t dim); +dist_func_t Choose_SQ8_FP32_L2_implementation_AVX512F_BW_VL_VNNI(size_t dim); // SQ8-to-SQ8 distance functions (both vectors are uint8 quantized with precomputed sum) dist_func_t Choose_SQ8_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim); diff --git a/src/VecSim/spaces/functions/NEON.cpp b/src/VecSim/spaces/functions/NEON.cpp index ad3a52697..0c9a286e3 100644 --- a/src/VecSim/spaces/functions/NEON.cpp +++ b/src/VecSim/spaces/functions/NEON.cpp @@ -15,8 +15,8 @@ #include "VecSim/spaces/IP/IP_NEON_UINT8.h" #include "VecSim/spaces/L2/L2_NEON_FP64.h" #include "VecSim/spaces/IP/IP_NEON_FP64.h" -#include "VecSim/spaces/L2/L2_NEON_SQ8.h" -#include "VecSim/spaces/IP/IP_NEON_SQ8.h" +#include "VecSim/spaces/L2/L2_NEON_SQ8_FP32.h" +#include "VecSim/spaces/IP/IP_NEON_SQ8_FP32.h" #include "VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h" #include "VecSim/spaces/L2/L2_NEON_SQ8_SQ8.h" @@ -83,21 +83,21 @@ dist_func_t Choose_FP64_L2_implementation_NEON(size_t dim) { return ret_dist_func; } -dist_func_t Choose_SQ8_L2_implementation_NEON(size_t dim) { +dist_func_t Choose_SQ8_FP32_L2_implementation_NEON(size_t dim) { dist_func_t ret_dist_func; - CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_L2SqrSIMD16_NEON); + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_FP32_L2SqrSIMD16_NEON); return ret_dist_func; } -dist_func_t Choose_SQ8_IP_implementation_NEON(size_t dim) { +dist_func_t Choose_SQ8_FP32_IP_implementation_NEON(size_t dim) { dist_func_t ret_dist_func; - CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_InnerProductSIMD16_NEON); + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_FP32_InnerProductSIMD16_NEON); return ret_dist_func; } -dist_func_t Choose_SQ8_Cosine_implementation_NEON(size_t dim) { +dist_func_t Choose_SQ8_FP32_Cosine_implementation_NEON(size_t dim) { dist_func_t ret_dist_func; - CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_CosineSIMD16_NEON); + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_FP32_CosineSIMD16_NEON); return ret_dist_func; } diff --git a/src/VecSim/spaces/functions/NEON.h b/src/VecSim/spaces/functions/NEON.h index 203f88725..08060b402 100644 --- a/src/VecSim/spaces/functions/NEON.h +++ b/src/VecSim/spaces/functions/NEON.h @@ -26,9 +26,9 @@ dist_func_t Choose_FP32_L2_implementation_NEON(size_t dim); dist_func_t Choose_FP64_IP_implementation_NEON(size_t dim); dist_func_t Choose_FP64_L2_implementation_NEON(size_t dim); -dist_func_t Choose_SQ8_L2_implementation_NEON(size_t dim); -dist_func_t Choose_SQ8_IP_implementation_NEON(size_t dim); -dist_func_t Choose_SQ8_Cosine_implementation_NEON(size_t dim); +dist_func_t Choose_SQ8_FP32_L2_implementation_NEON(size_t dim); +dist_func_t Choose_SQ8_FP32_IP_implementation_NEON(size_t dim); +dist_func_t Choose_SQ8_FP32_Cosine_implementation_NEON(size_t dim); // SQ8-to-SQ8 distance functions (both vectors are uint8 quantized with precomputed sum) dist_func_t Choose_SQ8_SQ8_IP_implementation_NEON(size_t dim); diff --git a/src/VecSim/spaces/functions/SSE.cpp b/src/VecSim/spaces/functions/SSE.cpp index d25463531..9963fa86f 100644 --- a/src/VecSim/spaces/functions/SSE.cpp +++ b/src/VecSim/spaces/functions/SSE.cpp @@ -10,11 +10,11 @@ #include "VecSim/spaces/L2/L2_SSE_FP32.h" #include "VecSim/spaces/L2/L2_SSE_FP64.h" -#include "VecSim/spaces/L2/L2_SSE4_SQ8.h" +#include "VecSim/spaces/L2/L2_SSE4_SQ8_FP32.h" #include "VecSim/spaces/IP/IP_SSE_FP32.h" #include "VecSim/spaces/IP/IP_SSE_FP64.h" -#include "VecSim/spaces/IP/IP_SSE4_SQ8.h" +#include "VecSim/spaces/IP/IP_SSE4_SQ8_FP32.h" namespace spaces { diff --git a/src/VecSim/spaces/functions/SSE4.cpp b/src/VecSim/spaces/functions/SSE4.cpp index d8dd51448..5f5bbc1ba 100644 --- a/src/VecSim/spaces/functions/SSE4.cpp +++ b/src/VecSim/spaces/functions/SSE4.cpp @@ -7,28 +7,28 @@ * GNU Affero General Public License v3 (AGPLv3). */ #include "SSE4.h" -#include "VecSim/spaces/IP/IP_SSE4_SQ8.h" -#include "VecSim/spaces/L2/L2_SSE4_SQ8.h" +#include "VecSim/spaces/IP/IP_SSE4_SQ8_FP32.h" +#include "VecSim/spaces/L2/L2_SSE4_SQ8_FP32.h" namespace spaces { #include "implementation_chooser.h" -dist_func_t Choose_SQ8_IP_implementation_SSE4(size_t dim) { +dist_func_t Choose_SQ8_FP32_IP_implementation_SSE4(size_t dim) { dist_func_t ret_dist_func; - CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_InnerProductSIMD16_SSE4); + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_FP32_InnerProductSIMD16_SSE4); return ret_dist_func; } -dist_func_t Choose_SQ8_Cosine_implementation_SSE4(size_t dim) { +dist_func_t Choose_SQ8_FP32_Cosine_implementation_SSE4(size_t dim) { dist_func_t ret_dist_func; - CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_CosineSIMD16_SSE4); + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_FP32_CosineSIMD16_SSE4); return ret_dist_func; } -dist_func_t Choose_SQ8_L2_implementation_SSE4(size_t dim) { +dist_func_t Choose_SQ8_FP32_L2_implementation_SSE4(size_t dim) { dist_func_t ret_dist_func; - CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_L2SqrSIMD16_SSE4); + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_FP32_L2SqrSIMD16_SSE4); return ret_dist_func; } diff --git a/src/VecSim/spaces/functions/SSE4.h b/src/VecSim/spaces/functions/SSE4.h index 27bbae0e0..e47948137 100644 --- a/src/VecSim/spaces/functions/SSE4.h +++ b/src/VecSim/spaces/functions/SSE4.h @@ -12,8 +12,8 @@ namespace spaces { -dist_func_t Choose_SQ8_IP_implementation_SSE4(size_t dim); -dist_func_t Choose_SQ8_Cosine_implementation_SSE4(size_t dim); -dist_func_t Choose_SQ8_L2_implementation_SSE4(size_t dim); +dist_func_t Choose_SQ8_FP32_IP_implementation_SSE4(size_t dim); +dist_func_t Choose_SQ8_FP32_Cosine_implementation_SSE4(size_t dim); +dist_func_t Choose_SQ8_FP32_L2_implementation_SSE4(size_t dim); } // namespace spaces diff --git a/src/VecSim/spaces/functions/SVE.cpp b/src/VecSim/spaces/functions/SVE.cpp index d2a95cbd6..fde853db2 100644 --- a/src/VecSim/spaces/functions/SVE.cpp +++ b/src/VecSim/spaces/functions/SVE.cpp @@ -22,8 +22,8 @@ #include "VecSim/spaces/L2/L2_SVE_UINT8.h" #include "VecSim/spaces/IP/IP_SVE_UINT8.h" -#include "VecSim/spaces/IP/IP_SVE_SQ8.h" -#include "VecSim/spaces/L2/L2_SVE_SQ8.h" +#include "VecSim/spaces/IP/IP_SVE_SQ8_FP32.h" +#include "VecSim/spaces/L2/L2_SVE_SQ8_FP32.h" #include "VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h" #include "VecSim/spaces/L2/L2_SVE_SQ8_SQ8.h" @@ -101,21 +101,21 @@ dist_func_t Choose_UINT8_Cosine_implementation_SVE(size_t dim) { return ret_dist_func; } -dist_func_t Choose_SQ8_IP_implementation_SVE(size_t dim) { +dist_func_t Choose_SQ8_FP32_IP_implementation_SVE(size_t dim) { dist_func_t ret_dist_func; - CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_InnerProductSIMD_SVE, dim, svcntw); + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_FP32_InnerProductSIMD_SVE, dim, svcntw); return ret_dist_func; } -dist_func_t Choose_SQ8_Cosine_implementation_SVE(size_t dim) { +dist_func_t Choose_SQ8_FP32_Cosine_implementation_SVE(size_t dim) { dist_func_t ret_dist_func; - CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_CosineSIMD_SVE, dim, svcntw); + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_FP32_CosineSIMD_SVE, dim, svcntw); return ret_dist_func; } -dist_func_t Choose_SQ8_L2_implementation_SVE(size_t dim) { +dist_func_t Choose_SQ8_FP32_L2_implementation_SVE(size_t dim) { dist_func_t ret_dist_func; - CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_L2SqrSIMD_SVE, dim, svcntw); + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_FP32_L2SqrSIMD_SVE, dim, svcntw); return ret_dist_func; } diff --git a/src/VecSim/spaces/functions/SVE.h b/src/VecSim/spaces/functions/SVE.h index 4b3eabde0..bd3bc97c3 100644 --- a/src/VecSim/spaces/functions/SVE.h +++ b/src/VecSim/spaces/functions/SVE.h @@ -29,9 +29,9 @@ dist_func_t Choose_UINT8_L2_implementation_SVE(size_t dim); dist_func_t Choose_UINT8_Cosine_implementation_SVE(size_t dim); dist_func_t Choose_UINT8_IP_implementation_SVE(size_t dim); -dist_func_t Choose_SQ8_IP_implementation_SVE(size_t dim); -dist_func_t Choose_SQ8_Cosine_implementation_SVE(size_t dim); -dist_func_t Choose_SQ8_L2_implementation_SVE(size_t dim); +dist_func_t Choose_SQ8_FP32_IP_implementation_SVE(size_t dim); +dist_func_t Choose_SQ8_FP32_Cosine_implementation_SVE(size_t dim); +dist_func_t Choose_SQ8_FP32_L2_implementation_SVE(size_t dim); // SQ8-to-SQ8 distance functions (both vectors are uint8 quantized with precomputed sum) dist_func_t Choose_SQ8_SQ8_IP_implementation_SVE(size_t dim); diff --git a/src/VecSim/spaces/functions/SVE2.cpp b/src/VecSim/spaces/functions/SVE2.cpp index c0c8329c2..4215d79cf 100644 --- a/src/VecSim/spaces/functions/SVE2.cpp +++ b/src/VecSim/spaces/functions/SVE2.cpp @@ -16,14 +16,14 @@ #include "VecSim/spaces/IP/IP_SVE_FP64.h" #include "VecSim/spaces/L2/L2_SVE_FP64.h" -#include "VecSim/spaces/L2/L2_SVE_INT8.h" // SVE2 implementation is identical to SVE -#include "VecSim/spaces/IP/IP_SVE_INT8.h" // SVE2 implementation is identical to SVE -#include "VecSim/spaces/L2/L2_SVE_UINT8.h" // SVE2 implementation is identical to SVE -#include "VecSim/spaces/IP/IP_SVE_UINT8.h" // SVE2 implementation is identical to SVE -#include "VecSim/spaces/IP/IP_SVE_SQ8.h" // SVE2 implementation is identical to SVE -#include "VecSim/spaces/L2/L2_SVE_SQ8.h" // SVE2 implementation is identical to SVE -#include "VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h" // SVE2 implementation is identical to SVE -#include "VecSim/spaces/L2/L2_SVE_SQ8_SQ8.h" // SVE2 implementation is identical to SVE +#include "VecSim/spaces/L2/L2_SVE_INT8.h" // SVE2 implementation is identical to SVE +#include "VecSim/spaces/IP/IP_SVE_INT8.h" // SVE2 implementation is identical to SVE +#include "VecSim/spaces/L2/L2_SVE_UINT8.h" // SVE2 implementation is identical to SVE +#include "VecSim/spaces/IP/IP_SVE_UINT8.h" // SVE2 implementation is identical to SVE +#include "VecSim/spaces/IP/IP_SVE_SQ8_FP32.h" // SVE2 implementation is identical to SVE +#include "VecSim/spaces/L2/L2_SVE_SQ8_FP32.h" // SVE2 implementation is identical to SVE +#include "VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h" // SVE2 implementation is identical to SVE +#include "VecSim/spaces/L2/L2_SVE_SQ8_SQ8.h" // SVE2 implementation is identical to SVE namespace spaces { @@ -98,21 +98,21 @@ dist_func_t Choose_UINT8_Cosine_implementation_SVE2(size_t dim) { return ret_dist_func; } -dist_func_t Choose_SQ8_IP_implementation_SVE2(size_t dim) { +dist_func_t Choose_SQ8_FP32_IP_implementation_SVE2(size_t dim) { dist_func_t ret_dist_func; - CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_InnerProductSIMD_SVE, dim, svcntw); + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_FP32_InnerProductSIMD_SVE, dim, svcntw); return ret_dist_func; } -dist_func_t Choose_SQ8_Cosine_implementation_SVE2(size_t dim) { +dist_func_t Choose_SQ8_FP32_Cosine_implementation_SVE2(size_t dim) { dist_func_t ret_dist_func; - CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_CosineSIMD_SVE, dim, svcntw); + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_FP32_CosineSIMD_SVE, dim, svcntw); return ret_dist_func; } -dist_func_t Choose_SQ8_L2_implementation_SVE2(size_t dim) { +dist_func_t Choose_SQ8_FP32_L2_implementation_SVE2(size_t dim) { dist_func_t ret_dist_func; - CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_L2SqrSIMD_SVE, dim, svcntw); + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_FP32_L2SqrSIMD_SVE, dim, svcntw); return ret_dist_func; } diff --git a/src/VecSim/spaces/functions/SVE2.h b/src/VecSim/spaces/functions/SVE2.h index efb3bceeb..04078a91e 100644 --- a/src/VecSim/spaces/functions/SVE2.h +++ b/src/VecSim/spaces/functions/SVE2.h @@ -29,9 +29,9 @@ dist_func_t Choose_UINT8_L2_implementation_SVE2(size_t dim); dist_func_t Choose_UINT8_Cosine_implementation_SVE2(size_t dim); dist_func_t Choose_UINT8_IP_implementation_SVE2(size_t dim); -dist_func_t Choose_SQ8_IP_implementation_SVE2(size_t dim); -dist_func_t Choose_SQ8_Cosine_implementation_SVE2(size_t dim); -dist_func_t Choose_SQ8_L2_implementation_SVE2(size_t dim); +dist_func_t Choose_SQ8_FP32_IP_implementation_SVE2(size_t dim); +dist_func_t Choose_SQ8_FP32_Cosine_implementation_SVE2(size_t dim); +dist_func_t Choose_SQ8_FP32_L2_implementation_SVE2(size_t dim); // SQ8-to-SQ8 distance functions (both vectors are uint8 quantized) dist_func_t Choose_SQ8_SQ8_IP_implementation_SVE2(size_t dim); diff --git a/src/VecSim/spaces/spaces.cpp b/src/VecSim/spaces/spaces.cpp index 42569ba80..baf5c886f 100644 --- a/src/VecSim/spaces/spaces.cpp +++ b/src/VecSim/spaces/spaces.cpp @@ -120,11 +120,11 @@ dist_func_t GetDistFunc(VecSimMetric met unsigned char *alignment) { switch (metric) { case VecSimMetric_Cosine: - return Cosine_SQ8_GetDistFunc(dim, alignment); + return Cosine_SQ8_FP32_GetDistFunc(dim, alignment); case VecSimMetric_IP: - return IP_SQ8_GetDistFunc(dim, alignment); + return IP_SQ8_FP32_GetDistFunc(dim, alignment); case VecSimMetric_L2: - return L2_SQ8_GetDistFunc(dim, alignment); + return L2_SQ8_FP32_GetDistFunc(dim, alignment); } throw std::invalid_argument("Invalid metric"); } diff --git a/tests/benchmark/CMakeLists.txt b/tests/benchmark/CMakeLists.txt index 052207214..859f2c0af 100644 --- a/tests/benchmark/CMakeLists.txt +++ b/tests/benchmark/CMakeLists.txt @@ -39,7 +39,7 @@ endif() # Spaces benchmarks # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # -set(DATA_TYPE fp32 fp64 bf16 fp16 int8 uint8 sq8 sq8_sq8) +set(DATA_TYPE fp32 fp64 bf16 fp16 int8 uint8 sq8_fp32 sq8_sq8) foreach(data_type IN LISTS DATA_TYPE) add_executable(bm_spaces_${data_type} spaces_benchmarks/bm_spaces_${data_type}.cpp) target_link_libraries(bm_spaces_${data_type} VectorSimilarity benchmark::benchmark) diff --git a/tests/benchmark/benchmarks.sh b/tests/benchmark/benchmarks.sh index 00eaf47a0..f00095a54 100755 --- a/tests/benchmark/benchmarks.sh +++ b/tests/benchmark/benchmarks.sh @@ -19,7 +19,7 @@ if [ -z "$BM_TYPE" ] || [ "$BM_TYPE" = "benchmarks-all" ]; then echo spaces_fp16 echo spaces_int8 echo spaces_uint8 - echo spaces_sq8 + echo spaces_sq8_fp32 echo spaces_sq8_sq8 elif [ "$BM_TYPE" = "benchmarks-default" ]; then @@ -31,7 +31,7 @@ elif [ "$BM_TYPE" = "benchmarks-default" ]; then echo spaces_fp16 echo spaces_int8 echo spaces_uint8 - echo spaces_sq8 + echo spaces_sq8_fp32 echo spaces_sq8_sq8 @@ -100,9 +100,10 @@ elif [ "$BM_TYPE" = "bm-basics-svs-fp32-single" ] ; then echo basics_svs_single_fp32 echo basics_svs_single_fp32_LVQ8 elif [ "$BM_TYPE" = "bm-spaces-sq8-full" ] ; then - echo spaces_sq8 + echo spaces_sq8_fp32 echo spaces_sq8_sq8 + # Spaces benchmarks elif [ "$BM_TYPE" = "bm-spaces" ] ; then echo spaces_fp32 @@ -111,7 +112,7 @@ elif [ "$BM_TYPE" = "bm-spaces" ] ; then echo spaces_bf16 echo spaces_int8 echo spaces_uint8 - echo spaces_sq8 + echo spaces_sq8_fp32 echo spaces_sq8_sq8 elif [ "$BM_TYPE" = "bm-spaces-fp32" ] ; then @@ -126,8 +127,8 @@ elif [ "$BM_TYPE" = "bm-spaces-int8" ] ; then echo spaces_int8 elif [ "$BM_TYPE" = "bm-spaces-uint8" ] ; then echo spaces_uint8 -elif [ "$BM_TYPE" = "bm-spaces-sq8" ] ; then - echo spaces_sq8 +elif [ "$BM_TYPE" = "bm-spaces-sq8-fp32" ] ; then + echo spaces_sq8_fp32 elif [ "$BM_TYPE" = "bm-spaces-sq8-sq8" ] ; then echo spaces_sq8_sq8 fi diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_fp32.cpp similarity index 52% rename from tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp rename to tests/benchmark/spaces_benchmarks/bm_spaces_sq8_fp32.cpp index 66c477deb..320818c02 100644 --- a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp +++ b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_fp32.cpp @@ -11,7 +11,7 @@ using sq8 = vecsim_types::sq8; -class BM_VecSimSpaces_SQ8 : public benchmark::Fixture { +class BM_VecSimSpaces_SQ8_FP32 : public benchmark::Fixture { protected: std::mt19937 rng; size_t dim; @@ -19,14 +19,14 @@ class BM_VecSimSpaces_SQ8 : public benchmark::Fixture { uint8_t *v2; public: - BM_VecSimSpaces_SQ8() { rng.seed(47); } - ~BM_VecSimSpaces_SQ8() = default; + BM_VecSimSpaces_SQ8_FP32() { rng.seed(47); } + ~BM_VecSimSpaces_SQ8_FP32() = default; void SetUp(const ::benchmark::State &state) { dim = state.range(0); - size_t query_size = (dim + sq8::query_metadata_count()); + size_t query_size = dim + sq8::query_metadata_count(); v1 = new float[query_size]; - test_utils::populate_fp32_sq8_query(v1, dim, true, 123); + test_utils::populate_sq8_fp32_query(v1, dim, true, 123); size_t quantized_size = dim * sizeof(uint8_t) + sq8::storage_metadata_count() * sizeof(float); v2 = new uint8_t[quantized_size]; @@ -44,20 +44,20 @@ cpu_features::Aarch64Features opt = cpu_features::GetAarch64Info().features; // NEON implementation for ARMv8-a #ifdef OPT_NEON bool neon_supported = opt.asimd; // ARMv8-a always supports NEON -INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8, SQ8, NEON, 16, neon_supported); -INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8, SQ8, NEON, 16, neon_supported); +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8_FP32, SQ8_FP32, NEON, 16, neon_supported); +INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_FP32, SQ8_FP32, NEON, 16, neon_supported); #endif // SVE implementation #ifdef OPT_SVE bool sve_supported = opt.sve; // Check for SVE support -INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8, SQ8, SVE, 16, sve_supported); -INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8, SQ8, SVE, 16, sve_supported); +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8_FP32, SQ8_FP32, SVE, 16, sve_supported); +INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_FP32, SQ8_FP32, SVE, 16, sve_supported); #endif // SVE2 implementation #ifdef OPT_SVE2 bool sve2_supported = opt.sve2; // Check for SVE2 support -INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8, SQ8, SVE2, 16, sve2_supported); -INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8, SQ8, SVE2, 16, sve2_supported); +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8_FP32, SQ8_FP32, SVE2, 16, sve2_supported); +INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_FP32, SQ8_FP32, SVE2, 16, sve2_supported); #endif #endif // AARCH64 @@ -67,38 +67,40 @@ cpu_features::X86Features opt = cpu_features::GetX86Info().features; // AVX512_F_BW_VL_VNNI functions #ifdef OPT_AVX512_F_BW_VL_VNNI bool avx512_f_bw_vl_vnni_supported = opt.avx512f && opt.avx512bw && opt.avx512vl && opt.avx512vnni; -INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8, SQ8, AVX512F_BW_VL_VNNI, 16, +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8_FP32, SQ8_FP32, AVX512F_BW_VL_VNNI, 16, avx512_f_bw_vl_vnni_supported); -INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8, SQ8, AVX512F_BW_VL_VNNI, 16, +INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_FP32, SQ8_FP32, AVX512F_BW_VL_VNNI, 16, avx512_f_bw_vl_vnni_supported); #endif // AVX512_F_BW_VL_VNNI #ifdef OPT_AVX2_FMA bool avx2_fma3_supported = opt.avx2 && opt.fma3; -INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8, SQ8, AVX2_FMA, 16, avx2_fma3_supported); -INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8, SQ8, AVX2_FMA, 16, avx2_fma3_supported); +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8_FP32, SQ8_FP32, AVX2_FMA, 16, + avx2_fma3_supported); +INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_FP32, SQ8_FP32, AVX2_FMA, 16, + avx2_fma3_supported); #endif // AVX2_FMA #ifdef OPT_AVX2 // AVX2 functions bool avx2_supported = opt.avx2; -INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8, SQ8, AVX2, 16, avx2_supported); -INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8, SQ8, AVX2, 16, avx2_supported); +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8_FP32, SQ8_FP32, AVX2, 16, avx2_supported); +INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_FP32, SQ8_FP32, AVX2, 16, avx2_supported); #endif // AVX2 // SSE4 functions #ifdef OPT_SSE4 bool sse4_supported = opt.sse4_1; -INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8, SQ8, SSE4, 16, sse4_supported); -INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8, SQ8, SSE4, 16, sse4_supported); +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8_FP32, SQ8_FP32, SSE4, 16, sse4_supported); +INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_FP32, SQ8_FP32, SSE4, 16, sse4_supported); #endif // SSE4 #endif // x86_64 // Naive algorithms -INITIALIZE_NAIVE_BM(BM_VecSimSpaces_SQ8, SQ8, InnerProduct, 16); -INITIALIZE_NAIVE_BM(BM_VecSimSpaces_SQ8, SQ8, Cosine, 16); -INITIALIZE_NAIVE_BM(BM_VecSimSpaces_SQ8, SQ8, L2Sqr, 16); +INITIALIZE_NAIVE_BM(BM_VecSimSpaces_SQ8_FP32, SQ8_FP32, InnerProduct, 16); +INITIALIZE_NAIVE_BM(BM_VecSimSpaces_SQ8_FP32, SQ8_FP32, Cosine, 16); +INITIALIZE_NAIVE_BM(BM_VecSimSpaces_SQ8_FP32, SQ8_FP32, L2Sqr, 16); // Naive diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index f371c8595..75b66febf 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -312,14 +312,14 @@ TEST_F(SpacesTest, uint8_Cosine_no_optimization_func_test) { /* ======================== Tests SQ8 ========================= */ -TEST_F(SpacesTest, SQ8_ip_no_optimization_norm_func_test) { +TEST_F(SpacesTest, SQ8_FP32_ip_no_optimization_norm_func_test) { size_t dim = 5; // Create V1 fp32 query with precomputed sum and sum_squares // Query layout: [float values (dim)] [sum] [sum_squares] size_t query_size = dim + sq8::query_metadata_count(); std::vector v1_orig(query_size); - test_utils::populate_fp32_sq8_query(v1_orig.data(), dim, true, 1234); + test_utils::populate_sq8_fp32_query(v1_orig.data(), dim, true, 1234); // Create V2 as SQ8 quantized vector with different seed size_t quantized_size = @@ -328,22 +328,22 @@ TEST_F(SpacesTest, SQ8_ip_no_optimization_norm_func_test) { test_utils::populate_float_vec_to_sq8_with_metadata(v2_compressed.data(), dim, true, 5678); float baseline = - test_utils::SQ8_NotOptimized_InnerProduct(v1_orig.data(), v2_compressed.data(), dim); + test_utils::SQ8_FP32_NotOptimized_InnerProduct(v2_compressed.data(), v1_orig.data(), dim); - float dist = - SQ8_InnerProduct((const void *)v1_orig.data(), (const void *)v2_compressed.data(), dim); + float dist = SQ8_FP32_InnerProduct((const void *)v2_compressed.data(), + (const void *)v1_orig.data(), dim); - ASSERT_NEAR(dist, baseline, 0.01) << "SQ8_InnerProduct failed to match expected distance"; + ASSERT_NEAR(dist, baseline, 0.01) << "SQ8_FP32_InnerProduct failed to match expected distance"; } -TEST_F(SpacesTest, SQ8_l2sqr_no_optimization_func_test) { +TEST_F(SpacesTest, SQ8_FP32_l2sqr_no_optimization_func_test) { size_t dim = 5; // Create V1 fp32 query with precomputed sum and sum_squares // Query layout: [float values (dim)] [sum] [sum_squares] size_t query_size = dim + sq8::query_metadata_count(); std::vector v1_orig(query_size); - test_utils::populate_fp32_sq8_query(v1_orig.data(), dim, false, 1234); + test_utils::populate_sq8_fp32_query(v1_orig.data(), dim, false, 1234); // Create V2 as SQ8 quantized vector with different seed // Storage layout: [uint8_t values (dim)] [min_val] [delta] [sum] [sum_squares] @@ -352,11 +352,13 @@ TEST_F(SpacesTest, SQ8_l2sqr_no_optimization_func_test) { std::vector v2_compressed(quantized_size); test_utils::populate_float_vec_to_sq8_with_metadata(v2_compressed.data(), dim, false, 5678); - float baseline = test_utils::SQ8_NotOptimized_L2Sqr(v1_orig.data(), v2_compressed.data(), dim); + float baseline = + test_utils::SQ8_FP32_NotOptimized_L2Sqr(v2_compressed.data(), v1_orig.data(), dim); - float dist = SQ8_L2Sqr((const void *)v1_orig.data(), (const void *)v2_compressed.data(), dim); + float dist = + SQ8_FP32_L2Sqr((const void *)v2_compressed.data(), (const void *)v1_orig.data(), dim); - ASSERT_NEAR(dist, baseline, 0.01) << "SQ8_L2Sqr failed to match expected distance"; + ASSERT_NEAR(dist, baseline, 0.01) << "SQ8_FP32_L2Sqr failed to match expected distance"; } /* ======================== Test Getters ======================== */ @@ -422,9 +424,9 @@ TEST_F(SpacesTest, GetDistFuncSQ8Asymmetric) { auto l2_func = spaces::GetDistFunc(VecSimMetric_L2, dim, nullptr); auto ip_func = spaces::GetDistFunc(VecSimMetric_IP, dim, nullptr); auto cosine_func = spaces::GetDistFunc(VecSimMetric_Cosine, dim, nullptr); - ASSERT_EQ(l2_func, L2_SQ8_GetDistFunc(dim, nullptr)); - ASSERT_EQ(ip_func, IP_SQ8_GetDistFunc(dim, nullptr)); - ASSERT_EQ(cosine_func, Cosine_SQ8_GetDistFunc(dim, nullptr)); + ASSERT_EQ(l2_func, L2_SQ8_FP32_GetDistFunc(dim, nullptr)); + ASSERT_EQ(ip_func, IP_SQ8_FP32_GetDistFunc(dim, nullptr)); + ASSERT_EQ(cosine_func, Cosine_SQ8_FP32_GetDistFunc(dim, nullptr)); } #ifdef CPU_FEATURES_ARCH_X86_64 @@ -1978,9 +1980,9 @@ TEST_P(UINT8SpacesOptimizationTest, UINT8_full_range_test) { INSTANTIATE_TEST_SUITE_P(UINT8OptFuncs, UINT8SpacesOptimizationTest, testing::Range(32UL, 64 * 2UL + 1)); -class SQ8SpacesOptimizationTest : public testing::TestWithParam {}; +class SQ8_FP32_SpacesOptimizationTest : public testing::TestWithParam {}; -TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { +TEST_P(SQ8_FP32_SpacesOptimizationTest, SQ8_FP32_L2SqrTest) { auto optimization = getCpuOptimizationFeatures(); size_t dim = GetParam(); @@ -1988,7 +1990,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { // Query layout: [float values (dim)] [sum] [sum_squares] size_t query_size = dim + sq8::query_metadata_count(); std::vector v1_orig(query_size); - test_utils::populate_fp32_sq8_query(v1_orig.data(), dim, false, 1234); + test_utils::populate_sq8_fp32_query(v1_orig.data(), dim, false, 1234); // Create V2 as SQ8 quantized vector with different seed // Storage layout: [uint8_t values (dim)] [min_val] [delta] [sum] [sum_squares] @@ -2003,15 +2005,15 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { }; dist_func_t arch_opt_func; - float baseline = SQ8_L2Sqr(v1_orig.data(), v2_compressed.data(), dim); + float baseline = SQ8_FP32_L2Sqr(v2_compressed.data(), v1_orig.data(), dim); // Test different optimizations based on CPU features #ifdef OPT_AVX512_F_BW_VL_VNNI if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { unsigned char alignment = 0; - arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); - ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_AVX512F_BW_VL_VNNI(dim)) + arch_opt_func = L2_SQ8_FP32_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_FP32_L2_implementation_AVX512F_BW_VL_VNNI(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_orig.data(), dim), 0.01) << "AVX512 with dim " << dim; // ASSERT_EQ(alignment, expected_alignment(512, dim)) << "AVX512 with dim " << dim; // Unset optimizations flag, so we'll choose the next optimization. @@ -2021,10 +2023,10 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { #ifdef OPT_AVX2_FMA if (optimization.avx2 && optimization.fma3) { unsigned char alignment = 0; - arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); - ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_AVX2_FMA(dim)) + arch_opt_func = L2_SQ8_FP32_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_FP32_L2_implementation_AVX2_FMA(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_orig.data(), dim), 0.01) << "AVX with dim " << dim; // ASSERT_EQ(alignment, expected_alignment(256, dim)) << "AVX with dim " << dim; // Unset optimizations flag, so we'll choose the next optimization. @@ -2034,10 +2036,10 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { #ifdef OPT_AVX2 if (optimization.avx2) { unsigned char alignment = 0; - arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); - ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_AVX2(dim)) + arch_opt_func = L2_SQ8_FP32_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_FP32_L2_implementation_AVX2(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_orig.data(), dim), 0.01) << "AVX with dim " << dim; // ASSERT_EQ(alignment, expected_alignment(256, dim)) << "AVX with dim " << dim; // Unset avx flag as well, so we'll choose the next optimization (SSE). @@ -2047,10 +2049,10 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { #ifdef OPT_SSE4 if (optimization.sse4_1) { unsigned char alignment = 0; - arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); - ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_SSE4(dim)) + arch_opt_func = L2_SQ8_FP32_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_FP32_L2_implementation_SSE4(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_orig.data(), dim), 0.01) << "SSE with dim " << dim; // ASSERT_EQ(alignment, expected_alignment(128, dim)) << "SSE with dim " << dim; // Unset sse flag as well, so we'll choose the next optimization (default). @@ -2061,10 +2063,10 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { #ifdef OPT_SVE2 if (optimization.sve2) { unsigned char alignment = 0; - arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); - ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_SVE2(dim)) + arch_opt_func = L2_SQ8_FP32_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_FP32_L2_implementation_SVE2(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_orig.data(), dim), 0.01) << "SVE2 with dim " << dim; ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; // Unset sve2 flag as well, so we'll choose the next option (default). @@ -2074,10 +2076,10 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { #ifdef OPT_SVE if (optimization.sve) { unsigned char alignment = 0; - arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); - ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_SVE(dim)) + arch_opt_func = L2_SQ8_FP32_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_FP32_L2_implementation_SVE(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_orig.data(), dim), 0.01) << "SVE with dim " << dim; ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; // Unset sve flag as well, so we'll choose the next option (default). @@ -2087,10 +2089,10 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { #ifdef OPT_NEON if (optimization.asimd) { unsigned char alignment = 0; - arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); - ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_NEON(dim)) + arch_opt_func = L2_SQ8_FP32_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_FP32_L2_implementation_NEON(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_orig.data(), dim), 0.01) << "NEON with dim " << dim; ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; // Unset optimizations flag, so we'll choose the next optimization. @@ -2100,14 +2102,15 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { // Test default implementation unsigned char alignment = 0; - arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); - ASSERT_EQ(arch_opt_func, SQ8_L2Sqr) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + arch_opt_func = L2_SQ8_FP32_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, SQ8_FP32_L2Sqr) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_orig.data(), dim), 0.01) << "No optimization with dim " << dim; ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; } -TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { +TEST_P(SQ8_FP32_SpacesOptimizationTest, SQ8_FP32_InnerProductTest) { auto optimization = getCpuOptimizationFeatures(); size_t dim = GetParam(); @@ -2115,7 +2118,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { // Query layout: [float values (dim)] [sum] [sum_squares] size_t query_size = dim + sq8::query_metadata_count(); std::vector v1_orig(query_size); - test_utils::populate_fp32_sq8_query(v1_orig.data(), dim, true, 1234); + test_utils::populate_sq8_fp32_query(v1_orig.data(), dim, true, 1234); size_t quantized_size = dim * sizeof(uint8_t) + sq8::storage_metadata_count() * sizeof(float); std::vector v2_compressed(quantized_size); @@ -2127,16 +2130,16 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { }; dist_func_t arch_opt_func; - float baseline = SQ8_InnerProduct(v1_orig.data(), v2_compressed.data(), dim); + float baseline = SQ8_FP32_InnerProduct(v2_compressed.data(), v1_orig.data(), dim); // Test different optimizations based on CPU features #ifdef OPT_AVX512_F_BW_VL_VNNI if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { unsigned char alignment = 0; - arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); - ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(dim)) + arch_opt_func = IP_SQ8_FP32_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_FP32_IP_implementation_AVX512F_BW_VL_VNNI(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_orig.data(), dim), 0.01) << "AVX512 with dim " << dim; optimization.avx512f = 0; } @@ -2144,10 +2147,10 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { #ifdef OPT_AVX2_FMA if (optimization.avx2 && optimization.fma3) { unsigned char alignment = 0; - arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); - ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_AVX2_FMA(dim)) + arch_opt_func = IP_SQ8_FP32_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_FP32_IP_implementation_AVX2_FMA(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_orig.data(), dim), 0.01) << "AVX with dim " << dim; optimization.fma3 = 0; } @@ -2155,10 +2158,10 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { #ifdef OPT_AVX2 if (optimization.avx2) { unsigned char alignment = 0; - arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); - ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_AVX2(dim)) + arch_opt_func = IP_SQ8_FP32_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_FP32_IP_implementation_AVX2(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_orig.data(), dim), 0.01) << "AVX with dim " << dim; optimization.avx2 = 0; } @@ -2166,10 +2169,10 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { #ifdef OPT_SSE if (optimization.sse4_1) { unsigned char alignment = 0; - arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); - ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_SSE4(dim)) + arch_opt_func = IP_SQ8_FP32_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_FP32_IP_implementation_SSE4(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_orig.data(), dim), 0.01) << "SSE with dim " << dim; optimization.sse4_1 = 0; } @@ -2177,10 +2180,10 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { #ifdef OPT_SVE2 if (optimization.sve2) { unsigned char alignment = 0; - arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); - ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_SVE2(dim)) + arch_opt_func = IP_SQ8_FP32_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_FP32_IP_implementation_SVE2(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_orig.data(), dim), 0.01) << "SVE2 with dim " << dim; optimization.sve2 = 0; } @@ -2188,10 +2191,10 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { #ifdef OPT_SVE if (optimization.sve) { unsigned char alignment = 0; - arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); - ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_SVE(dim)) + arch_opt_func = IP_SQ8_FP32_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_FP32_IP_implementation_SVE(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_orig.data(), dim), 0.01) << "SVE with dim " << dim; optimization.sve = 0; } @@ -2199,10 +2202,10 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { #ifdef OPT_NEON if (optimization.asimd) { unsigned char alignment = 0; - arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); - ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_NEON(dim)) + arch_opt_func = IP_SQ8_FP32_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_FP32_IP_implementation_NEON(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_orig.data(), dim), 0.01) << "NEON with dim " << dim; optimization.asimd = 0; } @@ -2210,19 +2213,19 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { // Test default implementation unsigned char alignment = 0; - arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); - ASSERT_EQ(arch_opt_func, SQ8_InnerProduct) + arch_opt_func = IP_SQ8_FP32_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, SQ8_FP32_InnerProduct) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_orig.data(), dim), 0.01) << "No optimization with dim " << dim; ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; } // Instantiate the test suite with dimensions to test -INSTANTIATE_TEST_SUITE_P(SQ8InnerProductTest, SQ8SpacesOptimizationTest, +INSTANTIATE_TEST_SUITE_P(SQ8_FP32_Test, SQ8_FP32_SpacesOptimizationTest, testing::Range(16UL, 16 * 2UL + 1)); -TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { +TEST_P(SQ8_FP32_SpacesOptimizationTest, SQ8_FP32_CosineTest) { auto optimization = getCpuOptimizationFeatures(); size_t dim = GetParam(); @@ -2234,7 +2237,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { dim * sizeof(uint8_t) + sq8::storage_metadata_count() * sizeof(float); std::vector v2_quantized(quantized_size); - test_utils::populate_fp32_sq8_query(v1_orig.data(), dim, true, 1234); + test_utils::populate_sq8_fp32_query(v1_orig.data(), dim, true, 1234); test_utils::populate_float_vec_to_sq8_with_metadata(v2_quantized.data(), dim, false, 456); auto expected_alignment = [](size_t reg_bit_size, size_t dim) { @@ -2243,15 +2246,16 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { }; dist_func_t arch_opt_func; - float baseline = SQ8_Cosine(v1_orig.data(), v2_quantized.data(), dim); + // Arguments: (SQ8_storage, FP32_query, dim) + float baseline = SQ8_FP32_Cosine(v2_quantized.data(), v1_orig.data(), dim); #ifdef OPT_SVE2 if (optimization.sve2) { unsigned char alignment = 0; - arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); - ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_SVE2(dim)) + arch_opt_func = Cosine_SQ8_FP32_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_FP32_Cosine_implementation_SVE2(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v2_quantized.data(), v1_orig.data(), dim), 0.01) << "SVE2 with dim " << dim; optimization.sve2 = 0; } @@ -2259,10 +2263,10 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { #ifdef OPT_SVE if (optimization.sve) { unsigned char alignment = 0; - arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); - ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_SVE(dim)) + arch_opt_func = Cosine_SQ8_FP32_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_FP32_Cosine_implementation_SVE(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v2_quantized.data(), v1_orig.data(), dim), 0.01) << "SVE with dim " << dim; optimization.sve = 0; } @@ -2270,10 +2274,10 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { #ifdef OPT_NEON if (optimization.asimd) { unsigned char alignment = 0; - arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); - ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_NEON(dim)) + arch_opt_func = Cosine_SQ8_FP32_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_FP32_Cosine_implementation_NEON(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v2_quantized.data(), v1_orig.data(), dim), 0.01) << "NEON with dim " << dim; optimization.asimd = 0; } @@ -2283,10 +2287,10 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { #ifdef OPT_AVX512_F_BW_VL_VNNI if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { unsigned char alignment = 0; - arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); - ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim)) + arch_opt_func = Cosine_SQ8_FP32_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_FP32_Cosine_implementation_AVX512F_BW_VL_VNNI(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v2_quantized.data(), v1_orig.data(), dim), 0.01) << "AVX512 with dim " << dim; optimization.avx512f = 0; } @@ -2294,10 +2298,10 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { #ifdef OPT_AVX2_FMA if (optimization.avx2 && optimization.fma3) { unsigned char alignment = 0; - arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); - ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_AVX2_FMA(dim)) + arch_opt_func = Cosine_SQ8_FP32_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_FP32_Cosine_implementation_AVX2_FMA(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v2_quantized.data(), v1_orig.data(), dim), 0.01) << "AVX with dim " << dim; optimization.fma3 = 0; } @@ -2305,10 +2309,10 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { #ifdef OPT_AVX2 if (optimization.avx2) { unsigned char alignment = 0; - arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); - ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_AVX2(dim)) + arch_opt_func = Cosine_SQ8_FP32_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_FP32_Cosine_implementation_AVX2(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v2_quantized.data(), v1_orig.data(), dim), 0.01) << "AVX with dim " << dim; optimization.avx2 = 0; } @@ -2317,10 +2321,10 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { #ifdef OPT_SSE if (optimization.sse4_1) { unsigned char alignment = 0; - arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); - ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_SSE4(dim)) + arch_opt_func = Cosine_SQ8_FP32_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_FP32_Cosine_implementation_SSE4(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v2_quantized.data(), v1_orig.data(), dim), 0.01) << "SSE with dim " << dim; optimization.sse4_1 = 0; } @@ -2328,28 +2332,30 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { // Test default implementation unsigned char alignment = 0; - arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); - ASSERT_EQ(arch_opt_func, SQ8_Cosine) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.01) + arch_opt_func = Cosine_SQ8_FP32_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, SQ8_FP32_Cosine) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v2_quantized.data(), v1_orig.data(), dim), 0.01) << "No optimization with dim " << dim; ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; } // Test self-distance: distance to itself should be 0 for cosine (normalized vectors) -TEST(SQ8_EdgeCases, SelfDistanceCosine) { +TEST(SQ8_FP32_EdgeCases, SelfDistanceCosine) { auto optimization = getCpuOptimizationFeatures(); size_t dim = 128; // Query layout: [float values (dim)] [sum] [sum_squares] size_t query_size = (dim + sq8::query_metadata_count()); std::vector v_orig(query_size); - test_utils::populate_fp32_sq8_query(v_orig.data(), dim, true, 1234); + test_utils::populate_sq8_fp32_query(v_orig.data(), dim, true, 1234); size_t quantized_size = dim * sizeof(uint8_t) + sq8::storage_metadata_count() * sizeof(float); std::vector v_quantized(quantized_size); test_utils::populate_float_vec_to_sq8_with_metadata(v_quantized.data(), dim, true, 1234); - float baseline = SQ8_Cosine(v_orig.data(), v_quantized.data(), dim); + // Arguments: (SQ8_storage, FP32_query, dim) + float baseline = SQ8_FP32_Cosine(v_quantized.data(), v_orig.data(), dim); // Self-distance for cosine should be close to 0 ASSERT_NEAR(baseline, 0.0f, 0.001f) << "Self-distance should be ~0 for cosine"; @@ -2357,8 +2363,8 @@ TEST(SQ8_EdgeCases, SelfDistanceCosine) { #ifdef OPT_SVE2 if (optimization.sve2) { unsigned char alignment = 0; - auto arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); - float result = arch_opt_func(v_orig.data(), v_quantized.data(), dim); + auto arch_opt_func = Cosine_SQ8_FP32_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_quantized.data(), v_orig.data(), dim); ASSERT_NEAR(result, baseline, 0.01f) << "Optimized self-distance should match baseline"; optimization.sve2 = 0; } @@ -2366,8 +2372,8 @@ TEST(SQ8_EdgeCases, SelfDistanceCosine) { #ifdef OPT_SVE if (optimization.sve) { unsigned char alignment = 0; - auto arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); - float result = arch_opt_func(v_orig.data(), v_quantized.data(), dim); + auto arch_opt_func = Cosine_SQ8_FP32_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_quantized.data(), v_orig.data(), dim); ASSERT_NEAR(result, baseline, 0.01f) << "Optimized self-distance should match baseline"; optimization.sve = 0; } @@ -2375,8 +2381,8 @@ TEST(SQ8_EdgeCases, SelfDistanceCosine) { #ifdef OPT_NEON_DOTPROD if (optimization.asimddp) { unsigned char alignment = 0; - auto arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); - float result = arch_opt_func(v_orig.data(), v_quantized.data(), dim); + auto arch_opt_func = Cosine_SQ8_FP32_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_quantized.data(), v_orig.data(), dim); ASSERT_NEAR(result, baseline, 0.01f) << "Optimized self-distance should match baseline"; optimization.asimddp = 0; } @@ -2384,8 +2390,8 @@ TEST(SQ8_EdgeCases, SelfDistanceCosine) { #ifdef OPT_NEON if (optimization.asimd) { unsigned char alignment = 0; - auto arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); - float result = arch_opt_func(v_orig.data(), v_quantized.data(), dim); + auto arch_opt_func = Cosine_SQ8_FP32_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_quantized.data(), v_orig.data(), dim); ASSERT_NEAR(result, baseline, 0.01f) << "Optimized self-distance should match baseline"; optimization.asimd = 0; } @@ -2393,36 +2399,36 @@ TEST(SQ8_EdgeCases, SelfDistanceCosine) { #ifdef OPT_AVX512_F_BW_VL_VNNI if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { unsigned char alignment = 0; - auto arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); - float result = arch_opt_func(v_orig.data(), v_quantized.data(), dim); + auto arch_opt_func = Cosine_SQ8_FP32_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_quantized.data(), v_orig.data(), dim); ASSERT_NEAR(result, baseline, 0.01f) << "Optimized self-distance should match baseline"; optimization.avx512f = 0; } #endif unsigned char alignment = 0; - auto arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); - auto result = arch_opt_func(v_orig.data(), v_quantized.data(), dim); + auto arch_opt_func = Cosine_SQ8_FP32_GetDistFunc(dim, &alignment, &optimization); + auto result = arch_opt_func(v_quantized.data(), v_orig.data(), dim); ASSERT_NEAR(baseline, result, 0.00001) << "No optimization self-distance should match baseline"; ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; } // Test self-distance: distance to itself should be 0 for L2 -TEST(SQ8_EdgeCases, SelfDistanceL2) { +TEST(SQ8_FP32_EdgeCases, SelfDistanceL2) { auto optimization = getCpuOptimizationFeatures(); size_t dim = 128; // Create fp32 query with precomputed sum and sum_squares // Query layout: [float values (dim)] [sum] [sum_squares] size_t query_size = (dim + sq8::query_metadata_count()); std::vector v_orig(query_size); - test_utils::populate_fp32_sq8_query(v_orig.data(), dim, false, 1234); + test_utils::populate_sq8_fp32_query(v_orig.data(), dim, false, 1234); size_t quantized_size = dim * sizeof(uint8_t) + sq8::storage_metadata_count() * sizeof(float); std::vector v_quantized(quantized_size); test_utils::populate_float_vec_to_sq8_with_metadata(v_quantized.data(), dim, false, 1234); - float baseline = SQ8_L2Sqr(v_orig.data(), v_quantized.data(), dim); + float baseline = SQ8_FP32_L2Sqr(v_quantized.data(), v_orig.data(), dim); // Self-distance for L2 should be close to 0 (due to quantization effects, small errors are // expected) @@ -2431,8 +2437,8 @@ TEST(SQ8_EdgeCases, SelfDistanceL2) { #ifdef OPT_SVE2 if (optimization.sve2) { unsigned char alignment = 0; - auto arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); - float result = arch_opt_func(v_orig.data(), v_quantized.data(), dim); + auto arch_opt_func = L2_SQ8_FP32_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_quantized.data(), v_orig.data(), dim); ASSERT_NEAR(result, baseline, 0.01f) << "Optimized self-distance should match baseline"; optimization.sve2 = 0; } @@ -2440,8 +2446,8 @@ TEST(SQ8_EdgeCases, SelfDistanceL2) { #ifdef OPT_SVE if (optimization.sve) { unsigned char alignment = 0; - auto arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); - float result = arch_opt_func(v_orig.data(), v_quantized.data(), dim); + auto arch_opt_func = L2_SQ8_FP32_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_quantized.data(), v_orig.data(), dim); ASSERT_NEAR(result, baseline, 0.01f) << "Optimized self-distance should match baseline"; optimization.sve = 0; } @@ -2449,8 +2455,8 @@ TEST(SQ8_EdgeCases, SelfDistanceL2) { #ifdef OPT_NEON_DOTPROD if (optimization.asimddp) { unsigned char alignment = 0; - auto arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); - float result = arch_opt_func(v_orig.data(), v_quantized.data(), dim); + auto arch_opt_func = L2_SQ8_FP32_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_quantized.data(), v_orig.data(), dim); ASSERT_NEAR(result, baseline, 0.01f) << "Optimized self-distance should match baseline"; optimization.asimddp = 0; } @@ -2458,8 +2464,8 @@ TEST(SQ8_EdgeCases, SelfDistanceL2) { #ifdef OPT_NEON if (optimization.asimd) { unsigned char alignment = 0; - auto arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); - float result = arch_opt_func(v_orig.data(), v_quantized.data(), dim); + auto arch_opt_func = L2_SQ8_FP32_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_quantized.data(), v_orig.data(), dim); ASSERT_NEAR(result, baseline, 0.01f) << "Optimized self-distance should match baseline"; optimization.asimd = 0; } @@ -2467,30 +2473,31 @@ TEST(SQ8_EdgeCases, SelfDistanceL2) { #ifdef OPT_AVX512_F_BW_VL_VNNI if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { unsigned char alignment = 0; - auto arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); - float result = arch_opt_func(v_orig.data(), v_quantized.data(), dim); + auto arch_opt_func = L2_SQ8_FP32_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_quantized.data(), v_orig.data(), dim); ASSERT_NEAR(result, baseline, 0.01f) << "Optimized self-distance should match baseline"; optimization.avx512f = 0; } #endif unsigned char alignment = 0; - auto arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); - auto result = arch_opt_func(v_orig.data(), v_quantized.data(), dim); + auto arch_opt_func = L2_SQ8_FP32_GetDistFunc(dim, &alignment, &optimization); + auto result = arch_opt_func(v_quantized.data(), v_orig.data(), dim); ASSERT_NEAR(baseline, result, 0.00001) << "No optimization self-distance should match baseline"; ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; } // Test symmetry: dist(v1, v2) == dist(v2, v1) -TEST(SQ8_EdgeCases, CosineSymmetryTest) { +// For asymmetric SQ8_FP32, symmetry means: dist(sq8_1, fp32_2) == dist(sq8_2, fp32_1) +TEST(SQ8_FP32_EdgeCases, CosineSymmetryTest) { size_t dim = 128; auto optimization = getCpuOptimizationFeatures(); // Query layout: [float values (dim)] [sum] [sum_squares] size_t query_size = dim + sq8::query_metadata_count(); std::vector v1_fp32(query_size); - test_utils::populate_fp32_sq8_query(v1_fp32.data(), dim, true, 1234); + test_utils::populate_sq8_fp32_query(v1_fp32.data(), dim, true, 1234); std::vector v2_fp32(query_size); - test_utils::populate_fp32_sq8_query(v2_fp32.data(), dim, true, 456); + test_utils::populate_sq8_fp32_query(v2_fp32.data(), dim, true, 456); size_t quantized_size = dim * sizeof(uint8_t) + sq8::storage_metadata_count() * sizeof(float); @@ -2498,9 +2505,9 @@ TEST(SQ8_EdgeCases, CosineSymmetryTest) { test_utils::populate_float_vec_to_sq8_with_metadata(v1_quantized.data(), dim, true, 1234); std::vector v2_quantized(quantized_size); test_utils::populate_float_vec_to_sq8_with_metadata(v2_quantized.data(), dim, true, 456); - - float baseline_1 = SQ8_Cosine(v1_fp32.data(), v2_quantized.data(), dim); - float baseline_2 = SQ8_Cosine(v2_fp32.data(), v1_quantized.data(), dim); + // Arguments: (SQ8_storage, FP32_query, dim) + float baseline_1 = SQ8_FP32_Cosine(v2_quantized.data(), v1_fp32.data(), dim); + float baseline_2 = SQ8_FP32_Cosine(v1_quantized.data(), v2_fp32.data(), dim); ASSERT_NEAR(baseline_1, baseline_2, 0.001f) << "Cosine should be symmetric"; unsigned char alignment = 0; @@ -2508,9 +2515,9 @@ TEST(SQ8_EdgeCases, CosineSymmetryTest) { #ifdef OPT_SVE2 if (optimization.sve2) { unsigned char alignment = 0; - auto arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); - float cos_12 = arch_opt_func(v1_fp32.data(), v2_quantized.data(), dim); - float cos_21 = arch_opt_func(v2_fp32.data(), v1_quantized.data(), dim); + auto arch_opt_func = Cosine_SQ8_FP32_GetDistFunc(dim, &alignment, &optimization); + float cos_12 = arch_opt_func(v2_quantized.data(), v1_fp32.data(), dim); + float cos_21 = arch_opt_func(v1_quantized.data(), v2_fp32.data(), dim); ASSERT_NEAR(cos_12, cos_21, 0.001f) << "Optimized cosine should be symmetric"; optimization.sve2 = 0; } @@ -2518,9 +2525,9 @@ TEST(SQ8_EdgeCases, CosineSymmetryTest) { #ifdef OPT_SVE if (optimization.sve) { unsigned char alignment = 0; - auto arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); - float cos_12 = arch_opt_func(v1_fp32.data(), v2_quantized.data(), dim); - float cos_21 = arch_opt_func(v2_fp32.data(), v1_quantized.data(), dim); + auto arch_opt_func = Cosine_SQ8_FP32_GetDistFunc(dim, &alignment, &optimization); + float cos_12 = arch_opt_func(v2_quantized.data(), v1_fp32.data(), dim); + float cos_21 = arch_opt_func(v1_quantized.data(), v2_fp32.data(), dim); ASSERT_NEAR(cos_12, cos_21, 0.001f) << "Optimized cosine should be symmetric"; optimization.sve = 0; } @@ -2528,9 +2535,9 @@ TEST(SQ8_EdgeCases, CosineSymmetryTest) { #ifdef OPT_NEON_DOTPROD if (optimization.asimddp) { unsigned char alignment = 0; - auto arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); - float cos_12 = arch_opt_func(v1_fp32.data(), v2_quantized.data(), dim); - float cos_21 = arch_opt_func(v2_fp32.data(), v1_quantized.data(), dim); + auto arch_opt_func = Cosine_SQ8_FP32_GetDistFunc(dim, &alignment, &optimization); + float cos_12 = arch_opt_func(v2_quantized.data(), v1_fp32.data(), dim); + float cos_21 = arch_opt_func(v1_quantized.data(), v2_fp32.data(), dim); ASSERT_NEAR(cos_12, cos_21, 0.001f) << "Optimized cosine should be symmetric"; optimization.asimddp = 0; } @@ -2538,9 +2545,9 @@ TEST(SQ8_EdgeCases, CosineSymmetryTest) { #ifdef OPT_NEON if (optimization.asimd) { unsigned char alignment = 0; - auto arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); - float cos_12 = arch_opt_func(v1_fp32.data(), v2_quantized.data(), dim); - float cos_21 = arch_opt_func(v2_fp32.data(), v1_quantized.data(), dim); + auto arch_opt_func = Cosine_SQ8_FP32_GetDistFunc(dim, &alignment, &optimization); + float cos_12 = arch_opt_func(v2_quantized.data(), v1_fp32.data(), dim); + float cos_21 = arch_opt_func(v1_quantized.data(), v2_fp32.data(), dim); ASSERT_NEAR(cos_12, cos_21, 0.001f) << "Optimized cosine should be symmetric"; optimization.asimd = 0; } @@ -2548,21 +2555,21 @@ TEST(SQ8_EdgeCases, CosineSymmetryTest) { #ifdef OPT_AVX512_F_BW_VL_VNNI if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { unsigned char alignment = 0; - auto arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); - float cos_12 = arch_opt_func(v1_fp32.data(), v2_quantized.data(), dim); - float cos_21 = arch_opt_func(v2_fp32.data(), v1_quantized.data(), dim); + auto arch_opt_func = Cosine_SQ8_FP32_GetDistFunc(dim, &alignment, &optimization); + float cos_12 = arch_opt_func(v2_quantized.data(), v1_fp32.data(), dim); + float cos_21 = arch_opt_func(v1_quantized.data(), v2_fp32.data(), dim); ASSERT_NEAR(cos_12, cos_21, 0.001f) << "Optimized cosine should be symmetric"; optimization.avx512f = 0; } #endif - auto cosine_func = Cosine_SQ8_GetDistFunc(dim, &alignment, nullptr); - float cos_12 = cosine_func(v1_fp32.data(), v2_quantized.data(), dim); - float cos_21 = cosine_func(v2_fp32.data(), v1_quantized.data(), dim); + auto cosine_func = Cosine_SQ8_FP32_GetDistFunc(dim, &alignment, nullptr); + float cos_12 = cosine_func(v2_quantized.data(), v1_fp32.data(), dim); + float cos_21 = cosine_func(v1_quantized.data(), v2_fp32.data(), dim); ASSERT_NEAR(cos_12, cos_21, 0.001f) << "Cosine should be symmetric"; } // Test with zero vector -TEST(SQ8_EdgeCases, CosineZeroVectorTest) { +TEST(SQ8_FP32_EdgeCases, CosineZeroVectorTest) { auto optimization = getCpuOptimizationFeatures(); size_t dim = 128; size_t query_size = dim + sq8::query_metadata_count(); @@ -2573,13 +2580,14 @@ TEST(SQ8_EdgeCases, CosineZeroVectorTest) { std::vector v_nonzero_quantized(quantized_size); test_utils::populate_float_vec_to_sq8_with_metadata(v_nonzero_quantized.data(), dim, true); - float baseline = SQ8_Cosine(v_zero.data(), v_nonzero_quantized.data(), dim); + // Arguments: (SQ8_storage, FP32_query, dim) + float baseline = SQ8_FP32_Cosine(v_nonzero_quantized.data(), v_zero.data(), dim); #ifdef OPT_SVE2 if (optimization.sve2) { unsigned char alignment = 0; - auto arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); - float result = arch_opt_func(v_zero.data(), v_nonzero_quantized.data(), dim); + auto arch_opt_func = Cosine_SQ8_FP32_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_nonzero_quantized.data(), v_zero.data(), dim); ASSERT_NEAR(result, baseline, 0.01f) << "Optimized zero vector IP should match baseline"; optimization.sve2 = 0; } @@ -2587,8 +2595,8 @@ TEST(SQ8_EdgeCases, CosineZeroVectorTest) { #ifdef OPT_SVE if (optimization.sve) { unsigned char alignment = 0; - auto arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); - float result = arch_opt_func(v_zero.data(), v_nonzero_quantized.data(), dim); + auto arch_opt_func = Cosine_SQ8_FP32_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_nonzero_quantized.data(), v_zero.data(), dim); ASSERT_NEAR(result, baseline, 0.01f) << "Optimized zero vector IP should match baseline"; optimization.sve = 0; } @@ -2596,8 +2604,8 @@ TEST(SQ8_EdgeCases, CosineZeroVectorTest) { #ifdef OPT_NEON_DOTPROD if (optimization.asimddp) { unsigned char alignment = 0; - auto arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); - float result = arch_opt_func(v_zero.data(), v_nonzero_quantized.data(), dim); + auto arch_opt_func = Cosine_SQ8_FP32_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_nonzero_quantized.data(), v_zero.data(), dim); ASSERT_NEAR(result, baseline, 0.01f) << "Optimized zero vector IP should match baseline"; optimization.asimddp = 0; } @@ -2605,8 +2613,8 @@ TEST(SQ8_EdgeCases, CosineZeroVectorTest) { #ifdef OPT_NEON if (optimization.asimd) { unsigned char alignment = 0; - auto arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); - float result = arch_opt_func(v_zero.data(), v_nonzero_quantized.data(), dim); + auto arch_opt_func = Cosine_SQ8_FP32_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_nonzero_quantized.data(), v_zero.data(), dim); ASSERT_NEAR(result, baseline, 0.01f) << "Optimized zero vector IP should match baseline"; optimization.asimd = 0; } @@ -2614,21 +2622,21 @@ TEST(SQ8_EdgeCases, CosineZeroVectorTest) { #ifdef OPT_AVX512_F_BW_VL_VNNI if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { unsigned char alignment = 0; - auto arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); - float result = arch_opt_func(v_zero.data(), v_nonzero_quantized.data(), dim); + auto arch_opt_func = Cosine_SQ8_FP32_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_nonzero_quantized.data(), v_zero.data(), dim); ASSERT_NEAR(result, baseline, 0.01f) << "Optimized zero vector IP should match baseline"; optimization.avx512f = 0; } #endif unsigned char alignment = 0; - auto arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, nullptr); - float result = arch_opt_func(v_zero.data(), v_nonzero_quantized.data(), dim); + auto arch_opt_func = Cosine_SQ8_FP32_GetDistFunc(dim, &alignment, nullptr); + float result = arch_opt_func(v_nonzero_quantized.data(), v_zero.data(), dim); ASSERT_EQ(result, baseline) << "Zero vector Cosine should match baseline"; } // Test with constant quantized vector (all same values - edge case where delta = 0) -TEST(SQ8_EdgeCases, CosineConstantVectorTest) { +TEST(SQ8_FP32_EdgeCases, CosineConstantVectorTest) { auto optimization = getCpuOptimizationFeatures(); size_t dim = 128; @@ -2637,7 +2645,7 @@ TEST(SQ8_EdgeCases, CosineConstantVectorTest) { size_t query_size = dim + sq8::query_metadata_count(); std::vector v_query(query_size); test_utils::populate_float_vec(v_query.data(), dim); - test_utils::preprocess_fp32_sq8_query(v_query.data(), dim); + test_utils::preprocess_sq8_fp32_query(v_query.data(), dim); // Create a constant quantized vector (all same values) // This tests the edge case where delta = 0 (or set to 1.0 to avoid division by zero) @@ -2649,12 +2657,13 @@ TEST(SQ8_EdgeCases, CosineConstantVectorTest) { test_utils::quantize_float_vec_to_sq8_with_metadata(v_const.data(), dim, v_const_quantized.data()); - float baseline = SQ8_Cosine(v_query.data(), v_const_quantized.data(), dim); + // Arguments: (SQ8_storage, FP32_query, dim) + float baseline = SQ8_FP32_Cosine(v_const_quantized.data(), v_query.data(), dim); #ifdef OPT_SVE2 if (optimization.sve2) { unsigned char alignment = 0; - auto arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); - float result = arch_opt_func(v_query.data(), v_const_quantized.data(), dim); + auto arch_opt_func = Cosine_SQ8_FP32_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_const_quantized.data(), v_query.data(), dim); ASSERT_NEAR(result, baseline, 0.01f) << "Optimized constant vector Cosine should match baseline"; optimization.sve2 = 0; @@ -2663,8 +2672,8 @@ TEST(SQ8_EdgeCases, CosineConstantVectorTest) { #ifdef OPT_SVE if (optimization.sve) { unsigned char alignment = 0; - auto arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); - float result = arch_opt_func(v_query.data(), v_const_quantized.data(), dim); + auto arch_opt_func = Cosine_SQ8_FP32_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_const_quantized.data(), v_query.data(), dim); ASSERT_NEAR(result, baseline, 0.01f) << "Optimized constant vector Cosine should match baseline"; optimization.sve = 0; @@ -2673,8 +2682,8 @@ TEST(SQ8_EdgeCases, CosineConstantVectorTest) { #ifdef OPT_NEON_DOTPROD if (optimization.asimddp) { unsigned char alignment = 0; - auto arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); - float result = arch_opt_func(v_query.data(), v_const_quantized.data(), dim); + auto arch_opt_func = Cosine_SQ8_FP32_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_const_quantized.data(), v_query.data(), dim); ASSERT_NEAR(result, baseline, 0.01f) << "Optimized constant vector Cosine should match baseline"; optimization.asimddp = 0; @@ -2683,8 +2692,8 @@ TEST(SQ8_EdgeCases, CosineConstantVectorTest) { #ifdef OPT_NEON if (optimization.asimd) { unsigned char alignment = 0; - auto arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); - float result = arch_opt_func(v_query.data(), v_const_quantized.data(), dim); + auto arch_opt_func = Cosine_SQ8_FP32_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_const_quantized.data(), v_query.data(), dim); ASSERT_NEAR(result, baseline, 0.01f) << "Optimized constant vector Cosine should match baseline"; optimization.asimd = 0; @@ -2693,23 +2702,23 @@ TEST(SQ8_EdgeCases, CosineConstantVectorTest) { #ifdef OPT_AVX512_F_BW_VL_VNNI if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { unsigned char alignment = 0; - auto arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); - float result = arch_opt_func(v_query.data(), v_const_quantized.data(), dim); + auto arch_opt_func = Cosine_SQ8_FP32_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_const_quantized.data(), v_query.data(), dim); ASSERT_NEAR(result, baseline, 0.01f) << "Optimized constant vector Cosine should match baseline"; optimization.avx512f = 0; } #endif unsigned char alignment = 0; - auto arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, nullptr); - float result = arch_opt_func(v_query.data(), v_const_quantized.data(), dim); + auto arch_opt_func = Cosine_SQ8_FP32_GetDistFunc(dim, &alignment, nullptr); + float result = arch_opt_func(v_const_quantized.data(), v_query.data(), dim); ASSERT_NEAR(result, baseline, 0.01f) << "Constant quantized vector Cosine should match baseline"; } // Test with extreme values (-1 and 1 only) -TEST(SQ8_EdgeCases, CosineExtremeValuesTest) { +TEST(SQ8_FP32_EdgeCases, CosineExtremeValuesTest) { auto optimization = getCpuOptimizationFeatures(); size_t dim = 128; // Query layout: [float values (dim)] [sum] [sum_squares] @@ -2721,19 +2730,20 @@ TEST(SQ8_EdgeCases, CosineExtremeValuesTest) { v1[i] = (i % 2 == 0) ? 1.0f : -1.0f; v2[i] = (i % 3 == 0) ? 1.0f : -1.0f; } - test_utils::preprocess_fp32_sq8_query(v1.data(), dim); + test_utils::preprocess_sq8_fp32_query(v1.data(), dim); size_t quantized_size = dim * sizeof(uint8_t) + sq8::storage_metadata_count() * sizeof(float); std::vector v2_quantized(quantized_size); test_utils::quantize_float_vec_to_sq8_with_metadata(v2.data(), dim, v2_quantized.data()); - float baseline = SQ8_Cosine(v1.data(), v2_quantized.data(), dim); + // Arguments: (SQ8_storage, FP32_query, dim) + float baseline = SQ8_FP32_Cosine(v2_quantized.data(), v1.data(), dim); #ifdef OPT_SVE2 if (optimization.sve2) { unsigned char alignment = 0; - auto arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); - float result = arch_opt_func(v1.data(), v2_quantized.data(), dim); + auto arch_opt_func = Cosine_SQ8_FP32_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v2_quantized.data(), v1.data(), dim); ASSERT_NEAR(result, baseline, 0.01f) << "Optimized extreme values Cosine should match baseline"; optimization.sve2 = 0; @@ -2742,8 +2752,8 @@ TEST(SQ8_EdgeCases, CosineExtremeValuesTest) { #ifdef OPT_SVE if (optimization.sve) { unsigned char alignment = 0; - auto arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); - float result = arch_opt_func(v1.data(), v2_quantized.data(), dim); + auto arch_opt_func = Cosine_SQ8_FP32_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v2_quantized.data(), v1.data(), dim); ASSERT_NEAR(result, baseline, 0.01f) << "Optimized extreme values Cosine should match baseline"; optimization.sve = 0; @@ -2752,8 +2762,8 @@ TEST(SQ8_EdgeCases, CosineExtremeValuesTest) { #ifdef OPT_NEON_DOTPROD if (optimization.asimddp) { unsigned char alignment = 0; - auto arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); - float result = arch_opt_func(v1.data(), v2_quantized.data(), dim); + auto arch_opt_func = Cosine_SQ8_FP32_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v2_quantized.data(), v1.data(), dim); ASSERT_NEAR(result, baseline, 0.01f) << "Optimized extreme values Cosine should match baseline"; optimization.asimddp = 0; @@ -2762,8 +2772,8 @@ TEST(SQ8_EdgeCases, CosineExtremeValuesTest) { #ifdef OPT_NEON if (optimization.asimd) { unsigned char alignment = 0; - auto arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); - float result = arch_opt_func(v1.data(), v2_quantized.data(), dim); + auto arch_opt_func = Cosine_SQ8_FP32_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v2_quantized.data(), v1.data(), dim); ASSERT_NEAR(result, baseline, 0.01f) << "Optimized extreme values Cosine should match baseline"; optimization.asimd = 0; @@ -2772,16 +2782,16 @@ TEST(SQ8_EdgeCases, CosineExtremeValuesTest) { #ifdef OPT_AVX512_F_BW_VL_VNNI if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { unsigned char alignment = 0; - auto arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); - float result = arch_opt_func(v1.data(), v2_quantized.data(), dim); + auto arch_opt_func = Cosine_SQ8_FP32_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v2_quantized.data(), v1.data(), dim); ASSERT_NEAR(result, baseline, 0.01f) << "Optimized extreme values Cosine should match baseline"; optimization.avx512f = 0; } #endif unsigned char alignment = 0; - auto arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, nullptr); - float result = arch_opt_func(v1.data(), v2_quantized.data(), dim); + auto arch_opt_func = Cosine_SQ8_FP32_GetDistFunc(dim, &alignment, nullptr); + float result = arch_opt_func(v2_quantized.data(), v1.data(), dim); ASSERT_NEAR(result, baseline, 0.01f) << "Extreme values Cosine should match baseline"; } diff --git a/tests/utils/tests_utils.h b/tests/utils/tests_utils.h index 34a198a53..a23718486 100644 --- a/tests/utils/tests_utils.h +++ b/tests/utils/tests_utils.h @@ -71,23 +71,25 @@ static void populate_float16_vec(vecsim_types::float16 *v, const size_t dim, int } /* - * SQ8 distance function without the algebraic optimizations + * SQ8-FP32 distance function without the algebraic optimizations * uses the regular dequantization formula: * IP = Σ((min + delta * q_i) * v_i) + * pVect1 = SQ8 storage (quantized values + metadata) + * pVect2 = FP32 query */ -static float SQ8_NotOptimized_InnerProduct(const void *pVect1v, const void *pVect2v, - size_t dimension) { +static float SQ8_FP32_NotOptimized_InnerProduct(const void *pVect1v, const void *pVect2v, + size_t dimension) { - const auto *pVect1 = static_cast(pVect1v); - const auto *pVect2 = static_cast(pVect2v); + const auto *pVect1 = static_cast(pVect1v); // SQ8 storage + const auto *pVect2 = static_cast(pVect2v); // FP32 query - // Get quantization parameters from pVect2 - const float min_val = *reinterpret_cast(pVect2 + dimension); - const float delta = *reinterpret_cast(pVect2 + dimension + sizeof(float)); + // Get quantization parameters from pVect1 (SQ8 storage) + const float min_val = *reinterpret_cast(pVect1 + dimension); + const float delta = *reinterpret_cast(pVect1 + dimension + sizeof(float)); // Compute inner product with dequantization float res = 0.0f; for (size_t i = 0; i < dimension; i++) { - res += (pVect2[i] * delta + min_val) * pVect1[i]; + res += (pVect1[i] * delta + min_val) * pVect2[i]; } return 1.0f - res; } @@ -96,8 +98,9 @@ static float SQ8_NotOptimized_InnerProduct(const void *pVect1v, const void *pVec * SQ8 Cosine distance function without the algebraic optimizations * For normalized vectors, cosine distance equals inner product distance. */ -static float SQ8_NotOptimized_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension) { - return SQ8_NotOptimized_InnerProduct(pVect1v, pVect2v, dimension); +static float SQ8_FP32_NotOptimized_Cosine(const void *pVect1v, const void *pVect2v, + size_t dimension) { + return SQ8_FP32_NotOptimized_InnerProduct(pVect1v, pVect2v, dimension); } /* @@ -207,7 +210,7 @@ static void quantize_float_vec_to_sq8_with_metadata(const float *v, size_t dim, // Query layout: [float values (dim)] [sum (float)] [sum_squares (float)] // Assuming v is a memory allocation of size (dim + sq8::query_metadata_count()) // defaults to L2 just for testing purposes. -static void preprocess_fp32_sq8_query(float *v, size_t dim) { +static void preprocess_sq8_fp32_query(float *v, size_t dim) { float sum = 0.0f; float sum_squares = 0.0f; for (size_t i = 0; i < dim; i++) { @@ -219,33 +222,36 @@ static void preprocess_fp32_sq8_query(float *v, size_t dim) { } // Assuming v is a memory allocation of size (dim + sq8::query_metadata_count()) -static void populate_fp32_sq8_query(float *v, size_t dim, bool should_normalize = false, +static void populate_sq8_fp32_query(float *v, size_t dim, bool should_normalize = false, int seed = 1234, float min = -1.0f, float max = 1.0f) { populate_float_vec(v, dim, seed, min, max); if (should_normalize) { spaces::GetNormalizeFunc()(v, dim); } - preprocess_fp32_sq8_query(v, dim); + preprocess_sq8_fp32_query(v, dim); } /* - * SQ8 L2 squared distance function without the algebraic optimizations. + * SQ8-FP32 L2 squared distance function without the algebraic optimizations. * Uses the regular dequantization formula element-by-element: * L2² = Σ((y_i - (min + delta * q_i))²) + * pVect1 = SQ8 storage (quantized values + metadata) + * pVect2 = FP32 query */ -static float SQ8_NotOptimized_L2Sqr(const void *pVect1v, const void *pVect2v, size_t dimension) { - const auto *pVect1 = static_cast(pVect1v); - const auto *pVect2 = static_cast(pVect2v); +static float SQ8_FP32_NotOptimized_L2Sqr(const void *pVect1v, const void *pVect2v, + size_t dimension) { + const auto *pVect1 = static_cast(pVect1v); // SQ8 storage + const auto *pVect2 = static_cast(pVect2v); // FP32 query - // Get quantization parameters from pVect2 - const float min_val = *reinterpret_cast(pVect2 + dimension); - const float delta = *reinterpret_cast(pVect2 + dimension + sizeof(float)); + // Get quantization parameters from pVect1 (SQ8 storage) + const float min_val = *reinterpret_cast(pVect1 + dimension); + const float delta = *reinterpret_cast(pVect1 + dimension + sizeof(float)); // Compute L2 squared with dequantization float res = 0.0f; for (size_t i = 0; i < dimension; i++) { - float dequantized = pVect2[i] * delta + min_val; - float diff = pVect1[i] - dequantized; + float dequantized = pVect1[i] * delta + min_val; + float diff = pVect2[i] - dequantized; res += diff * diff; } return res;