Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 21 additions & 20 deletions src/VecSim/spaces/IP/IP.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,22 +17,23 @@ using float16 = vecsim_types::float16;
using sq8 = vecsim_types::sq8;

/*
* Optimized asymmetric SQ8 inner product using algebraic identity:
* Optimized asymmetric SQ8-FP32 inner product using algebraic identity:
* IP(x, y) = Σ(x_i * y_i)
* ≈ Σ((min + delta * q_i) * y_i)
* = min * Σy_i + delta * Σ(q_i * y_i)
* = min * y_sum + delta * quantized_dot_product
*
* Uses 4x loop unrolling with multiple accumulators for ILP.
* pVect1 is query (FP32): [float values (dim)] [y_sum] [y_sum_squares (L2 only)]
* pVect2 is storage (SQ8): [uint8_t values (dim)] [min_val] [delta] [x_sum] [x_sum_squares (L2
* pVect1 is storage (SQ8): [uint8_t values (dim)] [min_val] [delta] [x_sum] [x_sum_squares (L2
* only)]
* pVect2 is query (FP32): [float values (dim)] [y_sum] [y_sum_squares (L2 only)]
*
* Returns raw inner product value (not distance). Used by SQ8_InnerProduct, SQ8_Cosine, SQ8_L2Sqr.
* Returns raw inner product value (not distance). Used by SQ8_FP32_InnerProduct, SQ8_FP32_Cosine,
* SQ8_FP32_L2Sqr.
*/
float SQ8_InnerProduct_Impl(const void *pVect1v, const void *pVect2v, size_t dimension) {
const auto *pVect1 = static_cast<const float *>(pVect1v);
const auto *pVect2 = static_cast<const uint8_t *>(pVect2v);
float SQ8_FP32_InnerProduct_Impl(const void *pVect1v, const void *pVect2v, size_t dimension) {
const auto *pVect1 = static_cast<const uint8_t *>(pVect1v);
const auto *pVect2 = static_cast<const float *>(pVect2v);

// Use 4 accumulators for instruction-level parallelism
float sum0 = 0, sum1 = 0, sum2 = 0, sum3 = 0;
Expand All @@ -41,38 +42,38 @@ float SQ8_InnerProduct_Impl(const void *pVect1v, const void *pVect2v, size_t dim
size_t i = 0;
size_t dim4 = dimension & ~size_t(3); // dim4 is a multiple of 4
for (; i < dim4; i += 4) {
sum0 += pVect1[i + 0] * static_cast<float>(pVect2[i + 0]);
sum1 += pVect1[i + 1] * static_cast<float>(pVect2[i + 1]);
sum2 += pVect1[i + 2] * static_cast<float>(pVect2[i + 2]);
sum3 += pVect1[i + 3] * static_cast<float>(pVect2[i + 3]);
sum0 += static_cast<float>(pVect1[i + 0]) * pVect2[i + 0];
sum1 += static_cast<float>(pVect1[i + 1]) * pVect2[i + 1];
sum2 += static_cast<float>(pVect1[i + 2]) * pVect2[i + 2];
sum3 += static_cast<float>(pVect1[i + 3]) * pVect2[i + 3];
}

// Handle remainder (0-3 elements)
for (; i < dimension; i++) {
sum0 += pVect1[i] * static_cast<float>(pVect2[i]);
sum0 += static_cast<float>(pVect1[i]) * pVect2[i];
}

// Combine accumulators
float quantized_dot = (sum0 + sum1) + (sum2 + sum3);

// Get quantization parameters from stored vector
const float *params = reinterpret_cast<const float *>(pVect2 + dimension);
// Get quantization parameters from stored vector (pVect1 is SQ8)
const float *params = reinterpret_cast<const float *>(pVect1 + dimension);
const float min_val = params[sq8::MIN_VAL];
const float delta = params[sq8::DELTA];

// Get precomputed y_sum from query blob (stored after the dim floats)
const float y_sum = pVect1[dimension + sq8::SUM_QUERY];
// Get precomputed y_sum from query blob (pVect2 is FP32, stored after the dim floats)
const float y_sum = pVect2[dimension + sq8::SUM_QUERY];

// Apply formula: IP = min * y_sum + delta * Σ(q_i * y_i)
return min_val * y_sum + delta * quantized_dot;
}

float SQ8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimension) {
return 1.0f - SQ8_InnerProduct_Impl(pVect1v, pVect2v, dimension);
float SQ8_FP32_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimension) {
return 1.0f - SQ8_FP32_InnerProduct_Impl(pVect1v, pVect2v, dimension);
}

float SQ8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension) {
return SQ8_InnerProduct(pVect1v, pVect2v, dimension);
float SQ8_FP32_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension) {
return SQ8_FP32_InnerProduct(pVect1v, pVect2v, dimension);
}

// SQ8-to-SQ8: Common inner product implementation that returns the raw inner product value
Expand Down
18 changes: 9 additions & 9 deletions src/VecSim/spaces/IP/IP.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,18 +10,18 @@

#include <cstdlib>

// FP32-to-SQ8: Common inner product implementation that returns the raw inner product value
// (not distance). Used by SQ8_InnerProduct, SQ8_Cosine, and SQ8_L2Sqr.
// pVect1 is query (FP32): [float values (dim)] [y_sum] [y_sum_squares (L2 only)]
// pVect2 is storage (SQ8): [uint8_t values (dim)] [min_val] [delta] [x_sum] [x_sum_squares (L2
// SQ8-FP32: Common inner product implementation that returns the raw inner product value
// (not distance). Used by SQ8_FP32_InnerProduct, SQ8_FP32_Cosine, and SQ8_FP32_L2Sqr.
// pVect1 is storage (SQ8): [uint8_t values (dim)] [min_val] [delta] [x_sum] [x_sum_squares (L2
// only)]
float SQ8_InnerProduct_Impl(const void *pVect1v, const void *pVect2v, size_t dimension);
// pVect2 is query (FP32): [float values (dim)] [y_sum] [y_sum_squares (L2 only)]
float SQ8_FP32_InnerProduct_Impl(const void *pVect1v, const void *pVect2v, size_t dimension);

// pVect1v vector of type fp32 and pVect2v vector of type uint8
float SQ8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimension);
// pVect1v vector of type uint8 (SQ8) and pVect2v vector of type fp32
float SQ8_FP32_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimension);

// pVect1v vector of type fp32 and pVect2v vector of type uint8
float SQ8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension);
// pVect1v vector of type uint8 (SQ8) and pVect2v vector of type fp32
float SQ8_FP32_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension);

// SQ8-to-SQ8: Common inner product implementation that returns the raw inner product value
// (not distance). Used by both SQ8_SQ8_InnerProduct, SQ8_SQ8_Cosine, and SQ8_SQ8_L2Sqr.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,47 +27,51 @@ using sq8 = vecsim_types::sq8;
*/

// Helper: compute Σ(q_i * y_i) for 8 elements using FMA (no dequantization)
static inline void InnerProductStepSQ8_FMA(const float *&pVect1, const uint8_t *&pVect2,
// pVect1 = SQ8 storage (quantized values), pVect2 = FP32 query
static inline void InnerProductStepSQ8_FMA(const uint8_t *&pVect1, const float *&pVect2,
__m256 &sum256) {
// Load 8 float elements from query
__m256 v1 = _mm256_loadu_ps(pVect1);
// Load 8 uint8 elements and convert to float
__m128i v1_128 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(pVect1));
pVect1 += 8;

// Load 8 uint8 elements and convert to float
__m128i v2_128 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(pVect2));
pVect2 += 8;
__m256i v1_256 = _mm256_cvtepu8_epi32(v1_128);
__m256 v1_f = _mm256_cvtepi32_ps(v1_256);

__m256i v2_256 = _mm256_cvtepu8_epi32(v2_128);
__m256 v2_f = _mm256_cvtepi32_ps(v2_256);
// Load 8 float elements from query
__m256 v2 = _mm256_loadu_ps(pVect2);
pVect2 += 8;

// Accumulate q_i * y_i using FMA (no dequantization!)
sum256 = _mm256_fmadd_ps(v2_f, v1, sum256);
sum256 = _mm256_fmadd_ps(v1_f, v2, sum256);
}

// pVect1v = SQ8 storage, pVect2v = FP32 query
template <unsigned char residual> // 0..15
float SQ8_InnerProductImp_FMA(const void *pVect1v, const void *pVect2v, size_t dimension) {
const float *pVect1 = static_cast<const float *>(pVect1v);
const uint8_t *pVect2 = static_cast<const uint8_t *>(pVect2v);
const float *pEnd1 = pVect1 + dimension;
float SQ8_FP32_InnerProductImp_FMA(const void *pVect1v, const void *pVect2v, size_t dimension) {
const uint8_t *pVect1 = static_cast<const uint8_t *>(pVect1v); // SQ8 storage
const float *pVect2 = static_cast<const float *>(pVect2v); // FP32 query
const uint8_t *pEnd1 = pVect1 + dimension;

// Initialize sum accumulator for Σ(q_i * y_i)
__m256 sum256 = _mm256_setzero_ps();

// Handle residual elements first (0-7 elements)
if constexpr (residual % 8) {
__mmask8 constexpr mask = (1 << (residual % 8)) - 1;
__m256 v1 = my_mm256_maskz_loadu_ps<mask>(pVect1);
pVect1 += residual % 8;

// Load uint8 elements and convert to float
__m128i v2_128 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(pVect2));
pVect2 += residual % 8;
__m128i v1_128 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(pVect1));
pVect1 += residual % 8;

__m256i v2_256 = _mm256_cvtepu8_epi32(v2_128);
__m256 v2_f = _mm256_cvtepi32_ps(v2_256);
__m256i v1_256 = _mm256_cvtepu8_epi32(v1_128);
__m256 v1_f = _mm256_cvtepi32_ps(v1_256);

// Load masked float elements from query
__m256 v2 = my_mm256_maskz_loadu_ps<mask>(pVect2);
pVect2 += residual % 8;

// Compute q_i * y_i (no dequantization)
sum256 = _mm256_mul_ps(v1, v2_f);
sum256 = _mm256_mul_ps(v1_f, v2);
}

// If the residual is >=8, have another step of 8 floats
Expand All @@ -86,25 +90,26 @@ float SQ8_InnerProductImp_FMA(const void *pVect1v, const void *pVect2v, size_t d
float quantized_dot = my_mm256_reduce_add_ps(sum256);

// Get quantization parameters from stored vector (after quantized data)
const uint8_t *pVect2Base = static_cast<const uint8_t *>(pVect2v);
const float *params2 = reinterpret_cast<const float *>(pVect2Base + dimension);
const float min_val = params2[sq8::MIN_VAL];
const float delta = params2[sq8::DELTA];
const uint8_t *pVect1Base = static_cast<const uint8_t *>(pVect1v);
const float *params1 = reinterpret_cast<const float *>(pVect1Base + dimension);
const float min_val = params1[sq8::MIN_VAL];
const float delta = params1[sq8::DELTA];

// Get precomputed y_sum from query blob (stored after the dim floats)
const float y_sum = static_cast<const float *>(pVect1v)[dimension + sq8::SUM_QUERY];
const float y_sum = static_cast<const float *>(pVect2v)[dimension + sq8::SUM_QUERY];

// Apply the algebraic formula: IP = min * y_sum + delta * Σ(q_i * y_i)
return min_val * y_sum + delta * quantized_dot;
}

template <unsigned char residual> // 0..15
float SQ8_InnerProductSIMD16_AVX2_FMA(const void *pVect1v, const void *pVect2v, size_t dimension) {
return 1.0f - SQ8_InnerProductImp_FMA<residual>(pVect1v, pVect2v, dimension);
float SQ8_FP32_InnerProductSIMD16_AVX2_FMA(const void *pVect1v, const void *pVect2v,
size_t dimension) {
return 1.0f - SQ8_FP32_InnerProductImp_FMA<residual>(pVect1v, pVect2v, dimension);
}

template <unsigned char residual> // 0..15
float SQ8_CosineSIMD16_AVX2_FMA(const void *pVect1v, const void *pVect2v, size_t dimension) {
float SQ8_FP32_CosineSIMD16_AVX2_FMA(const void *pVect1v, const void *pVect2v, size_t dimension) {
// Cosine distance = 1 - IP (vectors are pre-normalized)
return SQ8_InnerProductSIMD16_AVX2_FMA<residual>(pVect1v, pVect2v, dimension);
return SQ8_FP32_InnerProductSIMD16_AVX2_FMA<residual>(pVect1v, pVect2v, dimension);
}
Original file line number Diff line number Diff line change
Expand Up @@ -26,85 +26,89 @@ using sq8 = vecsim_types::sq8;
*/

// Helper: compute Σ(q_i * y_i) for 8 elements (no dequantization)
static inline void InnerProductStepSQ8(const float *&pVect1, const uint8_t *&pVect2,
__m256 &sum256) {
// Load 8 float elements from query
__m256 v1 = _mm256_loadu_ps(pVect1);
// pVect1 = SQ8 storage (quantized values), pVect2 = FP32 query
static inline void InnerProductStepSQ8_FP32(const uint8_t *&pVect1, const float *&pVect2,
__m256 &sum256) {
// Load 8 uint8 elements and convert to float
__m128i v1_128 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(pVect1));
pVect1 += 8;

// Load 8 uint8 elements and convert to float
__m128i v2_128 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(pVect2));
pVect2 += 8;
__m256i v1_256 = _mm256_cvtepu8_epi32(v1_128);
__m256 v1_f = _mm256_cvtepi32_ps(v1_256);

__m256i v2_256 = _mm256_cvtepu8_epi32(v2_128);
__m256 v2_f = _mm256_cvtepi32_ps(v2_256);
// Load 8 float elements from query
__m256 v2 = _mm256_loadu_ps(pVect2);
pVect2 += 8;

// Accumulate q_i * y_i (no dequantization!)
// Using mul + add since this is the non-FMA version
sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v2_f, v1));
sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v1_f, v2));
}

// pVect1v = SQ8 storage, pVect2v = FP32 query
template <unsigned char residual> // 0..15
float SQ8_InnerProductImp_AVX2(const void *pVect1v, const void *pVect2v, size_t dimension) {
const float *pVect1 = static_cast<const float *>(pVect1v);
const uint8_t *pVect2 = static_cast<const uint8_t *>(pVect2v);
const float *pEnd1 = pVect1 + dimension;
float SQ8_FP32_InnerProductImp_AVX2(const void *pVect1v, const void *pVect2v, size_t dimension) {
const uint8_t *pVect1 = static_cast<const uint8_t *>(pVect1v); // SQ8 storage
const float *pVect2 = static_cast<const float *>(pVect2v); // FP32 query
const uint8_t *pEnd1 = pVect1 + dimension;

// Initialize sum accumulator for Σ(q_i * y_i)
__m256 sum256 = _mm256_setzero_ps();

// Handle residual elements first (0-7 elements)
if constexpr (residual % 8) {
__mmask8 constexpr mask = (1 << (residual % 8)) - 1;
__m256 v1 = my_mm256_maskz_loadu_ps<mask>(pVect1);
pVect1 += residual % 8;

// Load uint8 elements and convert to float
__m128i v2_128 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(pVect2));
pVect2 += residual % 8;
__m128i v1_128 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(pVect1));
pVect1 += residual % 8;

__m256i v2_256 = _mm256_cvtepu8_epi32(v2_128);
__m256 v2_f = _mm256_cvtepi32_ps(v2_256);
__m256i v1_256 = _mm256_cvtepu8_epi32(v1_128);
__m256 v1_f = _mm256_cvtepi32_ps(v1_256);

// Load masked float elements from query
__m256 v2 = my_mm256_maskz_loadu_ps<mask>(pVect2);
pVect2 += residual % 8;

// Compute q_i * y_i (no dequantization)
sum256 = _mm256_mul_ps(v1, v2_f);
sum256 = _mm256_mul_ps(v1_f, v2);
}

// If the residual is >=8, have another step of 8 floats
if constexpr (residual >= 8) {
InnerProductStepSQ8(pVect1, pVect2, sum256);
InnerProductStepSQ8_FP32(pVect1, pVect2, sum256);
}

// Process remaining full chunks of 16 elements (2x8)
// Using do-while since dim > 16 guarantees at least one iteration
do {
InnerProductStepSQ8(pVect1, pVect2, sum256);
InnerProductStepSQ8(pVect1, pVect2, sum256);
InnerProductStepSQ8_FP32(pVect1, pVect2, sum256);
InnerProductStepSQ8_FP32(pVect1, pVect2, sum256);
} while (pVect1 < pEnd1);

// Reduce to get Σ(q_i * y_i)
float quantized_dot = my_mm256_reduce_add_ps(sum256);

// Get quantization parameters from stored vector (after quantized data)
const uint8_t *pVect2Base = static_cast<const uint8_t *>(pVect2v);
const float *params2 = reinterpret_cast<const float *>(pVect2Base + dimension);
const float min_val = params2[sq8::MIN_VAL];
const float delta = params2[sq8::DELTA];
const uint8_t *pVect1Base = static_cast<const uint8_t *>(pVect1v);
const float *params1 = reinterpret_cast<const float *>(pVect1Base + dimension);
const float min_val = params1[sq8::MIN_VAL];
const float delta = params1[sq8::DELTA];

// Get precomputed y_sum from query blob (stored after the dim floats)
const float y_sum = static_cast<const float *>(pVect1v)[dimension + sq8::SUM_QUERY];
const float y_sum = static_cast<const float *>(pVect2v)[dimension + sq8::SUM_QUERY];

// Apply the algebraic formula: IP = min * y_sum + delta * Σ(q_i * y_i)
return min_val * y_sum + delta * quantized_dot;
}

template <unsigned char residual> // 0..15
float SQ8_InnerProductSIMD16_AVX2(const void *pVect1v, const void *pVect2v, size_t dimension) {
return 1.0f - SQ8_InnerProductImp_AVX2<residual>(pVect1v, pVect2v, dimension);
float SQ8_FP32_InnerProductSIMD16_AVX2(const void *pVect1v, const void *pVect2v, size_t dimension) {
return 1.0f - SQ8_FP32_InnerProductImp_AVX2<residual>(pVect1v, pVect2v, dimension);
}

template <unsigned char residual> // 0..15
float SQ8_CosineSIMD16_AVX2(const void *pVect1v, const void *pVect2v, size_t dimension) {
float SQ8_FP32_CosineSIMD16_AVX2(const void *pVect1v, const void *pVect2v, size_t dimension) {
// Calculate inner product using common implementation with normalization
return SQ8_InnerProductSIMD16_AVX2<residual>(pVect1v, pVect2v, dimension);
return SQ8_FP32_InnerProductSIMD16_AVX2<residual>(pVect1v, pVect2v, dimension);
}
Loading
Loading