Skip to content
19 changes: 19 additions & 0 deletions cpp/src/neighbors/detail/cagra/cagra_build.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

#include "../../../core/nvtx.hpp"
#include "../../../preprocessing/quantize/vpq_build-ext.cuh"
#include "../../ivf_pq/ivf_pq_fp16_overflow.cuh"
#include "graph_core.cuh"

#include <raft/core/copy.cuh>
Expand Down Expand Up @@ -2197,6 +2198,24 @@ index<T, IdxT> build(
knn_build_params = cagra::graph_build_params::ivf_pq_params(dataset.extents(), params.metric);
}
}

// Predict potential FP16 distance overflow for large-magnitude (e.g. unnormalized) datasets
// -> fall back to FP32.
if (auto* pq = std::get_if<cagra::graph_build_params::ivf_pq_params>(&knn_build_params)) {
const bool using_fp16_distance = pq->search_params.internal_distance_dtype == CUDA_R_16F ||
pq->search_params.coarse_search_dtype == CUDA_R_16F;
if (using_fp16_distance &&
ivf_pq::helpers::estimate_fp16_overflow(res, dataset, params.metric)) {
RAFT_LOG_WARN(
"IVF-PQ internal type of FP16 is likely insufficient for this dataset to avoid overflow in "
"distance computations -> "
"Switching 'internal_distance_dtype' and 'coarse_search_dtype' to FP32");
pq->search_params.internal_distance_dtype = CUDA_R_32F;
pq->search_params.coarse_search_dtype = CUDA_R_32F;
// lut_dtype is left unchanged because its per-subspace terms are smaller by a factor of
// pq_dim and therefore, less likely to overflow.
}
}
RAFT_EXPECTS(
params.metric != cuvs::distance::DistanceType::BitwiseHamming ||
std::holds_alternative<cagra::graph_build_params::iterative_search_params>(
Expand Down
129 changes: 129 additions & 0 deletions cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
* SPDX-License-Identifier: Apache-2.0
*/

#pragma once

#include "../detail/ann_utils.cuh" // cuvs::spatial::knn::detail::utils::mapping

#include <cuvs/distance/distance.hpp>

#include <raft/core/device_mdarray.hpp>
#include <raft/core/error.hpp>
#include <raft/core/mdspan.hpp>
#include <raft/core/operators.hpp>
#include <raft/core/resource/cuda_stream.hpp>
#include <raft/core/resource/device_memory_resource.hpp>
#include <raft/core/resources.hpp>
#include <raft/linalg/map_reduce.cuh>
#include <raft/linalg/reduce.cuh>
#include <raft/util/cuda_dev_essentials.cuh>
#include <raft/util/cudart_utils.hpp>

#include <cstdint>

namespace cuvs::neighbors::ivf_pq::detail {

/**
* Estimate max_i ||x_i||^2 over the dataset.
*/
template <typename DataT, typename Accessor>
float estimate_max_squared_norm(
raft::resources const& handle,
raft::mdspan<const DataT, raft::matrix_extent<int64_t>, raft::row_major, Accessor> dataset)
{
common::nvtx::range<common::nvtx::domain::cuvs> r("estimate_max_squared_norm");
auto stream = raft::resource::get_cuda_stream(handle);
const int64_t n_rows = dataset.extent(0);
const int64_t dim = dataset.extent(1);

// Determine sample size based on a smooth saturation equation. The equation satisfies:
// - n_sample is always less than or equal to n_rows
// - n_sample saturates to kSaturation when n_rows is inf
// - n_sample increases fast for small n_rows and slow to saturation for large n_rows
// Idea: we examine most of the dataset when it is small-sized, and only a small fraction
// (up to a maximum/saturation number) when the dataset size grows large.
// kSaturation and kDelay are selected as a compromise between runtime and outlier recall.
constexpr int64_t kSaturation = 20000;
constexpr int64_t kDelay = kSaturation * 10;
RAFT_EXPECTS(kDelay >= kSaturation,
"kDelay must not be smaller than kSaturation so that n_sample is always less than "
"or equal to n_rows");
int64_t n_sample = raft::ceildiv(n_rows * kSaturation, n_rows + kDelay);

auto mr = raft::resource::get_workspace_resource_ref(handle);
auto sample = raft::make_device_mdarray<DataT>(handle, mr, raft::make_extents<int64_t>(n_sample, dim));
raft::copy(sample.data_handle(), dataset.data_handle(), n_sample * dim, raft::resource::get_cuda_stream(handle));

// Compute float-mapped squared norm
auto d_map_sq_norm = raft::make_device_vector<float, int64_t>(handle, n_sample);
raft::linalg::reduce<raft::Apply::ALONG_ROWS>(
handle,
raft::make_const_mdspan(sample.view()),
d_map_sq_norm.view(),
0.0f,
false,
[] __device__(DataT v, auto) -> float {
float e = cuvs::spatial::knn::detail::utils::mapping<float>{}(v);
return e * e;
},
raft::add_op(),
raft::identity_op());
// Compute max of squared norm vector
auto d_max_sq = raft::make_device_scalar<float>(handle, 0.0f);
raft::linalg::map_reduce(handle,
raft::make_const_mdspan(d_map_sq_norm.view()),
d_max_sq.view(),
0.0f,
raft::identity_op(),
raft::max_op());

float max_sq = 0.0f;
raft::update_host(&max_sq, d_max_sq.data_handle(), 1, stream);
raft::resource::sync_stream(handle);

return max_sq;
}

} // namespace cuvs::neighbors::ivf_pq::detail

namespace cuvs::neighbors::ivf_pq::helpers {

/**
* @brief Estimate whether FP16 is likely insufficient for IVF-PQ's full-magnitude distance
* computations on this dataset (i.e. `internal_distance_dtype` and `coarse_search_dtype`).
*
* We bound the largest achievable score from the dataset's vector norms. With R = max_i ||x_i||
* (estimated from a fraction of the dataset):
* - L2Expanded: ||x - y||^2 = ||x||^2 + ||y||^2 - 2<x,y> <= (||x|| + ||y||)^2 <= 4 * R^2
* - InnerProduct: |<x, y>| <= ||x|| * ||y|| <= R^2
* - CosineExpanded: data is L2-normalized, so |score| <= 1 and overflow is impossible.
*/
template <typename DataT, typename Accessor>
bool estimate_fp16_overflow(
raft::resources const& handle,
raft::mdspan<const DataT, raft::matrix_extent<int64_t>, raft::row_major, Accessor> dataset,
cuvs::distance::DistanceType metric)
{
if (dataset.extent(0) == 0) { return false; }

// Cosine similarity scores does normalization itself, so overflow won't happen
if (metric == cuvs::distance::DistanceType::CosineExpanded) { return false; }

// FP16 largest finite value, with a defensive margin to also avoid precision loss near the limit.
constexpr float kFp16Max = 65504.0f;
constexpr float kFp16DefensiveMargin = 0.25f;
const float overflow_detect_threshold = kFp16DefensiveMargin * kFp16Max;
Comment on lines +115 to +117

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

While the intention is good, I am not sure if we ned so large kFp16DefensiveMargin. E.g. someone could have normalized 4096 dim embeddings, where the current setting would trigger fallback to fp32 (in case of L2 norm). Admittedly, such large embeddings are rare, but I would prefer to only fall back to fp32 when the distance can overflow, and not when we have precision loss due to the limited range of the numeric type. @achirkin what do you think?

@huuanhhuyn huuanhhuyn Jun 26, 2026

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you for checking it.

I have some arguments for keeping it this way:

  • the precision loss would break the ivf-pq search when neighbor distances become indifferentiable. This will fall again into the "low self-included ratio" error message.
  • The magnitude looks large but the bitwise distances between 65k and 16k are just 2 bits.

someone could have normalized 4096 dim embeddings, where the current setting would trigger fallback to fp32

Do you have the dataset? I could run a test on it


const float max_vector_sq_norm =
cuvs::neighbors::ivf_pq::detail::estimate_max_squared_norm(handle, dataset);

const float max_distance_sq_norm = metric == cuvs::distance::DistanceType::L2Expanded
? 4.0f * max_vector_sq_norm
: max_vector_sq_norm;

return max_distance_sq_norm > overflow_detect_threshold;
}

} // namespace cuvs::neighbors::ivf_pq::helpers
Loading