From faa16090d43a5c99d13068299e9cb6b4114c6835 Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Tue, 16 Jun 2026 07:06:43 +0000 Subject: [PATCH 1/8] IVF-PQ FP16 overflow detection --- .../neighbors/detail/cagra/cagra_build.cuh | 16 ++ .../neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh | 138 ++++++++++++++++++ 2 files changed, 154 insertions(+) create mode 100644 cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh index 96ff8344d3..f2db647842 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh @@ -6,6 +6,7 @@ #include "../../../core/nvtx.hpp" #include "../../../preprocessing/quantize/vpq_build-ext.cuh" +#include "../../ivf_pq/ivf_pq_fp16_overflow.cuh" #include "graph_core.cuh" #include @@ -1621,6 +1622,21 @@ void build_knn_graph( pq.build_params.metric == cuvs::distance::DistanceType::CosineExpanded, "Currently only L2Expanded, InnerProduct and CosineExpanded metrics are supported"); + // Guard against potential FP16 distance overflow for large-magnitude datasets -> back to FP32. + const bool using_fp16_distance = pq.search_params.internal_distance_dtype == CUDA_R_16F || + pq.search_params.coarse_search_dtype == CUDA_R_16F; + if (using_fp16_distance && + ivf_pq::helpers::estimate_fp16_overflow(res, dataset, pq.build_params.metric)) { + RAFT_LOG_WARN( + "IVF-PQ internal type of FP16 is likely insufficient for this dataset to avoid overflow in " + "distance computations -> " + "Switching 'internal_distance_dtype' and 'coarse_search_dtype' to FP32"); + pq.search_params.internal_distance_dtype = CUDA_R_32F; + pq.search_params.coarse_search_dtype = CUDA_R_32F; + // lut_dtype is left unchanged because its per-subspace terms are smaller by a factor of pq_dim + // and therefore, less likely to overflow. + } + uint32_t node_degree = knn_graph.extent(1); raft::common::nvtx::range fun_scope( "cagra::build_knn_graph(%zu, %zu, %u)", diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh new file mode 100644 index 0000000000..e67e3a5d9c --- /dev/null +++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh @@ -0,0 +1,138 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#include "../detail/ann_utils.cuh" // cuvs::spatial::knn::detail::utils::mapping + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace cuvs::neighbors::ivf_pq::detail { + +/** Reduce the maximum squared L2 norm over a set of row-major vectors of element type DataT. */ +template +__global__ void kern_max_squared_norm(const DataT* __restrict__ data, + int64_t n_rows, + int64_t dim, + float* __restrict__ out_max_sq) +{ + for (int64_t row = blockIdx.x * blockDim.x + threadIdx.x; row < n_rows; + row += static_cast(gridDim.x) * blockDim.x) { + const DataT* v = data + row * dim; + float sq = 0.0f; + for (int64_t d = 0; d < dim; d++) { + // + float e = cuvs::spatial::knn::detail::utils::mapping{}(v[d]); + printf("raw float(v[d]): %f, mapped: %f\n", static_cast(v[d]), e); + sq += e * e; + } + // - There is no atomicMax for floats, so we embrace the bitwise representation monoticity + // between float and int. This is valid when values are non-negative, which is the case + // for squared norms. + // - Choose global atomic instead of shared memory tree reduction for simplicity, assuming + // low contention. + atomicMax(reinterpret_cast(out_max_sq), __float_as_int(sq)); + } +} + +/** + * Estimate max_i ||x_i||^2 over the dataset by sampling a fraction of its rows. + * + * NOTE: sampling yields a *lower-bound* estimate of the true max norm, so a too-small fraction can + * miss outlier vectors. Increase `sample_fraction` (up to 1.0 for an exact, no-false-negative scan) + * if you observe overflow slipping through. + */ +template +float estimate_max_squared_norm( + raft::resources const& res, + raft::mdspan, raft::row_major, Accessor> dataset, + double sample_fraction) +{ + auto stream = raft::resource::get_cuda_stream(res); + const int64_t n_rows = dataset.extent(0); + const int64_t dim = dataset.extent(1); + + // Determine sample size based on fraction, with bounds + int64_t n_sample = static_cast(sample_fraction * static_cast(n_rows)); + if (n_rows <= 1000) { + n_sample = n_rows; // for small datasets, just use them all and skip the sampling overhead + } else if (n_rows > 100000) { + // cap the sample size to 100k for speed and keep memory use within the limited workspace + // resource + n_sample = 100000; + // + } + + // Sample from the dataset + auto mr = raft::resource::get_workspace_resource_ref(res); + auto sample = + raft::make_device_mdarray(res, mr, raft::make_extents(n_sample, dim)); + raft::matrix::sample_rows( + res, raft::random::RngState{137}, dataset, sample.view()); + + auto d_max_sq = raft::make_device_scalar(res, 0.0f); + constexpr int block_size = 256; + const int grid_size = static_cast((n_sample + block_size - 1) / block_size); + kern_max_squared_norm<<>>( + sample.data_handle(), n_sample, dim, d_max_sq.data_handle()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + + float h_max_sq = 0.0f; + raft::update_host(&h_max_sq, d_max_sq.data_handle(), 1, stream); + raft::resource::sync_stream(res); + return h_max_sq; +} + +} // namespace cuvs::neighbors::ivf_pq::detail + +namespace cuvs::neighbors::ivf_pq::helpers { + +/** + * @brief Estimate whether FP16 is likely insufficient for IVF-PQ's full-magnitude distance + * computations on this dataset (i.e. `internal_distance_dtype` and `coarse_search_dtype`). + * + * We bound the largest achievable score from the dataset's vector norms. With R = max_i ||x_i|| + * (estimated from a random sample of the dataset): + * - L2Expanded: ||x - y||^2 = ||x||^2 + ||y||^2 - 2 <= (||x|| + ||y||)^2 <= 4 * R^2 + * - InnerProduct: || <= ||x|| * ||y|| <= R^2 + * - CosineExpanded: data is L2-normalized, so |score| <= 1 and overflow is impossible. + */ +template +bool estimate_fp16_overflow( + raft::resources const& res, + raft::mdspan, raft::row_major, Accessor> dataset, + cuvs::distance::DistanceType metric, + double sample_fraction = 0.01) +{ + // Cosine similarity scores does normalization itself, so overflow won't happen + if (metric == cuvs::distance::DistanceType::CosineExpanded) { return false; } + + // FP16 largest finite value, with a defensive margin to also avoid precision loss near the limit. + constexpr float kFp16Max = 65504.0f; + constexpr float kFp16DefensiveMargin = 0.25f; + const float overflow_detect_threshold = kFp16DefensiveMargin * kFp16Max; + + const float max_vector_sq_norm = + cuvs::neighbors::ivf_pq::detail::estimate_max_squared_norm(res, dataset, sample_fraction); + + const float max_distance_sq_norm = metric == cuvs::distance::DistanceType::L2Expanded + ? 4.0f * max_vector_sq_norm + : max_vector_sq_norm; + + return max_distance_sq_norm > overflow_detect_threshold; +} + +} // namespace cuvs::neighbors::ivf_pq::helpers From 3b5130f8d3af2f67d78fecec3b733c098793cdc0 Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Tue, 16 Jun 2026 07:28:47 +0000 Subject: [PATCH 2/8] Explain the use of mapping --- cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh index e67e3a5d9c..3315607a85 100644 --- a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh +++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh @@ -34,13 +34,14 @@ __global__ void kern_max_squared_norm(const DataT* __restrict__ data, const DataT* v = data + row * dim; float sq = 0.0f; for (int64_t d = 0; d < dim; d++) { - // + // internally, IVF-PQ distance computations will map the input data type (e.g. FP16) to float before + // doing arithmetic, so we need to apply the same mapping here to get a correct estimate of the squared norms + // instead of using static_cast(v[d]) float e = cuvs::spatial::knn::detail::utils::mapping{}(v[d]); - printf("raw float(v[d]): %f, mapped: %f\n", static_cast(v[d]), e); sq += e * e; } // - There is no atomicMax for floats, so we embrace the bitwise representation monoticity - // between float and int. This is valid when values are non-negative, which is the case + // between float and int. This is valid when values are non-negative, which is the case // for squared norms. // - Choose global atomic instead of shared memory tree reduction for simplicity, assuming // low contention. @@ -70,10 +71,8 @@ float estimate_max_squared_norm( if (n_rows <= 1000) { n_sample = n_rows; // for small datasets, just use them all and skip the sampling overhead } else if (n_rows > 100000) { - // cap the sample size to 100k for speed and keep memory use within the limited workspace - // resource + // cap the sample size to 100k for speed and keep memory use within the limited workspace resource n_sample = 100000; - // } // Sample from the dataset From 3a4e4913f5586eb56c093aa1ceaf65a4bfb9b9a8 Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Mon, 22 Jun 2026 08:50:43 +0000 Subject: [PATCH 3/8] Move params fallback to params init phase --- .../neighbors/detail/cagra/cagra_build.cuh | 31 +++++++++---------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh index f2db647842..b566e43970 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh @@ -1621,22 +1621,7 @@ void build_knn_graph( pq.build_params.metric == cuvs::distance::DistanceType::InnerProduct || pq.build_params.metric == cuvs::distance::DistanceType::CosineExpanded, "Currently only L2Expanded, InnerProduct and CosineExpanded metrics are supported"); - - // Guard against potential FP16 distance overflow for large-magnitude datasets -> back to FP32. - const bool using_fp16_distance = pq.search_params.internal_distance_dtype == CUDA_R_16F || - pq.search_params.coarse_search_dtype == CUDA_R_16F; - if (using_fp16_distance && - ivf_pq::helpers::estimate_fp16_overflow(res, dataset, pq.build_params.metric)) { - RAFT_LOG_WARN( - "IVF-PQ internal type of FP16 is likely insufficient for this dataset to avoid overflow in " - "distance computations -> " - "Switching 'internal_distance_dtype' and 'coarse_search_dtype' to FP32"); - pq.search_params.internal_distance_dtype = CUDA_R_32F; - pq.search_params.coarse_search_dtype = CUDA_R_32F; - // lut_dtype is left unchanged because its per-subspace terms are smaller by a factor of pq_dim - // and therefore, less likely to overflow. - } - + uint32_t node_degree = knn_graph.extent(1); raft::common::nvtx::range fun_scope( "cagra::build_knn_graph(%zu, %zu, %u)", @@ -2211,6 +2196,20 @@ index build( } else { RAFT_LOG_DEBUG("Selecting IVF-PQ solver"); knn_build_params = cagra::graph_build_params::ivf_pq_params(dataset.extents(), params.metric); + // Guard against potential FP16 distance overflow for large-magnitude datasets -> back to FP32. + const bool using_fp16_distance = pq.search_params.internal_distance_dtype == CUDA_R_16F || + pq.search_params.coarse_search_dtype == CUDA_R_16F; + if (using_fp16_distance && + ivf_pq::helpers::estimate_fp16_overflow(res, dataset, pq.build_params.metric)) { + RAFT_LOG_WARN( + "IVF-PQ internal type of FP16 is likely insufficient for this dataset to avoid overflow in " + "distance computations -> " + "Switching 'internal_distance_dtype' and 'coarse_search_dtype' to FP32"); + pq.search_params.internal_distance_dtype = CUDA_R_32F; + pq.search_params.coarse_search_dtype = CUDA_R_32F; + // lut_dtype is left unchanged because its per-subspace terms are smaller by a factor of pq_dim + // and therefore, less likely to overflow. + } } } RAFT_EXPECTS( From f4031a3fa9dfdc74145aeca52c839727ac473081 Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Mon, 22 Jun 2026 08:51:12 +0000 Subject: [PATCH 4/8] Choose sampling equation --- .../neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh | 24 +++++++++---------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh index 3315607a85..4c0bc6be94 100644 --- a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh +++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh @@ -59,21 +59,20 @@ __global__ void kern_max_squared_norm(const DataT* __restrict__ data, template float estimate_max_squared_norm( raft::resources const& res, - raft::mdspan, raft::row_major, Accessor> dataset, - double sample_fraction) + raft::mdspan, raft::row_major, Accessor> dataset) { auto stream = raft::resource::get_cuda_stream(res); const int64_t n_rows = dataset.extent(0); const int64_t dim = dataset.extent(1); - // Determine sample size based on fraction, with bounds - int64_t n_sample = static_cast(sample_fraction * static_cast(n_rows)); - if (n_rows <= 1000) { - n_sample = n_rows; // for small datasets, just use them all and skip the sampling overhead - } else if (n_rows > 100000) { - // cap the sample size to 100k for speed and keep memory use within the limited workspace resource - n_sample = 100000; - } + // Determine sample size based on a smooth saturation equation. The equation satisfies: + // - n_sample is always less than or equal to n_rows + // - n_sample saturates to kSaturation when n_rows is inf + // - n_sample increases fast for small n_rows and slow to saturation for large n_rows + // Idea: we greedily sample most of the dataset when it is small-sized, and cap it to kSaturation + // when dataset size grows very large. + constexpr int64_t kSaturation = 100000; + int64_t n_sample = (n_rows * kSaturation + (n_rows + kSaturation - 1)) / (n_rows + kSaturation); // Sample from the dataset auto mr = raft::resource::get_workspace_resource_ref(res); @@ -113,8 +112,7 @@ template bool estimate_fp16_overflow( raft::resources const& res, raft::mdspan, raft::row_major, Accessor> dataset, - cuvs::distance::DistanceType metric, - double sample_fraction = 0.01) + cuvs::distance::DistanceType metric) { // Cosine similarity scores does normalization itself, so overflow won't happen if (metric == cuvs::distance::DistanceType::CosineExpanded) { return false; } @@ -125,7 +123,7 @@ bool estimate_fp16_overflow( const float overflow_detect_threshold = kFp16DefensiveMargin * kFp16Max; const float max_vector_sq_norm = - cuvs::neighbors::ivf_pq::detail::estimate_max_squared_norm(res, dataset, sample_fraction); + cuvs::neighbors::ivf_pq::detail::estimate_max_squared_norm(res, dataset); const float max_distance_sq_norm = metric == cuvs::distance::DistanceType::L2Expanded ? 4.0f * max_vector_sq_norm From f330a7fa67a3901e17522091c7bb4ee08c4018cc Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Mon, 22 Jun 2026 10:18:20 +0000 Subject: [PATCH 5/8] Use raft built-in kernels and remove manual one. --- .../neighbors/detail/cagra/cagra_build.cuh | 34 ++++--- .../neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh | 98 +++++++++---------- 2 files changed, 64 insertions(+), 68 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh index b566e43970..515e464bb8 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh @@ -1621,7 +1621,7 @@ void build_knn_graph( pq.build_params.metric == cuvs::distance::DistanceType::InnerProduct || pq.build_params.metric == cuvs::distance::DistanceType::CosineExpanded, "Currently only L2Expanded, InnerProduct and CosineExpanded metrics are supported"); - + uint32_t node_degree = knn_graph.extent(1); raft::common::nvtx::range fun_scope( "cagra::build_knn_graph(%zu, %zu, %u)", @@ -2196,20 +2196,24 @@ index build( } else { RAFT_LOG_DEBUG("Selecting IVF-PQ solver"); knn_build_params = cagra::graph_build_params::ivf_pq_params(dataset.extents(), params.metric); - // Guard against potential FP16 distance overflow for large-magnitude datasets -> back to FP32. - const bool using_fp16_distance = pq.search_params.internal_distance_dtype == CUDA_R_16F || - pq.search_params.coarse_search_dtype == CUDA_R_16F; - if (using_fp16_distance && - ivf_pq::helpers::estimate_fp16_overflow(res, dataset, pq.build_params.metric)) { - RAFT_LOG_WARN( - "IVF-PQ internal type of FP16 is likely insufficient for this dataset to avoid overflow in " - "distance computations -> " - "Switching 'internal_distance_dtype' and 'coarse_search_dtype' to FP32"); - pq.search_params.internal_distance_dtype = CUDA_R_32F; - pq.search_params.coarse_search_dtype = CUDA_R_32F; - // lut_dtype is left unchanged because its per-subspace terms are smaller by a factor of pq_dim - // and therefore, less likely to overflow. - } + } + } + + // Guard against potential FP16 distance overflow for large-magnitude datasets. Applies to any + // IVF-PQ graph build (auto-selected above or explicitly set in params) -> fall back to FP32. + if (auto* pq = std::get_if(&knn_build_params)) { + const bool using_fp16_distance = pq->search_params.internal_distance_dtype == CUDA_R_16F || + pq->search_params.coarse_search_dtype == CUDA_R_16F; + if (using_fp16_distance && + ivf_pq::helpers::estimate_fp16_overflow(res, dataset, params.metric)) { + RAFT_LOG_WARN( + "IVF-PQ internal type of FP16 is likely insufficient for this dataset to avoid overflow in " + "distance computations -> " + "Switching 'internal_distance_dtype' and 'coarse_search_dtype' to FP32"); + pq->search_params.internal_distance_dtype = CUDA_R_32F; + pq->search_params.coarse_search_dtype = CUDA_R_32F; + // lut_dtype is left unchanged because its per-subspace terms are smaller by a factor of + // pq_dim and therefore, less likely to overflow. } } RAFT_EXPECTS( diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh index 4c0bc6be94..a184fb9701 100644 --- a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh +++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh @@ -11,9 +11,12 @@ #include #include +#include #include #include #include +#include +#include #include #include #include @@ -22,46 +25,15 @@ namespace cuvs::neighbors::ivf_pq::detail { -/** Reduce the maximum squared L2 norm over a set of row-major vectors of element type DataT. */ -template -__global__ void kern_max_squared_norm(const DataT* __restrict__ data, - int64_t n_rows, - int64_t dim, - float* __restrict__ out_max_sq) -{ - for (int64_t row = blockIdx.x * blockDim.x + threadIdx.x; row < n_rows; - row += static_cast(gridDim.x) * blockDim.x) { - const DataT* v = data + row * dim; - float sq = 0.0f; - for (int64_t d = 0; d < dim; d++) { - // internally, IVF-PQ distance computations will map the input data type (e.g. FP16) to float before - // doing arithmetic, so we need to apply the same mapping here to get a correct estimate of the squared norms - // instead of using static_cast(v[d]) - float e = cuvs::spatial::knn::detail::utils::mapping{}(v[d]); - sq += e * e; - } - // - There is no atomicMax for floats, so we embrace the bitwise representation monoticity - // between float and int. This is valid when values are non-negative, which is the case - // for squared norms. - // - Choose global atomic instead of shared memory tree reduction for simplicity, assuming - // low contention. - atomicMax(reinterpret_cast(out_max_sq), __float_as_int(sq)); - } -} - /** - * Estimate max_i ||x_i||^2 over the dataset by sampling a fraction of its rows. - * - * NOTE: sampling yields a *lower-bound* estimate of the true max norm, so a too-small fraction can - * miss outlier vectors. Increase `sample_fraction` (up to 1.0 for an exact, no-false-negative scan) - * if you observe overflow slipping through. + * Estimate max_i ||x_i||^2 over the dataset by sampling from its rows. */ template float estimate_max_squared_norm( - raft::resources const& res, + raft::resources const& handle, raft::mdspan, raft::row_major, Accessor> dataset) { - auto stream = raft::resource::get_cuda_stream(res); + auto stream = raft::resource::get_cuda_stream(handle); const int64_t n_rows = dataset.extent(0); const int64_t dim = dataset.extent(1); @@ -69,29 +41,47 @@ float estimate_max_squared_norm( // - n_sample is always less than or equal to n_rows // - n_sample saturates to kSaturation when n_rows is inf // - n_sample increases fast for small n_rows and slow to saturation for large n_rows - // Idea: we greedily sample most of the dataset when it is small-sized, and cap it to kSaturation - // when dataset size grows very large. + // Idea: we sample most of the dataset when it is small-sized, and only a small fraction + // (up to a maximum/saturation number) when the dataset size grows large. + // kSaturation is selected as a compromise between runtime and recall. constexpr int64_t kSaturation = 100000; int64_t n_sample = (n_rows * kSaturation + (n_rows + kSaturation - 1)) / (n_rows + kSaturation); // Sample from the dataset - auto mr = raft::resource::get_workspace_resource_ref(res); + auto mr = raft::resource::get_workspace_resource_ref(handle); auto sample = - raft::make_device_mdarray(res, mr, raft::make_extents(n_sample, dim)); + raft::make_device_mdarray(handle, mr, raft::make_extents(n_sample, dim)); raft::matrix::sample_rows( - res, raft::random::RngState{137}, dataset, sample.view()); - - auto d_max_sq = raft::make_device_scalar(res, 0.0f); - constexpr int block_size = 256; - const int grid_size = static_cast((n_sample + block_size - 1) / block_size); - kern_max_squared_norm<<>>( - sample.data_handle(), n_sample, dim, d_max_sq.data_handle()); - RAFT_CUDA_TRY(cudaPeekAtLastError()); - - float h_max_sq = 0.0f; - raft::update_host(&h_max_sq, d_max_sq.data_handle(), 1, stream); - raft::resource::sync_stream(res); - return h_max_sq; + handle, raft::random::RngState{137}, dataset, sample.view()); + + // Compute mapped squared norm + auto d_map_sq_norm = raft::make_device_vector(handle, n_sample); + raft::linalg::reduce( + handle, + raft::make_const_mdspan(sample.view()), + d_map_sq_norm.view(), + 0.0f, + false, + [] __device__(DataT v, auto) -> float { + float e = cuvs::spatial::knn::detail::utils::mapping{}(v); + return e * e; + }, + raft::add_op(), + raft::identity_op()); + // Compute max of squared norm vector + auto d_max_sq = raft::make_device_scalar(handle, 0.0f); + raft::linalg::map_reduce(handle, + raft::make_const_mdspan(d_map_sq_norm.view()), + d_max_sq.view(), + 0.0f, + raft::identity_op(), + raft::max_op()); + + float max_sq = 0.0f; + raft::update_host(&max_sq, d_max_sq.data_handle(), 1, stream); + raft::resource::sync_stream(handle); + + return max_sq; } } // namespace cuvs::neighbors::ivf_pq::detail @@ -110,10 +100,12 @@ namespace cuvs::neighbors::ivf_pq::helpers { */ template bool estimate_fp16_overflow( - raft::resources const& res, + raft::resources const& handle, raft::mdspan, raft::row_major, Accessor> dataset, cuvs::distance::DistanceType metric) { + if (dataset.extent(0) == 0) { return false; } + // Cosine similarity scores does normalization itself, so overflow won't happen if (metric == cuvs::distance::DistanceType::CosineExpanded) { return false; } @@ -123,7 +115,7 @@ bool estimate_fp16_overflow( const float overflow_detect_threshold = kFp16DefensiveMargin * kFp16Max; const float max_vector_sq_norm = - cuvs::neighbors::ivf_pq::detail::estimate_max_squared_norm(res, dataset); + cuvs::neighbors::ivf_pq::detail::estimate_max_squared_norm(handle, dataset); const float max_distance_sq_norm = metric == cuvs::distance::DistanceType::L2Expanded ? 4.0f * max_vector_sq_norm From 22878f1255f37baf14d636c522288a8de19fe7a2 Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Mon, 22 Jun 2026 10:55:38 +0000 Subject: [PATCH 6/8] Add kDelay param to adjust the sampling fraction growth speed --- cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh index a184fb9701..b50e7e9a4e 100644 --- a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh +++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -43,9 +44,13 @@ float estimate_max_squared_norm( // - n_sample increases fast for small n_rows and slow to saturation for large n_rows // Idea: we sample most of the dataset when it is small-sized, and only a small fraction // (up to a maximum/saturation number) when the dataset size grows large. - // kSaturation is selected as a compromise between runtime and recall. + // kSaturation and kDelay are selected as a compromise between runtime and outlier recall. constexpr int64_t kSaturation = 100000; - int64_t n_sample = (n_rows * kSaturation + (n_rows + kSaturation - 1)) / (n_rows + kSaturation); + constexpr int64_t kDelay = kSaturation * 10; + RAFT_EXPECTS(kDelay >= kSaturation, + "kDelay must not be smaller than kSaturation so that n_sample is always less than " + "or equal to n_rows"); + int64_t n_sample = (n_rows * kSaturation + (n_rows + kDelay - 1)) / (n_rows + kDelay); // Sample from the dataset auto mr = raft::resource::get_workspace_resource_ref(handle); From a2d9369166386c3484d30988e55083f83d04f908 Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Mon, 22 Jun 2026 11:24:33 +0000 Subject: [PATCH 7/8] Reduce kSaturation to 20k (just sufficient to detect FP16 overflow in SIFT 1M) --- cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh index b50e7e9a4e..99f2c9e60c 100644 --- a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh +++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh @@ -45,7 +45,7 @@ float estimate_max_squared_norm( // Idea: we sample most of the dataset when it is small-sized, and only a small fraction // (up to a maximum/saturation number) when the dataset size grows large. // kSaturation and kDelay are selected as a compromise between runtime and outlier recall. - constexpr int64_t kSaturation = 100000; + constexpr int64_t kSaturation = 20000; constexpr int64_t kDelay = kSaturation * 10; RAFT_EXPECTS(kDelay >= kSaturation, "kDelay must not be smaller than kSaturation so that n_sample is always less than " From 34150927980f335134901a0f7dcddd82fceeec0d Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Mon, 22 Jun 2026 12:18:55 +0000 Subject: [PATCH 8/8] Explain uniform sampling decision --- cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh index 99f2c9e60c..2f99da9e06 100644 --- a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh +++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh @@ -27,7 +27,12 @@ namespace cuvs::neighbors::ivf_pq::detail { /** - * Estimate max_i ||x_i||^2 over the dataset by sampling from its rows. + * Estimate max_i ||x_i||^2 over the dataset by uniformly sampling a fraction from it. + * + * Decision: Uniform sampling is selected as it is sufficient to detect FP16 overflow in + * the datasets, where overflow-causing large vectors are frequent (e.g. SIFT 1M). For + * dataset with rare large outliers, we might preferably sample biasedly towards large vectors, + * e.g. via top-k selection over the vectors with largest L_inf norm. */ template float estimate_max_squared_norm( @@ -59,7 +64,7 @@ float estimate_max_squared_norm( raft::matrix::sample_rows( handle, raft::random::RngState{137}, dataset, sample.view()); - // Compute mapped squared norm + // Compute float-mapped squared norm auto d_map_sq_norm = raft::make_device_vector(handle, n_sample); raft::linalg::reduce( handle,