From faa16090d43a5c99d13068299e9cb6b4114c6835 Mon Sep 17 00:00:00 2001
From: Huy Nguyen <huuanhhuyn@nvidia.com>
Date: Tue, 16 Jun 2026 07:06:43 +0000
Subject: [PATCH 1/8] IVF-PQ FP16 overflow detection

---
 .../neighbors/detail/cagra/cagra_build.cuh    |  16 ++
 .../neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh | 138 ++++++++++++++++++
 2 files changed, 154 insertions(+)
 create mode 100644 cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh
diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
index 96ff8344d3..f2db647842 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh
+++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
@@ -6,6 +6,7 @@
 
 #include "../../../core/nvtx.hpp"
 #include "../../../preprocessing/quantize/vpq_build-ext.cuh"
+#include "../../ivf_pq/ivf_pq_fp16_overflow.cuh"
 #include "graph_core.cuh"
 
 #include <raft/core/copy.cuh>
@@ -1621,6 +1622,21 @@ void build_knn_graph(
                  pq.build_params.metric == cuvs::distance::DistanceType::CosineExpanded,
                "Currently only L2Expanded, InnerProduct and CosineExpanded metrics are supported");
 
+  // Guard against potential FP16 distance overflow for large-magnitude datasets -> back to FP32.
+  const bool using_fp16_distance = pq.search_params.internal_distance_dtype == CUDA_R_16F ||
+                                   pq.search_params.coarse_search_dtype == CUDA_R_16F;
+  if (using_fp16_distance &&
+      ivf_pq::helpers::estimate_fp16_overflow(res, dataset, pq.build_params.metric)) {
+    RAFT_LOG_WARN(
+      "IVF-PQ internal type of FP16 is likely insufficient for this dataset to avoid overflow in "
+      "distance computations -> "
+      "Switching 'internal_distance_dtype' and 'coarse_search_dtype' to FP32");
+    pq.search_params.internal_distance_dtype = CUDA_R_32F;
+    pq.search_params.coarse_search_dtype     = CUDA_R_32F;
+    // lut_dtype is left unchanged because its per-subspace terms are smaller by a factor of pq_dim
+    // and therefore, less likely to overflow.
+  }
+
   uint32_t node_degree = knn_graph.extent(1);
   raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope(
     "cagra::build_knn_graph<IVF-PQ>(%zu, %zu, %u)",
diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh
new file mode 100644
index 0000000000..e67e3a5d9c
--- /dev/null
+++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh
@@ -0,0 +1,138 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include "../detail/ann_utils.cuh"  // cuvs::spatial::knn::detail::utils::mapping
+
+#include <cuvs/distance/distance.hpp>
+
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/mdspan.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/device_memory_resource.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/matrix/sample_rows.cuh>
+#include <raft/random/rng.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+#include <cstdint>
+
+namespace cuvs::neighbors::ivf_pq::detail {
+
+/** Reduce the maximum squared L2 norm over a set of row-major vectors of element type DataT. */
+template <typename DataT>
+__global__ void kern_max_squared_norm(const DataT* __restrict__ data,
+                                      int64_t n_rows,
+                                      int64_t dim,
+                                      float* __restrict__ out_max_sq)
+{
+  for (int64_t row = blockIdx.x * blockDim.x + threadIdx.x; row < n_rows;
+       row += static_cast<int64_t>(gridDim.x) * blockDim.x) {
+    const DataT* v = data + row * dim;
+    float sq       = 0.0f;
+    for (int64_t d = 0; d < dim; d++) {
+      //
+      float e = cuvs::spatial::knn::detail::utils::mapping<float>{}(v[d]);
+      printf("raw float(v[d]): %f, mapped: %f\n", static_cast<float>(v[d]), e);
+      sq += e * e;
+    }
+    // - There is no atomicMax for floats, so we embrace the bitwise representation monoticity
+    //   between float and int. This is valid when values are non-negative, which is the case
+    //   for squared norms.
+    // - Choose global atomic instead of shared memory tree reduction for simplicity, assuming
+    //   low contention.
+    atomicMax(reinterpret_cast<int*>(out_max_sq), __float_as_int(sq));
+  }
+}
+
+/**
+ * Estimate max_i ||x_i||^2 over the dataset by sampling a fraction of its rows.
+ *
+ * NOTE: sampling yields a *lower-bound* estimate of the true max norm, so a too-small fraction can
+ * miss outlier vectors. Increase `sample_fraction` (up to 1.0 for an exact, no-false-negative scan)
+ * if you observe overflow slipping through.
+ */
+template <typename DataT, typename Accessor>
+float estimate_max_squared_norm(
+  raft::resources const& res,
+  raft::mdspan<const DataT, raft::matrix_extent<int64_t>, raft::row_major, Accessor> dataset,
+  double sample_fraction)
+{
+  auto stream          = raft::resource::get_cuda_stream(res);
+  const int64_t n_rows = dataset.extent(0);
+  const int64_t dim    = dataset.extent(1);
+
+  // Determine sample size based on fraction, with bounds
+  int64_t n_sample = static_cast<int64_t>(sample_fraction * static_cast<double>(n_rows));
+  if (n_rows <= 1000) {
+    n_sample = n_rows;  // for small datasets, just use them all and skip the sampling overhead
+  } else if (n_rows > 100000) {
+    // cap the sample size to 100k for speed and keep memory use within the limited workspace
+    // resource
+    n_sample = 100000;
+    //
+  }
+
+  // Sample from the dataset
+  auto mr = raft::resource::get_workspace_resource_ref(res);
+  auto sample =
+    raft::make_device_mdarray<DataT>(res, mr, raft::make_extents<int64_t>(n_sample, dim));
+  raft::matrix::sample_rows<DataT, int64_t>(
+    res, raft::random::RngState{137}, dataset, sample.view());
+
+  auto d_max_sq            = raft::make_device_scalar<float>(res, 0.0f);
+  constexpr int block_size = 256;
+  const int grid_size      = static_cast<int>((n_sample + block_size - 1) / block_size);
+  kern_max_squared_norm<DataT><<<grid_size, block_size, 0, stream>>>(
+    sample.data_handle(), n_sample, dim, d_max_sq.data_handle());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+
+  float h_max_sq = 0.0f;
+  raft::update_host(&h_max_sq, d_max_sq.data_handle(), 1, stream);
+  raft::resource::sync_stream(res);
+  return h_max_sq;
+}
+
+}  // namespace cuvs::neighbors::ivf_pq::detail
+
+namespace cuvs::neighbors::ivf_pq::helpers {
+
+/**
+ * @brief Estimate whether FP16 is likely insufficient for IVF-PQ's full-magnitude distance
+ * computations on this dataset (i.e. `internal_distance_dtype` and `coarse_search_dtype`).
+ *
+ * We bound the largest achievable score from the dataset's vector norms. With R = max_i ||x_i||
+ * (estimated from a random sample of the dataset):
+ *   - L2Expanded:     ||x - y||^2 = ||x||^2 + ||y||^2 - 2<x,y> <= (||x|| + ||y||)^2 <= 4 * R^2
+ *   - InnerProduct:   |<x, y>|    <= ||x|| * ||y||                                  <=     R^2
+ *   - CosineExpanded: data is L2-normalized, so |score| <= 1 and overflow is impossible.
+ */
+template <typename DataT, typename Accessor>
+bool estimate_fp16_overflow(
+  raft::resources const& res,
+  raft::mdspan<const DataT, raft::matrix_extent<int64_t>, raft::row_major, Accessor> dataset,
+  cuvs::distance::DistanceType metric,
+  double sample_fraction = 0.01)
+{
+  // Cosine similarity scores does normalization itself, so overflow won't happen
+  if (metric == cuvs::distance::DistanceType::CosineExpanded) { return false; }
+
+  // FP16 largest finite value, with a defensive margin to also avoid precision loss near the limit.
+  constexpr float kFp16Max              = 65504.0f;
+  constexpr float kFp16DefensiveMargin  = 0.25f;
+  const float overflow_detect_threshold = kFp16DefensiveMargin * kFp16Max;
+
+  const float max_vector_sq_norm =
+    cuvs::neighbors::ivf_pq::detail::estimate_max_squared_norm(res, dataset, sample_fraction);
+
+  const float max_distance_sq_norm = metric == cuvs::distance::DistanceType::L2Expanded
+                                       ? 4.0f * max_vector_sq_norm
+                                       : max_vector_sq_norm;
+
+  return max_distance_sq_norm > overflow_detect_threshold;
+}
+
+}  // namespace cuvs::neighbors::ivf_pq::helpers

From 3b5130f8d3af2f67d78fecec3b733c098793cdc0 Mon Sep 17 00:00:00 2001
From: Huy Nguyen <huuanhhuyn@nvidia.com>
Date: Tue, 16 Jun 2026 07:28:47 +0000
Subject: [PATCH 2/8] Explain the use of mapping

---
 cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh
index e67e3a5d9c..3315607a85 100644
--- a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh
+++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh
@@ -34,13 +34,14 @@ __global__ void kern_max_squared_norm(const DataT* __restrict__ data,
     const DataT* v = data + row * dim;
     float sq       = 0.0f;
     for (int64_t d = 0; d < dim; d++) {
-      //
+      // internally, IVF-PQ distance computations will map the input data type (e.g. FP16) to float before
+      // doing arithmetic, so we need to apply the same mapping here to get a correct estimate of the squared norms
+      // instead of using static_cast<float>(v[d])
       float e = cuvs::spatial::knn::detail::utils::mapping<float>{}(v[d]);
-      printf("raw float(v[d]): %f, mapped: %f\n", static_cast<float>(v[d]), e);
       sq += e * e;
     }
     // - There is no atomicMax for floats, so we embrace the bitwise representation monoticity
-    //   between float and int. This is valid when values are non-negative, which is the case
+    //   between float and int. This is valid when values are non-negative, which is the case 
     //   for squared norms.
     // - Choose global atomic instead of shared memory tree reduction for simplicity, assuming
     //   low contention.
@@ -70,10 +71,8 @@ float estimate_max_squared_norm(
   if (n_rows <= 1000) {
     n_sample = n_rows;  // for small datasets, just use them all and skip the sampling overhead
   } else if (n_rows > 100000) {
-    // cap the sample size to 100k for speed and keep memory use within the limited workspace
-    // resource
+    // cap the sample size to 100k for speed and keep memory use within the limited workspace resource
     n_sample = 100000;
-    //
   }
 
   // Sample from the dataset

From 3a4e4913f5586eb56c093aa1ceaf65a4bfb9b9a8 Mon Sep 17 00:00:00 2001
From: Huy Nguyen <huuanhhuyn@nvidia.com>
Date: Mon, 22 Jun 2026 08:50:43 +0000
Subject: [PATCH 3/8] Move params fallback to params init phase

---
 .../neighbors/detail/cagra/cagra_build.cuh    | 31 +++++++++----------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
index f2db647842..b566e43970 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh
+++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
@@ -1621,22 +1621,7 @@ void build_knn_graph(
                  pq.build_params.metric == cuvs::distance::DistanceType::InnerProduct ||
                  pq.build_params.metric == cuvs::distance::DistanceType::CosineExpanded,
                "Currently only L2Expanded, InnerProduct and CosineExpanded metrics are supported");
-
-  // Guard against potential FP16 distance overflow for large-magnitude datasets -> back to FP32.
-  const bool using_fp16_distance = pq.search_params.internal_distance_dtype == CUDA_R_16F ||
-                                   pq.search_params.coarse_search_dtype == CUDA_R_16F;
-  if (using_fp16_distance &&
-      ivf_pq::helpers::estimate_fp16_overflow(res, dataset, pq.build_params.metric)) {
-    RAFT_LOG_WARN(
-      "IVF-PQ internal type of FP16 is likely insufficient for this dataset to avoid overflow in "
-      "distance computations -> "
-      "Switching 'internal_distance_dtype' and 'coarse_search_dtype' to FP32");
-    pq.search_params.internal_distance_dtype = CUDA_R_32F;
-    pq.search_params.coarse_search_dtype     = CUDA_R_32F;
-    // lut_dtype is left unchanged because its per-subspace terms are smaller by a factor of pq_dim
-    // and therefore, less likely to overflow.
-  }
-
+    
   uint32_t node_degree = knn_graph.extent(1);
   raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope(
     "cagra::build_knn_graph<IVF-PQ>(%zu, %zu, %u)",
@@ -2211,6 +2196,20 @@ index<T, IdxT> build(
     } else {
       RAFT_LOG_DEBUG("Selecting IVF-PQ solver");
       knn_build_params = cagra::graph_build_params::ivf_pq_params(dataset.extents(), params.metric);
+       // Guard against potential FP16 distance overflow for large-magnitude datasets -> back to FP32.
+      const bool using_fp16_distance = pq.search_params.internal_distance_dtype == CUDA_R_16F ||
+                                      pq.search_params.coarse_search_dtype == CUDA_R_16F;
+      if (using_fp16_distance &&
+          ivf_pq::helpers::estimate_fp16_overflow(res, dataset, pq.build_params.metric)) {
+        RAFT_LOG_WARN(
+          "IVF-PQ internal type of FP16 is likely insufficient for this dataset to avoid overflow in "
+          "distance computations -> "
+          "Switching 'internal_distance_dtype' and 'coarse_search_dtype' to FP32");
+        pq.search_params.internal_distance_dtype = CUDA_R_32F;
+        pq.search_params.coarse_search_dtype     = CUDA_R_32F;
+        // lut_dtype is left unchanged because its per-subspace terms are smaller by a factor of pq_dim
+        // and therefore, less likely to overflow.
+      }
     }
   }
   RAFT_EXPECTS(

From f4031a3fa9dfdc74145aeca52c839727ac473081 Mon Sep 17 00:00:00 2001
From: Huy Nguyen <huuanhhuyn@nvidia.com>
Date: Mon, 22 Jun 2026 08:51:12 +0000
Subject: [PATCH 4/8] Choose sampling equation

---
 .../neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh | 24 +++++++++----------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh
index 3315607a85..4c0bc6be94 100644
--- a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh
+++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh
@@ -59,21 +59,20 @@ __global__ void kern_max_squared_norm(const DataT* __restrict__ data,
 template <typename DataT, typename Accessor>
 float estimate_max_squared_norm(
   raft::resources const& res,
-  raft::mdspan<const DataT, raft::matrix_extent<int64_t>, raft::row_major, Accessor> dataset,
-  double sample_fraction)
+  raft::mdspan<const DataT, raft::matrix_extent<int64_t>, raft::row_major, Accessor> dataset)
 {
   auto stream          = raft::resource::get_cuda_stream(res);
   const int64_t n_rows = dataset.extent(0);
   const int64_t dim    = dataset.extent(1);
 
-  // Determine sample size based on fraction, with bounds
-  int64_t n_sample = static_cast<int64_t>(sample_fraction * static_cast<double>(n_rows));
-  if (n_rows <= 1000) {
-    n_sample = n_rows;  // for small datasets, just use them all and skip the sampling overhead
-  } else if (n_rows > 100000) {
-    // cap the sample size to 100k for speed and keep memory use within the limited workspace resource
-    n_sample = 100000;
-  }
+  // Determine sample size based on a smooth saturation equation. The equation satisfies:
+  // - n_sample is always less than or equal to n_rows
+  // - n_sample saturates to kSaturation when n_rows is inf
+  // - n_sample increases fast for small n_rows and slow to saturation for large n_rows
+  // Idea: we greedily sample most of the dataset when it is small-sized, and cap it to kSaturation
+  // when dataset size grows very large.
+  constexpr int64_t kSaturation = 100000;
+  int64_t n_sample = (n_rows * kSaturation + (n_rows + kSaturation - 1)) / (n_rows + kSaturation);
 
   // Sample from the dataset
   auto mr = raft::resource::get_workspace_resource_ref(res);
@@ -113,8 +112,7 @@ template <typename DataT, typename Accessor>
 bool estimate_fp16_overflow(
   raft::resources const& res,
   raft::mdspan<const DataT, raft::matrix_extent<int64_t>, raft::row_major, Accessor> dataset,
-  cuvs::distance::DistanceType metric,
-  double sample_fraction = 0.01)
+  cuvs::distance::DistanceType metric)
 {
   // Cosine similarity scores does normalization itself, so overflow won't happen
   if (metric == cuvs::distance::DistanceType::CosineExpanded) { return false; }
@@ -125,7 +123,7 @@ bool estimate_fp16_overflow(
   const float overflow_detect_threshold = kFp16DefensiveMargin * kFp16Max;
 
   const float max_vector_sq_norm =
-    cuvs::neighbors::ivf_pq::detail::estimate_max_squared_norm(res, dataset, sample_fraction);
+    cuvs::neighbors::ivf_pq::detail::estimate_max_squared_norm(res, dataset);
 
   const float max_distance_sq_norm = metric == cuvs::distance::DistanceType::L2Expanded
                                        ? 4.0f * max_vector_sq_norm

From f330a7fa67a3901e17522091c7bb4ee08c4018cc Mon Sep 17 00:00:00 2001
From: Huy Nguyen <huuanhhuyn@nvidia.com>
Date: Mon, 22 Jun 2026 10:18:20 +0000
Subject: [PATCH 5/8] Use raft built-in kernels and remove manual one.

---
 .../neighbors/detail/cagra/cagra_build.cuh    | 34 ++++---
 .../neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh | 98 +++++++++----------
 2 files changed, 64 insertions(+), 68 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
index b566e43970..515e464bb8 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh
+++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
@@ -1621,7 +1621,7 @@ void build_knn_graph(
                  pq.build_params.metric == cuvs::distance::DistanceType::InnerProduct ||
                  pq.build_params.metric == cuvs::distance::DistanceType::CosineExpanded,
                "Currently only L2Expanded, InnerProduct and CosineExpanded metrics are supported");
-    
+
   uint32_t node_degree = knn_graph.extent(1);
   raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope(
     "cagra::build_knn_graph<IVF-PQ>(%zu, %zu, %u)",
@@ -2196,20 +2196,24 @@ index<T, IdxT> build(
     } else {
       RAFT_LOG_DEBUG("Selecting IVF-PQ solver");
       knn_build_params = cagra::graph_build_params::ivf_pq_params(dataset.extents(), params.metric);
-       // Guard against potential FP16 distance overflow for large-magnitude datasets -> back to FP32.
-      const bool using_fp16_distance = pq.search_params.internal_distance_dtype == CUDA_R_16F ||
-                                      pq.search_params.coarse_search_dtype == CUDA_R_16F;
-      if (using_fp16_distance &&
-          ivf_pq::helpers::estimate_fp16_overflow(res, dataset, pq.build_params.metric)) {
-        RAFT_LOG_WARN(
-          "IVF-PQ internal type of FP16 is likely insufficient for this dataset to avoid overflow in "
-          "distance computations -> "
-          "Switching 'internal_distance_dtype' and 'coarse_search_dtype' to FP32");
-        pq.search_params.internal_distance_dtype = CUDA_R_32F;
-        pq.search_params.coarse_search_dtype     = CUDA_R_32F;
-        // lut_dtype is left unchanged because its per-subspace terms are smaller by a factor of pq_dim
-        // and therefore, less likely to overflow.
-      }
+    }
+  }
+
+  // Guard against potential FP16 distance overflow for large-magnitude datasets. Applies to any
+  // IVF-PQ graph build (auto-selected above or explicitly set in params) -> fall back to FP32.
+  if (auto* pq = std::get_if<cagra::graph_build_params::ivf_pq_params>(&knn_build_params)) {
+    const bool using_fp16_distance = pq->search_params.internal_distance_dtype == CUDA_R_16F ||
+                                     pq->search_params.coarse_search_dtype == CUDA_R_16F;
+    if (using_fp16_distance &&
+        ivf_pq::helpers::estimate_fp16_overflow(res, dataset, params.metric)) {
+      RAFT_LOG_WARN(
+        "IVF-PQ internal type of FP16 is likely insufficient for this dataset to avoid overflow in "
+        "distance computations -> "
+        "Switching 'internal_distance_dtype' and 'coarse_search_dtype' to FP32");
+      pq->search_params.internal_distance_dtype = CUDA_R_32F;
+      pq->search_params.coarse_search_dtype     = CUDA_R_32F;
+      // lut_dtype is left unchanged because its per-subspace terms are smaller by a factor of
+      // pq_dim and therefore, less likely to overflow.
     }
   }
   RAFT_EXPECTS(
diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh
index 4c0bc6be94..a184fb9701 100644
--- a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh
+++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh
@@ -11,9 +11,12 @@
 
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/mdspan.hpp>
+#include <raft/core/operators.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resource/device_memory_resource.hpp>
 #include <raft/core/resources.hpp>
+#include <raft/linalg/map_reduce.cuh>
+#include <raft/linalg/reduce.cuh>
 #include <raft/matrix/sample_rows.cuh>
 #include <raft/random/rng.cuh>
 #include <raft/util/cudart_utils.hpp>
@@ -22,46 +25,15 @@
 
 namespace cuvs::neighbors::ivf_pq::detail {
 
-/** Reduce the maximum squared L2 norm over a set of row-major vectors of element type DataT. */
-template <typename DataT>
-__global__ void kern_max_squared_norm(const DataT* __restrict__ data,
-                                      int64_t n_rows,
-                                      int64_t dim,
-                                      float* __restrict__ out_max_sq)
-{
-  for (int64_t row = blockIdx.x * blockDim.x + threadIdx.x; row < n_rows;
-       row += static_cast<int64_t>(gridDim.x) * blockDim.x) {
-    const DataT* v = data + row * dim;
-    float sq       = 0.0f;
-    for (int64_t d = 0; d < dim; d++) {
-      // internally, IVF-PQ distance computations will map the input data type (e.g. FP16) to float before
-      // doing arithmetic, so we need to apply the same mapping here to get a correct estimate of the squared norms
-      // instead of using static_cast<float>(v[d])
-      float e = cuvs::spatial::knn::detail::utils::mapping<float>{}(v[d]);
-      sq += e * e;
-    }
-    // - There is no atomicMax for floats, so we embrace the bitwise representation monoticity
-    //   between float and int. This is valid when values are non-negative, which is the case 
-    //   for squared norms.
-    // - Choose global atomic instead of shared memory tree reduction for simplicity, assuming
-    //   low contention.
-    atomicMax(reinterpret_cast<int*>(out_max_sq), __float_as_int(sq));
-  }
-}
-
 /**
- * Estimate max_i ||x_i||^2 over the dataset by sampling a fraction of its rows.
- *
- * NOTE: sampling yields a *lower-bound* estimate of the true max norm, so a too-small fraction can
- * miss outlier vectors. Increase `sample_fraction` (up to 1.0 for an exact, no-false-negative scan)
- * if you observe overflow slipping through.
+ * Estimate max_i ||x_i||^2 over the dataset by sampling from its rows.
  */
 template <typename DataT, typename Accessor>
 float estimate_max_squared_norm(
-  raft::resources const& res,
+  raft::resources const& handle,
   raft::mdspan<const DataT, raft::matrix_extent<int64_t>, raft::row_major, Accessor> dataset)
 {
-  auto stream          = raft::resource::get_cuda_stream(res);
+  auto stream          = raft::resource::get_cuda_stream(handle);
   const int64_t n_rows = dataset.extent(0);
   const int64_t dim    = dataset.extent(1);
 
@@ -69,29 +41,47 @@ float estimate_max_squared_norm(
   // - n_sample is always less than or equal to n_rows
   // - n_sample saturates to kSaturation when n_rows is inf
   // - n_sample increases fast for small n_rows and slow to saturation for large n_rows
-  // Idea: we greedily sample most of the dataset when it is small-sized, and cap it to kSaturation
-  // when dataset size grows very large.
+  // Idea: we sample most of the dataset when it is small-sized, and only a small fraction
+  // (up to a maximum/saturation number) when the dataset size grows large.
+  // kSaturation is selected as a compromise between runtime and recall.
   constexpr int64_t kSaturation = 100000;
   int64_t n_sample = (n_rows * kSaturation + (n_rows + kSaturation - 1)) / (n_rows + kSaturation);
 
   // Sample from the dataset
-  auto mr = raft::resource::get_workspace_resource_ref(res);
+  auto mr = raft::resource::get_workspace_resource_ref(handle);
   auto sample =
-    raft::make_device_mdarray<DataT>(res, mr, raft::make_extents<int64_t>(n_sample, dim));
+    raft::make_device_mdarray<DataT>(handle, mr, raft::make_extents<int64_t>(n_sample, dim));
   raft::matrix::sample_rows<DataT, int64_t>(
-    res, raft::random::RngState{137}, dataset, sample.view());
-
-  auto d_max_sq            = raft::make_device_scalar<float>(res, 0.0f);
-  constexpr int block_size = 256;
-  const int grid_size      = static_cast<int>((n_sample + block_size - 1) / block_size);
-  kern_max_squared_norm<DataT><<<grid_size, block_size, 0, stream>>>(
-    sample.data_handle(), n_sample, dim, d_max_sq.data_handle());
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
-
-  float h_max_sq = 0.0f;
-  raft::update_host(&h_max_sq, d_max_sq.data_handle(), 1, stream);
-  raft::resource::sync_stream(res);
-  return h_max_sq;
+    handle, raft::random::RngState{137}, dataset, sample.view());
+
+  // Compute mapped squared norm
+  auto d_map_sq_norm = raft::make_device_vector<float, int64_t>(handle, n_sample);
+  raft::linalg::reduce<raft::Apply::ALONG_ROWS>(
+    handle,
+    raft::make_const_mdspan(sample.view()),
+    d_map_sq_norm.view(),
+    0.0f,
+    false,
+    [] __device__(DataT v, auto) -> float {
+      float e = cuvs::spatial::knn::detail::utils::mapping<float>{}(v);
+      return e * e;
+    },
+    raft::add_op(),
+    raft::identity_op());
+  // Compute max of squared norm vector
+  auto d_max_sq = raft::make_device_scalar<float>(handle, 0.0f);
+  raft::linalg::map_reduce(handle,
+                           raft::make_const_mdspan(d_map_sq_norm.view()),
+                           d_max_sq.view(),
+                           0.0f,
+                           raft::identity_op(),
+                           raft::max_op());
+
+  float max_sq = 0.0f;
+  raft::update_host(&max_sq, d_max_sq.data_handle(), 1, stream);
+  raft::resource::sync_stream(handle);
+
+  return max_sq;
 }
 
 }  // namespace cuvs::neighbors::ivf_pq::detail
@@ -110,10 +100,12 @@ namespace cuvs::neighbors::ivf_pq::helpers {
  */
 template <typename DataT, typename Accessor>
 bool estimate_fp16_overflow(
-  raft::resources const& res,
+  raft::resources const& handle,
   raft::mdspan<const DataT, raft::matrix_extent<int64_t>, raft::row_major, Accessor> dataset,
   cuvs::distance::DistanceType metric)
 {
+  if (dataset.extent(0) == 0) { return false; }
+
   // Cosine similarity scores does normalization itself, so overflow won't happen
   if (metric == cuvs::distance::DistanceType::CosineExpanded) { return false; }
 
@@ -123,7 +115,7 @@ bool estimate_fp16_overflow(
   const float overflow_detect_threshold = kFp16DefensiveMargin * kFp16Max;
 
   const float max_vector_sq_norm =
-    cuvs::neighbors::ivf_pq::detail::estimate_max_squared_norm(res, dataset);
+    cuvs::neighbors::ivf_pq::detail::estimate_max_squared_norm(handle, dataset);
 
   const float max_distance_sq_norm = metric == cuvs::distance::DistanceType::L2Expanded
                                        ? 4.0f * max_vector_sq_norm

From 22878f1255f37baf14d636c522288a8de19fe7a2 Mon Sep 17 00:00:00 2001
From: Huy Nguyen <huuanhhuyn@nvidia.com>
Date: Mon, 22 Jun 2026 10:55:38 +0000
Subject: [PATCH 6/8] Add kDelay param to adjust the sampling fraction growth
 speed

---
 cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh
index a184fb9701..b50e7e9a4e 100644
--- a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh
+++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh
@@ -10,6 +10,7 @@
 #include <cuvs/distance/distance.hpp>
 
 #include <raft/core/device_mdarray.hpp>
+#include <raft/core/error.hpp>
 #include <raft/core/mdspan.hpp>
 #include <raft/core/operators.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
@@ -43,9 +44,13 @@ float estimate_max_squared_norm(
   // - n_sample increases fast for small n_rows and slow to saturation for large n_rows
   // Idea: we sample most of the dataset when it is small-sized, and only a small fraction
   // (up to a maximum/saturation number) when the dataset size grows large.
-  // kSaturation is selected as a compromise between runtime and recall.
+  // kSaturation and kDelay are selected as a compromise between runtime and outlier recall.
   constexpr int64_t kSaturation = 100000;
-  int64_t n_sample = (n_rows * kSaturation + (n_rows + kSaturation - 1)) / (n_rows + kSaturation);
+  constexpr int64_t kDelay      = kSaturation * 10;
+  RAFT_EXPECTS(kDelay >= kSaturation,
+               "kDelay must not be smaller than kSaturation so that n_sample is always less than "
+               "or equal to n_rows");
+  int64_t n_sample = (n_rows * kSaturation + (n_rows + kDelay - 1)) / (n_rows + kDelay);
 
   // Sample from the dataset
   auto mr = raft::resource::get_workspace_resource_ref(handle);

From a2d9369166386c3484d30988e55083f83d04f908 Mon Sep 17 00:00:00 2001
From: Huy Nguyen <huuanhhuyn@nvidia.com>
Date: Mon, 22 Jun 2026 11:24:33 +0000
Subject: [PATCH 7/8] Reduce kSaturation to 20k (just sufficient to detect FP16
 overflow in SIFT 1M)

---
 cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh
index b50e7e9a4e..99f2c9e60c 100644
--- a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh
+++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh
@@ -45,7 +45,7 @@ float estimate_max_squared_norm(
   // Idea: we sample most of the dataset when it is small-sized, and only a small fraction
   // (up to a maximum/saturation number) when the dataset size grows large.
   // kSaturation and kDelay are selected as a compromise between runtime and outlier recall.
-  constexpr int64_t kSaturation = 100000;
+  constexpr int64_t kSaturation = 20000;
   constexpr int64_t kDelay      = kSaturation * 10;
   RAFT_EXPECTS(kDelay >= kSaturation,
                "kDelay must not be smaller than kSaturation so that n_sample is always less than "

From 34150927980f335134901a0f7dcddd82fceeec0d Mon Sep 17 00:00:00 2001
From: Huy Nguyen <huuanhhuyn@nvidia.com>
Date: Mon, 22 Jun 2026 12:18:55 +0000
Subject: [PATCH 8/8] Explain uniform sampling decision

---
 cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh
index 99f2c9e60c..2f99da9e06 100644
--- a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh
+++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh
@@ -27,7 +27,12 @@
 namespace cuvs::neighbors::ivf_pq::detail {
 
 /**
- * Estimate max_i ||x_i||^2 over the dataset by sampling from its rows.
+ * Estimate max_i ||x_i||^2 over the dataset by uniformly sampling a fraction from it.
+ *
+ * Decision: Uniform sampling is selected as it is sufficient to detect FP16 overflow in
+ * the datasets, where overflow-causing large vectors are frequent (e.g. SIFT 1M). For 
+ * dataset with rare large outliers, we might preferably sample biasedly towards large vectors,
+ * e.g. via top-k selection over the vectors with largest L_inf norm.
  */
 template <typename DataT, typename Accessor>
 float estimate_max_squared_norm(
@@ -59,7 +64,7 @@ float estimate_max_squared_norm(
   raft::matrix::sample_rows<DataT, int64_t>(
     handle, raft::random::RngState{137}, dataset, sample.view());
 
-  // Compute mapped squared norm
+  // Compute float-mapped squared norm
   auto d_map_sq_norm = raft::make_device_vector<float, int64_t>(handle, n_sample);
   raft::linalg::reduce<raft::Apply::ALONG_ROWS>(
     handle,