diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 227c2906cc..f467137d75 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -1142,6 +1142,7 @@ if(NOT BUILD_CPU_ONLY)
     src/cluster/single_linkage_float.cu
     src/cluster/spectral.cu
     src/core/bitset.cu
+    src/core/roaring/roaring.cu
     src/core/omp_wrapper.cpp
     src/util/file_io.cpp
     src/util/host_memory.cpp
diff --git a/cpp/include/cuvs/core/roaring.hpp b/cpp/include/cuvs/core/roaring.hpp
new file mode 100644
index 0000000000..0f19d8ebe5
--- /dev/null
+++ b/cpp/include/cuvs/core/roaring.hpp
@@ -0,0 +1,373 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include <cuvs/core/bitset.hpp>
+
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+namespace cuvs::core {
+
+/**
+ * @defgroup roaring GPU Roaring Bitmap
+ *
+ * GPU-native Roaring Bitmap for compressed prefiltering in vector search.
+ * Stores attribute bitmaps in compressed Roaring format in GPU memory,
+ * supports bulk set operations (AND/OR/ANDNOT/XOR), and decompresses
+ * to flat bitsets compatible with cuvs::core::bitset for search.
+ *
+ * Typical usage:
+ * @code{.cpp}
+ *   // Build filter bitmaps on CPU, upload to GPU
+ *   auto category = gpu_roaring::from_sorted_ids(res, cat_ids, universe);
+ *   auto price    = gpu_roaring::from_sorted_ids(res, price_ids, universe);
+ *
+ *   // Combine on GPU
+ *   auto combined = gpu_roaring::set_and(res, category, price);
+ *
+ *   // Decompress to bitset for search
+ *   auto bitset = gpu_roaring::to_bitset(res, combined);
+ *   auto filter = cuvs::neighbors::filtering::bitset_filter<uint32_t, int64_t>(bitset.view());
+ *   cuvs::neighbors::cagra::search(res, params, index, queries, neighbors, distances, filter);
+ * @endcode
+ * @{
+ */
+
+/** Container type tags for Roaring containers */
+enum class roaring_container_type : uint8_t { ARRAY = 0, BITMAP = 1, RUN = 2 };
+
+/**
+ * @brief Non-owning device view of a @ref gpu_roaring, usable inside
+ * kernels for per-sample membership tests (mirrors
+ * raft::core::bitset_view::test()).
+ */
+struct roaring_view {
+  const uint16_t* keys                = nullptr;
+  const roaring_container_type* types = nullptr;
+  const uint32_t* offsets             = nullptr;
+  const uint16_t* cardinalities       = nullptr;
+  const uint64_t* bitmap_data         = nullptr;
+  const uint16_t* array_data          = nullptr;
+  const uint16_t* run_data            = nullptr;
+  const uint16_t* key_index           = nullptr;  // optional O(1) key lookup
+  uint32_t n_containers               = 0;
+  uint32_t max_key                    = 0;
+  uint32_t n_rows                     = 0;
+
+#if defined(__CUDACC__)
+  /** Whether sample index `id` is a member of the filter. */
+  __device__ __forceinline__ bool test(uint32_t id) const
+  {
+    uint16_t key = static_cast<uint16_t>(id >> 16);
+    uint16_t low = static_cast<uint16_t>(id & 0xFFFFu);
+
+    int cid = -1;
+    if (key_index != nullptr) {
+      if (key > max_key) return false;
+      uint16_t slot = key_index[key];
+      if (slot == 0xFFFFu) return false;
+      cid = slot;
+    } else {
+      uint32_t lo = 0, hi = n_containers;
+      while (lo < hi) {  // lower_bound on sorted keys
+        uint32_t mid = (lo + hi) / 2;
+        if (keys[mid] < key) {
+          lo = mid + 1;
+        } else {
+          hi = mid;
+        }
+      }
+      if (lo == n_containers || keys[lo] != key) return false;
+      cid = static_cast<int>(lo);
+    }
+
+    uint32_t offset = offsets[cid];
+    switch (types[cid]) {
+      case roaring_container_type::BITMAP: {
+        uint64_t word = bitmap_data[offset / sizeof(uint64_t) + (low >> 6)];
+        return (word >> (low & 63u)) & 1u;
+      }
+      case roaring_container_type::ARRAY: {
+        const uint16_t* arr = array_data + offset / sizeof(uint16_t);
+        uint32_t lo = 0, hi = cardinalities[cid];
+        while (lo < hi) {
+          uint32_t mid = (lo + hi) / 2;
+          if (arr[mid] < low) {
+            lo = mid + 1;
+          } else {
+            hi = mid;
+          }
+        }
+        return lo < cardinalities[cid] && arr[lo] == low;
+      }
+      default: {  // RUN
+        const uint16_t* runs = run_data + offset / sizeof(uint16_t);
+        uint32_t n_runs      = cardinalities[cid];
+        uint32_t lo = 0, hi = n_runs;
+        while (lo < hi) {  // last run with start <= low
+          uint32_t mid = (lo + hi) / 2;
+          if (runs[mid * 2] <= low) {
+            lo = mid + 1;
+          } else {
+            hi = mid;
+          }
+        }
+        if (lo == 0) return false;
+        uint32_t start = runs[(lo - 1) * 2];
+        uint32_t len   = runs[(lo - 1) * 2 + 1];
+        return low >= start && low <= start + len;
+      }
+    }
+  }
+#endif
+};
+
+/**
+ * @brief GPU-resident Roaring bitmap in Structure-of-Arrays layout.
+ *
+ * All device memory is managed via rmm::device_uvector (RAII).
+ * Supports move semantics; not copyable.
+ */
+struct gpu_roaring {
+  // Top-level index (one entry per container, sorted by key)
+  rmm::device_uvector<uint16_t> keys;
+  rmm::device_uvector<roaring_container_type> types;
+  rmm::device_uvector<uint32_t> offsets;
+  rmm::device_uvector<uint16_t> cardinalities;
+  uint32_t n_containers  = 0;
+  uint32_t universe_size = 0;
+
+  // Per-type data pools
+  rmm::device_uvector<uint64_t> bitmap_data;
+  uint32_t n_bitmap_containers = 0;
+
+  rmm::device_uvector<uint16_t> array_data;
+  uint32_t n_array_containers = 0;
+
+  rmm::device_uvector<uint16_t> run_data;
+  uint32_t n_run_containers = 0;
+
+  // Direct-map key index: key_index[high16] = container index, or 0xFFFF.
+  // Replaces O(log n) binary search with O(1) table lookup.
+  rmm::device_uvector<uint16_t> key_index;
+  uint32_t max_key = 0;
+
+  // Complement optimization: when true, the stored set is the complement
+  // of the logical set. contains() results are flipped at query time.
+  // Note: never set by from_sorted_ids; filters require negated == false.
+  bool negated = false;
+
+  // Total logical cardinality (number of set bits, before complement)
+  uint64_t total_cardinality = 0;
+
+  // Host mirrors of the per-container metadata, captured at construction.
+  // These make per-filter cardinalities and CSR indptr available with no
+  // device->host transfer or count kernels (uint32 element counts: a full
+  // 65536-element container wraps to 0 in the device-side uint16 array).
+  std::vector<uint16_t> h_keys;
+  std::vector<roaring_container_type> h_types;
+  std::vector<uint32_t> h_offsets;
+  std::vector<uint32_t> h_element_counts;
+
+  /** Non-owning device view for use inside kernels. */
+  [[nodiscard]] roaring_view view() const
+  {
+    roaring_view v;
+    v.keys          = keys.data();
+    v.types         = types.data();
+    v.offsets       = offsets.data();
+    v.cardinalities = cardinalities.data();
+    v.bitmap_data   = bitmap_data.data();
+    v.array_data    = array_data.data();
+    v.run_data      = run_data.data();
+    v.key_index     = key_index.size() > 0 ? key_index.data() : nullptr;
+    v.n_containers  = n_containers;
+    v.max_key       = max_key;
+    v.n_rows        = universe_size;
+    return v;
+  }
+
+  /** Construct an empty GPU Roaring bitmap */
+  explicit gpu_roaring(rmm::cuda_stream_view stream)
+    : keys(0, stream),
+      types(0, stream),
+      offsets(0, stream),
+      cardinalities(0, stream),
+      bitmap_data(0, stream),
+      array_data(0, stream),
+      run_data(0, stream),
+      key_index(0, stream)
+  {
+  }
+
+  gpu_roaring(gpu_roaring&&)            = default;
+  gpu_roaring& operator=(gpu_roaring&&) = default;
+
+  /** Total device memory used (bytes) */
+  [[nodiscard]] size_t device_memory_bytes() const
+  {
+    return keys.size() * sizeof(uint16_t) + types.size() * sizeof(roaring_container_type) +
+           offsets.size() * sizeof(uint32_t) + cardinalities.size() * sizeof(uint16_t) +
+           bitmap_data.size() * sizeof(uint64_t) + array_data.size() * sizeof(uint16_t) +
+           run_data.size() * sizeof(uint16_t) + key_index.size() * sizeof(uint16_t);
+  }
+
+  /** Equivalent flat bitset size (bytes) */
+  [[nodiscard]] size_t flat_bitset_bytes() const
+  {
+    return (static_cast<size_t>(universe_size) + 31) / 32 * sizeof(uint32_t);
+  }
+
+  /** Compression ratio (flat / compressed) */
+  [[nodiscard]] double compression_ratio() const
+  {
+    auto dev_bytes = device_memory_bytes();
+    return dev_bytes > 0 ? static_cast<double>(flat_bitset_bytes()) / dev_bytes : 0.0;
+  }
+};
+
+/** Set operation types */
+enum class roaring_set_op : uint8_t { AND = 0, OR = 1, ANDNOT = 2, XOR = 3 };
+
+/**
+ * @brief Create a GPU Roaring bitmap from a sorted array of IDs on host.
+ *
+ * The IDs are partitioned into 65536-element containers following the
+ * Roaring bitmap format. Containers with <= 4096 elements use array format;
+ * denser containers use bitmap format.
+ *
+ * @param[in] res RAFT resources (provides CUDA stream)
+ * @param[in] sorted_ids Host pointer to sorted, deduplicated uint32_t IDs
+ * @param[in] n_ids Number of IDs
+ * @param[in] universe_size Maximum representable ID + 1
+ * @return GPU Roaring bitmap
+ */
+gpu_roaring from_sorted_ids(raft::resources const& res,
+                            const uint32_t* sorted_ids,
+                            uint32_t n_ids,
+                            uint32_t universe_size);
+
+/**
+ * @brief Perform a pairwise set operation between two GPU Roaring bitmaps.
+ *
+ * @param[in] res RAFT resources
+ * @param[in] a First operand
+ * @param[in] b Second operand
+ * @param[in] op Set operation (AND, OR, ANDNOT, XOR)
+ * @return Result GPU Roaring bitmap
+ */
+gpu_roaring set_op(raft::resources const& res,
+                   const gpu_roaring& a,
+                   const gpu_roaring& b,
+                   roaring_set_op op);
+
+/** Convenience: result = a AND b */
+inline gpu_roaring set_and(raft::resources const& res, const gpu_roaring& a, const gpu_roaring& b)
+{
+  return set_op(res, a, b, roaring_set_op::AND);
+}
+
+/** Convenience: result = a OR b */
+inline gpu_roaring set_or(raft::resources const& res, const gpu_roaring& a, const gpu_roaring& b)
+{
+  return set_op(res, a, b, roaring_set_op::OR);
+}
+
+/**
+ * @brief Multi-bitmap AND: result = a[0] AND a[1] AND ... AND a[n-1].
+ */
+gpu_roaring multi_and(raft::resources const& res, const gpu_roaring* bitmaps, uint32_t count);
+
+/**
+ * @brief Multi-bitmap OR: result = a[0] OR a[1] OR ... OR a[n-1].
+ */
+gpu_roaring multi_or(raft::resources const& res, const gpu_roaring* bitmaps, uint32_t count);
+
+/**
+ * @brief Decompress a GPU Roaring bitmap to a flat bitset.
+ *
+ * The output is compatible with cuvs::core::bitset<uint32_t, int64_t>
+ * and can be wrapped in a bitset_filter for use with cuVS search.
+ *
+ * @param[in] res RAFT resources
+ * @param[in] bitmap Source GPU Roaring bitmap
+ * @return Flat bitset (device memory, RAII)
+ */
+cuvs::core::bitset<uint32_t, int64_t> to_bitset(raft::resources const& res,
+                                                const gpu_roaring& bitmap);
+
+/**
+ * @brief Decompress into a pre-allocated flat bitset buffer.
+ *
+ * @param[in]  res RAFT resources
+ * @param[in]  bitmap Source GPU Roaring bitmap
+ * @param[out] output Device pointer to uint32_t array, pre-zeroed
+ * @param[in]  output_size_words Number of uint32_t words in output
+ */
+void decompress_to_bitset(raft::resources const& res,
+                          const gpu_roaring& bitmap,
+                          uint32_t* output,
+                          uint32_t output_size_words);
+
+/**
+ * @brief Create a GPU Roaring bitmap from sorted, deduplicated device IDs.
+ *
+ * Equivalent to the host-pointer overload; the IDs are copied to the host
+ * for container construction (the per-container metadata must live on the
+ * host anyway — it is what makes count-free filtered search possible).
+ */
+gpu_roaring from_sorted_ids(raft::resources const& res,
+                            raft::device_vector_view<const uint32_t, int64_t> sorted_ids,
+                            uint32_t universe_size);
+
+/**
+ * @brief Emit the sorted member IDs of one or more Roaring bitmaps into the
+ * column-index array of a CSR structure, one bitmap per CSR row segment.
+ *
+ * Row `i`'s segment is `[indptr[i], indptr[i+1])` where the indptr is the
+ * exclusive prefix sum of `bitmaps[i]->total_cardinality` — available on
+ * the host with no device synchronization. One kernel launch for all
+ * bitmaps (one CTA per Roaring container).
+ *
+ * @param[in]  res RAFT resources
+ * @param[in]  bitmaps Host array of `n_bitmaps` bitmap pointers
+ * @param[in]  n_bitmaps Number of bitmaps (CSR rows)
+ * @param[out] indices Device array of size `sum(cardinalities)`
+ */
+void to_csr_indices(raft::resources const& res,
+                    const gpu_roaring* const* bitmaps,
+                    uint32_t n_bitmaps,
+                    int64_t* indices);
+
+/**
+ * @brief Decompress `n_bitmaps` Roaring bitmaps into one dense bit matrix
+ * of logical shape `[n_bitmaps, n_rows]` (row-contiguous bits), compatible
+ * with cuvs::core::bitmap_view.
+ *
+ * @param[in]  res RAFT resources
+ * @param[in]  bitmaps Host array of `n_bitmaps` bitmap pointers
+ * @param[in]  n_bitmaps Number of bitmap rows
+ * @param[in]  n_rows Number of columns (bits per row)
+ * @param[out] output Device array of `ceil(n_bitmaps * n_rows / 32)` words,
+ *             pre-zeroed by the caller
+ */
+void decompress_to_bitmap(raft::resources const& res,
+                          const gpu_roaring* const* bitmaps,
+                          uint32_t n_bitmaps,
+                          int64_t n_rows,
+                          uint32_t* output);
+
+/** @} */  // end group roaring
+
+}  // namespace cuvs::core
diff --git a/cpp/include/cuvs/neighbors/common.hpp b/cpp/include/cuvs/neighbors/common.hpp
index 2fd804f115..d2ed2a373d 100644
--- a/cpp/include/cuvs/neighbors/common.hpp
+++ b/cpp/include/cuvs/neighbors/common.hpp
@@ -497,7 +497,7 @@ namespace filtering {
  * @{
  */
 
-enum class FilterType { None, Bitmap, Bitset, UDF };
+enum class FilterType { None, Bitmap, Bitset, UDF, Roaring, RoaringMatrix };
 
 struct base_filter {
   ~base_filter()                             = default;
diff --git a/cpp/include/cuvs/neighbors/roaring_filter.hpp b/cpp/include/cuvs/neighbors/roaring_filter.hpp
new file mode 100644
index 0000000000..67cc19486f
--- /dev/null
+++ b/cpp/include/cuvs/neighbors/roaring_filter.hpp
@@ -0,0 +1,116 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include <cuvs/core/roaring.hpp>
+#include <cuvs/neighbors/common.hpp>
+
+#include <cstdint>
+#include <vector>
+
+namespace cuvs::neighbors::filtering {
+
+/**
+ * @defgroup roaring_filters Roaring bitmap sample filters
+ * @{
+ */
+
+/**
+ * @brief Filter an index with a compressed Roaring bitmap shared by all
+ * queries.
+ *
+ * The semantic counterpart of @ref bitset_filter with a compressed filter
+ * representation: memory scales with filter cardinality and structure
+ * instead of dataset size, and search dispatch can exploit the
+ * host-known cardinality (no count kernels) and the container structure
+ * (direct CSR emission, row gathering) instead of scanning a flat
+ * bitset.
+ *
+ * The filter is non-owning: the referenced `cuvs::core::gpu_roaring`
+ * must outlive any search call using this filter.
+ *
+ * @code{.cpp}
+ *   auto roaring = cuvs::core::from_sorted_ids(res, ids, n_ids, n_rows);
+ *   auto filter  = cuvs::neighbors::filtering::roaring_filter(roaring);
+ *   cuvs::neighbors::brute_force::search(
+ *     res, params, index, queries, neighbors, distances, filter);
+ * @endcode
+ */
+struct roaring_filter : public base_filter {
+  /** The bitmap to filter on (non-owning). */
+  const cuvs::core::gpu_roaring* bitmap_;
+  /** Device view used for per-sample membership tests. */
+  cuvs::core::roaring_view view_;
+
+  explicit roaring_filter(const cuvs::core::gpu_roaring& bitmap)
+    : bitmap_(&bitmap), view_(bitmap.view())
+  {
+  }
+
+  /** \cond */
+#if defined(__CUDACC__)
+  __device__ __forceinline__ bool operator()(
+    // query index (ignored: one filter shared by all queries)
+    const uint32_t query_ix,
+    // the index of the current sample
+    const uint32_t sample_ix) const
+  {
+    (void)query_ix;
+    return view_.test(sample_ix);
+  }
+#endif
+  /** \endcond */
+
+  FilterType get_filter_type() const override { return FilterType::Roaring; }
+};
+
+/**
+ * @brief Filter an index with one compressed Roaring bitmap per query.
+ *
+ * The semantic counterpart of @ref bitmap_filter (logical shape
+ * `[n_queries, index->size()]`) with per-query compressed filters: the
+ * dense `n_queries x n_rows` bit matrix is never materialized. Search
+ * emits the sparse distance structure directly from the per-query
+ * containers (per-query cardinalities are known on the host at filter
+ * construction, so the CSR indptr is free and no count kernels or
+ * device synchronization are needed).
+ *
+ * The filter is non-owning: the array of `gpu_roaring const*` and every
+ * pointee must outlive any search call using this filter.
+ *
+ * @code{.cpp}
+ *   std::vector<const cuvs::core::gpu_roaring*> per_query = {...};
+ *   auto filter = cuvs::neighbors::filtering::roaring_matrix_filter(
+ *     per_query.data(), n_queries);
+ *   cuvs::neighbors::brute_force::search(
+ *     res, params, index, queries, neighbors, distances, filter);
+ * @endcode
+ */
+struct roaring_matrix_filter : public base_filter {
+  /** Per-query bitmaps, `bitmaps_[i]` filters query `i` (non-owning). */
+  const cuvs::core::gpu_roaring* const* bitmaps_;
+  uint32_t n_queries_;
+
+  roaring_matrix_filter(const cuvs::core::gpu_roaring* const* bitmaps, uint32_t n_queries)
+    : bitmaps_(bitmaps), n_queries_(n_queries)
+  {
+  }
+
+  /** Sum of per-query cardinalities (the sparse search nnz). */
+  [[nodiscard]] uint64_t nnz() const
+  {
+    uint64_t total = 0;
+    for (uint32_t i = 0; i < n_queries_; ++i)
+      total += bitmaps_[i]->total_cardinality;
+    return total;
+  }
+
+  FilterType get_filter_type() const override { return FilterType::RoaringMatrix; }
+};
+
+/** @} */
+
+}  // namespace cuvs::neighbors::filtering
diff --git a/cpp/src/core/roaring/roaring.cu b/cpp/src/core/roaring/roaring.cu
new file mode 100644
index 0000000000..6112a14fa0
--- /dev/null
+++ b/cpp/src/core/roaring/roaring.cu
@@ -0,0 +1,1090 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <cuvs/core/roaring.hpp>
+
+#include <raft/core/bitset.cuh>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <algorithm>
+#include <cstring>
+#include <vector>
+
+namespace cuvs::core {
+
+// ============================================================================
+// from_sorted_ids: build GPU Roaring from a sorted host ID array
+// ============================================================================
+
+gpu_roaring from_sorted_ids(raft::resources const& res,
+                            const uint32_t* sorted_ids,
+                            uint32_t n_ids,
+                            uint32_t universe_size)
+{
+  auto stream = raft::resource::get_cuda_stream(res);
+  gpu_roaring result(stream);
+  result.universe_size = universe_size;
+
+  if (n_ids == 0) return result;
+
+  // Partition IDs into containers by high 16 bits
+  struct Container {
+    uint16_t key;
+    std::vector<uint16_t> values;  // low 16 bits
+  };
+  std::vector<Container> containers;
+
+  uint16_t cur_key = static_cast<uint16_t>(sorted_ids[0] >> 16);
+  containers.push_back({cur_key, {}});
+
+  for (uint32_t i = 0; i < n_ids; ++i) {
+    uint16_t key = static_cast<uint16_t>(sorted_ids[i] >> 16);
+    uint16_t val = static_cast<uint16_t>(sorted_ids[i] & 0xFFFF);
+    if (key != cur_key) {
+      cur_key = key;
+      containers.push_back({key, {}});
+    }
+    containers.back().values.push_back(val);
+  }
+
+  uint32_t n          = static_cast<uint32_t>(containers.size());
+  result.n_containers = n;
+
+  // Build SoA on host
+  std::vector<uint16_t> h_keys(n);
+  std::vector<roaring_container_type> h_types(n);
+  std::vector<uint32_t> h_offsets(n);
+  std::vector<uint16_t> h_cards(n);
+
+  std::vector<uint64_t> h_bitmap_pool;
+  std::vector<uint16_t> h_array_pool;
+  uint32_t n_bitmap = 0, n_array = 0;
+
+  for (uint32_t i = 0; i < n; ++i) {
+    h_keys[i]     = containers[i].key;
+    uint32_t card = static_cast<uint32_t>(containers[i].values.size());
+    h_cards[i]    = static_cast<uint16_t>(card > 65535 ? 0 : card);
+
+    if (card > 4096) {
+      // Bitmap container
+      h_types[i]   = roaring_container_type::BITMAP;
+      h_offsets[i] = static_cast<uint32_t>(h_bitmap_pool.size() * sizeof(uint64_t));
+      h_bitmap_pool.resize(h_bitmap_pool.size() + 1024, 0);
+      uint64_t* words = h_bitmap_pool.data() + h_bitmap_pool.size() - 1024;
+      for (uint16_t v : containers[i].values) {
+        words[v / 64] |= 1ULL << (v % 64);
+      }
+      ++n_bitmap;
+    } else {
+      // Array container
+      h_types[i]   = roaring_container_type::ARRAY;
+      h_offsets[i] = static_cast<uint32_t>(h_array_pool.size() * sizeof(uint16_t));
+      h_array_pool.insert(
+        h_array_pool.end(), containers[i].values.begin(), containers[i].values.end());
+      ++n_array;
+    }
+  }
+
+  result.n_bitmap_containers = n_bitmap;
+  result.n_array_containers  = n_array;
+  result.n_run_containers    = 0;
+
+  // Upload to device
+  result.keys.resize(n, stream);
+  result.types.resize(n, stream);
+  result.offsets.resize(n, stream);
+  result.cardinalities.resize(n, stream);
+
+  raft::update_device(result.keys.data(), h_keys.data(), n, stream);
+  raft::update_device(result.types.data(),
+                      reinterpret_cast<const roaring_container_type*>(h_types.data()),
+                      n,
+                      stream);
+  raft::update_device(result.offsets.data(), h_offsets.data(), n, stream);
+  raft::update_device(result.cardinalities.data(), h_cards.data(), n, stream);
+
+  if (!h_bitmap_pool.empty()) {
+    result.bitmap_data.resize(h_bitmap_pool.size(), stream);
+    raft::update_device(
+      result.bitmap_data.data(), h_bitmap_pool.data(), h_bitmap_pool.size(), stream);
+  }
+  if (!h_array_pool.empty()) {
+    result.array_data.resize(h_array_pool.size(), stream);
+    raft::update_device(result.array_data.data(), h_array_pool.data(), h_array_pool.size(), stream);
+  }
+
+  // Total cardinality
+  result.total_cardinality = static_cast<uint64_t>(n_ids);
+
+  // Host mirrors (count-free filtered search: indptr and dispatch decisions
+  // come from these, never from count kernels)
+  result.h_keys    = h_keys;
+  result.h_types   = h_types;
+  result.h_offsets = h_offsets;
+  result.h_element_counts.resize(n);
+  for (uint32_t i = 0; i < n; ++i)
+    result.h_element_counts[i] = static_cast<uint32_t>(containers[i].values.size());
+
+  // Build direct-map key index: key_index[key] = container_idx, 0xFFFF = absent
+  if (n > 0) {
+    result.max_key   = h_keys[n - 1];
+    uint32_t ki_size = result.max_key + 1;
+    std::vector<uint16_t> h_key_index(ki_size, 0xFFFF);
+    for (uint32_t i = 0; i < n; ++i) {
+      h_key_index[h_keys[i]] = static_cast<uint16_t>(i);
+    }
+    result.key_index.resize(ki_size, stream);
+    raft::update_device(result.key_index.data(), h_key_index.data(), ki_size, stream);
+  }
+
+  raft::resource::sync_stream(res);
+  return result;
+}
+
+// ============================================================================
+// Decompress kernel
+// ============================================================================
+
+__global__ void decompress_kernel(const uint16_t* keys,
+                                  const roaring_container_type* types,
+                                  const uint32_t* offsets,
+                                  const uint16_t* cardinalities,
+                                  uint32_t n_containers,
+                                  const uint64_t* bitmap_data,
+                                  const uint16_t* array_data,
+                                  const uint16_t* run_data,
+                                  uint32_t* output,
+                                  uint32_t output_size_words)
+{
+  uint32_t cid = blockIdx.x;
+  if (cid >= n_containers) return;
+
+  uint32_t key       = keys[cid];
+  uint32_t base_word = key * 2048u;
+  auto ctype         = types[cid];
+  uint32_t offset    = offsets[cid];
+
+  if (ctype == roaring_container_type::BITMAP) {
+    const uint32_t* src =
+      reinterpret_cast<const uint32_t*>(bitmap_data) + (offset / sizeof(uint32_t));
+    for (uint32_t i = threadIdx.x; i < 2048u; i += blockDim.x) {
+      uint32_t dst_idx = base_word + i;
+      if (dst_idx < output_size_words) { output[dst_idx] = src[i]; }
+    }
+  } else if (ctype == roaring_container_type::ARRAY) {
+    const uint16_t* arr = array_data + (offset / sizeof(uint16_t));
+    uint16_t card       = cardinalities[cid];
+    for (uint32_t i = threadIdx.x; i < card; i += blockDim.x) {
+      uint16_t val      = arr[i];
+      uint32_t abs_bit  = (static_cast<uint32_t>(key) << 16) | val;
+      uint32_t word_idx = abs_bit / 32u;
+      uint32_t bit_pos  = abs_bit % 32u;
+      if (word_idx < output_size_words) { atomicOr(&output[word_idx], 1u << bit_pos); }
+    }
+  } else if (ctype == roaring_container_type::RUN) {
+    const uint16_t* runs = run_data + (offset / sizeof(uint16_t));
+    uint16_t n_runs      = cardinalities[cid];
+    for (uint32_t r = threadIdx.x; r < n_runs; r += blockDim.x) {
+      uint16_t start  = runs[r * 2];
+      uint16_t length = runs[r * 2 + 1];
+      for (uint32_t v = start; v <= static_cast<uint32_t>(start) + length; ++v) {
+        uint32_t abs_bit  = (static_cast<uint32_t>(key) << 16) | v;
+        uint32_t word_idx = abs_bit / 32u;
+        uint32_t bit_pos  = abs_bit % 32u;
+        if (word_idx < output_size_words) { atomicOr(&output[word_idx], 1u << bit_pos); }
+      }
+    }
+  }
+}
+
+void decompress_to_bitset(raft::resources const& res,
+                          const gpu_roaring& bitmap,
+                          uint32_t* output,
+                          uint32_t output_size_words)
+{
+  if (bitmap.n_containers == 0) return;
+  auto stream = raft::resource::get_cuda_stream(res);
+
+  RAFT_CUDA_TRY(cudaMemsetAsync(output, 0, output_size_words * sizeof(uint32_t), stream));
+
+  decompress_kernel<<<bitmap.n_containers, 256, 0, stream>>>(bitmap.keys.data(),
+                                                             bitmap.types.data(),
+                                                             bitmap.offsets.data(),
+                                                             bitmap.cardinalities.data(),
+                                                             bitmap.n_containers,
+                                                             bitmap.bitmap_data.data(),
+                                                             bitmap.array_data.data(),
+                                                             bitmap.run_data.data(),
+                                                             output,
+                                                             output_size_words);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+cuvs::core::bitset<uint32_t, int64_t> to_bitset(raft::resources const& res,
+                                                const gpu_roaring& bitmap)
+{
+  auto stream    = raft::resource::get_cuda_stream(res);
+  int64_t n_bits = static_cast<int64_t>(bitmap.universe_size);
+
+  // Create a bitset initialized to zero
+  cuvs::core::bitset<uint32_t, int64_t> result(res, n_bits, false);
+
+  uint32_t n_words = static_cast<uint32_t>((n_bits + 31) / 32);
+  if (n_words > 0 && bitmap.n_containers > 0) {
+    decompress_to_bitset(res, bitmap, result.data(), n_words);
+  }
+  return result;
+}
+
+// ============================================================================
+// Set operation kernels (bitmap x bitmap)
+// ============================================================================
+
+struct BitmapBitmapWork {
+  uint16_t key;
+  uint32_t a_offset;  // in uint64_t words
+  uint32_t b_offset;
+};
+
+template <roaring_set_op OP>
+__device__ __forceinline__ uint64_t apply_op(uint64_t a, uint64_t b);
+
+template <>
+__device__ __forceinline__ uint64_t apply_op<roaring_set_op::AND>(uint64_t a, uint64_t b)
+{
+  return a & b;
+}
+template <>
+__device__ __forceinline__ uint64_t apply_op<roaring_set_op::OR>(uint64_t a, uint64_t b)
+{
+  return a | b;
+}
+template <>
+__device__ __forceinline__ uint64_t apply_op<roaring_set_op::ANDNOT>(uint64_t a, uint64_t b)
+{
+  return a & ~b;
+}
+template <>
+__device__ __forceinline__ uint64_t apply_op<roaring_set_op::XOR>(uint64_t a, uint64_t b)
+{
+  return a ^ b;
+}
+
+template <roaring_set_op OP>
+__global__ void bitmap_bitmap_kernel(const BitmapBitmapWork* work,
+                                     uint32_t n_pairs,
+                                     const uint64_t* a_data,
+                                     const uint64_t* b_data,
+                                     uint64_t* out_data,
+                                     uint16_t* out_cards)
+{
+  uint32_t pair_idx = blockIdx.x;
+  if (pair_idx >= n_pairs) return;
+
+  const auto& w     = work[pair_idx];
+  const uint64_t* a = a_data + w.a_offset;
+  const uint64_t* b = b_data + w.b_offset;
+  uint64_t* out     = out_data + pair_idx * 1024;
+
+  uint32_t local_pop = 0;
+  for (uint32_t i = threadIdx.x; i < 1024u; i += blockDim.x) {
+    uint64_t val = apply_op<OP>(a[i], b[i]);
+    out[i]       = val;
+    local_pop += __popcll(val);
+  }
+
+  // Warp reduction
+  for (int off = 16; off > 0; off >>= 1)
+    local_pop += __shfl_down_sync(0xFFFFFFFF, local_pop, off);
+
+  __shared__ uint32_t warp_counts[8];
+  uint32_t warp_id = threadIdx.x / 32;
+  uint32_t lane_id = threadIdx.x % 32;
+  if (lane_id == 0) warp_counts[warp_id] = local_pop;
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    uint32_t total = 0;
+    for (uint32_t w = 0; w < blockDim.x / 32; ++w)
+      total += warp_counts[w];
+    out_cards[pair_idx] = static_cast<uint16_t>(total > 65535 ? 0 : total);
+  }
+}
+
+// Expand array/run to bitmap
+struct ExpandWork {
+  uint32_t src_offset;
+  uint16_t cardinality;
+  roaring_container_type type;
+};
+
+__global__ void expand_to_bitmap_kernel(const ExpandWork* work,
+                                        uint32_t n_items,
+                                        const uint16_t* array_pool,
+                                        const uint16_t* run_pool,
+                                        uint64_t* out_bitmaps)
+{
+  uint32_t idx = blockIdx.x;
+  if (idx >= n_items) return;
+
+  const auto& w = work[idx];
+  uint64_t* out = out_bitmaps + idx * 1024;
+
+  for (uint32_t i = threadIdx.x; i < 1024u; i += blockDim.x)
+    out[i] = 0;
+  __syncthreads();
+
+  if (w.type == roaring_container_type::ARRAY) {
+    const uint16_t* arr = array_pool + w.src_offset;
+    for (uint32_t i = threadIdx.x; i < w.cardinality; i += blockDim.x) {
+      uint16_t val      = arr[i];
+      uint32_t word_idx = val / 64u;
+      uint64_t bit_mask = 1ULL << (val % 64u);
+      atomicOr(reinterpret_cast<unsigned long long*>(&out[word_idx]),
+               static_cast<unsigned long long>(bit_mask));
+    }
+  } else if (w.type == roaring_container_type::RUN) {
+    const uint16_t* runs = run_pool + w.src_offset;
+    for (uint32_t r = threadIdx.x; r < w.cardinality; r += blockDim.x) {
+      uint16_t start  = runs[r * 2];
+      uint16_t length = runs[r * 2 + 1];
+      for (uint32_t v = start; v <= static_cast<uint32_t>(start) + length; ++v) {
+        uint32_t word_idx = v / 64u;
+        uint64_t bit_mask = 1ULL << (v % 64u);
+        atomicOr(reinterpret_cast<unsigned long long*>(&out[word_idx]),
+                 static_cast<unsigned long long>(bit_mask));
+      }
+    }
+  }
+}
+
+// Copy bitmap container
+__global__ void copy_bitmap_kernel(const uint64_t* src_pool,
+                                   const uint32_t* src_offsets,
+                                   const bool* from_a,
+                                   uint32_t n_items,
+                                   const uint64_t* a_pool,
+                                   const uint64_t* b_pool,
+                                   uint64_t* dst_pool,
+                                   uint32_t dst_start)
+{
+  uint32_t item = blockIdx.x;
+  if (item >= n_items) return;
+
+  const uint64_t* src = from_a[item] ? (a_pool + src_offsets[item]) : (b_pool + src_offsets[item]);
+  uint64_t* dst       = dst_pool + (dst_start + item) * 1024;
+  for (uint32_t i = threadIdx.x; i < 1024u; i += blockDim.x)
+    dst[i] = src[i];
+}
+
+// ============================================================================
+// Host-side helpers
+// ============================================================================
+
+struct HostIndex {
+  std::vector<uint16_t> keys;
+  std::vector<roaring_container_type> types;
+  std::vector<uint32_t> offsets;
+  std::vector<uint16_t> cardinalities;
+};
+
+static HostIndex download_index(const gpu_roaring& g, cudaStream_t stream)
+{
+  HostIndex h;
+  uint32_t n = g.n_containers;
+  h.keys.resize(n);
+  h.types.resize(n);
+  h.offsets.resize(n);
+  h.cardinalities.resize(n);
+  if (n == 0) return h;
+
+  RAFT_CUDA_TRY(cudaMemcpyAsync(
+    h.keys.data(), g.keys.data(), n * sizeof(uint16_t), cudaMemcpyDeviceToHost, stream));
+  RAFT_CUDA_TRY(cudaMemcpyAsync(h.types.data(),
+                                g.types.data(),
+                                n * sizeof(roaring_container_type),
+                                cudaMemcpyDeviceToHost,
+                                stream));
+  RAFT_CUDA_TRY(cudaMemcpyAsync(
+    h.offsets.data(), g.offsets.data(), n * sizeof(uint32_t), cudaMemcpyDeviceToHost, stream));
+  RAFT_CUDA_TRY(cudaMemcpyAsync(h.cardinalities.data(),
+                                g.cardinalities.data(),
+                                n * sizeof(uint16_t),
+                                cudaMemcpyDeviceToHost,
+                                stream));
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  return h;
+}
+
+static void launch_bb(roaring_set_op op,
+                      uint32_t n,
+                      const BitmapBitmapWork* d_work,
+                      const uint64_t* a,
+                      const uint64_t* b,
+                      uint64_t* out,
+                      uint16_t* cards,
+                      cudaStream_t stream)
+{
+  switch (op) {
+    case roaring_set_op::AND:
+      bitmap_bitmap_kernel<roaring_set_op::AND><<<n, 256, 0, stream>>>(d_work, n, a, b, out, cards);
+      break;
+    case roaring_set_op::OR:
+      bitmap_bitmap_kernel<roaring_set_op::OR><<<n, 256, 0, stream>>>(d_work, n, a, b, out, cards);
+      break;
+    case roaring_set_op::ANDNOT:
+      bitmap_bitmap_kernel<roaring_set_op::ANDNOT>
+        <<<n, 256, 0, stream>>>(d_work, n, a, b, out, cards);
+      break;
+    case roaring_set_op::XOR:
+      bitmap_bitmap_kernel<roaring_set_op::XOR><<<n, 256, 0, stream>>>(d_work, n, a, b, out, cards);
+      break;
+  }
+}
+
+// ============================================================================
+// set_op: the main set operation implementation
+// ============================================================================
+
+gpu_roaring set_op(raft::resources const& res,
+                   const gpu_roaring& a,
+                   const gpu_roaring& b,
+                   roaring_set_op op)
+{
+  auto stream = raft::resource::get_cuda_stream(res);
+  auto ha     = download_index(a, stream);
+  auto hb     = download_index(b, stream);
+
+  // Container matching (merge of sorted key arrays)
+  struct WorkItem {
+    uint16_t key;
+    int32_t a_idx, b_idx;
+    roaring_container_type a_type, b_type;
+  };
+  std::vector<WorkItem> work;
+  work.reserve(a.n_containers + b.n_containers);
+
+  uint32_t ia = 0, ib = 0;
+  while (ia < a.n_containers && ib < b.n_containers) {
+    if (ha.keys[ia] < hb.keys[ib]) {
+      if (op == roaring_set_op::OR || op == roaring_set_op::XOR || op == roaring_set_op::ANDNOT)
+        work.push_back({ha.keys[ia], (int32_t)ia, -1, ha.types[ia], roaring_container_type::ARRAY});
+      ++ia;
+    } else if (ha.keys[ia] > hb.keys[ib]) {
+      if (op == roaring_set_op::OR || op == roaring_set_op::XOR)
+        work.push_back({hb.keys[ib], -1, (int32_t)ib, roaring_container_type::ARRAY, hb.types[ib]});
+      ++ib;
+    } else {
+      work.push_back({ha.keys[ia], (int32_t)ia, (int32_t)ib, ha.types[ia], hb.types[ib]});
+      ++ia;
+      ++ib;
+    }
+  }
+  while (ia < a.n_containers) {
+    if (op == roaring_set_op::OR || op == roaring_set_op::XOR || op == roaring_set_op::ANDNOT)
+      work.push_back({ha.keys[ia], (int32_t)ia, -1, ha.types[ia], roaring_container_type::ARRAY});
+    ++ia;
+  }
+  while (ib < b.n_containers) {
+    if (op == roaring_set_op::OR || op == roaring_set_op::XOR)
+      work.push_back({hb.keys[ib], -1, (int32_t)ib, roaring_container_type::ARRAY, hb.types[ib]});
+    ++ib;
+  }
+
+  if (work.empty()) return gpu_roaring(stream);
+
+  // Classify by type pair
+  struct BBW {
+    uint16_t key;
+    uint32_t a_off, b_off;
+  };
+  struct CopyW {
+    uint16_t key;
+    uint32_t offset;
+    uint16_t card;
+    roaring_container_type type;
+    bool from_a;
+  };
+  struct MixedW {
+    uint16_t key;
+    roaring_container_type a_type, b_type;
+    uint32_t a_off, b_off;
+    uint16_t a_card, b_card;
+    bool a_is_bmp, b_is_bmp;
+  };
+
+  std::vector<BBW> bb_work;
+  std::vector<CopyW> copy_bmp, copy_arr, copy_run;
+  std::vector<MixedW> mixed;
+
+  auto elem_off = [](roaring_container_type t, uint32_t byte_off) -> uint32_t {
+    if (t == roaring_container_type::BITMAP) return byte_off / sizeof(uint64_t);
+    return byte_off / sizeof(uint16_t);
+  };
+
+  for (auto& wi : work) {
+    if (wi.a_idx < 0) {
+      auto& ho  = hb;
+      int32_t i = wi.b_idx;
+      CopyW cw  = {wi.key, ho.offsets[i], ho.cardinalities[i], ho.types[i], false};
+      if (ho.types[i] == roaring_container_type::BITMAP)
+        copy_bmp.push_back(cw);
+      else if (ho.types[i] == roaring_container_type::ARRAY)
+        copy_arr.push_back(cw);
+      else
+        copy_run.push_back(cw);
+    } else if (wi.b_idx < 0) {
+      auto& ho  = ha;
+      int32_t i = wi.a_idx;
+      CopyW cw  = {wi.key, ho.offsets[i], ho.cardinalities[i], ho.types[i], true};
+      if (ho.types[i] == roaring_container_type::BITMAP)
+        copy_bmp.push_back(cw);
+      else if (ho.types[i] == roaring_container_type::ARRAY)
+        copy_arr.push_back(cw);
+      else
+        copy_run.push_back(cw);
+    } else {
+      auto ta = ha.types[wi.a_idx];
+      auto tb = hb.types[wi.b_idx];
+      if (ta == roaring_container_type::BITMAP && tb == roaring_container_type::BITMAP) {
+        bb_work.push_back({wi.key,
+                           ha.offsets[wi.a_idx] / (uint32_t)sizeof(uint64_t),
+                           hb.offsets[wi.b_idx] / (uint32_t)sizeof(uint64_t)});
+      } else {
+        // All non-bitmap×bitmap: expand to bitmap
+        mixed.push_back({wi.key,
+                         ta,
+                         tb,
+                         elem_off(ta, ha.offsets[wi.a_idx]),
+                         elem_off(tb, hb.offsets[wi.b_idx]),
+                         ha.cardinalities[wi.a_idx],
+                         hb.cardinalities[wi.b_idx],
+                         ta == roaring_container_type::BITMAP,
+                         tb == roaring_container_type::BITMAP});
+      }
+    }
+  }
+
+  uint32_t out_n_bitmap = static_cast<uint32_t>(bb_work.size() + mixed.size() + copy_bmp.size());
+
+  // Allocate output bitmap pool
+  rmm::device_uvector<uint64_t> d_out_bmp(static_cast<size_t>(out_n_bitmap) * 1024, stream);
+
+  // Execute bitmap×bitmap
+  std::vector<uint16_t> bb_cards(bb_work.size());
+  if (!bb_work.empty()) {
+    uint32_t n_bb = static_cast<uint32_t>(bb_work.size());
+    rmm::device_uvector<BitmapBitmapWork> d_bb(n_bb, stream);
+    rmm::device_uvector<uint16_t> d_bb_cards(n_bb, stream);
+    raft::update_device(
+      d_bb.data(), reinterpret_cast<const BitmapBitmapWork*>(bb_work.data()), n_bb, stream);
+
+    launch_bb(op,
+              n_bb,
+              d_bb.data(),
+              a.bitmap_data.data(),
+              b.bitmap_data.data(),
+              d_out_bmp.data(),
+              d_bb_cards.data(),
+              stream);
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+
+    raft::update_host(bb_cards.data(), d_bb_cards.data(), n_bb, stream);
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  }
+
+  // Execute mixed pairs (expand + bitmap op)
+  std::vector<uint16_t> mixed_cards(mixed.size());
+  rmm::device_uvector<uint64_t> d_tmp_a(0, stream);
+  rmm::device_uvector<uint64_t> d_tmp_b(0, stream);
+  if (!mixed.empty()) {
+    uint32_t n_mx = static_cast<uint32_t>(mixed.size());
+    d_tmp_a.resize(static_cast<size_t>(n_mx) * 1024, stream);
+    d_tmp_b.resize(static_cast<size_t>(n_mx) * 1024, stream);
+
+    // Build expand work
+    std::vector<ExpandWork> exp_a(n_mx), exp_b(n_mx);
+    for (uint32_t i = 0; i < n_mx; ++i) {
+      exp_a[i] = {mixed[i].a_off, mixed[i].a_card, mixed[i].a_type};
+      exp_b[i] = {mixed[i].b_off, mixed[i].b_card, mixed[i].b_type};
+    }
+
+    rmm::device_uvector<ExpandWork> d_exp_a(n_mx, stream);
+    rmm::device_uvector<ExpandWork> d_exp_b(n_mx, stream);
+    raft::update_device(d_exp_a.data(), exp_a.data(), n_mx, stream);
+    raft::update_device(d_exp_b.data(), exp_b.data(), n_mx, stream);
+
+    RAFT_CUDA_TRY(cudaMemsetAsync(d_tmp_a.data(), 0, d_tmp_a.size() * sizeof(uint64_t), stream));
+    RAFT_CUDA_TRY(cudaMemsetAsync(d_tmp_b.data(), 0, d_tmp_b.size() * sizeof(uint64_t), stream));
+
+    expand_to_bitmap_kernel<<<n_mx, 256, 0, stream>>>(
+      d_exp_a.data(), n_mx, a.array_data.data(), a.run_data.data(), d_tmp_a.data());
+    expand_to_bitmap_kernel<<<n_mx, 256, 0, stream>>>(
+      d_exp_b.data(), n_mx, b.array_data.data(), b.run_data.data(), d_tmp_b.data());
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+
+    // Copy bitmap containers from source pools into temp buffers
+    for (uint32_t i = 0; i < n_mx; ++i) {
+      if (mixed[i].a_is_bmp) {
+        RAFT_CUDA_TRY(cudaMemcpyAsync(d_tmp_a.data() + i * 1024,
+                                      a.bitmap_data.data() + mixed[i].a_off,
+                                      1024 * sizeof(uint64_t),
+                                      cudaMemcpyDeviceToDevice,
+                                      stream));
+      }
+      if (mixed[i].b_is_bmp) {
+        RAFT_CUDA_TRY(cudaMemcpyAsync(d_tmp_b.data() + i * 1024,
+                                      b.bitmap_data.data() + mixed[i].b_off,
+                                      1024 * sizeof(uint64_t),
+                                      cudaMemcpyDeviceToDevice,
+                                      stream));
+      }
+    }
+
+    // Build BitmapBitmapWork for temp pools
+    std::vector<BitmapBitmapWork> mx_bb(n_mx);
+    for (uint32_t i = 0; i < n_mx; ++i)
+      mx_bb[i] = {mixed[i].key, i * 1024, i * 1024};
+
+    rmm::device_uvector<BitmapBitmapWork> d_mx_bb(n_mx, stream);
+    rmm::device_uvector<uint16_t> d_mx_cards(n_mx, stream);
+    raft::update_device(d_mx_bb.data(), mx_bb.data(), n_mx, stream);
+
+    uint32_t bb_sz = static_cast<uint32_t>(bb_work.size());
+    launch_bb(op,
+              n_mx,
+              d_mx_bb.data(),
+              d_tmp_a.data(),
+              d_tmp_b.data(),
+              d_out_bmp.data() + static_cast<size_t>(bb_sz) * 1024,
+              d_mx_cards.data(),
+              stream);
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+
+    raft::update_host(mixed_cards.data(), d_mx_cards.data(), n_mx, stream);
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  }
+
+  // Copy-through bitmap containers
+  if (!copy_bmp.empty()) {
+    uint32_t n_cp  = static_cast<uint32_t>(copy_bmp.size());
+    uint32_t start = static_cast<uint32_t>(bb_work.size() + mixed.size());
+    for (uint32_t i = 0; i < n_cp; ++i) {
+      const uint64_t* src = copy_bmp[i].from_a
+                              ? a.bitmap_data.data() + copy_bmp[i].offset / sizeof(uint64_t)
+                              : b.bitmap_data.data() + copy_bmp[i].offset / sizeof(uint64_t);
+      RAFT_CUDA_TRY(cudaMemcpyAsync(d_out_bmp.data() + (start + i) * 1024,
+                                    src,
+                                    1024 * sizeof(uint64_t),
+                                    cudaMemcpyDeviceToDevice,
+                                    stream));
+    }
+  }
+
+  // Copy-through array containers
+  uint32_t total_arr = 0;
+  std::vector<uint32_t> ca_off(copy_arr.size());
+  for (size_t i = 0; i < copy_arr.size(); ++i) {
+    ca_off[i] = total_arr;
+    total_arr += copy_arr[i].card;
+  }
+  rmm::device_uvector<uint16_t> d_out_arr(total_arr, stream);
+  for (size_t i = 0; i < copy_arr.size(); ++i) {
+    const uint16_t* src = copy_arr[i].from_a
+                            ? a.array_data.data() + copy_arr[i].offset / sizeof(uint16_t)
+                            : b.array_data.data() + copy_arr[i].offset / sizeof(uint16_t);
+    RAFT_CUDA_TRY(cudaMemcpyAsync(d_out_arr.data() + ca_off[i],
+                                  src,
+                                  copy_arr[i].card * sizeof(uint16_t),
+                                  cudaMemcpyDeviceToDevice,
+                                  stream));
+  }
+
+  // Copy-through run containers
+  uint32_t total_run_vals = 0;
+  std::vector<uint32_t> cr_off(copy_run.size());
+  for (size_t i = 0; i < copy_run.size(); ++i) {
+    cr_off[i] = total_run_vals;
+    total_run_vals += copy_run[i].card * 2;
+  }
+  rmm::device_uvector<uint16_t> d_out_run(total_run_vals, stream);
+  for (size_t i = 0; i < copy_run.size(); ++i) {
+    const uint16_t* src = copy_run[i].from_a
+                            ? a.run_data.data() + copy_run[i].offset / sizeof(uint16_t)
+                            : b.run_data.data() + copy_run[i].offset / sizeof(uint16_t);
+    RAFT_CUDA_TRY(cudaMemcpyAsync(d_out_run.data() + cr_off[i],
+                                  src,
+                                  copy_run[i].card * 2 * sizeof(uint16_t),
+                                  cudaMemcpyDeviceToDevice,
+                                  stream));
+  }
+
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+
+  // Assemble output
+  std::vector<uint16_t> out_keys;
+  std::vector<roaring_container_type> out_types;
+  std::vector<uint32_t> out_offsets;
+  std::vector<uint16_t> out_cards;
+
+  uint32_t bb_idx = 0, mx_idx = 0, cb_idx = 0, ca_idx = 0, cr_idx = 0;
+
+  for (auto& wi : work) {
+    if (wi.a_idx < 0 || wi.b_idx < 0) {
+      // Copy-through
+      auto& ho        = (wi.a_idx < 0) ? hb : ha;
+      int32_t src_idx = (wi.a_idx < 0) ? wi.b_idx : wi.a_idx;
+      auto ct         = ho.types[src_idx];
+      if (ct == roaring_container_type::BITMAP) {
+        uint32_t slot = static_cast<uint32_t>(bb_work.size() + mixed.size()) + cb_idx;
+        out_keys.push_back(wi.key);
+        out_types.push_back(roaring_container_type::BITMAP);
+        out_offsets.push_back(slot * 1024 * sizeof(uint64_t));
+        out_cards.push_back(ho.cardinalities[src_idx]);
+        ++cb_idx;
+      } else if (ct == roaring_container_type::ARRAY) {
+        out_keys.push_back(wi.key);
+        out_types.push_back(roaring_container_type::ARRAY);
+        out_offsets.push_back(ca_off[ca_idx] * sizeof(uint16_t));
+        out_cards.push_back(copy_arr[ca_idx].card);
+        ++ca_idx;
+      } else {
+        out_keys.push_back(wi.key);
+        out_types.push_back(roaring_container_type::RUN);
+        out_offsets.push_back(cr_off[cr_idx] * sizeof(uint16_t));
+        out_cards.push_back(copy_run[cr_idx].card);
+        ++cr_idx;
+      }
+    } else {
+      auto ta = ha.types[wi.a_idx];
+      auto tb = hb.types[wi.b_idx];
+      if (ta == roaring_container_type::BITMAP && tb == roaring_container_type::BITMAP) {
+        uint16_t card = bb_cards[bb_idx];
+        if (card == 0 && op != roaring_set_op::OR) {
+          ++bb_idx;
+          continue;
+        }
+        out_keys.push_back(wi.key);
+        out_types.push_back(roaring_container_type::BITMAP);
+        out_offsets.push_back(bb_idx * 1024 * sizeof(uint64_t));
+        out_cards.push_back(card);
+        ++bb_idx;
+      } else {
+        uint16_t card = mixed_cards[mx_idx];
+        if (card == 0 && op != roaring_set_op::OR) {
+          ++mx_idx;
+          continue;
+        }
+        uint32_t slot = static_cast<uint32_t>(bb_work.size()) + mx_idx;
+        out_keys.push_back(wi.key);
+        out_types.push_back(roaring_container_type::BITMAP);
+        out_offsets.push_back(slot * 1024 * sizeof(uint64_t));
+        out_cards.push_back(card);
+        ++mx_idx;
+      }
+    }
+  }
+
+  // Build result
+  gpu_roaring result(stream);
+  result.n_containers        = static_cast<uint32_t>(out_keys.size());
+  result.universe_size       = std::max(a.universe_size, b.universe_size);
+  result.bitmap_data         = std::move(d_out_bmp);
+  result.n_bitmap_containers = out_n_bitmap;
+  result.array_data          = std::move(d_out_arr);
+  result.n_array_containers  = static_cast<uint32_t>(copy_arr.size());
+  result.run_data            = std::move(d_out_run);
+  result.n_run_containers    = static_cast<uint32_t>(copy_run.size());
+
+  if (result.n_containers > 0) {
+    result.keys.resize(result.n_containers, stream);
+    result.types.resize(result.n_containers, stream);
+    result.offsets.resize(result.n_containers, stream);
+    result.cardinalities.resize(result.n_containers, stream);
+    raft::update_device(result.keys.data(), out_keys.data(), result.n_containers, stream);
+    raft::update_device(result.types.data(), out_types.data(), result.n_containers, stream);
+    raft::update_device(result.offsets.data(), out_offsets.data(), result.n_containers, stream);
+    raft::update_device(result.cardinalities.data(), out_cards.data(), result.n_containers, stream);
+    raft::resource::sync_stream(res);
+  }
+
+  return result;
+}
+
+// ============================================================================
+// Multi-AND / Multi-OR
+// ============================================================================
+
+gpu_roaring multi_and(raft::resources const& res, const gpu_roaring* bitmaps, uint32_t count)
+{
+  if (count == 0) return gpu_roaring(raft::resource::get_cuda_stream(res));
+  if (count == 1) return set_op(res, bitmaps[0], bitmaps[0], roaring_set_op::AND);
+
+  gpu_roaring result = set_op(res, bitmaps[0], bitmaps[1], roaring_set_op::AND);
+  for (uint32_t i = 2; i < count; ++i) {
+    gpu_roaring next = set_op(res, result, bitmaps[i], roaring_set_op::AND);
+    result           = std::move(next);
+  }
+  return result;
+}
+
+gpu_roaring multi_or(raft::resources const& res, const gpu_roaring* bitmaps, uint32_t count)
+{
+  if (count == 0) return gpu_roaring(raft::resource::get_cuda_stream(res));
+  if (count == 1) return set_op(res, bitmaps[0], bitmaps[0], roaring_set_op::AND);
+
+  gpu_roaring result = set_op(res, bitmaps[0], bitmaps[1], roaring_set_op::OR);
+  for (uint32_t i = 2; i < count; ++i) {
+    gpu_roaring next = set_op(res, result, bitmaps[i], roaring_set_op::OR);
+    result           = std::move(next);
+  }
+  return result;
+}
+
+// ============================================================================
+// from_sorted_ids (device input)
+// ============================================================================
+
+gpu_roaring from_sorted_ids(raft::resources const& res,
+                            raft::device_vector_view<const uint32_t, int64_t> sorted_ids,
+                            uint32_t universe_size)
+{
+  auto stream = raft::resource::get_cuda_stream(res);
+  std::vector<uint32_t> h_ids(sorted_ids.extent(0));
+  raft::update_host(h_ids.data(), sorted_ids.data_handle(), h_ids.size(), stream);
+  raft::resource::sync_stream(res);
+  return from_sorted_ids(res, h_ids.data(), static_cast<uint32_t>(h_ids.size()), universe_size);
+}
+
+// ============================================================================
+// Batched CSR emission: every container of every bitmap in one launch.
+// One CTA per container; each container writes its sorted member ids into
+// its precomputed segment of the global indices array (offsets derive from
+// host-side element counts — no count kernels, no syncs).
+// ============================================================================
+
+namespace {
+
+constexpr uint32_t kEmitBlock = 256;
+
+__device__ inline void emit_block_scan(uint32_t* smem, uint32_t tid)
+{
+  for (uint32_t stride = 1; stride < kEmitBlock; stride *= 2) {
+    uint32_t v = (tid >= stride) ? smem[tid - stride] : 0;
+    __syncthreads();
+    smem[tid] += v;
+    __syncthreads();
+  }
+}
+
+__global__ void emit_csr_indices_kernel(const uint32_t* keys,
+                                        const uint8_t* types,
+                                        const uint32_t* elem_counts,
+                                        const void* const* data,
+                                        const int64_t* out_offsets,
+                                        uint32_t n_containers,
+                                        int64_t* indices)
+{
+  uint32_t cid = blockIdx.x;
+  if (cid >= n_containers) return;
+  int64_t base_id   = static_cast<int64_t>(keys[cid]) << 16;
+  int64_t out_start = out_offsets[cid];
+  uint32_t tid      = threadIdx.x;
+  auto ctype        = static_cast<roaring_container_type>(types[cid]);
+
+  if (ctype == roaring_container_type::ARRAY) {
+    const uint16_t* arr = static_cast<const uint16_t*>(data[cid]);
+    for (uint32_t i = tid; i < elem_counts[cid]; i += kEmitBlock)
+      indices[out_start + i] = base_id | arr[i];
+  } else if (ctype == roaring_container_type::BITMAP) {
+    const uint64_t* bmp = static_cast<const uint64_t*>(data[cid]);
+    __shared__ uint32_t counts[kEmitBlock];
+    uint32_t mine = 0;
+    for (uint32_t w = tid * 4; w < tid * 4 + 4; ++w)
+      mine += static_cast<uint32_t>(__popcll(bmp[w]));
+    counts[tid] = mine;
+    __syncthreads();
+    emit_block_scan(counts, tid);
+    uint32_t pos = (tid > 0) ? counts[tid - 1] : 0;
+    for (uint32_t w = tid * 4; w < tid * 4 + 4; ++w) {
+      uint64_t word = bmp[w];
+      while (word != 0) {
+        uint32_t bit = static_cast<uint32_t>(__ffsll(static_cast<long long>(word))) - 1;
+        indices[out_start + pos++] = base_id | (w * 64 + bit);
+        word &= word - 1;
+      }
+    }
+  }
+  // RUN containers are rejected host-side (not produced by v1 construction).
+}
+
+}  // namespace
+
+void to_csr_indices(raft::resources const& res,
+                    const gpu_roaring* const* bitmaps,
+                    uint32_t n_bitmaps,
+                    int64_t* indices)
+{
+  auto stream = raft::resource::get_cuda_stream(res);
+
+  // flatten per-container descriptors from the host mirrors
+  std::vector<uint32_t> keys, counts;
+  std::vector<uint8_t> types;
+  std::vector<const void*> data;
+  std::vector<int64_t> out_offsets;
+  int64_t cursor = 0;
+  for (uint32_t b = 0; b < n_bitmaps; ++b) {
+    const gpu_roaring& r = *bitmaps[b];
+    RAFT_EXPECTS(!r.negated, "to_csr_indices: negated bitmaps are not supported");
+    RAFT_EXPECTS(r.h_keys.size() == r.n_containers,
+                 "to_csr_indices: bitmap lacks host container metadata "
+                 "(construct via from_sorted_ids)");
+    for (uint32_t c = 0; c < r.n_containers; ++c) {
+      keys.push_back(r.h_keys[c]);
+      types.push_back(static_cast<uint8_t>(r.h_types[c]));
+      out_offsets.push_back(cursor);
+      switch (r.h_types[c]) {
+        case roaring_container_type::ARRAY:
+          data.push_back(r.array_data.data() + r.h_offsets[c] / sizeof(uint16_t));
+          counts.push_back(r.h_element_counts[c]);
+          cursor += r.h_element_counts[c];
+          break;
+        case roaring_container_type::BITMAP:
+          data.push_back(r.bitmap_data.data() + r.h_offsets[c] / sizeof(uint64_t));
+          counts.push_back(r.h_element_counts[c]);
+          cursor += r.h_element_counts[c];
+          break;
+        default: RAFT_FAIL("to_csr_indices: RUN containers are not produced by v1 construction");
+      }
+    }
+  }
+  uint32_t n_c = static_cast<uint32_t>(keys.size());
+  if (n_c == 0) return;
+
+  rmm::device_uvector<uint32_t> d_keys(n_c, stream), d_counts(n_c, stream);
+  rmm::device_uvector<uint8_t> d_types(n_c, stream);
+  rmm::device_uvector<const void*> d_data(n_c, stream);
+  rmm::device_uvector<int64_t> d_off(n_c, stream);
+  raft::update_device(d_keys.data(), keys.data(), n_c, stream);
+  raft::update_device(d_counts.data(), counts.data(), n_c, stream);
+  raft::update_device(d_types.data(), types.data(), n_c, stream);
+  raft::update_device(d_data.data(), data.data(), n_c, stream);
+  raft::update_device(d_off.data(), out_offsets.data(), n_c, stream);
+
+  emit_csr_indices_kernel<<<n_c, kEmitBlock, 0, stream>>>(
+    d_keys.data(), d_types.data(), d_counts.data(), d_data.data(), d_off.data(), n_c, indices);
+  RAFT_CUDA_TRY(cudaGetLastError());
+}
+
+// ============================================================================
+// Batched decompression into a dense [n_bitmaps, n_rows] bit matrix
+// ============================================================================
+
+namespace {
+
+__global__ void decompress_bitmap_row_kernel(const uint32_t* keys,
+                                             const uint8_t* types,
+                                             const uint32_t* elem_counts,
+                                             const void* const* data,
+                                             const int64_t* row_of,
+                                             uint32_t n_containers,
+                                             int64_t n_rows,
+                                             uint32_t* output)
+{
+  uint32_t cid = blockIdx.x;
+  if (cid >= n_containers) return;
+  int64_t row_bit_base = row_of[cid] * n_rows;
+  uint32_t key         = keys[cid];
+  uint32_t tid         = threadIdx.x;
+  auto ctype           = static_cast<roaring_container_type>(types[cid]);
+
+  auto set_bit = [&](uint32_t low) {
+    int64_t bit = row_bit_base + ((static_cast<int64_t>(key) << 16) | low);
+    atomicOr(&output[bit >> 5], 1u << (bit & 31));
+  };
+
+  if (ctype == roaring_container_type::ARRAY) {
+    const uint16_t* arr = static_cast<const uint16_t*>(data[cid]);
+    for (uint32_t i = tid; i < elem_counts[cid]; i += blockDim.x)
+      set_bit(arr[i]);
+  } else if (ctype == roaring_container_type::BITMAP) {
+    const uint64_t* bmp = static_cast<const uint64_t*>(data[cid]);
+    for (uint32_t w = tid; w < 1024; w += blockDim.x) {
+      uint64_t word = bmp[w];
+      while (word != 0) {
+        uint32_t bit = static_cast<uint32_t>(__ffsll(static_cast<long long>(word))) - 1;
+        set_bit(w * 64 + bit);
+        word &= word - 1;
+      }
+    }
+  }
+  // RUN containers are rejected host-side (not produced by v1 construction).
+}
+
+}  // namespace
+
+void decompress_to_bitmap(raft::resources const& res,
+                          const gpu_roaring* const* bitmaps,
+                          uint32_t n_bitmaps,
+                          int64_t n_rows,
+                          uint32_t* output)
+{
+  auto stream = raft::resource::get_cuda_stream(res);
+
+  std::vector<uint32_t> keys, counts;
+  std::vector<uint8_t> types;
+  std::vector<const void*> data;
+  std::vector<int64_t> row_of;
+  for (uint32_t b = 0; b < n_bitmaps; ++b) {
+    const gpu_roaring& r = *bitmaps[b];
+    RAFT_EXPECTS(!r.negated, "decompress_to_bitmap: negated bitmaps are not supported");
+    RAFT_EXPECTS(r.h_keys.size() == r.n_containers,
+                 "decompress_to_bitmap: bitmap lacks host container metadata "
+                 "(construct via from_sorted_ids)");
+    for (uint32_t c = 0; c < r.n_containers; ++c) {
+      keys.push_back(r.h_keys[c]);
+      types.push_back(static_cast<uint8_t>(r.h_types[c]));
+      row_of.push_back(static_cast<int64_t>(b));
+      counts.push_back(r.h_element_counts[c]);
+      switch (r.h_types[c]) {
+        case roaring_container_type::ARRAY:
+          data.push_back(r.array_data.data() + r.h_offsets[c] / sizeof(uint16_t));
+          break;
+        case roaring_container_type::BITMAP:
+          data.push_back(r.bitmap_data.data() + r.h_offsets[c] / sizeof(uint64_t));
+          break;
+        default:
+          RAFT_FAIL("decompress_to_bitmap: RUN containers are not produced by v1 construction");
+      }
+    }
+  }
+  uint32_t n_c = static_cast<uint32_t>(keys.size());
+  if (n_c == 0) return;
+
+  rmm::device_uvector<uint32_t> d_keys(n_c, stream), d_counts(n_c, stream);
+  rmm::device_uvector<uint8_t> d_types(n_c, stream);
+  rmm::device_uvector<const void*> d_data(n_c, stream);
+  rmm::device_uvector<int64_t> d_row(n_c, stream);
+  raft::update_device(d_keys.data(), keys.data(), n_c, stream);
+  raft::update_device(d_counts.data(), counts.data(), n_c, stream);
+  raft::update_device(d_types.data(), types.data(), n_c, stream);
+  raft::update_device(d_data.data(), data.data(), n_c, stream);
+  raft::update_device(d_row.data(), row_of.data(), n_c, stream);
+
+  decompress_bitmap_row_kernel<<<n_c, 256, 0, stream>>>(d_keys.data(),
+                                                        d_types.data(),
+                                                        d_counts.data(),
+                                                        d_data.data(),
+                                                        d_row.data(),
+                                                        n_c,
+                                                        n_rows,
+                                                        output);
+  RAFT_CUDA_TRY(cudaGetLastError());
+}
+
+}  // namespace cuvs::core
diff --git a/cpp/src/neighbors/detail/knn_brute_force.cuh b/cpp/src/neighbors/detail/knn_brute_force.cuh
index 34ef8dd937..2b28ada909 100644
--- a/cpp/src/neighbors/detail/knn_brute_force.cuh
+++ b/cpp/src/neighbors/detail/knn_brute_force.cuh
@@ -15,6 +15,7 @@
 #include "./fused_l2_knn.cuh"
 #include "./haversine_distance.cuh"
 #include "./knn_utils.cuh"
+#include "./knn_brute_force_roaring.cuh"
 
 #include <raft/core/bitmap.cuh>
 #include <raft/core/copy.cuh>
@@ -589,7 +590,7 @@ void brute_force_search_filtered(
   const cuvs::neighbors::filtering::base_filter* filter,
   raft::device_matrix_view<IdxT, IdxT, raft::row_major> neighbors,
   raft::device_matrix_view<DistanceT, IdxT, raft::row_major> distances,
-  std::optional<raft::device_vector_view<const DistanceT, IdxT>> query_norms = std::nullopt)
+  std::optional<raft::device_vector_view<const DistanceT, IdxT>> query_norms)
 {
   auto metric = idx.metric();
 
@@ -754,6 +755,29 @@ void search(raft::resources const& res,
   if constexpr (std::is_same_v<LayoutT, raft::col_major>) {
     RAFT_FAIL("filtered search isn't available with col_major queries yet");
   } else {
+    if constexpr (std::is_same_v<T, float> && std::is_same_v<DistT, float>) {
+      try {
+        auto& sample_filter =
+          dynamic_cast<const cuvs::neighbors::filtering::roaring_filter&>(sample_filter_ref);
+        return brute_force_search_roaring(res, idx, queries, sample_filter, neighbors, distances);
+      } catch (const std::bad_cast&) {
+      }
+      try {
+        auto& sample_filter =
+          dynamic_cast<const cuvs::neighbors::filtering::roaring_matrix_filter&>(
+            sample_filter_ref);
+        return brute_force_search_roaring_matrix(
+          res, idx, queries, sample_filter, neighbors, distances);
+      } catch (const std::bad_cast&) {
+      }
+    } else {
+      RAFT_EXPECTS(
+        sample_filter_ref.get_filter_type() != cuvs::neighbors::filtering::FilterType::Roaring &&
+          sample_filter_ref.get_filter_type() !=
+            cuvs::neighbors::filtering::FilterType::RoaringMatrix,
+        "roaring filters currently support float32 data only");
+    }
+
     try {
       auto& sample_filter =
         dynamic_cast<const cuvs::neighbors::filtering::bitmap_filter<uint32_t, int64_t>&>(
diff --git a/cpp/src/neighbors/detail/knn_brute_force_roaring.cuh b/cpp/src/neighbors/detail/knn_brute_force_roaring.cuh
new file mode 100644
index 0000000000..b6be4b6e96
--- /dev/null
+++ b/cpp/src/neighbors/detail/knn_brute_force_roaring.cuh
@@ -0,0 +1,438 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+/*
+ * Brute-force prefiltered search with Roaring-bitmap filters.
+ *
+ * Both entry points implement a three-regime dispatch keyed on filter
+ * selectivity s (known on the host from the filters' cardinalities — no
+ * count kernels, no device synchronization):
+ *
+ *  - very sparse:   emit the filter's member ids straight into a CSR
+ *                   structure (one kernel; indptr is free from host-side
+ *                   cardinalities), compute distances at the nnz positions
+ *                   with SDDMM, select_k over the sparse rows.
+ *  - mid (shared filter only): gather the selected rows and run a dense
+ *                   GEMM + select_k over [n_queries, |filter|] — computes
+ *                   |filter| columns instead of n_rows. This is where the
+ *                   bulk of the win lives (up to ~19x vs the dense+mask
+ *                   path at 1-10% selectivity, 10M x 512d).
+ *  - dense:         decompress to a flat bitset / bit matrix and delegate
+ *                   to the existing bitset/bitmap pipeline, which is
+ *                   already optimal above ~40-50% selectivity.
+ *
+ * The sparse/mid threshold is dimension-dependent (cusparse SDDMM
+ * degrades with d, dense GEMM does not): measured crossovers on
+ * RTX 5090 are ~3% at d=128 and ~0.1% at d=512.
+ *
+ * float32 only in this version (the SDDMM/GEMM plumbing is instantiated
+ * for float; half support tracks the existing filtered-path limitation).
+ */
+
+#include <cuvs/core/roaring.hpp>
+#include <cuvs/neighbors/brute_force.hpp>
+#include <cuvs/neighbors/roaring_filter.hpp>
+
+#include "./knn_utils.cuh"
+
+#include <raft/core/bitmap.cuh>
+#include <raft/core/bitset.cuh>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/thrust_policy.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/linalg/gemm.cuh>
+#include <raft/linalg/norm.cuh>
+#include <raft/matrix/select_k.cuh>
+#include <raft/sparse/convert/coo.cuh>
+#include <raft/sparse/convert/csr.cuh>
+#include <raft/sparse/linalg/sddmm.hpp>
+#include <raft/sparse/matrix/select_k.cuh>
+
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/fill.h>
+
+#include <cstdint>
+#include <vector>
+
+namespace cuvs::neighbors::detail {
+
+// defined in knn_brute_force.cuh (which includes this header first)
+template <typename T, typename IdxT, typename BitsT, typename DistanceT>
+void brute_force_search_filtered(
+  raft::resources const& res,
+  const cuvs::neighbors::brute_force::index<T, DistanceT>& idx,
+  raft::device_matrix_view<const T, IdxT, raft::row_major> queries,
+  const cuvs::neighbors::filtering::base_filter* filter,
+  raft::device_matrix_view<IdxT, IdxT, raft::row_major> neighbors,
+  raft::device_matrix_view<DistanceT, IdxT, raft::row_major> distances,
+  std::optional<raft::device_vector_view<const DistanceT, IdxT>> query_norms = std::nullopt);
+
+namespace roaring_detail {
+
+/** Selectivity below which the SDDMM path beats gathering + dense GEMM. */
+inline double sparse_threshold(int64_t dim) { return dim >= 256 ? 0.001 : 0.03; }
+/** Selectivity above which the dense masked pipeline wins. */
+constexpr double kDenseThreshold = 0.45;
+
+__global__ inline void replicate_indices_kernel(const int64_t* ids,
+                                                int64_t n_ids,
+                                                int64_t total,
+                                                int64_t* out)
+{
+  int64_t i = blockIdx.x * static_cast<int64_t>(blockDim.x) + threadIdx.x;
+  if (i < total) out[i] = ids[i % n_ids];
+}
+
+__global__ inline void gather_rows_kernel(const float* __restrict__ dataset,
+                                          const int64_t* __restrict__ ids,
+                                          int64_t n_ids,
+                                          int64_t dim,
+                                          float* __restrict__ out)
+{
+  int64_t row = blockIdx.x;
+  if (row >= n_ids) return;
+  const float* src = dataset + ids[row] * dim;
+  float* dst       = out + row * dim;
+  if ((dim & 3) == 0 && (reinterpret_cast<uintptr_t>(src) & 15) == 0) {
+    auto* src4 = reinterpret_cast<const float4*>(src);
+    auto* dst4 = reinterpret_cast<float4*>(dst);
+    for (int64_t c = threadIdx.x; c < dim / 4; c += blockDim.x)
+      dst4[c] = src4[c];
+  } else {
+    for (int64_t c = threadIdx.x; c < dim; c += blockDim.x)
+      dst[c] = src[c];
+  }
+}
+
+__global__ inline void gather_norms_kernel(const float* __restrict__ norms,
+                                           const int64_t* __restrict__ ids,
+                                           int64_t n_ids,
+                                           float* __restrict__ out)
+{
+  int64_t i = blockIdx.x * static_cast<int64_t>(blockDim.x) + threadIdx.x;
+  if (i < n_ids) out[i] = norms[ids[i]];
+}
+
+/** Dense counterpart of epilogue_on_csr: dist[q][u] from raw dots. */
+__global__ inline void dense_epilogue_kernel(float* __restrict__ dist,
+                                             const float* __restrict__ q_norms,
+                                             const float* __restrict__ r_norms,
+                                             int64_t n_queries,
+                                             int64_t n_cols,
+                                             cuvs::distance::DistanceType metric)
+{
+  int64_t i = blockIdx.x * static_cast<int64_t>(blockDim.x) + threadIdx.x;
+  if (i >= n_queries * n_cols) return;
+  int64_t q = i / n_cols;
+  int64_t u = i % n_cols;
+  float dot = dist[i];
+  if (metric == cuvs::distance::DistanceType::L2Expanded) {
+    dist[i] = -2.0f * dot + q_norms[q] + r_norms[u];
+  } else if (metric == cuvs::distance::DistanceType::L2SqrtExpanded) {
+    dist[i] = raft::sqrt(-2.0f * dot + q_norms[q] + r_norms[u]);
+  } else {  // CosineExpanded
+    dist[i] = 1.0f - dot / (q_norms[q] * r_norms[u]);
+  }
+}
+
+__global__ inline void remap_ids_kernel(const int64_t* __restrict__ pos,
+                                        const int64_t* __restrict__ ids,
+                                        int64_t n,
+                                        int64_t* __restrict__ out)
+{
+  int64_t i = blockIdx.x * static_cast<int64_t>(blockDim.x) + threadIdx.x;
+  if (i < n) out[i] = pos[i] < 0 ? int64_t{-1} : ids[pos[i]];
+}
+
+/** Compute query norms the way the existing filtered path does. */
+inline rmm::device_uvector<float> make_query_norms(
+  raft::resources const& res,
+  raft::device_matrix_view<const float, int64_t, raft::row_major> queries,
+  cuvs::distance::DistanceType metric)
+{
+  auto stream = raft::resource::get_cuda_stream(res);
+  rmm::device_uvector<float> q_norms(queries.extent(0), stream);
+  if (metric == cuvs::distance::DistanceType::CosineExpanded) {
+    raft::linalg::rowNorm<raft::linalg::L2Norm, true>(q_norms.data(),
+                                                      queries.data_handle(),
+                                                      queries.extent(1),
+                                                      queries.extent(0),
+                                                      stream,
+                                                      raft::sqrt_op{});
+  } else {
+    raft::linalg::rowNorm<raft::linalg::L2Norm, true>(q_norms.data(),
+                                                      queries.data_handle(),
+                                                      queries.extent(1),
+                                                      queries.extent(0),
+                                                      stream,
+                                                      raft::identity_op{});
+  }
+  return q_norms;
+}
+
+/** Shared SDDMM + epilogue + sparse select_k over a prebuilt CSR pattern. */
+inline void sparse_distances_select_k(
+  raft::resources const& res,
+  const cuvs::neighbors::brute_force::index<float, float>& idx,
+  raft::device_matrix_view<const float, int64_t, raft::row_major> queries,
+  const int64_t* indptr,
+  const int64_t* indices,
+  int64_t nnz,
+  raft::device_matrix_view<int64_t, int64_t, raft::row_major> neighbors,
+  raft::device_matrix_view<float, int64_t, raft::row_major> distances)
+{
+  auto stream       = raft::resource::get_cuda_stream(res);
+  auto metric       = idx.metric();
+  int64_t n_queries = queries.extent(0);
+  int64_t n_rows    = idx.dataset().extent(0);
+  int64_t dim       = idx.dataset().extent(1);
+
+  rmm::device_uvector<float> values(nnz, stream);
+  auto structure = raft::make_device_compressed_structure_view<int64_t, int64_t, int64_t>(
+    const_cast<int64_t*>(indptr), const_cast<int64_t*>(indices), n_queries, n_rows, nnz);
+  auto csr_view =
+    raft::make_device_csr_matrix_view<float, int64_t, int64_t, int64_t>(values.data(), structure);
+
+  auto dataset_view = raft::make_device_matrix_view<const float, int64_t, raft::row_major>(
+    idx.dataset().data_handle(), n_rows, dim);
+  float alpha_v = 1.0f, beta_v = 0.0f;
+  auto alpha = raft::make_host_scalar_view<float>(&alpha_v);
+  auto beta  = raft::make_host_scalar_view<float>(&beta_v);
+  raft::sparse::linalg::sddmm(res,
+                              queries,
+                              dataset_view,
+                              csr_view,
+                              raft::linalg::Operation::NON_TRANSPOSE,
+                              raft::linalg::Operation::TRANSPOSE,
+                              alpha,
+                              beta);
+
+  if (metric == cuvs::distance::DistanceType::L2Expanded ||
+      metric == cuvs::distance::DistanceType::L2SqrtExpanded ||
+      metric == cuvs::distance::DistanceType::CosineExpanded) {
+    rmm::device_uvector<int64_t> rows(nnz, stream);
+    raft::sparse::convert::csr_to_coo(
+      const_cast<int64_t*>(indptr), n_queries, rows.data(), nnz, stream);
+    auto q_norms = make_query_norms(res, queries, metric);
+    cuvs::neighbors::detail::epilogue_on_csr(res,
+                                             values.data(),
+                                             nnz,
+                                             rows.data(),
+                                             const_cast<int64_t*>(indices),
+                                             q_norms.data(),
+                                             idx.norms().data_handle(),
+                                             metric);
+  }
+
+  auto const_csr = raft::make_device_csr_matrix_view<const float, int64_t, int64_t, int64_t>(
+    values.data(), structure);
+  std::optional<raft::device_vector_view<const int64_t, int64_t>> no_opt = std::nullopt;
+  bool select_min = cuvs::distance::is_min_close(metric);
+  raft::sparse::matrix::select_k(res, const_csr, no_opt, distances, neighbors, select_min, true);
+}
+
+}  // namespace roaring_detail
+
+/**
+ * Shared-filter (one Roaring bitmap for all queries) brute-force search.
+ */
+inline void brute_force_search_roaring(
+  raft::resources const& res,
+  const cuvs::neighbors::brute_force::index<float, float>& idx,
+  raft::device_matrix_view<const float, int64_t, raft::row_major> queries,
+  const cuvs::neighbors::filtering::roaring_filter& filter,
+  raft::device_matrix_view<int64_t, int64_t, raft::row_major> neighbors,
+  raft::device_matrix_view<float, int64_t, raft::row_major> distances)
+{
+  using namespace roaring_detail;
+  auto stream       = raft::resource::get_cuda_stream(res);
+  auto metric       = idx.metric();
+  int64_t n_queries = queries.extent(0);
+  int64_t n_rows    = idx.dataset().extent(0);
+  int64_t dim       = idx.dataset().extent(1);
+
+  const cuvs::core::gpu_roaring& bitmap = *filter.bitmap_;
+  RAFT_EXPECTS(!bitmap.negated, "roaring_filter: negated bitmaps are not supported");
+  RAFT_EXPECTS(static_cast<int64_t>(bitmap.universe_size) == n_rows,
+               "roaring_filter universe size must match the index size");
+  int64_t n_selected = static_cast<int64_t>(bitmap.total_cardinality);
+  double s           = static_cast<double>(n_selected) / static_cast<double>(n_rows);
+
+  bool needs_norms = metric == cuvs::distance::DistanceType::L2Expanded ||
+                     metric == cuvs::distance::DistanceType::L2SqrtExpanded ||
+                     metric == cuvs::distance::DistanceType::CosineExpanded;
+  RAFT_EXPECTS(idx.has_norms() || !needs_norms,
+               "roaring_filter: index must carry norms for L2/Cosine metrics");
+
+  if (n_selected == 0) {
+    thrust::fill(raft::resource::get_thrust_policy(res),
+                 neighbors.data_handle(),
+                 neighbors.data_handle() + n_queries * neighbors.extent(1),
+                 int64_t{-1});
+    return;
+  }
+
+  // ---- dense regime: decompress and delegate to the bitset pipeline ----
+  if (s >= kDenseThreshold) {
+    auto bitset = cuvs::core::to_bitset(res, bitmap);
+    auto bf     = cuvs::neighbors::filtering::bitset_filter<uint32_t, int64_t>(bitset.view());
+    brute_force_search_filtered<float, int64_t, uint32_t, float>(
+      res, idx, queries, &bf, neighbors, distances, std::nullopt);
+    return;
+  }
+
+  // member ids, emitted once per search from the containers
+  const cuvs::core::gpu_roaring* bptr = &bitmap;
+  rmm::device_uvector<int64_t> ids(n_selected, stream);
+  cuvs::core::to_csr_indices(res, &bptr, 1, ids.data());
+
+  // ---- very sparse: per-query CSR (replicated indices) + SDDMM ----
+  if (s <= sparse_threshold(dim)) {
+    int64_t nnz = n_queries * n_selected;
+    std::vector<int64_t> h_indptr(n_queries + 1);
+    for (int64_t q = 0; q <= n_queries; ++q)
+      h_indptr[q] = q * n_selected;
+    rmm::device_uvector<int64_t> indptr(n_queries + 1, stream);
+    raft::update_device(indptr.data(), h_indptr.data(), h_indptr.size(), stream);
+    rmm::device_uvector<int64_t> indices(nnz, stream);
+    int64_t blocks = (nnz + 255) / 256;
+    replicate_indices_kernel<<<static_cast<unsigned>(blocks), 256, 0, stream>>>(
+      ids.data(), n_selected, nnz, indices.data());
+    sparse_distances_select_k(
+      res, idx, queries, indptr.data(), indices.data(), nnz, neighbors, distances);
+    return;
+  }
+
+  // ---- mid regime: gather selected rows, dense GEMM, select_k, remap ----
+  constexpr int64_t kChunk = int64_t{1} << 20;
+  rmm::device_uvector<float> gathered(std::min(kChunk, n_selected) * dim, stream);
+  rmm::device_uvector<float> dist(n_queries * n_selected, stream);
+
+  for (int64_t c0 = 0; c0 < n_selected; c0 += kChunk) {
+    int64_t uc = std::min(kChunk, n_selected - c0);
+    gather_rows_kernel<<<static_cast<unsigned>(uc), 128, 0, stream>>>(
+      idx.dataset().data_handle(), ids.data() + c0, uc, dim, gathered.data());
+    // row-major dist[Q, U] tile = queries[Q, dim] x gathered[uc, dim]^T
+    float alpha = 1.0f, beta = 0.0f;
+    raft::linalg::gemm(res,
+                       true,
+                       false,
+                       static_cast<int>(uc),
+                       static_cast<int>(n_queries),
+                       static_cast<int>(dim),
+                       &alpha,
+                       gathered.data(),
+                       static_cast<int>(dim),
+                       queries.data_handle(),
+                       static_cast<int>(dim),
+                       &beta,
+                       dist.data() + c0,
+                       static_cast<int>(n_selected),
+                       stream);
+  }
+
+  if (needs_norms) {
+    rmm::device_uvector<float> r_norms(n_selected, stream);
+    int64_t blocks = (n_selected + 255) / 256;
+    gather_norms_kernel<<<static_cast<unsigned>(blocks), 256, 0, stream>>>(
+      idx.norms().data_handle(), ids.data(), n_selected, r_norms.data());
+    auto q_norms  = make_query_norms(res, queries, metric);
+    int64_t total = n_queries * n_selected;
+    dense_epilogue_kernel<<<static_cast<unsigned>((total + 255) / 256), 256, 0, stream>>>(
+      dist.data(), q_norms.data(), r_norms.data(), n_queries, n_selected, metric);
+  }
+
+  int64_t k = neighbors.extent(1);
+  rmm::device_uvector<int64_t> positions(n_queries * k, stream);
+  bool select_min = cuvs::distance::is_min_close(metric);
+  raft::matrix::select_k<float, int64_t>(
+    res,
+    raft::make_device_matrix_view<const float, int64_t, raft::row_major>(
+      dist.data(), n_queries, n_selected),
+    std::nullopt,
+    distances,
+    raft::make_device_matrix_view<int64_t, int64_t, raft::row_major>(
+      positions.data(), n_queries, k),
+    select_min,
+    true);
+  int64_t total = n_queries * k;
+  remap_ids_kernel<<<static_cast<unsigned>((total + 255) / 256), 256, 0, stream>>>(
+    positions.data(), ids.data(), total, neighbors.data_handle());
+}
+
+/**
+ * Per-query-filter (one Roaring bitmap per query) brute-force search.
+ */
+inline void brute_force_search_roaring_matrix(
+  raft::resources const& res,
+  const cuvs::neighbors::brute_force::index<float, float>& idx,
+  raft::device_matrix_view<const float, int64_t, raft::row_major> queries,
+  const cuvs::neighbors::filtering::roaring_matrix_filter& filter,
+  raft::device_matrix_view<int64_t, int64_t, raft::row_major> neighbors,
+  raft::device_matrix_view<float, int64_t, raft::row_major> distances)
+{
+  using namespace roaring_detail;
+  auto stream       = raft::resource::get_cuda_stream(res);
+  auto metric       = idx.metric();
+  int64_t n_queries = queries.extent(0);
+  int64_t n_rows    = idx.dataset().extent(0);
+
+  RAFT_EXPECTS(static_cast<int64_t>(filter.n_queries_) == n_queries,
+               "roaring_matrix_filter must provide one bitmap per query");
+  bool needs_norms = metric == cuvs::distance::DistanceType::L2Expanded ||
+                     metric == cuvs::distance::DistanceType::L2SqrtExpanded ||
+                     metric == cuvs::distance::DistanceType::CosineExpanded;
+  RAFT_EXPECTS(idx.has_norms() || !needs_norms,
+               "roaring_matrix_filter: index must carry norms for L2/Cosine metrics");
+
+  // indptr comes straight off the host-side cardinalities
+  std::vector<int64_t> h_indptr(n_queries + 1, 0);
+  for (int64_t q = 0; q < n_queries; ++q) {
+    const auto* b = filter.bitmaps_[q];
+    RAFT_EXPECTS(!b->negated, "roaring_matrix_filter: negated bitmaps are not supported");
+    RAFT_EXPECTS(static_cast<int64_t>(b->universe_size) == n_rows,
+                 "roaring_matrix_filter universe size must match the index size");
+    h_indptr[q + 1] = h_indptr[q] + static_cast<int64_t>(b->total_cardinality);
+  }
+  int64_t nnz = h_indptr[n_queries];
+  double s    = static_cast<double>(nnz) / (static_cast<double>(n_queries) * n_rows);
+
+  if (nnz == 0) {
+    thrust::fill(raft::resource::get_thrust_policy(res),
+                 neighbors.data_handle(),
+                 neighbors.data_handle() + n_queries * neighbors.extent(1),
+                 int64_t{-1});
+    return;
+  }
+
+  // ---- dense regime: decompress to a [n_queries, n_rows] bit matrix ----
+  if (s >= kDenseThreshold) {
+    int64_t n_words = (n_queries * n_rows + 31) / 32;
+    rmm::device_uvector<uint32_t> bits(n_words, stream);
+    RAFT_CUDA_TRY(cudaMemsetAsync(bits.data(), 0, n_words * sizeof(uint32_t), stream));
+    cuvs::core::decompress_to_bitmap(
+      res, filter.bitmaps_, static_cast<uint32_t>(n_queries), n_rows, bits.data());
+    auto view = cuvs::core::bitmap_view<uint32_t, int64_t>(bits.data(), n_queries, n_rows);
+    auto bf   = cuvs::neighbors::filtering::bitmap_filter<uint32_t, int64_t>(view);
+    brute_force_search_filtered<float, int64_t, uint32_t, float>(
+      res, idx, queries, &bf, neighbors, distances, std::nullopt);
+    return;
+  }
+
+  // ---- sparse: batched CSR emission (one launch for all filters) ----
+  rmm::device_uvector<int64_t> indptr(n_queries + 1, stream);
+  raft::update_device(indptr.data(), h_indptr.data(), h_indptr.size(), stream);
+  rmm::device_uvector<int64_t> indices(nnz, stream);
+  cuvs::core::to_csr_indices(
+    res, filter.bitmaps_, static_cast<uint32_t>(n_queries), indices.data());
+  sparse_distances_select_k(
+    res, idx, queries, indptr.data(), indices.data(), nnz, neighbors, distances);
+}
+
+}  // namespace cuvs::neighbors::detail
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 9b96f94bf0..31729afd56 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -104,7 +104,8 @@ endfunction()
 
 ConfigureTest(
   NAME NEIGHBORS_TEST
-  PATH neighbors/brute_force.cu neighbors/brute_force_prefiltered.cu neighbors/sparse_brute_force.cu
+  PATH neighbors/brute_force.cu neighbors/brute_force_prefiltered.cu neighbors/brute_force_roaring.cu
+       neighbors/sparse_brute_force.cu
        neighbors/refine.cu neighbors/distance_nn.cu
   GPUS 1
   PERCENT 100
diff --git a/cpp/tests/neighbors/brute_force_roaring.cu b/cpp/tests/neighbors/brute_force_roaring.cu
new file mode 100644
index 0000000000..5d59cce73c
--- /dev/null
+++ b/cpp/tests/neighbors/brute_force_roaring.cu
@@ -0,0 +1,232 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <cuvs/core/roaring.hpp>
+#include <cuvs/neighbors/brute_force.hpp>
+#include <cuvs/neighbors/common.hpp>
+#include <cuvs/neighbors/roaring_filter.hpp>
+
+#include <raft/core/bitset.cuh>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/random/rng.cuh>
+
+#include <rmm/device_uvector.hpp>
+
+#include <gtest/gtest.h>
+
+#include <algorithm>
+#include <cstdint>
+#include <random>
+#include <vector>
+
+namespace cuvs::neighbors {
+
+namespace {
+
+struct RoaringFilterInputs {
+  int64_t n_rows;
+  int64_t dim;
+  int64_t n_queries;
+  int64_t k;
+  double selectivity;  // chosen to exercise sparse / mid / dense dispatch
+  cuvs::distance::DistanceType metric;
+};
+
+std::ostream& operator<<(std::ostream& os, const RoaringFilterInputs& p)
+{
+  return os << "n_rows=" << p.n_rows << " dim=" << p.dim << " q=" << p.n_queries << " k=" << p.k
+            << " s=" << p.selectivity << " metric=" << (int)p.metric;
+}
+
+std::vector<uint32_t> make_filter_ids(std::mt19937& rng, int64_t n_rows, double s)
+{
+  std::vector<uint32_t> ids;
+  std::uniform_real_distribution<double> u(0.0, 1.0);
+  for (int64_t i = 0; i < n_rows; ++i)
+    if (u(rng) < s) ids.push_back(static_cast<uint32_t>(i));
+  if (ids.size() < 32) {  // keep k satisfiable
+    for (uint32_t i = 0; ids.size() < 32; i += 7)
+      ids.push_back(i % static_cast<uint32_t>(n_rows));
+    std::sort(ids.begin(), ids.end());
+    ids.erase(std::unique(ids.begin(), ids.end()), ids.end());
+  }
+  return ids;
+}
+
+class RoaringFilterTest : public ::testing::TestWithParam<RoaringFilterInputs> {
+ protected:
+  void SetUp() override
+  {
+    auto p      = GetParam();
+    auto stream = raft::resource::get_cuda_stream(res_);
+
+    dataset_.emplace(raft::make_device_matrix<float, int64_t>(res_, p.n_rows, p.dim));
+    queries_.emplace(raft::make_device_matrix<float, int64_t>(res_, p.n_queries, p.dim));
+    raft::random::RngState r1(42), r2(43);
+    raft::random::uniform(
+      res_,
+      r1,
+      raft::make_device_vector_view<float, int64_t>(dataset_->data_handle(), p.n_rows * p.dim),
+      -1.0f,
+      1.0f);
+    raft::random::uniform(
+      res_,
+      r2,
+      raft::make_device_vector_view<float, int64_t>(queries_->data_handle(), p.n_queries * p.dim),
+      -1.0f,
+      1.0f);
+
+    brute_force::index_params ip;
+    ip.metric = p.metric;
+    index_.emplace(brute_force::build(res_, ip, raft::make_const_mdspan(dataset_->view())));
+    raft::resource::sync_stream(res_);
+  }
+
+  // run a search with the given filter into fresh buffers
+  std::pair<std::vector<int64_t>, std::vector<float>> run(const filtering::base_filter& filter)
+  {
+    auto p  = GetParam();
+    auto nb = raft::make_device_matrix<int64_t, int64_t>(res_, p.n_queries, p.k);
+    auto di = raft::make_device_matrix<float, int64_t>(res_, p.n_queries, p.k);
+    brute_force::search(res_,
+                        brute_force::search_params{},
+                        *index_,
+                        raft::make_const_mdspan(queries_->view()),
+                        nb.view(),
+                        di.view(),
+                        filter);
+    std::vector<int64_t> h_nb(p.n_queries * p.k);
+    std::vector<float> h_di(p.n_queries * p.k);
+    raft::update_host(
+      h_nb.data(), nb.data_handle(), h_nb.size(), raft::resource::get_cuda_stream(res_));
+    raft::update_host(
+      h_di.data(), di.data_handle(), h_di.size(), raft::resource::get_cuda_stream(res_));
+    raft::resource::sync_stream(res_);
+    return {std::move(h_nb), std::move(h_di)};
+  }
+
+  // tie-tolerant: sorted per-query distance lists must match elementwise
+  static void expect_same_distances(const std::vector<float>& a,
+                                    const std::vector<float>& b,
+                                    int64_t n_queries,
+                                    int64_t k)
+  {
+    for (int64_t q = 0; q < n_queries; ++q) {
+      std::vector<float> da(a.begin() + q * k, a.begin() + (q + 1) * k);
+      std::vector<float> db(b.begin() + q * k, b.begin() + (q + 1) * k);
+      std::sort(da.begin(), da.end());
+      std::sort(db.begin(), db.end());
+      for (int64_t j = 0; j < k; ++j) {
+        ASSERT_NEAR(da[j], db[j], 1e-3 + 1e-3 * std::abs(da[j])) << "query " << q << " rank " << j;
+      }
+    }
+  }
+
+  raft::resources res_;
+  std::optional<raft::device_matrix<float, int64_t>> dataset_;
+  std::optional<raft::device_matrix<float, int64_t>> queries_;
+  std::optional<brute_force::index<float, float>> index_;
+};
+
+TEST_P(RoaringFilterTest, MatchesBitsetFilter)
+{
+  auto p      = GetParam();
+  auto stream = raft::resource::get_cuda_stream(res_);
+  std::mt19937 rng(123);
+  auto ids = make_filter_ids(rng, p.n_rows, p.selectivity);
+
+  // roaring filter
+  auto roaring =
+    cuvs::core::from_sorted_ids(res_, ids.data(), (uint32_t)ids.size(), (uint32_t)p.n_rows);
+  auto rf = filtering::roaring_filter(roaring);
+
+  // reference: equivalent bitset filter
+  raft::core::bitset<uint32_t, int64_t> bits(res_, p.n_rows, false);
+  std::vector<uint32_t> h_words((p.n_rows + 31) / 32, 0);
+  for (auto id : ids)
+    h_words[id / 32] |= 1u << (id % 32);
+  raft::update_device(bits.data(), h_words.data(), h_words.size(), stream);
+  auto bf = filtering::bitset_filter<uint32_t, int64_t>(bits.view());
+
+  auto [nb_r, di_r] = run(rf);
+  auto [nb_b, di_b] = run(bf);
+
+  expect_same_distances(di_r, di_b, p.n_queries, p.k);
+  // membership: every returned id must be in the filter
+  for (auto id : nb_r) {
+    ASSERT_TRUE(id >= 0 && std::binary_search(ids.begin(), ids.end(), (uint32_t)id))
+      << "non-member id " << id;
+  }
+}
+
+TEST_P(RoaringFilterTest, MatrixMatchesBitmapFilter)
+{
+  auto p      = GetParam();
+  auto stream = raft::resource::get_cuda_stream(res_);
+  std::mt19937 rng(321);
+
+  // one filter per query (share a few to exercise repeated bitmaps)
+  std::vector<std::vector<uint32_t>> per_query(p.n_queries);
+  std::vector<cuvs::core::gpu_roaring> bitmaps;
+  bitmaps.reserve(p.n_queries);
+  std::vector<const cuvs::core::gpu_roaring*> ptrs(p.n_queries);
+  for (int64_t q = 0; q < p.n_queries; ++q) {
+    per_query[q] = make_filter_ids(rng, p.n_rows, p.selectivity);
+    bitmaps.emplace_back(cuvs::core::from_sorted_ids(
+      res_, per_query[q].data(), (uint32_t)per_query[q].size(), (uint32_t)p.n_rows));
+    ptrs[q] = &bitmaps[q];
+  }
+  auto rmf = filtering::roaring_matrix_filter(ptrs.data(), (uint32_t)p.n_queries);
+
+  // reference: dense [n_queries, n_rows] bitmap filter
+  int64_t total_bits = p.n_queries * p.n_rows;
+  rmm::device_uvector<uint32_t> words((total_bits + 31) / 32, stream);
+  std::vector<uint32_t> h_words((total_bits + 31) / 32, 0);
+  for (int64_t q = 0; q < p.n_queries; ++q)
+    for (auto id : per_query[q]) {
+      int64_t bit = q * p.n_rows + id;
+      h_words[bit / 32] |= 1u << (bit % 32);
+    }
+  raft::update_device(words.data(), h_words.data(), h_words.size(), stream);
+  auto bview = cuvs::core::bitmap_view<uint32_t, int64_t>(words.data(), p.n_queries, p.n_rows);
+  auto bmf   = filtering::bitmap_filter<uint32_t, int64_t>(bview);
+
+  auto [nb_r, di_r] = run(rmf);
+  auto [nb_b, di_b] = run(bmf);
+
+  expect_same_distances(di_r, di_b, p.n_queries, p.k);
+  for (int64_t q = 0; q < p.n_queries; ++q)
+    for (int64_t j = 0; j < p.k; ++j) {
+      int64_t id = nb_r[q * p.k + j];
+      ASSERT_TRUE(id >= 0 &&
+                  std::binary_search(per_query[q].begin(), per_query[q].end(), (uint32_t)id))
+        << "query " << q << " non-member id " << id;
+    }
+}
+
+// selectivities chosen to hit the sparse / gather-GEMM / dense regimes at
+// both a low and a high dimension (the sparse/mid threshold is
+// dimension-dependent)
+const std::vector<RoaringFilterInputs> inputs = {
+  // d=64: sparse (<=3%), mid, dense
+  {100000, 64, 17, 10, 0.002, cuvs::distance::DistanceType::InnerProduct},
+  {100000, 64, 17, 10, 0.10, cuvs::distance::DistanceType::InnerProduct},
+  {100000, 64, 17, 10, 0.60, cuvs::distance::DistanceType::InnerProduct},
+  {100000, 64, 17, 10, 0.10, cuvs::distance::DistanceType::L2Expanded},
+  {100000, 64, 17, 10, 0.002, cuvs::distance::DistanceType::L2Expanded},
+  {100000, 64, 17, 10, 0.60, cuvs::distance::DistanceType::L2Expanded},
+  {100000, 64, 17, 10, 0.10, cuvs::distance::DistanceType::CosineExpanded},
+  // d=512: sparse (<=0.1%), mid, dense
+  {60000, 512, 9, 10, 0.0008, cuvs::distance::DistanceType::InnerProduct},
+  {60000, 512, 9, 10, 0.05, cuvs::distance::DistanceType::L2Expanded},
+  {60000, 512, 9, 10, 0.55, cuvs::distance::DistanceType::InnerProduct},
+};
+
+INSTANTIATE_TEST_SUITE_P(RoaringFilter, RoaringFilterTest, ::testing::ValuesIn(inputs));
+
+}  // namespace
+}  // namespace cuvs::neighbors