diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 227c2906cc..f467137d75 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -1142,6 +1142,7 @@ if(NOT BUILD_CPU_ONLY) src/cluster/single_linkage_float.cu src/cluster/spectral.cu src/core/bitset.cu + src/core/roaring/roaring.cu src/core/omp_wrapper.cpp src/util/file_io.cpp src/util/host_memory.cpp diff --git a/cpp/include/cuvs/core/roaring.hpp b/cpp/include/cuvs/core/roaring.hpp new file mode 100644 index 0000000000..0f19d8ebe5 --- /dev/null +++ b/cpp/include/cuvs/core/roaring.hpp @@ -0,0 +1,373 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#include + +#include +#include +#include + +#include + +#include +#include +#include + +namespace cuvs::core { + +/** + * @defgroup roaring GPU Roaring Bitmap + * + * GPU-native Roaring Bitmap for compressed prefiltering in vector search. + * Stores attribute bitmaps in compressed Roaring format in GPU memory, + * supports bulk set operations (AND/OR/ANDNOT/XOR), and decompresses + * to flat bitsets compatible with cuvs::core::bitset for search. + * + * Typical usage: + * @code{.cpp} + * // Build filter bitmaps on CPU, upload to GPU + * auto category = gpu_roaring::from_sorted_ids(res, cat_ids, universe); + * auto price = gpu_roaring::from_sorted_ids(res, price_ids, universe); + * + * // Combine on GPU + * auto combined = gpu_roaring::set_and(res, category, price); + * + * // Decompress to bitset for search + * auto bitset = gpu_roaring::to_bitset(res, combined); + * auto filter = cuvs::neighbors::filtering::bitset_filter(bitset.view()); + * cuvs::neighbors::cagra::search(res, params, index, queries, neighbors, distances, filter); + * @endcode + * @{ + */ + +/** Container type tags for Roaring containers */ +enum class roaring_container_type : uint8_t { ARRAY = 0, BITMAP = 1, RUN = 2 }; + +/** + * @brief Non-owning device view of a @ref gpu_roaring, usable inside + * kernels for per-sample membership tests (mirrors + * raft::core::bitset_view::test()). + */ +struct roaring_view { + const uint16_t* keys = nullptr; + const roaring_container_type* types = nullptr; + const uint32_t* offsets = nullptr; + const uint16_t* cardinalities = nullptr; + const uint64_t* bitmap_data = nullptr; + const uint16_t* array_data = nullptr; + const uint16_t* run_data = nullptr; + const uint16_t* key_index = nullptr; // optional O(1) key lookup + uint32_t n_containers = 0; + uint32_t max_key = 0; + uint32_t n_rows = 0; + +#if defined(__CUDACC__) + /** Whether sample index `id` is a member of the filter. */ + __device__ __forceinline__ bool test(uint32_t id) const + { + uint16_t key = static_cast(id >> 16); + uint16_t low = static_cast(id & 0xFFFFu); + + int cid = -1; + if (key_index != nullptr) { + if (key > max_key) return false; + uint16_t slot = key_index[key]; + if (slot == 0xFFFFu) return false; + cid = slot; + } else { + uint32_t lo = 0, hi = n_containers; + while (lo < hi) { // lower_bound on sorted keys + uint32_t mid = (lo + hi) / 2; + if (keys[mid] < key) { + lo = mid + 1; + } else { + hi = mid; + } + } + if (lo == n_containers || keys[lo] != key) return false; + cid = static_cast(lo); + } + + uint32_t offset = offsets[cid]; + switch (types[cid]) { + case roaring_container_type::BITMAP: { + uint64_t word = bitmap_data[offset / sizeof(uint64_t) + (low >> 6)]; + return (word >> (low & 63u)) & 1u; + } + case roaring_container_type::ARRAY: { + const uint16_t* arr = array_data + offset / sizeof(uint16_t); + uint32_t lo = 0, hi = cardinalities[cid]; + while (lo < hi) { + uint32_t mid = (lo + hi) / 2; + if (arr[mid] < low) { + lo = mid + 1; + } else { + hi = mid; + } + } + return lo < cardinalities[cid] && arr[lo] == low; + } + default: { // RUN + const uint16_t* runs = run_data + offset / sizeof(uint16_t); + uint32_t n_runs = cardinalities[cid]; + uint32_t lo = 0, hi = n_runs; + while (lo < hi) { // last run with start <= low + uint32_t mid = (lo + hi) / 2; + if (runs[mid * 2] <= low) { + lo = mid + 1; + } else { + hi = mid; + } + } + if (lo == 0) return false; + uint32_t start = runs[(lo - 1) * 2]; + uint32_t len = runs[(lo - 1) * 2 + 1]; + return low >= start && low <= start + len; + } + } + } +#endif +}; + +/** + * @brief GPU-resident Roaring bitmap in Structure-of-Arrays layout. + * + * All device memory is managed via rmm::device_uvector (RAII). + * Supports move semantics; not copyable. + */ +struct gpu_roaring { + // Top-level index (one entry per container, sorted by key) + rmm::device_uvector keys; + rmm::device_uvector types; + rmm::device_uvector offsets; + rmm::device_uvector cardinalities; + uint32_t n_containers = 0; + uint32_t universe_size = 0; + + // Per-type data pools + rmm::device_uvector bitmap_data; + uint32_t n_bitmap_containers = 0; + + rmm::device_uvector array_data; + uint32_t n_array_containers = 0; + + rmm::device_uvector run_data; + uint32_t n_run_containers = 0; + + // Direct-map key index: key_index[high16] = container index, or 0xFFFF. + // Replaces O(log n) binary search with O(1) table lookup. + rmm::device_uvector key_index; + uint32_t max_key = 0; + + // Complement optimization: when true, the stored set is the complement + // of the logical set. contains() results are flipped at query time. + // Note: never set by from_sorted_ids; filters require negated == false. + bool negated = false; + + // Total logical cardinality (number of set bits, before complement) + uint64_t total_cardinality = 0; + + // Host mirrors of the per-container metadata, captured at construction. + // These make per-filter cardinalities and CSR indptr available with no + // device->host transfer or count kernels (uint32 element counts: a full + // 65536-element container wraps to 0 in the device-side uint16 array). + std::vector h_keys; + std::vector h_types; + std::vector h_offsets; + std::vector h_element_counts; + + /** Non-owning device view for use inside kernels. */ + [[nodiscard]] roaring_view view() const + { + roaring_view v; + v.keys = keys.data(); + v.types = types.data(); + v.offsets = offsets.data(); + v.cardinalities = cardinalities.data(); + v.bitmap_data = bitmap_data.data(); + v.array_data = array_data.data(); + v.run_data = run_data.data(); + v.key_index = key_index.size() > 0 ? key_index.data() : nullptr; + v.n_containers = n_containers; + v.max_key = max_key; + v.n_rows = universe_size; + return v; + } + + /** Construct an empty GPU Roaring bitmap */ + explicit gpu_roaring(rmm::cuda_stream_view stream) + : keys(0, stream), + types(0, stream), + offsets(0, stream), + cardinalities(0, stream), + bitmap_data(0, stream), + array_data(0, stream), + run_data(0, stream), + key_index(0, stream) + { + } + + gpu_roaring(gpu_roaring&&) = default; + gpu_roaring& operator=(gpu_roaring&&) = default; + + /** Total device memory used (bytes) */ + [[nodiscard]] size_t device_memory_bytes() const + { + return keys.size() * sizeof(uint16_t) + types.size() * sizeof(roaring_container_type) + + offsets.size() * sizeof(uint32_t) + cardinalities.size() * sizeof(uint16_t) + + bitmap_data.size() * sizeof(uint64_t) + array_data.size() * sizeof(uint16_t) + + run_data.size() * sizeof(uint16_t) + key_index.size() * sizeof(uint16_t); + } + + /** Equivalent flat bitset size (bytes) */ + [[nodiscard]] size_t flat_bitset_bytes() const + { + return (static_cast(universe_size) + 31) / 32 * sizeof(uint32_t); + } + + /** Compression ratio (flat / compressed) */ + [[nodiscard]] double compression_ratio() const + { + auto dev_bytes = device_memory_bytes(); + return dev_bytes > 0 ? static_cast(flat_bitset_bytes()) / dev_bytes : 0.0; + } +}; + +/** Set operation types */ +enum class roaring_set_op : uint8_t { AND = 0, OR = 1, ANDNOT = 2, XOR = 3 }; + +/** + * @brief Create a GPU Roaring bitmap from a sorted array of IDs on host. + * + * The IDs are partitioned into 65536-element containers following the + * Roaring bitmap format. Containers with <= 4096 elements use array format; + * denser containers use bitmap format. + * + * @param[in] res RAFT resources (provides CUDA stream) + * @param[in] sorted_ids Host pointer to sorted, deduplicated uint32_t IDs + * @param[in] n_ids Number of IDs + * @param[in] universe_size Maximum representable ID + 1 + * @return GPU Roaring bitmap + */ +gpu_roaring from_sorted_ids(raft::resources const& res, + const uint32_t* sorted_ids, + uint32_t n_ids, + uint32_t universe_size); + +/** + * @brief Perform a pairwise set operation between two GPU Roaring bitmaps. + * + * @param[in] res RAFT resources + * @param[in] a First operand + * @param[in] b Second operand + * @param[in] op Set operation (AND, OR, ANDNOT, XOR) + * @return Result GPU Roaring bitmap + */ +gpu_roaring set_op(raft::resources const& res, + const gpu_roaring& a, + const gpu_roaring& b, + roaring_set_op op); + +/** Convenience: result = a AND b */ +inline gpu_roaring set_and(raft::resources const& res, const gpu_roaring& a, const gpu_roaring& b) +{ + return set_op(res, a, b, roaring_set_op::AND); +} + +/** Convenience: result = a OR b */ +inline gpu_roaring set_or(raft::resources const& res, const gpu_roaring& a, const gpu_roaring& b) +{ + return set_op(res, a, b, roaring_set_op::OR); +} + +/** + * @brief Multi-bitmap AND: result = a[0] AND a[1] AND ... AND a[n-1]. + */ +gpu_roaring multi_and(raft::resources const& res, const gpu_roaring* bitmaps, uint32_t count); + +/** + * @brief Multi-bitmap OR: result = a[0] OR a[1] OR ... OR a[n-1]. + */ +gpu_roaring multi_or(raft::resources const& res, const gpu_roaring* bitmaps, uint32_t count); + +/** + * @brief Decompress a GPU Roaring bitmap to a flat bitset. + * + * The output is compatible with cuvs::core::bitset + * and can be wrapped in a bitset_filter for use with cuVS search. + * + * @param[in] res RAFT resources + * @param[in] bitmap Source GPU Roaring bitmap + * @return Flat bitset (device memory, RAII) + */ +cuvs::core::bitset to_bitset(raft::resources const& res, + const gpu_roaring& bitmap); + +/** + * @brief Decompress into a pre-allocated flat bitset buffer. + * + * @param[in] res RAFT resources + * @param[in] bitmap Source GPU Roaring bitmap + * @param[out] output Device pointer to uint32_t array, pre-zeroed + * @param[in] output_size_words Number of uint32_t words in output + */ +void decompress_to_bitset(raft::resources const& res, + const gpu_roaring& bitmap, + uint32_t* output, + uint32_t output_size_words); + +/** + * @brief Create a GPU Roaring bitmap from sorted, deduplicated device IDs. + * + * Equivalent to the host-pointer overload; the IDs are copied to the host + * for container construction (the per-container metadata must live on the + * host anyway — it is what makes count-free filtered search possible). + */ +gpu_roaring from_sorted_ids(raft::resources const& res, + raft::device_vector_view sorted_ids, + uint32_t universe_size); + +/** + * @brief Emit the sorted member IDs of one or more Roaring bitmaps into the + * column-index array of a CSR structure, one bitmap per CSR row segment. + * + * Row `i`'s segment is `[indptr[i], indptr[i+1])` where the indptr is the + * exclusive prefix sum of `bitmaps[i]->total_cardinality` — available on + * the host with no device synchronization. One kernel launch for all + * bitmaps (one CTA per Roaring container). + * + * @param[in] res RAFT resources + * @param[in] bitmaps Host array of `n_bitmaps` bitmap pointers + * @param[in] n_bitmaps Number of bitmaps (CSR rows) + * @param[out] indices Device array of size `sum(cardinalities)` + */ +void to_csr_indices(raft::resources const& res, + const gpu_roaring* const* bitmaps, + uint32_t n_bitmaps, + int64_t* indices); + +/** + * @brief Decompress `n_bitmaps` Roaring bitmaps into one dense bit matrix + * of logical shape `[n_bitmaps, n_rows]` (row-contiguous bits), compatible + * with cuvs::core::bitmap_view. + * + * @param[in] res RAFT resources + * @param[in] bitmaps Host array of `n_bitmaps` bitmap pointers + * @param[in] n_bitmaps Number of bitmap rows + * @param[in] n_rows Number of columns (bits per row) + * @param[out] output Device array of `ceil(n_bitmaps * n_rows / 32)` words, + * pre-zeroed by the caller + */ +void decompress_to_bitmap(raft::resources const& res, + const gpu_roaring* const* bitmaps, + uint32_t n_bitmaps, + int64_t n_rows, + uint32_t* output); + +/** @} */ // end group roaring + +} // namespace cuvs::core diff --git a/cpp/include/cuvs/neighbors/common.hpp b/cpp/include/cuvs/neighbors/common.hpp index 2fd804f115..d2ed2a373d 100644 --- a/cpp/include/cuvs/neighbors/common.hpp +++ b/cpp/include/cuvs/neighbors/common.hpp @@ -497,7 +497,7 @@ namespace filtering { * @{ */ -enum class FilterType { None, Bitmap, Bitset, UDF }; +enum class FilterType { None, Bitmap, Bitset, UDF, Roaring, RoaringMatrix }; struct base_filter { ~base_filter() = default; diff --git a/cpp/include/cuvs/neighbors/roaring_filter.hpp b/cpp/include/cuvs/neighbors/roaring_filter.hpp new file mode 100644 index 0000000000..67cc19486f --- /dev/null +++ b/cpp/include/cuvs/neighbors/roaring_filter.hpp @@ -0,0 +1,116 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#include +#include + +#include +#include + +namespace cuvs::neighbors::filtering { + +/** + * @defgroup roaring_filters Roaring bitmap sample filters + * @{ + */ + +/** + * @brief Filter an index with a compressed Roaring bitmap shared by all + * queries. + * + * The semantic counterpart of @ref bitset_filter with a compressed filter + * representation: memory scales with filter cardinality and structure + * instead of dataset size, and search dispatch can exploit the + * host-known cardinality (no count kernels) and the container structure + * (direct CSR emission, row gathering) instead of scanning a flat + * bitset. + * + * The filter is non-owning: the referenced `cuvs::core::gpu_roaring` + * must outlive any search call using this filter. + * + * @code{.cpp} + * auto roaring = cuvs::core::from_sorted_ids(res, ids, n_ids, n_rows); + * auto filter = cuvs::neighbors::filtering::roaring_filter(roaring); + * cuvs::neighbors::brute_force::search( + * res, params, index, queries, neighbors, distances, filter); + * @endcode + */ +struct roaring_filter : public base_filter { + /** The bitmap to filter on (non-owning). */ + const cuvs::core::gpu_roaring* bitmap_; + /** Device view used for per-sample membership tests. */ + cuvs::core::roaring_view view_; + + explicit roaring_filter(const cuvs::core::gpu_roaring& bitmap) + : bitmap_(&bitmap), view_(bitmap.view()) + { + } + + /** \cond */ +#if defined(__CUDACC__) + __device__ __forceinline__ bool operator()( + // query index (ignored: one filter shared by all queries) + const uint32_t query_ix, + // the index of the current sample + const uint32_t sample_ix) const + { + (void)query_ix; + return view_.test(sample_ix); + } +#endif + /** \endcond */ + + FilterType get_filter_type() const override { return FilterType::Roaring; } +}; + +/** + * @brief Filter an index with one compressed Roaring bitmap per query. + * + * The semantic counterpart of @ref bitmap_filter (logical shape + * `[n_queries, index->size()]`) with per-query compressed filters: the + * dense `n_queries x n_rows` bit matrix is never materialized. Search + * emits the sparse distance structure directly from the per-query + * containers (per-query cardinalities are known on the host at filter + * construction, so the CSR indptr is free and no count kernels or + * device synchronization are needed). + * + * The filter is non-owning: the array of `gpu_roaring const*` and every + * pointee must outlive any search call using this filter. + * + * @code{.cpp} + * std::vector per_query = {...}; + * auto filter = cuvs::neighbors::filtering::roaring_matrix_filter( + * per_query.data(), n_queries); + * cuvs::neighbors::brute_force::search( + * res, params, index, queries, neighbors, distances, filter); + * @endcode + */ +struct roaring_matrix_filter : public base_filter { + /** Per-query bitmaps, `bitmaps_[i]` filters query `i` (non-owning). */ + const cuvs::core::gpu_roaring* const* bitmaps_; + uint32_t n_queries_; + + roaring_matrix_filter(const cuvs::core::gpu_roaring* const* bitmaps, uint32_t n_queries) + : bitmaps_(bitmaps), n_queries_(n_queries) + { + } + + /** Sum of per-query cardinalities (the sparse search nnz). */ + [[nodiscard]] uint64_t nnz() const + { + uint64_t total = 0; + for (uint32_t i = 0; i < n_queries_; ++i) + total += bitmaps_[i]->total_cardinality; + return total; + } + + FilterType get_filter_type() const override { return FilterType::RoaringMatrix; } +}; + +/** @} */ + +} // namespace cuvs::neighbors::filtering diff --git a/cpp/src/core/roaring/roaring.cu b/cpp/src/core/roaring/roaring.cu new file mode 100644 index 0000000000..6112a14fa0 --- /dev/null +++ b/cpp/src/core/roaring/roaring.cu @@ -0,0 +1,1090 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include + +#include +#include +#include + +#include + +#include +#include +#include + +namespace cuvs::core { + +// ============================================================================ +// from_sorted_ids: build GPU Roaring from a sorted host ID array +// ============================================================================ + +gpu_roaring from_sorted_ids(raft::resources const& res, + const uint32_t* sorted_ids, + uint32_t n_ids, + uint32_t universe_size) +{ + auto stream = raft::resource::get_cuda_stream(res); + gpu_roaring result(stream); + result.universe_size = universe_size; + + if (n_ids == 0) return result; + + // Partition IDs into containers by high 16 bits + struct Container { + uint16_t key; + std::vector values; // low 16 bits + }; + std::vector containers; + + uint16_t cur_key = static_cast(sorted_ids[0] >> 16); + containers.push_back({cur_key, {}}); + + for (uint32_t i = 0; i < n_ids; ++i) { + uint16_t key = static_cast(sorted_ids[i] >> 16); + uint16_t val = static_cast(sorted_ids[i] & 0xFFFF); + if (key != cur_key) { + cur_key = key; + containers.push_back({key, {}}); + } + containers.back().values.push_back(val); + } + + uint32_t n = static_cast(containers.size()); + result.n_containers = n; + + // Build SoA on host + std::vector h_keys(n); + std::vector h_types(n); + std::vector h_offsets(n); + std::vector h_cards(n); + + std::vector h_bitmap_pool; + std::vector h_array_pool; + uint32_t n_bitmap = 0, n_array = 0; + + for (uint32_t i = 0; i < n; ++i) { + h_keys[i] = containers[i].key; + uint32_t card = static_cast(containers[i].values.size()); + h_cards[i] = static_cast(card > 65535 ? 0 : card); + + if (card > 4096) { + // Bitmap container + h_types[i] = roaring_container_type::BITMAP; + h_offsets[i] = static_cast(h_bitmap_pool.size() * sizeof(uint64_t)); + h_bitmap_pool.resize(h_bitmap_pool.size() + 1024, 0); + uint64_t* words = h_bitmap_pool.data() + h_bitmap_pool.size() - 1024; + for (uint16_t v : containers[i].values) { + words[v / 64] |= 1ULL << (v % 64); + } + ++n_bitmap; + } else { + // Array container + h_types[i] = roaring_container_type::ARRAY; + h_offsets[i] = static_cast(h_array_pool.size() * sizeof(uint16_t)); + h_array_pool.insert( + h_array_pool.end(), containers[i].values.begin(), containers[i].values.end()); + ++n_array; + } + } + + result.n_bitmap_containers = n_bitmap; + result.n_array_containers = n_array; + result.n_run_containers = 0; + + // Upload to device + result.keys.resize(n, stream); + result.types.resize(n, stream); + result.offsets.resize(n, stream); + result.cardinalities.resize(n, stream); + + raft::update_device(result.keys.data(), h_keys.data(), n, stream); + raft::update_device(result.types.data(), + reinterpret_cast(h_types.data()), + n, + stream); + raft::update_device(result.offsets.data(), h_offsets.data(), n, stream); + raft::update_device(result.cardinalities.data(), h_cards.data(), n, stream); + + if (!h_bitmap_pool.empty()) { + result.bitmap_data.resize(h_bitmap_pool.size(), stream); + raft::update_device( + result.bitmap_data.data(), h_bitmap_pool.data(), h_bitmap_pool.size(), stream); + } + if (!h_array_pool.empty()) { + result.array_data.resize(h_array_pool.size(), stream); + raft::update_device(result.array_data.data(), h_array_pool.data(), h_array_pool.size(), stream); + } + + // Total cardinality + result.total_cardinality = static_cast(n_ids); + + // Host mirrors (count-free filtered search: indptr and dispatch decisions + // come from these, never from count kernels) + result.h_keys = h_keys; + result.h_types = h_types; + result.h_offsets = h_offsets; + result.h_element_counts.resize(n); + for (uint32_t i = 0; i < n; ++i) + result.h_element_counts[i] = static_cast(containers[i].values.size()); + + // Build direct-map key index: key_index[key] = container_idx, 0xFFFF = absent + if (n > 0) { + result.max_key = h_keys[n - 1]; + uint32_t ki_size = result.max_key + 1; + std::vector h_key_index(ki_size, 0xFFFF); + for (uint32_t i = 0; i < n; ++i) { + h_key_index[h_keys[i]] = static_cast(i); + } + result.key_index.resize(ki_size, stream); + raft::update_device(result.key_index.data(), h_key_index.data(), ki_size, stream); + } + + raft::resource::sync_stream(res); + return result; +} + +// ============================================================================ +// Decompress kernel +// ============================================================================ + +__global__ void decompress_kernel(const uint16_t* keys, + const roaring_container_type* types, + const uint32_t* offsets, + const uint16_t* cardinalities, + uint32_t n_containers, + const uint64_t* bitmap_data, + const uint16_t* array_data, + const uint16_t* run_data, + uint32_t* output, + uint32_t output_size_words) +{ + uint32_t cid = blockIdx.x; + if (cid >= n_containers) return; + + uint32_t key = keys[cid]; + uint32_t base_word = key * 2048u; + auto ctype = types[cid]; + uint32_t offset = offsets[cid]; + + if (ctype == roaring_container_type::BITMAP) { + const uint32_t* src = + reinterpret_cast(bitmap_data) + (offset / sizeof(uint32_t)); + for (uint32_t i = threadIdx.x; i < 2048u; i += blockDim.x) { + uint32_t dst_idx = base_word + i; + if (dst_idx < output_size_words) { output[dst_idx] = src[i]; } + } + } else if (ctype == roaring_container_type::ARRAY) { + const uint16_t* arr = array_data + (offset / sizeof(uint16_t)); + uint16_t card = cardinalities[cid]; + for (uint32_t i = threadIdx.x; i < card; i += blockDim.x) { + uint16_t val = arr[i]; + uint32_t abs_bit = (static_cast(key) << 16) | val; + uint32_t word_idx = abs_bit / 32u; + uint32_t bit_pos = abs_bit % 32u; + if (word_idx < output_size_words) { atomicOr(&output[word_idx], 1u << bit_pos); } + } + } else if (ctype == roaring_container_type::RUN) { + const uint16_t* runs = run_data + (offset / sizeof(uint16_t)); + uint16_t n_runs = cardinalities[cid]; + for (uint32_t r = threadIdx.x; r < n_runs; r += blockDim.x) { + uint16_t start = runs[r * 2]; + uint16_t length = runs[r * 2 + 1]; + for (uint32_t v = start; v <= static_cast(start) + length; ++v) { + uint32_t abs_bit = (static_cast(key) << 16) | v; + uint32_t word_idx = abs_bit / 32u; + uint32_t bit_pos = abs_bit % 32u; + if (word_idx < output_size_words) { atomicOr(&output[word_idx], 1u << bit_pos); } + } + } + } +} + +void decompress_to_bitset(raft::resources const& res, + const gpu_roaring& bitmap, + uint32_t* output, + uint32_t output_size_words) +{ + if (bitmap.n_containers == 0) return; + auto stream = raft::resource::get_cuda_stream(res); + + RAFT_CUDA_TRY(cudaMemsetAsync(output, 0, output_size_words * sizeof(uint32_t), stream)); + + decompress_kernel<<>>(bitmap.keys.data(), + bitmap.types.data(), + bitmap.offsets.data(), + bitmap.cardinalities.data(), + bitmap.n_containers, + bitmap.bitmap_data.data(), + bitmap.array_data.data(), + bitmap.run_data.data(), + output, + output_size_words); + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} + +cuvs::core::bitset to_bitset(raft::resources const& res, + const gpu_roaring& bitmap) +{ + auto stream = raft::resource::get_cuda_stream(res); + int64_t n_bits = static_cast(bitmap.universe_size); + + // Create a bitset initialized to zero + cuvs::core::bitset result(res, n_bits, false); + + uint32_t n_words = static_cast((n_bits + 31) / 32); + if (n_words > 0 && bitmap.n_containers > 0) { + decompress_to_bitset(res, bitmap, result.data(), n_words); + } + return result; +} + +// ============================================================================ +// Set operation kernels (bitmap x bitmap) +// ============================================================================ + +struct BitmapBitmapWork { + uint16_t key; + uint32_t a_offset; // in uint64_t words + uint32_t b_offset; +}; + +template +__device__ __forceinline__ uint64_t apply_op(uint64_t a, uint64_t b); + +template <> +__device__ __forceinline__ uint64_t apply_op(uint64_t a, uint64_t b) +{ + return a & b; +} +template <> +__device__ __forceinline__ uint64_t apply_op(uint64_t a, uint64_t b) +{ + return a | b; +} +template <> +__device__ __forceinline__ uint64_t apply_op(uint64_t a, uint64_t b) +{ + return a & ~b; +} +template <> +__device__ __forceinline__ uint64_t apply_op(uint64_t a, uint64_t b) +{ + return a ^ b; +} + +template +__global__ void bitmap_bitmap_kernel(const BitmapBitmapWork* work, + uint32_t n_pairs, + const uint64_t* a_data, + const uint64_t* b_data, + uint64_t* out_data, + uint16_t* out_cards) +{ + uint32_t pair_idx = blockIdx.x; + if (pair_idx >= n_pairs) return; + + const auto& w = work[pair_idx]; + const uint64_t* a = a_data + w.a_offset; + const uint64_t* b = b_data + w.b_offset; + uint64_t* out = out_data + pair_idx * 1024; + + uint32_t local_pop = 0; + for (uint32_t i = threadIdx.x; i < 1024u; i += blockDim.x) { + uint64_t val = apply_op(a[i], b[i]); + out[i] = val; + local_pop += __popcll(val); + } + + // Warp reduction + for (int off = 16; off > 0; off >>= 1) + local_pop += __shfl_down_sync(0xFFFFFFFF, local_pop, off); + + __shared__ uint32_t warp_counts[8]; + uint32_t warp_id = threadIdx.x / 32; + uint32_t lane_id = threadIdx.x % 32; + if (lane_id == 0) warp_counts[warp_id] = local_pop; + __syncthreads(); + + if (threadIdx.x == 0) { + uint32_t total = 0; + for (uint32_t w = 0; w < blockDim.x / 32; ++w) + total += warp_counts[w]; + out_cards[pair_idx] = static_cast(total > 65535 ? 0 : total); + } +} + +// Expand array/run to bitmap +struct ExpandWork { + uint32_t src_offset; + uint16_t cardinality; + roaring_container_type type; +}; + +__global__ void expand_to_bitmap_kernel(const ExpandWork* work, + uint32_t n_items, + const uint16_t* array_pool, + const uint16_t* run_pool, + uint64_t* out_bitmaps) +{ + uint32_t idx = blockIdx.x; + if (idx >= n_items) return; + + const auto& w = work[idx]; + uint64_t* out = out_bitmaps + idx * 1024; + + for (uint32_t i = threadIdx.x; i < 1024u; i += blockDim.x) + out[i] = 0; + __syncthreads(); + + if (w.type == roaring_container_type::ARRAY) { + const uint16_t* arr = array_pool + w.src_offset; + for (uint32_t i = threadIdx.x; i < w.cardinality; i += blockDim.x) { + uint16_t val = arr[i]; + uint32_t word_idx = val / 64u; + uint64_t bit_mask = 1ULL << (val % 64u); + atomicOr(reinterpret_cast(&out[word_idx]), + static_cast(bit_mask)); + } + } else if (w.type == roaring_container_type::RUN) { + const uint16_t* runs = run_pool + w.src_offset; + for (uint32_t r = threadIdx.x; r < w.cardinality; r += blockDim.x) { + uint16_t start = runs[r * 2]; + uint16_t length = runs[r * 2 + 1]; + for (uint32_t v = start; v <= static_cast(start) + length; ++v) { + uint32_t word_idx = v / 64u; + uint64_t bit_mask = 1ULL << (v % 64u); + atomicOr(reinterpret_cast(&out[word_idx]), + static_cast(bit_mask)); + } + } + } +} + +// Copy bitmap container +__global__ void copy_bitmap_kernel(const uint64_t* src_pool, + const uint32_t* src_offsets, + const bool* from_a, + uint32_t n_items, + const uint64_t* a_pool, + const uint64_t* b_pool, + uint64_t* dst_pool, + uint32_t dst_start) +{ + uint32_t item = blockIdx.x; + if (item >= n_items) return; + + const uint64_t* src = from_a[item] ? (a_pool + src_offsets[item]) : (b_pool + src_offsets[item]); + uint64_t* dst = dst_pool + (dst_start + item) * 1024; + for (uint32_t i = threadIdx.x; i < 1024u; i += blockDim.x) + dst[i] = src[i]; +} + +// ============================================================================ +// Host-side helpers +// ============================================================================ + +struct HostIndex { + std::vector keys; + std::vector types; + std::vector offsets; + std::vector cardinalities; +}; + +static HostIndex download_index(const gpu_roaring& g, cudaStream_t stream) +{ + HostIndex h; + uint32_t n = g.n_containers; + h.keys.resize(n); + h.types.resize(n); + h.offsets.resize(n); + h.cardinalities.resize(n); + if (n == 0) return h; + + RAFT_CUDA_TRY(cudaMemcpyAsync( + h.keys.data(), g.keys.data(), n * sizeof(uint16_t), cudaMemcpyDeviceToHost, stream)); + RAFT_CUDA_TRY(cudaMemcpyAsync(h.types.data(), + g.types.data(), + n * sizeof(roaring_container_type), + cudaMemcpyDeviceToHost, + stream)); + RAFT_CUDA_TRY(cudaMemcpyAsync( + h.offsets.data(), g.offsets.data(), n * sizeof(uint32_t), cudaMemcpyDeviceToHost, stream)); + RAFT_CUDA_TRY(cudaMemcpyAsync(h.cardinalities.data(), + g.cardinalities.data(), + n * sizeof(uint16_t), + cudaMemcpyDeviceToHost, + stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); + return h; +} + +static void launch_bb(roaring_set_op op, + uint32_t n, + const BitmapBitmapWork* d_work, + const uint64_t* a, + const uint64_t* b, + uint64_t* out, + uint16_t* cards, + cudaStream_t stream) +{ + switch (op) { + case roaring_set_op::AND: + bitmap_bitmap_kernel<<>>(d_work, n, a, b, out, cards); + break; + case roaring_set_op::OR: + bitmap_bitmap_kernel<<>>(d_work, n, a, b, out, cards); + break; + case roaring_set_op::ANDNOT: + bitmap_bitmap_kernel + <<>>(d_work, n, a, b, out, cards); + break; + case roaring_set_op::XOR: + bitmap_bitmap_kernel<<>>(d_work, n, a, b, out, cards); + break; + } +} + +// ============================================================================ +// set_op: the main set operation implementation +// ============================================================================ + +gpu_roaring set_op(raft::resources const& res, + const gpu_roaring& a, + const gpu_roaring& b, + roaring_set_op op) +{ + auto stream = raft::resource::get_cuda_stream(res); + auto ha = download_index(a, stream); + auto hb = download_index(b, stream); + + // Container matching (merge of sorted key arrays) + struct WorkItem { + uint16_t key; + int32_t a_idx, b_idx; + roaring_container_type a_type, b_type; + }; + std::vector work; + work.reserve(a.n_containers + b.n_containers); + + uint32_t ia = 0, ib = 0; + while (ia < a.n_containers && ib < b.n_containers) { + if (ha.keys[ia] < hb.keys[ib]) { + if (op == roaring_set_op::OR || op == roaring_set_op::XOR || op == roaring_set_op::ANDNOT) + work.push_back({ha.keys[ia], (int32_t)ia, -1, ha.types[ia], roaring_container_type::ARRAY}); + ++ia; + } else if (ha.keys[ia] > hb.keys[ib]) { + if (op == roaring_set_op::OR || op == roaring_set_op::XOR) + work.push_back({hb.keys[ib], -1, (int32_t)ib, roaring_container_type::ARRAY, hb.types[ib]}); + ++ib; + } else { + work.push_back({ha.keys[ia], (int32_t)ia, (int32_t)ib, ha.types[ia], hb.types[ib]}); + ++ia; + ++ib; + } + } + while (ia < a.n_containers) { + if (op == roaring_set_op::OR || op == roaring_set_op::XOR || op == roaring_set_op::ANDNOT) + work.push_back({ha.keys[ia], (int32_t)ia, -1, ha.types[ia], roaring_container_type::ARRAY}); + ++ia; + } + while (ib < b.n_containers) { + if (op == roaring_set_op::OR || op == roaring_set_op::XOR) + work.push_back({hb.keys[ib], -1, (int32_t)ib, roaring_container_type::ARRAY, hb.types[ib]}); + ++ib; + } + + if (work.empty()) return gpu_roaring(stream); + + // Classify by type pair + struct BBW { + uint16_t key; + uint32_t a_off, b_off; + }; + struct CopyW { + uint16_t key; + uint32_t offset; + uint16_t card; + roaring_container_type type; + bool from_a; + }; + struct MixedW { + uint16_t key; + roaring_container_type a_type, b_type; + uint32_t a_off, b_off; + uint16_t a_card, b_card; + bool a_is_bmp, b_is_bmp; + }; + + std::vector bb_work; + std::vector copy_bmp, copy_arr, copy_run; + std::vector mixed; + + auto elem_off = [](roaring_container_type t, uint32_t byte_off) -> uint32_t { + if (t == roaring_container_type::BITMAP) return byte_off / sizeof(uint64_t); + return byte_off / sizeof(uint16_t); + }; + + for (auto& wi : work) { + if (wi.a_idx < 0) { + auto& ho = hb; + int32_t i = wi.b_idx; + CopyW cw = {wi.key, ho.offsets[i], ho.cardinalities[i], ho.types[i], false}; + if (ho.types[i] == roaring_container_type::BITMAP) + copy_bmp.push_back(cw); + else if (ho.types[i] == roaring_container_type::ARRAY) + copy_arr.push_back(cw); + else + copy_run.push_back(cw); + } else if (wi.b_idx < 0) { + auto& ho = ha; + int32_t i = wi.a_idx; + CopyW cw = {wi.key, ho.offsets[i], ho.cardinalities[i], ho.types[i], true}; + if (ho.types[i] == roaring_container_type::BITMAP) + copy_bmp.push_back(cw); + else if (ho.types[i] == roaring_container_type::ARRAY) + copy_arr.push_back(cw); + else + copy_run.push_back(cw); + } else { + auto ta = ha.types[wi.a_idx]; + auto tb = hb.types[wi.b_idx]; + if (ta == roaring_container_type::BITMAP && tb == roaring_container_type::BITMAP) { + bb_work.push_back({wi.key, + ha.offsets[wi.a_idx] / (uint32_t)sizeof(uint64_t), + hb.offsets[wi.b_idx] / (uint32_t)sizeof(uint64_t)}); + } else { + // All non-bitmap×bitmap: expand to bitmap + mixed.push_back({wi.key, + ta, + tb, + elem_off(ta, ha.offsets[wi.a_idx]), + elem_off(tb, hb.offsets[wi.b_idx]), + ha.cardinalities[wi.a_idx], + hb.cardinalities[wi.b_idx], + ta == roaring_container_type::BITMAP, + tb == roaring_container_type::BITMAP}); + } + } + } + + uint32_t out_n_bitmap = static_cast(bb_work.size() + mixed.size() + copy_bmp.size()); + + // Allocate output bitmap pool + rmm::device_uvector d_out_bmp(static_cast(out_n_bitmap) * 1024, stream); + + // Execute bitmap×bitmap + std::vector bb_cards(bb_work.size()); + if (!bb_work.empty()) { + uint32_t n_bb = static_cast(bb_work.size()); + rmm::device_uvector d_bb(n_bb, stream); + rmm::device_uvector d_bb_cards(n_bb, stream); + raft::update_device( + d_bb.data(), reinterpret_cast(bb_work.data()), n_bb, stream); + + launch_bb(op, + n_bb, + d_bb.data(), + a.bitmap_data.data(), + b.bitmap_data.data(), + d_out_bmp.data(), + d_bb_cards.data(), + stream); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + + raft::update_host(bb_cards.data(), d_bb_cards.data(), n_bb, stream); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); + } + + // Execute mixed pairs (expand + bitmap op) + std::vector mixed_cards(mixed.size()); + rmm::device_uvector d_tmp_a(0, stream); + rmm::device_uvector d_tmp_b(0, stream); + if (!mixed.empty()) { + uint32_t n_mx = static_cast(mixed.size()); + d_tmp_a.resize(static_cast(n_mx) * 1024, stream); + d_tmp_b.resize(static_cast(n_mx) * 1024, stream); + + // Build expand work + std::vector exp_a(n_mx), exp_b(n_mx); + for (uint32_t i = 0; i < n_mx; ++i) { + exp_a[i] = {mixed[i].a_off, mixed[i].a_card, mixed[i].a_type}; + exp_b[i] = {mixed[i].b_off, mixed[i].b_card, mixed[i].b_type}; + } + + rmm::device_uvector d_exp_a(n_mx, stream); + rmm::device_uvector d_exp_b(n_mx, stream); + raft::update_device(d_exp_a.data(), exp_a.data(), n_mx, stream); + raft::update_device(d_exp_b.data(), exp_b.data(), n_mx, stream); + + RAFT_CUDA_TRY(cudaMemsetAsync(d_tmp_a.data(), 0, d_tmp_a.size() * sizeof(uint64_t), stream)); + RAFT_CUDA_TRY(cudaMemsetAsync(d_tmp_b.data(), 0, d_tmp_b.size() * sizeof(uint64_t), stream)); + + expand_to_bitmap_kernel<<>>( + d_exp_a.data(), n_mx, a.array_data.data(), a.run_data.data(), d_tmp_a.data()); + expand_to_bitmap_kernel<<>>( + d_exp_b.data(), n_mx, b.array_data.data(), b.run_data.data(), d_tmp_b.data()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + + // Copy bitmap containers from source pools into temp buffers + for (uint32_t i = 0; i < n_mx; ++i) { + if (mixed[i].a_is_bmp) { + RAFT_CUDA_TRY(cudaMemcpyAsync(d_tmp_a.data() + i * 1024, + a.bitmap_data.data() + mixed[i].a_off, + 1024 * sizeof(uint64_t), + cudaMemcpyDeviceToDevice, + stream)); + } + if (mixed[i].b_is_bmp) { + RAFT_CUDA_TRY(cudaMemcpyAsync(d_tmp_b.data() + i * 1024, + b.bitmap_data.data() + mixed[i].b_off, + 1024 * sizeof(uint64_t), + cudaMemcpyDeviceToDevice, + stream)); + } + } + + // Build BitmapBitmapWork for temp pools + std::vector mx_bb(n_mx); + for (uint32_t i = 0; i < n_mx; ++i) + mx_bb[i] = {mixed[i].key, i * 1024, i * 1024}; + + rmm::device_uvector d_mx_bb(n_mx, stream); + rmm::device_uvector d_mx_cards(n_mx, stream); + raft::update_device(d_mx_bb.data(), mx_bb.data(), n_mx, stream); + + uint32_t bb_sz = static_cast(bb_work.size()); + launch_bb(op, + n_mx, + d_mx_bb.data(), + d_tmp_a.data(), + d_tmp_b.data(), + d_out_bmp.data() + static_cast(bb_sz) * 1024, + d_mx_cards.data(), + stream); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + + raft::update_host(mixed_cards.data(), d_mx_cards.data(), n_mx, stream); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); + } + + // Copy-through bitmap containers + if (!copy_bmp.empty()) { + uint32_t n_cp = static_cast(copy_bmp.size()); + uint32_t start = static_cast(bb_work.size() + mixed.size()); + for (uint32_t i = 0; i < n_cp; ++i) { + const uint64_t* src = copy_bmp[i].from_a + ? a.bitmap_data.data() + copy_bmp[i].offset / sizeof(uint64_t) + : b.bitmap_data.data() + copy_bmp[i].offset / sizeof(uint64_t); + RAFT_CUDA_TRY(cudaMemcpyAsync(d_out_bmp.data() + (start + i) * 1024, + src, + 1024 * sizeof(uint64_t), + cudaMemcpyDeviceToDevice, + stream)); + } + } + + // Copy-through array containers + uint32_t total_arr = 0; + std::vector ca_off(copy_arr.size()); + for (size_t i = 0; i < copy_arr.size(); ++i) { + ca_off[i] = total_arr; + total_arr += copy_arr[i].card; + } + rmm::device_uvector d_out_arr(total_arr, stream); + for (size_t i = 0; i < copy_arr.size(); ++i) { + const uint16_t* src = copy_arr[i].from_a + ? a.array_data.data() + copy_arr[i].offset / sizeof(uint16_t) + : b.array_data.data() + copy_arr[i].offset / sizeof(uint16_t); + RAFT_CUDA_TRY(cudaMemcpyAsync(d_out_arr.data() + ca_off[i], + src, + copy_arr[i].card * sizeof(uint16_t), + cudaMemcpyDeviceToDevice, + stream)); + } + + // Copy-through run containers + uint32_t total_run_vals = 0; + std::vector cr_off(copy_run.size()); + for (size_t i = 0; i < copy_run.size(); ++i) { + cr_off[i] = total_run_vals; + total_run_vals += copy_run[i].card * 2; + } + rmm::device_uvector d_out_run(total_run_vals, stream); + for (size_t i = 0; i < copy_run.size(); ++i) { + const uint16_t* src = copy_run[i].from_a + ? a.run_data.data() + copy_run[i].offset / sizeof(uint16_t) + : b.run_data.data() + copy_run[i].offset / sizeof(uint16_t); + RAFT_CUDA_TRY(cudaMemcpyAsync(d_out_run.data() + cr_off[i], + src, + copy_run[i].card * 2 * sizeof(uint16_t), + cudaMemcpyDeviceToDevice, + stream)); + } + + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); + + // Assemble output + std::vector out_keys; + std::vector out_types; + std::vector out_offsets; + std::vector out_cards; + + uint32_t bb_idx = 0, mx_idx = 0, cb_idx = 0, ca_idx = 0, cr_idx = 0; + + for (auto& wi : work) { + if (wi.a_idx < 0 || wi.b_idx < 0) { + // Copy-through + auto& ho = (wi.a_idx < 0) ? hb : ha; + int32_t src_idx = (wi.a_idx < 0) ? wi.b_idx : wi.a_idx; + auto ct = ho.types[src_idx]; + if (ct == roaring_container_type::BITMAP) { + uint32_t slot = static_cast(bb_work.size() + mixed.size()) + cb_idx; + out_keys.push_back(wi.key); + out_types.push_back(roaring_container_type::BITMAP); + out_offsets.push_back(slot * 1024 * sizeof(uint64_t)); + out_cards.push_back(ho.cardinalities[src_idx]); + ++cb_idx; + } else if (ct == roaring_container_type::ARRAY) { + out_keys.push_back(wi.key); + out_types.push_back(roaring_container_type::ARRAY); + out_offsets.push_back(ca_off[ca_idx] * sizeof(uint16_t)); + out_cards.push_back(copy_arr[ca_idx].card); + ++ca_idx; + } else { + out_keys.push_back(wi.key); + out_types.push_back(roaring_container_type::RUN); + out_offsets.push_back(cr_off[cr_idx] * sizeof(uint16_t)); + out_cards.push_back(copy_run[cr_idx].card); + ++cr_idx; + } + } else { + auto ta = ha.types[wi.a_idx]; + auto tb = hb.types[wi.b_idx]; + if (ta == roaring_container_type::BITMAP && tb == roaring_container_type::BITMAP) { + uint16_t card = bb_cards[bb_idx]; + if (card == 0 && op != roaring_set_op::OR) { + ++bb_idx; + continue; + } + out_keys.push_back(wi.key); + out_types.push_back(roaring_container_type::BITMAP); + out_offsets.push_back(bb_idx * 1024 * sizeof(uint64_t)); + out_cards.push_back(card); + ++bb_idx; + } else { + uint16_t card = mixed_cards[mx_idx]; + if (card == 0 && op != roaring_set_op::OR) { + ++mx_idx; + continue; + } + uint32_t slot = static_cast(bb_work.size()) + mx_idx; + out_keys.push_back(wi.key); + out_types.push_back(roaring_container_type::BITMAP); + out_offsets.push_back(slot * 1024 * sizeof(uint64_t)); + out_cards.push_back(card); + ++mx_idx; + } + } + } + + // Build result + gpu_roaring result(stream); + result.n_containers = static_cast(out_keys.size()); + result.universe_size = std::max(a.universe_size, b.universe_size); + result.bitmap_data = std::move(d_out_bmp); + result.n_bitmap_containers = out_n_bitmap; + result.array_data = std::move(d_out_arr); + result.n_array_containers = static_cast(copy_arr.size()); + result.run_data = std::move(d_out_run); + result.n_run_containers = static_cast(copy_run.size()); + + if (result.n_containers > 0) { + result.keys.resize(result.n_containers, stream); + result.types.resize(result.n_containers, stream); + result.offsets.resize(result.n_containers, stream); + result.cardinalities.resize(result.n_containers, stream); + raft::update_device(result.keys.data(), out_keys.data(), result.n_containers, stream); + raft::update_device(result.types.data(), out_types.data(), result.n_containers, stream); + raft::update_device(result.offsets.data(), out_offsets.data(), result.n_containers, stream); + raft::update_device(result.cardinalities.data(), out_cards.data(), result.n_containers, stream); + raft::resource::sync_stream(res); + } + + return result; +} + +// ============================================================================ +// Multi-AND / Multi-OR +// ============================================================================ + +gpu_roaring multi_and(raft::resources const& res, const gpu_roaring* bitmaps, uint32_t count) +{ + if (count == 0) return gpu_roaring(raft::resource::get_cuda_stream(res)); + if (count == 1) return set_op(res, bitmaps[0], bitmaps[0], roaring_set_op::AND); + + gpu_roaring result = set_op(res, bitmaps[0], bitmaps[1], roaring_set_op::AND); + for (uint32_t i = 2; i < count; ++i) { + gpu_roaring next = set_op(res, result, bitmaps[i], roaring_set_op::AND); + result = std::move(next); + } + return result; +} + +gpu_roaring multi_or(raft::resources const& res, const gpu_roaring* bitmaps, uint32_t count) +{ + if (count == 0) return gpu_roaring(raft::resource::get_cuda_stream(res)); + if (count == 1) return set_op(res, bitmaps[0], bitmaps[0], roaring_set_op::AND); + + gpu_roaring result = set_op(res, bitmaps[0], bitmaps[1], roaring_set_op::OR); + for (uint32_t i = 2; i < count; ++i) { + gpu_roaring next = set_op(res, result, bitmaps[i], roaring_set_op::OR); + result = std::move(next); + } + return result; +} + +// ============================================================================ +// from_sorted_ids (device input) +// ============================================================================ + +gpu_roaring from_sorted_ids(raft::resources const& res, + raft::device_vector_view sorted_ids, + uint32_t universe_size) +{ + auto stream = raft::resource::get_cuda_stream(res); + std::vector h_ids(sorted_ids.extent(0)); + raft::update_host(h_ids.data(), sorted_ids.data_handle(), h_ids.size(), stream); + raft::resource::sync_stream(res); + return from_sorted_ids(res, h_ids.data(), static_cast(h_ids.size()), universe_size); +} + +// ============================================================================ +// Batched CSR emission: every container of every bitmap in one launch. +// One CTA per container; each container writes its sorted member ids into +// its precomputed segment of the global indices array (offsets derive from +// host-side element counts — no count kernels, no syncs). +// ============================================================================ + +namespace { + +constexpr uint32_t kEmitBlock = 256; + +__device__ inline void emit_block_scan(uint32_t* smem, uint32_t tid) +{ + for (uint32_t stride = 1; stride < kEmitBlock; stride *= 2) { + uint32_t v = (tid >= stride) ? smem[tid - stride] : 0; + __syncthreads(); + smem[tid] += v; + __syncthreads(); + } +} + +__global__ void emit_csr_indices_kernel(const uint32_t* keys, + const uint8_t* types, + const uint32_t* elem_counts, + const void* const* data, + const int64_t* out_offsets, + uint32_t n_containers, + int64_t* indices) +{ + uint32_t cid = blockIdx.x; + if (cid >= n_containers) return; + int64_t base_id = static_cast(keys[cid]) << 16; + int64_t out_start = out_offsets[cid]; + uint32_t tid = threadIdx.x; + auto ctype = static_cast(types[cid]); + + if (ctype == roaring_container_type::ARRAY) { + const uint16_t* arr = static_cast(data[cid]); + for (uint32_t i = tid; i < elem_counts[cid]; i += kEmitBlock) + indices[out_start + i] = base_id | arr[i]; + } else if (ctype == roaring_container_type::BITMAP) { + const uint64_t* bmp = static_cast(data[cid]); + __shared__ uint32_t counts[kEmitBlock]; + uint32_t mine = 0; + for (uint32_t w = tid * 4; w < tid * 4 + 4; ++w) + mine += static_cast(__popcll(bmp[w])); + counts[tid] = mine; + __syncthreads(); + emit_block_scan(counts, tid); + uint32_t pos = (tid > 0) ? counts[tid - 1] : 0; + for (uint32_t w = tid * 4; w < tid * 4 + 4; ++w) { + uint64_t word = bmp[w]; + while (word != 0) { + uint32_t bit = static_cast(__ffsll(static_cast(word))) - 1; + indices[out_start + pos++] = base_id | (w * 64 + bit); + word &= word - 1; + } + } + } + // RUN containers are rejected host-side (not produced by v1 construction). +} + +} // namespace + +void to_csr_indices(raft::resources const& res, + const gpu_roaring* const* bitmaps, + uint32_t n_bitmaps, + int64_t* indices) +{ + auto stream = raft::resource::get_cuda_stream(res); + + // flatten per-container descriptors from the host mirrors + std::vector keys, counts; + std::vector types; + std::vector data; + std::vector out_offsets; + int64_t cursor = 0; + for (uint32_t b = 0; b < n_bitmaps; ++b) { + const gpu_roaring& r = *bitmaps[b]; + RAFT_EXPECTS(!r.negated, "to_csr_indices: negated bitmaps are not supported"); + RAFT_EXPECTS(r.h_keys.size() == r.n_containers, + "to_csr_indices: bitmap lacks host container metadata " + "(construct via from_sorted_ids)"); + for (uint32_t c = 0; c < r.n_containers; ++c) { + keys.push_back(r.h_keys[c]); + types.push_back(static_cast(r.h_types[c])); + out_offsets.push_back(cursor); + switch (r.h_types[c]) { + case roaring_container_type::ARRAY: + data.push_back(r.array_data.data() + r.h_offsets[c] / sizeof(uint16_t)); + counts.push_back(r.h_element_counts[c]); + cursor += r.h_element_counts[c]; + break; + case roaring_container_type::BITMAP: + data.push_back(r.bitmap_data.data() + r.h_offsets[c] / sizeof(uint64_t)); + counts.push_back(r.h_element_counts[c]); + cursor += r.h_element_counts[c]; + break; + default: RAFT_FAIL("to_csr_indices: RUN containers are not produced by v1 construction"); + } + } + } + uint32_t n_c = static_cast(keys.size()); + if (n_c == 0) return; + + rmm::device_uvector d_keys(n_c, stream), d_counts(n_c, stream); + rmm::device_uvector d_types(n_c, stream); + rmm::device_uvector d_data(n_c, stream); + rmm::device_uvector d_off(n_c, stream); + raft::update_device(d_keys.data(), keys.data(), n_c, stream); + raft::update_device(d_counts.data(), counts.data(), n_c, stream); + raft::update_device(d_types.data(), types.data(), n_c, stream); + raft::update_device(d_data.data(), data.data(), n_c, stream); + raft::update_device(d_off.data(), out_offsets.data(), n_c, stream); + + emit_csr_indices_kernel<<>>( + d_keys.data(), d_types.data(), d_counts.data(), d_data.data(), d_off.data(), n_c, indices); + RAFT_CUDA_TRY(cudaGetLastError()); +} + +// ============================================================================ +// Batched decompression into a dense [n_bitmaps, n_rows] bit matrix +// ============================================================================ + +namespace { + +__global__ void decompress_bitmap_row_kernel(const uint32_t* keys, + const uint8_t* types, + const uint32_t* elem_counts, + const void* const* data, + const int64_t* row_of, + uint32_t n_containers, + int64_t n_rows, + uint32_t* output) +{ + uint32_t cid = blockIdx.x; + if (cid >= n_containers) return; + int64_t row_bit_base = row_of[cid] * n_rows; + uint32_t key = keys[cid]; + uint32_t tid = threadIdx.x; + auto ctype = static_cast(types[cid]); + + auto set_bit = [&](uint32_t low) { + int64_t bit = row_bit_base + ((static_cast(key) << 16) | low); + atomicOr(&output[bit >> 5], 1u << (bit & 31)); + }; + + if (ctype == roaring_container_type::ARRAY) { + const uint16_t* arr = static_cast(data[cid]); + for (uint32_t i = tid; i < elem_counts[cid]; i += blockDim.x) + set_bit(arr[i]); + } else if (ctype == roaring_container_type::BITMAP) { + const uint64_t* bmp = static_cast(data[cid]); + for (uint32_t w = tid; w < 1024; w += blockDim.x) { + uint64_t word = bmp[w]; + while (word != 0) { + uint32_t bit = static_cast(__ffsll(static_cast(word))) - 1; + set_bit(w * 64 + bit); + word &= word - 1; + } + } + } + // RUN containers are rejected host-side (not produced by v1 construction). +} + +} // namespace + +void decompress_to_bitmap(raft::resources const& res, + const gpu_roaring* const* bitmaps, + uint32_t n_bitmaps, + int64_t n_rows, + uint32_t* output) +{ + auto stream = raft::resource::get_cuda_stream(res); + + std::vector keys, counts; + std::vector types; + std::vector data; + std::vector row_of; + for (uint32_t b = 0; b < n_bitmaps; ++b) { + const gpu_roaring& r = *bitmaps[b]; + RAFT_EXPECTS(!r.negated, "decompress_to_bitmap: negated bitmaps are not supported"); + RAFT_EXPECTS(r.h_keys.size() == r.n_containers, + "decompress_to_bitmap: bitmap lacks host container metadata " + "(construct via from_sorted_ids)"); + for (uint32_t c = 0; c < r.n_containers; ++c) { + keys.push_back(r.h_keys[c]); + types.push_back(static_cast(r.h_types[c])); + row_of.push_back(static_cast(b)); + counts.push_back(r.h_element_counts[c]); + switch (r.h_types[c]) { + case roaring_container_type::ARRAY: + data.push_back(r.array_data.data() + r.h_offsets[c] / sizeof(uint16_t)); + break; + case roaring_container_type::BITMAP: + data.push_back(r.bitmap_data.data() + r.h_offsets[c] / sizeof(uint64_t)); + break; + default: + RAFT_FAIL("decompress_to_bitmap: RUN containers are not produced by v1 construction"); + } + } + } + uint32_t n_c = static_cast(keys.size()); + if (n_c == 0) return; + + rmm::device_uvector d_keys(n_c, stream), d_counts(n_c, stream); + rmm::device_uvector d_types(n_c, stream); + rmm::device_uvector d_data(n_c, stream); + rmm::device_uvector d_row(n_c, stream); + raft::update_device(d_keys.data(), keys.data(), n_c, stream); + raft::update_device(d_counts.data(), counts.data(), n_c, stream); + raft::update_device(d_types.data(), types.data(), n_c, stream); + raft::update_device(d_data.data(), data.data(), n_c, stream); + raft::update_device(d_row.data(), row_of.data(), n_c, stream); + + decompress_bitmap_row_kernel<<>>(d_keys.data(), + d_types.data(), + d_counts.data(), + d_data.data(), + d_row.data(), + n_c, + n_rows, + output); + RAFT_CUDA_TRY(cudaGetLastError()); +} + +} // namespace cuvs::core diff --git a/cpp/src/neighbors/detail/knn_brute_force.cuh b/cpp/src/neighbors/detail/knn_brute_force.cuh index 34ef8dd937..2b28ada909 100644 --- a/cpp/src/neighbors/detail/knn_brute_force.cuh +++ b/cpp/src/neighbors/detail/knn_brute_force.cuh @@ -15,6 +15,7 @@ #include "./fused_l2_knn.cuh" #include "./haversine_distance.cuh" #include "./knn_utils.cuh" +#include "./knn_brute_force_roaring.cuh" #include #include @@ -589,7 +590,7 @@ void brute_force_search_filtered( const cuvs::neighbors::filtering::base_filter* filter, raft::device_matrix_view neighbors, raft::device_matrix_view distances, - std::optional> query_norms = std::nullopt) + std::optional> query_norms) { auto metric = idx.metric(); @@ -754,6 +755,29 @@ void search(raft::resources const& res, if constexpr (std::is_same_v) { RAFT_FAIL("filtered search isn't available with col_major queries yet"); } else { + if constexpr (std::is_same_v && std::is_same_v) { + try { + auto& sample_filter = + dynamic_cast(sample_filter_ref); + return brute_force_search_roaring(res, idx, queries, sample_filter, neighbors, distances); + } catch (const std::bad_cast&) { + } + try { + auto& sample_filter = + dynamic_cast( + sample_filter_ref); + return brute_force_search_roaring_matrix( + res, idx, queries, sample_filter, neighbors, distances); + } catch (const std::bad_cast&) { + } + } else { + RAFT_EXPECTS( + sample_filter_ref.get_filter_type() != cuvs::neighbors::filtering::FilterType::Roaring && + sample_filter_ref.get_filter_type() != + cuvs::neighbors::filtering::FilterType::RoaringMatrix, + "roaring filters currently support float32 data only"); + } + try { auto& sample_filter = dynamic_cast&>( diff --git a/cpp/src/neighbors/detail/knn_brute_force_roaring.cuh b/cpp/src/neighbors/detail/knn_brute_force_roaring.cuh new file mode 100644 index 0000000000..b6be4b6e96 --- /dev/null +++ b/cpp/src/neighbors/detail/knn_brute_force_roaring.cuh @@ -0,0 +1,438 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +/* + * Brute-force prefiltered search with Roaring-bitmap filters. + * + * Both entry points implement a three-regime dispatch keyed on filter + * selectivity s (known on the host from the filters' cardinalities — no + * count kernels, no device synchronization): + * + * - very sparse: emit the filter's member ids straight into a CSR + * structure (one kernel; indptr is free from host-side + * cardinalities), compute distances at the nnz positions + * with SDDMM, select_k over the sparse rows. + * - mid (shared filter only): gather the selected rows and run a dense + * GEMM + select_k over [n_queries, |filter|] — computes + * |filter| columns instead of n_rows. This is where the + * bulk of the win lives (up to ~19x vs the dense+mask + * path at 1-10% selectivity, 10M x 512d). + * - dense: decompress to a flat bitset / bit matrix and delegate + * to the existing bitset/bitmap pipeline, which is + * already optimal above ~40-50% selectivity. + * + * The sparse/mid threshold is dimension-dependent (cusparse SDDMM + * degrades with d, dense GEMM does not): measured crossovers on + * RTX 5090 are ~3% at d=128 and ~0.1% at d=512. + * + * float32 only in this version (the SDDMM/GEMM plumbing is instantiated + * for float; half support tracks the existing filtered-path limitation). + */ + +#include +#include +#include + +#include "./knn_utils.cuh" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include +#include + +namespace cuvs::neighbors::detail { + +// defined in knn_brute_force.cuh (which includes this header first) +template +void brute_force_search_filtered( + raft::resources const& res, + const cuvs::neighbors::brute_force::index& idx, + raft::device_matrix_view queries, + const cuvs::neighbors::filtering::base_filter* filter, + raft::device_matrix_view neighbors, + raft::device_matrix_view distances, + std::optional> query_norms = std::nullopt); + +namespace roaring_detail { + +/** Selectivity below which the SDDMM path beats gathering + dense GEMM. */ +inline double sparse_threshold(int64_t dim) { return dim >= 256 ? 0.001 : 0.03; } +/** Selectivity above which the dense masked pipeline wins. */ +constexpr double kDenseThreshold = 0.45; + +__global__ inline void replicate_indices_kernel(const int64_t* ids, + int64_t n_ids, + int64_t total, + int64_t* out) +{ + int64_t i = blockIdx.x * static_cast(blockDim.x) + threadIdx.x; + if (i < total) out[i] = ids[i % n_ids]; +} + +__global__ inline void gather_rows_kernel(const float* __restrict__ dataset, + const int64_t* __restrict__ ids, + int64_t n_ids, + int64_t dim, + float* __restrict__ out) +{ + int64_t row = blockIdx.x; + if (row >= n_ids) return; + const float* src = dataset + ids[row] * dim; + float* dst = out + row * dim; + if ((dim & 3) == 0 && (reinterpret_cast(src) & 15) == 0) { + auto* src4 = reinterpret_cast(src); + auto* dst4 = reinterpret_cast(dst); + for (int64_t c = threadIdx.x; c < dim / 4; c += blockDim.x) + dst4[c] = src4[c]; + } else { + for (int64_t c = threadIdx.x; c < dim; c += blockDim.x) + dst[c] = src[c]; + } +} + +__global__ inline void gather_norms_kernel(const float* __restrict__ norms, + const int64_t* __restrict__ ids, + int64_t n_ids, + float* __restrict__ out) +{ + int64_t i = blockIdx.x * static_cast(blockDim.x) + threadIdx.x; + if (i < n_ids) out[i] = norms[ids[i]]; +} + +/** Dense counterpart of epilogue_on_csr: dist[q][u] from raw dots. */ +__global__ inline void dense_epilogue_kernel(float* __restrict__ dist, + const float* __restrict__ q_norms, + const float* __restrict__ r_norms, + int64_t n_queries, + int64_t n_cols, + cuvs::distance::DistanceType metric) +{ + int64_t i = blockIdx.x * static_cast(blockDim.x) + threadIdx.x; + if (i >= n_queries * n_cols) return; + int64_t q = i / n_cols; + int64_t u = i % n_cols; + float dot = dist[i]; + if (metric == cuvs::distance::DistanceType::L2Expanded) { + dist[i] = -2.0f * dot + q_norms[q] + r_norms[u]; + } else if (metric == cuvs::distance::DistanceType::L2SqrtExpanded) { + dist[i] = raft::sqrt(-2.0f * dot + q_norms[q] + r_norms[u]); + } else { // CosineExpanded + dist[i] = 1.0f - dot / (q_norms[q] * r_norms[u]); + } +} + +__global__ inline void remap_ids_kernel(const int64_t* __restrict__ pos, + const int64_t* __restrict__ ids, + int64_t n, + int64_t* __restrict__ out) +{ + int64_t i = blockIdx.x * static_cast(blockDim.x) + threadIdx.x; + if (i < n) out[i] = pos[i] < 0 ? int64_t{-1} : ids[pos[i]]; +} + +/** Compute query norms the way the existing filtered path does. */ +inline rmm::device_uvector make_query_norms( + raft::resources const& res, + raft::device_matrix_view queries, + cuvs::distance::DistanceType metric) +{ + auto stream = raft::resource::get_cuda_stream(res); + rmm::device_uvector q_norms(queries.extent(0), stream); + if (metric == cuvs::distance::DistanceType::CosineExpanded) { + raft::linalg::rowNorm(q_norms.data(), + queries.data_handle(), + queries.extent(1), + queries.extent(0), + stream, + raft::sqrt_op{}); + } else { + raft::linalg::rowNorm(q_norms.data(), + queries.data_handle(), + queries.extent(1), + queries.extent(0), + stream, + raft::identity_op{}); + } + return q_norms; +} + +/** Shared SDDMM + epilogue + sparse select_k over a prebuilt CSR pattern. */ +inline void sparse_distances_select_k( + raft::resources const& res, + const cuvs::neighbors::brute_force::index& idx, + raft::device_matrix_view queries, + const int64_t* indptr, + const int64_t* indices, + int64_t nnz, + raft::device_matrix_view neighbors, + raft::device_matrix_view distances) +{ + auto stream = raft::resource::get_cuda_stream(res); + auto metric = idx.metric(); + int64_t n_queries = queries.extent(0); + int64_t n_rows = idx.dataset().extent(0); + int64_t dim = idx.dataset().extent(1); + + rmm::device_uvector values(nnz, stream); + auto structure = raft::make_device_compressed_structure_view( + const_cast(indptr), const_cast(indices), n_queries, n_rows, nnz); + auto csr_view = + raft::make_device_csr_matrix_view(values.data(), structure); + + auto dataset_view = raft::make_device_matrix_view( + idx.dataset().data_handle(), n_rows, dim); + float alpha_v = 1.0f, beta_v = 0.0f; + auto alpha = raft::make_host_scalar_view(&alpha_v); + auto beta = raft::make_host_scalar_view(&beta_v); + raft::sparse::linalg::sddmm(res, + queries, + dataset_view, + csr_view, + raft::linalg::Operation::NON_TRANSPOSE, + raft::linalg::Operation::TRANSPOSE, + alpha, + beta); + + if (metric == cuvs::distance::DistanceType::L2Expanded || + metric == cuvs::distance::DistanceType::L2SqrtExpanded || + metric == cuvs::distance::DistanceType::CosineExpanded) { + rmm::device_uvector rows(nnz, stream); + raft::sparse::convert::csr_to_coo( + const_cast(indptr), n_queries, rows.data(), nnz, stream); + auto q_norms = make_query_norms(res, queries, metric); + cuvs::neighbors::detail::epilogue_on_csr(res, + values.data(), + nnz, + rows.data(), + const_cast(indices), + q_norms.data(), + idx.norms().data_handle(), + metric); + } + + auto const_csr = raft::make_device_csr_matrix_view( + values.data(), structure); + std::optional> no_opt = std::nullopt; + bool select_min = cuvs::distance::is_min_close(metric); + raft::sparse::matrix::select_k(res, const_csr, no_opt, distances, neighbors, select_min, true); +} + +} // namespace roaring_detail + +/** + * Shared-filter (one Roaring bitmap for all queries) brute-force search. + */ +inline void brute_force_search_roaring( + raft::resources const& res, + const cuvs::neighbors::brute_force::index& idx, + raft::device_matrix_view queries, + const cuvs::neighbors::filtering::roaring_filter& filter, + raft::device_matrix_view neighbors, + raft::device_matrix_view distances) +{ + using namespace roaring_detail; + auto stream = raft::resource::get_cuda_stream(res); + auto metric = idx.metric(); + int64_t n_queries = queries.extent(0); + int64_t n_rows = idx.dataset().extent(0); + int64_t dim = idx.dataset().extent(1); + + const cuvs::core::gpu_roaring& bitmap = *filter.bitmap_; + RAFT_EXPECTS(!bitmap.negated, "roaring_filter: negated bitmaps are not supported"); + RAFT_EXPECTS(static_cast(bitmap.universe_size) == n_rows, + "roaring_filter universe size must match the index size"); + int64_t n_selected = static_cast(bitmap.total_cardinality); + double s = static_cast(n_selected) / static_cast(n_rows); + + bool needs_norms = metric == cuvs::distance::DistanceType::L2Expanded || + metric == cuvs::distance::DistanceType::L2SqrtExpanded || + metric == cuvs::distance::DistanceType::CosineExpanded; + RAFT_EXPECTS(idx.has_norms() || !needs_norms, + "roaring_filter: index must carry norms for L2/Cosine metrics"); + + if (n_selected == 0) { + thrust::fill(raft::resource::get_thrust_policy(res), + neighbors.data_handle(), + neighbors.data_handle() + n_queries * neighbors.extent(1), + int64_t{-1}); + return; + } + + // ---- dense regime: decompress and delegate to the bitset pipeline ---- + if (s >= kDenseThreshold) { + auto bitset = cuvs::core::to_bitset(res, bitmap); + auto bf = cuvs::neighbors::filtering::bitset_filter(bitset.view()); + brute_force_search_filtered( + res, idx, queries, &bf, neighbors, distances, std::nullopt); + return; + } + + // member ids, emitted once per search from the containers + const cuvs::core::gpu_roaring* bptr = &bitmap; + rmm::device_uvector ids(n_selected, stream); + cuvs::core::to_csr_indices(res, &bptr, 1, ids.data()); + + // ---- very sparse: per-query CSR (replicated indices) + SDDMM ---- + if (s <= sparse_threshold(dim)) { + int64_t nnz = n_queries * n_selected; + std::vector h_indptr(n_queries + 1); + for (int64_t q = 0; q <= n_queries; ++q) + h_indptr[q] = q * n_selected; + rmm::device_uvector indptr(n_queries + 1, stream); + raft::update_device(indptr.data(), h_indptr.data(), h_indptr.size(), stream); + rmm::device_uvector indices(nnz, stream); + int64_t blocks = (nnz + 255) / 256; + replicate_indices_kernel<<(blocks), 256, 0, stream>>>( + ids.data(), n_selected, nnz, indices.data()); + sparse_distances_select_k( + res, idx, queries, indptr.data(), indices.data(), nnz, neighbors, distances); + return; + } + + // ---- mid regime: gather selected rows, dense GEMM, select_k, remap ---- + constexpr int64_t kChunk = int64_t{1} << 20; + rmm::device_uvector gathered(std::min(kChunk, n_selected) * dim, stream); + rmm::device_uvector dist(n_queries * n_selected, stream); + + for (int64_t c0 = 0; c0 < n_selected; c0 += kChunk) { + int64_t uc = std::min(kChunk, n_selected - c0); + gather_rows_kernel<<(uc), 128, 0, stream>>>( + idx.dataset().data_handle(), ids.data() + c0, uc, dim, gathered.data()); + // row-major dist[Q, U] tile = queries[Q, dim] x gathered[uc, dim]^T + float alpha = 1.0f, beta = 0.0f; + raft::linalg::gemm(res, + true, + false, + static_cast(uc), + static_cast(n_queries), + static_cast(dim), + &alpha, + gathered.data(), + static_cast(dim), + queries.data_handle(), + static_cast(dim), + &beta, + dist.data() + c0, + static_cast(n_selected), + stream); + } + + if (needs_norms) { + rmm::device_uvector r_norms(n_selected, stream); + int64_t blocks = (n_selected + 255) / 256; + gather_norms_kernel<<(blocks), 256, 0, stream>>>( + idx.norms().data_handle(), ids.data(), n_selected, r_norms.data()); + auto q_norms = make_query_norms(res, queries, metric); + int64_t total = n_queries * n_selected; + dense_epilogue_kernel<<((total + 255) / 256), 256, 0, stream>>>( + dist.data(), q_norms.data(), r_norms.data(), n_queries, n_selected, metric); + } + + int64_t k = neighbors.extent(1); + rmm::device_uvector positions(n_queries * k, stream); + bool select_min = cuvs::distance::is_min_close(metric); + raft::matrix::select_k( + res, + raft::make_device_matrix_view( + dist.data(), n_queries, n_selected), + std::nullopt, + distances, + raft::make_device_matrix_view( + positions.data(), n_queries, k), + select_min, + true); + int64_t total = n_queries * k; + remap_ids_kernel<<((total + 255) / 256), 256, 0, stream>>>( + positions.data(), ids.data(), total, neighbors.data_handle()); +} + +/** + * Per-query-filter (one Roaring bitmap per query) brute-force search. + */ +inline void brute_force_search_roaring_matrix( + raft::resources const& res, + const cuvs::neighbors::brute_force::index& idx, + raft::device_matrix_view queries, + const cuvs::neighbors::filtering::roaring_matrix_filter& filter, + raft::device_matrix_view neighbors, + raft::device_matrix_view distances) +{ + using namespace roaring_detail; + auto stream = raft::resource::get_cuda_stream(res); + auto metric = idx.metric(); + int64_t n_queries = queries.extent(0); + int64_t n_rows = idx.dataset().extent(0); + + RAFT_EXPECTS(static_cast(filter.n_queries_) == n_queries, + "roaring_matrix_filter must provide one bitmap per query"); + bool needs_norms = metric == cuvs::distance::DistanceType::L2Expanded || + metric == cuvs::distance::DistanceType::L2SqrtExpanded || + metric == cuvs::distance::DistanceType::CosineExpanded; + RAFT_EXPECTS(idx.has_norms() || !needs_norms, + "roaring_matrix_filter: index must carry norms for L2/Cosine metrics"); + + // indptr comes straight off the host-side cardinalities + std::vector h_indptr(n_queries + 1, 0); + for (int64_t q = 0; q < n_queries; ++q) { + const auto* b = filter.bitmaps_[q]; + RAFT_EXPECTS(!b->negated, "roaring_matrix_filter: negated bitmaps are not supported"); + RAFT_EXPECTS(static_cast(b->universe_size) == n_rows, + "roaring_matrix_filter universe size must match the index size"); + h_indptr[q + 1] = h_indptr[q] + static_cast(b->total_cardinality); + } + int64_t nnz = h_indptr[n_queries]; + double s = static_cast(nnz) / (static_cast(n_queries) * n_rows); + + if (nnz == 0) { + thrust::fill(raft::resource::get_thrust_policy(res), + neighbors.data_handle(), + neighbors.data_handle() + n_queries * neighbors.extent(1), + int64_t{-1}); + return; + } + + // ---- dense regime: decompress to a [n_queries, n_rows] bit matrix ---- + if (s >= kDenseThreshold) { + int64_t n_words = (n_queries * n_rows + 31) / 32; + rmm::device_uvector bits(n_words, stream); + RAFT_CUDA_TRY(cudaMemsetAsync(bits.data(), 0, n_words * sizeof(uint32_t), stream)); + cuvs::core::decompress_to_bitmap( + res, filter.bitmaps_, static_cast(n_queries), n_rows, bits.data()); + auto view = cuvs::core::bitmap_view(bits.data(), n_queries, n_rows); + auto bf = cuvs::neighbors::filtering::bitmap_filter(view); + brute_force_search_filtered( + res, idx, queries, &bf, neighbors, distances, std::nullopt); + return; + } + + // ---- sparse: batched CSR emission (one launch for all filters) ---- + rmm::device_uvector indptr(n_queries + 1, stream); + raft::update_device(indptr.data(), h_indptr.data(), h_indptr.size(), stream); + rmm::device_uvector indices(nnz, stream); + cuvs::core::to_csr_indices( + res, filter.bitmaps_, static_cast(n_queries), indices.data()); + sparse_distances_select_k( + res, idx, queries, indptr.data(), indices.data(), nnz, neighbors, distances); +} + +} // namespace cuvs::neighbors::detail diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 9b96f94bf0..31729afd56 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -104,7 +104,8 @@ endfunction() ConfigureTest( NAME NEIGHBORS_TEST - PATH neighbors/brute_force.cu neighbors/brute_force_prefiltered.cu neighbors/sparse_brute_force.cu + PATH neighbors/brute_force.cu neighbors/brute_force_prefiltered.cu neighbors/brute_force_roaring.cu + neighbors/sparse_brute_force.cu neighbors/refine.cu neighbors/distance_nn.cu GPUS 1 PERCENT 100 diff --git a/cpp/tests/neighbors/brute_force_roaring.cu b/cpp/tests/neighbors/brute_force_roaring.cu new file mode 100644 index 0000000000..5d59cce73c --- /dev/null +++ b/cpp/tests/neighbors/brute_force_roaring.cu @@ -0,0 +1,232 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include + +#include + +#include +#include +#include +#include + +namespace cuvs::neighbors { + +namespace { + +struct RoaringFilterInputs { + int64_t n_rows; + int64_t dim; + int64_t n_queries; + int64_t k; + double selectivity; // chosen to exercise sparse / mid / dense dispatch + cuvs::distance::DistanceType metric; +}; + +std::ostream& operator<<(std::ostream& os, const RoaringFilterInputs& p) +{ + return os << "n_rows=" << p.n_rows << " dim=" << p.dim << " q=" << p.n_queries << " k=" << p.k + << " s=" << p.selectivity << " metric=" << (int)p.metric; +} + +std::vector make_filter_ids(std::mt19937& rng, int64_t n_rows, double s) +{ + std::vector ids; + std::uniform_real_distribution u(0.0, 1.0); + for (int64_t i = 0; i < n_rows; ++i) + if (u(rng) < s) ids.push_back(static_cast(i)); + if (ids.size() < 32) { // keep k satisfiable + for (uint32_t i = 0; ids.size() < 32; i += 7) + ids.push_back(i % static_cast(n_rows)); + std::sort(ids.begin(), ids.end()); + ids.erase(std::unique(ids.begin(), ids.end()), ids.end()); + } + return ids; +} + +class RoaringFilterTest : public ::testing::TestWithParam { + protected: + void SetUp() override + { + auto p = GetParam(); + auto stream = raft::resource::get_cuda_stream(res_); + + dataset_.emplace(raft::make_device_matrix(res_, p.n_rows, p.dim)); + queries_.emplace(raft::make_device_matrix(res_, p.n_queries, p.dim)); + raft::random::RngState r1(42), r2(43); + raft::random::uniform( + res_, + r1, + raft::make_device_vector_view(dataset_->data_handle(), p.n_rows * p.dim), + -1.0f, + 1.0f); + raft::random::uniform( + res_, + r2, + raft::make_device_vector_view(queries_->data_handle(), p.n_queries * p.dim), + -1.0f, + 1.0f); + + brute_force::index_params ip; + ip.metric = p.metric; + index_.emplace(brute_force::build(res_, ip, raft::make_const_mdspan(dataset_->view()))); + raft::resource::sync_stream(res_); + } + + // run a search with the given filter into fresh buffers + std::pair, std::vector> run(const filtering::base_filter& filter) + { + auto p = GetParam(); + auto nb = raft::make_device_matrix(res_, p.n_queries, p.k); + auto di = raft::make_device_matrix(res_, p.n_queries, p.k); + brute_force::search(res_, + brute_force::search_params{}, + *index_, + raft::make_const_mdspan(queries_->view()), + nb.view(), + di.view(), + filter); + std::vector h_nb(p.n_queries * p.k); + std::vector h_di(p.n_queries * p.k); + raft::update_host( + h_nb.data(), nb.data_handle(), h_nb.size(), raft::resource::get_cuda_stream(res_)); + raft::update_host( + h_di.data(), di.data_handle(), h_di.size(), raft::resource::get_cuda_stream(res_)); + raft::resource::sync_stream(res_); + return {std::move(h_nb), std::move(h_di)}; + } + + // tie-tolerant: sorted per-query distance lists must match elementwise + static void expect_same_distances(const std::vector& a, + const std::vector& b, + int64_t n_queries, + int64_t k) + { + for (int64_t q = 0; q < n_queries; ++q) { + std::vector da(a.begin() + q * k, a.begin() + (q + 1) * k); + std::vector db(b.begin() + q * k, b.begin() + (q + 1) * k); + std::sort(da.begin(), da.end()); + std::sort(db.begin(), db.end()); + for (int64_t j = 0; j < k; ++j) { + ASSERT_NEAR(da[j], db[j], 1e-3 + 1e-3 * std::abs(da[j])) << "query " << q << " rank " << j; + } + } + } + + raft::resources res_; + std::optional> dataset_; + std::optional> queries_; + std::optional> index_; +}; + +TEST_P(RoaringFilterTest, MatchesBitsetFilter) +{ + auto p = GetParam(); + auto stream = raft::resource::get_cuda_stream(res_); + std::mt19937 rng(123); + auto ids = make_filter_ids(rng, p.n_rows, p.selectivity); + + // roaring filter + auto roaring = + cuvs::core::from_sorted_ids(res_, ids.data(), (uint32_t)ids.size(), (uint32_t)p.n_rows); + auto rf = filtering::roaring_filter(roaring); + + // reference: equivalent bitset filter + raft::core::bitset bits(res_, p.n_rows, false); + std::vector h_words((p.n_rows + 31) / 32, 0); + for (auto id : ids) + h_words[id / 32] |= 1u << (id % 32); + raft::update_device(bits.data(), h_words.data(), h_words.size(), stream); + auto bf = filtering::bitset_filter(bits.view()); + + auto [nb_r, di_r] = run(rf); + auto [nb_b, di_b] = run(bf); + + expect_same_distances(di_r, di_b, p.n_queries, p.k); + // membership: every returned id must be in the filter + for (auto id : nb_r) { + ASSERT_TRUE(id >= 0 && std::binary_search(ids.begin(), ids.end(), (uint32_t)id)) + << "non-member id " << id; + } +} + +TEST_P(RoaringFilterTest, MatrixMatchesBitmapFilter) +{ + auto p = GetParam(); + auto stream = raft::resource::get_cuda_stream(res_); + std::mt19937 rng(321); + + // one filter per query (share a few to exercise repeated bitmaps) + std::vector> per_query(p.n_queries); + std::vector bitmaps; + bitmaps.reserve(p.n_queries); + std::vector ptrs(p.n_queries); + for (int64_t q = 0; q < p.n_queries; ++q) { + per_query[q] = make_filter_ids(rng, p.n_rows, p.selectivity); + bitmaps.emplace_back(cuvs::core::from_sorted_ids( + res_, per_query[q].data(), (uint32_t)per_query[q].size(), (uint32_t)p.n_rows)); + ptrs[q] = &bitmaps[q]; + } + auto rmf = filtering::roaring_matrix_filter(ptrs.data(), (uint32_t)p.n_queries); + + // reference: dense [n_queries, n_rows] bitmap filter + int64_t total_bits = p.n_queries * p.n_rows; + rmm::device_uvector words((total_bits + 31) / 32, stream); + std::vector h_words((total_bits + 31) / 32, 0); + for (int64_t q = 0; q < p.n_queries; ++q) + for (auto id : per_query[q]) { + int64_t bit = q * p.n_rows + id; + h_words[bit / 32] |= 1u << (bit % 32); + } + raft::update_device(words.data(), h_words.data(), h_words.size(), stream); + auto bview = cuvs::core::bitmap_view(words.data(), p.n_queries, p.n_rows); + auto bmf = filtering::bitmap_filter(bview); + + auto [nb_r, di_r] = run(rmf); + auto [nb_b, di_b] = run(bmf); + + expect_same_distances(di_r, di_b, p.n_queries, p.k); + for (int64_t q = 0; q < p.n_queries; ++q) + for (int64_t j = 0; j < p.k; ++j) { + int64_t id = nb_r[q * p.k + j]; + ASSERT_TRUE(id >= 0 && + std::binary_search(per_query[q].begin(), per_query[q].end(), (uint32_t)id)) + << "query " << q << " non-member id " << id; + } +} + +// selectivities chosen to hit the sparse / gather-GEMM / dense regimes at +// both a low and a high dimension (the sparse/mid threshold is +// dimension-dependent) +const std::vector inputs = { + // d=64: sparse (<=3%), mid, dense + {100000, 64, 17, 10, 0.002, cuvs::distance::DistanceType::InnerProduct}, + {100000, 64, 17, 10, 0.10, cuvs::distance::DistanceType::InnerProduct}, + {100000, 64, 17, 10, 0.60, cuvs::distance::DistanceType::InnerProduct}, + {100000, 64, 17, 10, 0.10, cuvs::distance::DistanceType::L2Expanded}, + {100000, 64, 17, 10, 0.002, cuvs::distance::DistanceType::L2Expanded}, + {100000, 64, 17, 10, 0.60, cuvs::distance::DistanceType::L2Expanded}, + {100000, 64, 17, 10, 0.10, cuvs::distance::DistanceType::CosineExpanded}, + // d=512: sparse (<=0.1%), mid, dense + {60000, 512, 9, 10, 0.0008, cuvs::distance::DistanceType::InnerProduct}, + {60000, 512, 9, 10, 0.05, cuvs::distance::DistanceType::L2Expanded}, + {60000, 512, 9, 10, 0.55, cuvs::distance::DistanceType::InnerProduct}, +}; + +INSTANTIATE_TEST_SUITE_P(RoaringFilter, RoaringFilterTest, ::testing::ValuesIn(inputs)); + +} // namespace +} // namespace cuvs::neighbors