Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 62 additions & 4 deletions cpp/src/neighbors/detail/cagra/cagra_helpers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -84,14 +84,60 @@ std::tuple<size_t, size_t, size_t, size_t> optimize_workspace_size(size_t n_rows
}
size_t combine_dev = combine_dev_fixed;

size_t total_host = mst_host + combine_host;
size_t debug_host_size = 0;
if (raft::default_logger().should_log(rapids_logger::level_enum::debug)) {
debug_host_size = n_rows * graph_degree * sizeof(uint32_t) // host_copy_output_graph
+ n_rows * sizeof(uint32_t) // in_edge_count
+ graph_degree * sizeof(uint32_t); // hist
}

size_t total_host = mst_host + combine_host + debug_host_size;
size_t total_host_fixed = mst_host_fixed + combine_host_fixed;
size_t total_dev = std::max(prune_dev, rev_dev + combine_dev);
size_t total_dev_fixed = std::max(prune_dev_fixed, combine_dev_fixed);

return std::make_tuple(total_host, total_dev, total_host_fixed, total_dev_fixed);
}
Comment thread
huuanhhuyn marked this conversation as resolved.

inline size_t ivf_pq_extend_mem_usage(raft::matrix_extent<int64_t> dataset,
cuvs::neighbors::graph_build_params::ivf_pq_params params,
size_t dtype_size)
{
constexpr size_t kReasonableMaxBatchSize = 65536;
constexpr size_t kSpecAlignMax = 1024;

size_t n_rows = dataset.extent(0);
size_t dim = dataset.extent(1);
size_t pq_dim = params.build_params.pq_dim;
size_t pq_bits = params.build_params.pq_bits;
size_t rot_dim = raft::round_up_safe<size_t>(dim, pq_dim);
size_t n_clusters = params.build_params.n_lists;

RAFT_EXPECTS(pq_dim > 0, "pq_dim should not be 0");

size_t max_batch_size = std::min<size_t>(n_rows, kReasonableMaxBatchSize);
size_t workspace_size = max_batch_size * dim * dtype_size // vec_batches
+ max_batch_size * rot_dim * sizeof(float) // new_vectors_residual
+ max_batch_size * dim * sizeof(float); // flat_compute_residuals_tmp

// each row contains pq codes and index
size_t code_bytes_per_vec = pq_dim * pq_bits / 8;
size_t bytes_per_row = code_bytes_per_vec + sizeof(uint32_t);

// estimate the "worst-case" for the number of placeholder rows and resize rows
// The worst-case (i.e. max) happens for INTERLEAVED (as oppposed to FLAT) and when each row
// wastes n_cluster * alignment_size
size_t n_rows_placeholder = n_rows + ivf_pq::kIndexGroupSize * n_clusters;
size_t placeholder_dev = n_rows_placeholder * bytes_per_row;
size_t n_rows_resize_lists = n_rows + kSpecAlignMax * n_clusters;
size_t resize_lists_dev = n_rows_resize_lists * bytes_per_row;

// Placeholder freed before resize_list
size_t device_size = std::max(placeholder_dev, resize_lists_dev);

return device_size + workspace_size;
}
Comment thread
huuanhhuyn marked this conversation as resolved.

// All sizes are in bytes
inline std::pair<size_t, size_t> ivf_pq_build_mem_usage(
raft::resources const& res,
Expand All @@ -100,7 +146,8 @@ inline std::pair<size_t, size_t> ivf_pq_build_mem_usage(
cuvs::neighbors::graph_build_params::ivf_pq_params params,
size_t graph_degree,
size_t intermediate_graph_degree,
bool guarantee_connectivity)
bool guarantee_connectivity,
bool attach_dataset_on_build)
{
size_t dtype_size = cuda_data_type_size(dtype);
bool input_is_float = (dtype == CUDA_R_32F);
Expand All @@ -125,6 +172,10 @@ inline std::pair<size_t, size_t> ivf_pq_build_mem_usage(
params.build_params.n_lists));
size_t kmeans_n_rows = n_rows / kmeans_trainset_ratio;
size_t kmeans_gpu_mem = kmeans_n_rows * dim * sizeof(float);
if (dtype != CUDA_R_32F) {
// kmeans trainset tmp allocation
kmeans_gpu_mem += kmeans_n_rows * dim * dtype_size;
}

// For non-float input, ivf_pq::build first samples into a temporary trainset of type T
if (!input_is_float) { kmeans_gpu_mem += kmeans_n_rows * dim * dtype_size; }
Expand All @@ -137,6 +188,12 @@ inline std::pair<size_t, size_t> ivf_pq_build_mem_usage(
size_t kmeans_pinned_host = 2 * pinned_rows * dim * dtype_size; // two staging double-buffers
size_t kmeans_host_mem = kmeans_indices_host + kmeans_pinned_host;

// Extend phase
size_t extend_gpu_mem = params.build_params.add_data_on_build ? ivf_pq_extend_mem_usage(dataset, params, dtype_size) : 0;

// Add graph to index on GPU
size_t attach_graph_gpu_mem = attach_dataset_on_build ? n_rows * graph_degree * sizeof(uint32_t) : 0;

// Search phase (build_knn_graph):
constexpr size_t kWorkspaceRatio = 5;
size_t top_k = intermediate_graph_degree + 1;
Expand All @@ -155,7 +212,7 @@ inline std::pair<size_t, size_t> ivf_pq_build_mem_usage(
+ (sizeof(float) + sizeof(int64_t)) * top_k); // refined_*

// Phases run sequentially (train/extend -> search -> optimize)
size_t total_dev = std::max({kmeans_gpu_mem, search_phase_dev, gpu_workspace_size});
size_t total_dev = std::max({kmeans_gpu_mem, extend_gpu_mem, attach_graph_gpu_mem, search_phase_dev, gpu_workspace_size});

// The graph (and its optimize workspace) stays resident across phases
size_t total_host =
Expand Down Expand Up @@ -209,7 +266,8 @@ std::pair<size_t, size_t> cagra_build_mem_usage(raft::resources const& res,
pq_params,
cparams.graph_degree,
cparams.intermediate_graph_degree,
cparams.guarantee_connectivity);
cparams.guarantee_connectivity,
cparams.attach_dataset_on_build);
} else if (std::holds_alternative<graph_build_params::nn_descent_params>(
cparams.graph_build_params)) {
RAFT_LOG_INFO("Considering CAGRA in memory build with NN-descent");
Expand Down
Loading