Skip to content
Open
49 changes: 38 additions & 11 deletions python/cuvs_bench/cuvs_bench/generate_groundtruth/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import warnings

from .utils import (
add_jitter,
groundtruth_neighbors_filename,
memmap_bin_file,
offset_neighbor_indices,
Expand Down Expand Up @@ -111,6 +112,22 @@ def choose_random_queries(dataset, n_queries):
return dataset[query_idx, :]


def choose_random_queries_with_jitter(dataset, n_queries, seed=12345):
"""Pick ``n_queries`` random rows from ``dataset`` and add Gaussian jitter
at scale ``0.1 * std(sample)``.
"""
import numpy as _np

print("Choosing random vectors from dataset and jittering with noise")
rng = _np.random.default_rng(seed)
n_rows = dataset.shape[0]
# Sort indices so the memmap read is sequential rather than random-access.
query_idx = _np.sort(rng.choice(n_rows, size=n_queries, replace=False))
sampled = dataset[query_idx, :].astype(_np.float32, copy=True)

return add_jitter(sampled, rng, normalize=False)

Comment thread
jinsolp marked this conversation as resolved.

def cpu_search(dataset, queries, k, metric="squeclidean"):
"""
Find the k nearest neighbors for each query point in the dataset using the
Expand Down Expand Up @@ -235,18 +252,22 @@ def main():
"The input and output files are in big-ann-benchmark's binary format.",
epilog="""Example usage
# With existing query file
python -m cuvs_bench.generate_groundtruth --dataset /dataset/base.\
fbin --output=groundtruth_dir --queries=/dataset/query.public.10K.fbin
python -m cuvs_bench.generate_groundtruth /dataset/base.fbin \
--output=groundtruth_dir --queries=/dataset/query.public.10K.fbin

# With randomly generated queries
python -m cuvs_bench.generate_groundtruth --dataset /dataset/base.\
fbin --output=groundtruth_dir --queries=random --n_queries=10000
python -m cuvs_bench.generate_groundtruth /dataset/base.fbin \
--output=groundtruth_dir --queries=random --n_queries=10000

# Using only a subset of the dataset. Define queries by randomly
# selecting vectors from the (subset of the) dataset.
python -m cuvs_bench.generate_groundtruth --dataset /dataset/base.\
fbin --nrows=2000000 --cols=128 --output=groundtruth_dir \
python -m cuvs_bench.generate_groundtruth /dataset/base.fbin \
--rows=2000000 --cols=128 --output=groundtruth_dir \
--queries=random-choice --n_queries=10000

# Jittered queries (following the logic of cuvs_bench.synthesize_dataset)
python -m cuvs_bench.generate_groundtruth /dataset/base.fbin \
--output=groundtruth_dir --queries=random-jitter --n_queries=10000
""",
formatter_class=argparse.RawDescriptionHelpFormatter,
)
Expand All @@ -256,9 +277,11 @@ def main():
"--queries",
type=str,
default="random",
help="Queries file name, or one of 'random-choice' or 'random' "
"(default). 'random-choice': select n_queries vectors from the input "
"dataset. 'random': generate n_queries as uniform random numbers.",
help="Queries file name, or one of 'random-choice', 'random-jitter', "
"or 'random' (default). 'random-choice': select n_queries vectors "
"from the input dataset. 'random-jitter': same as 'random-choice', "
"but add std-relative Gaussian noise to each query. 'random': generate "
"n_queries as uniform random numbers.",
)
parser.add_argument(
"--output",
Expand Down Expand Up @@ -341,7 +364,7 @@ def main():
if len(args.output) > 0:
os.makedirs(args.output, exist_ok=True)

if args.queries == "random" or args.queries == "random-choice":
if args.queries in {"random", "random-choice", "random-jitter"}:
if args.n_queries is None:
raise RuntimeError(
"n_queries must be given to generate random queries"
Expand All @@ -352,9 +375,13 @@ def main():
)
elif args.queries == "random-choice":
queries = choose_random_queries(dataset, args.n_queries)
elif args.queries == "random-jitter":
queries = choose_random_queries_with_jitter(
dataset, args.n_queries
)

queries_filename = os.path.join(
args.output, "queries" + suffix_from_dtype(dtype)
args.output, "queries" + suffix_from_dtype(queries.dtype)
)
print("Writing queries file", queries_filename)
write_bin(queries_filename, queries)
Expand Down
16 changes: 16 additions & 0 deletions python/cuvs_bench/cuvs_bench/generate_groundtruth/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,22 @@
from cuvs_bench._bin_format import read_bin_header, write_bin_header


def add_jitter(
queries: np.ndarray,
rng: np.random.Generator,
normalize: bool,
) -> np.ndarray:
"""Add Gaussian jitter to query vectors and optionally re-normalize."""
noise_scale = float(np.std(queries)) * 0.1
queries = queries + rng.normal(0, noise_scale, queries.shape).astype(
np.float32
)
if normalize:
norms = np.linalg.norm(queries, axis=1, keepdims=True)
queries = queries / np.maximum(norms, 1e-8)
return queries.astype(np.float32)


def dtype_from_filename(filename):
ext = os.path.splitext(filename)[1]
if ext == ".fbin":
Expand Down
Loading
Loading