Skip to content
54 changes: 35 additions & 19 deletions build.zig
Original file line number Diff line number Diff line change
Expand Up @@ -53,28 +53,44 @@ pub fn build(b: *std.Build) void {
const run_step = b.step("run", "Run codedb daemon");
run_step.dependOn(&run_cmd.step);

// ── Tests ──
// ── Tests (split into independent binaries for faster compilation) ──
const test_filter = b.option([]const u8, "test-filter", "Only run tests whose name contains this substring");
const tests = b.addTest(.{
.root_module = b.createModule(.{
.root_source_file = b.path("src/tests.zig"),
.target = target,
.optimize = optimize,
.link_libc = true,
}),
});
tests.root_module.addImport("mcp", mcp_dep.module("mcp"));
tests.root_module.addImport("nanoregex", nanoregex_dep.module("nanoregex"));
if (test_filter) |f| {
const filters = b.allocator.alloc([]const u8, 1) catch @panic("oom");
filters[0] = f;
tests.filters = filters;
const test_step = b.step("test", "Run all tests");

const test_files = [_]struct { name: []const u8, path: []const u8, needs_mcp: bool, needs_nanoregex: bool }{
.{ .name = "test-core", .path = "src/test_core.zig", .needs_mcp = false, .needs_nanoregex = false },
.{ .name = "test-explore", .path = "src/test_explore.zig", .needs_mcp = false, .needs_nanoregex = true },
.{ .name = "test-index", .path = "src/test_index.zig", .needs_mcp = true, .needs_nanoregex = true },
.{ .name = "test-parser", .path = "src/test_parser.zig", .needs_mcp = false, .needs_nanoregex = true },
.{ .name = "test-search", .path = "src/test_search.zig", .needs_mcp = true, .needs_nanoregex = true },
.{ .name = "test-snapshot", .path = "src/test_snapshot.zig", .needs_mcp = false, .needs_nanoregex = true },
.{ .name = "test-mcp", .path = "src/test_mcp.zig", .needs_mcp = true, .needs_nanoregex = true },
.{ .name = "test-query", .path = "src/test_query.zig", .needs_mcp = true, .needs_nanoregex = true },
};

for (test_files) |tf| {
const t = b.addTest(.{
.root_module = b.createModule(.{
.root_source_file = b.path(tf.path),
.target = target,
.optimize = optimize,
.link_libc = true,
}),
});
if (tf.needs_mcp) t.root_module.addImport("mcp", mcp_dep.module("mcp"));
if (tf.needs_nanoregex) t.root_module.addImport("nanoregex", nanoregex_dep.module("nanoregex"));
if (test_filter) |f| {
const filters = b.allocator.alloc([]const u8, 1) catch @panic("oom");
filters[0] = f;
t.filters = filters;
}
const run = b.addRunArtifact(t);
test_step.dependOn(&run.step);

const individual_step = b.step(tf.name, b.fmt("Run {s}", .{tf.name}));
individual_step.dependOn(&run.step);
}

const test_step = b.step("test", "Run tests");
const tests_run = b.addRunArtifact(tests);
test_step.dependOn(&tests_run.step);


// ── Library tests (verify the module root compiles) ──
const lib_tests = b.addTest(.{
Expand Down
Binary file modified codedb.snapshot
Binary file not shown.
9 changes: 9 additions & 0 deletions src/cio.zig
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,15 @@ extern "c" fn clock_gettime(id: c_int, ts: *std.c.timespec) c_int;
extern "c" fn pipe(fds: *[2]c_int) c_int;
extern "c" fn close(fd: c_int) c_int;

pub fn ignoreSigpipe() void {
var act: std.posix.Sigaction = .{
.handler = .{ .handler = std.posix.SIG.IGN },
.mask = 0,
.flags = 0,
};
std.posix.sigaction(std.posix.SIG.PIPE, &act, null);
}

const CLOCK_REALTIME: c_int = 0;
const CLOCK_MONOTONIC: c_int = if (builtin.os.tag == .macos) 6 else 1;

Expand Down
111 changes: 80 additions & 31 deletions src/explore.zig
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,20 @@ pub const SearchResult = struct {
score: f32 = 0.0,
};

pub const SearchBreakdown = struct {
tier0_ns: i128 = 0,
tier05_ns: i128 = 0,
tier1_ns: i128 = 0,
tier2_ns: i128 = 0,
tier3_ns: i128 = 0,
tier4_ns: i128 = 0,
tier5_ns: i128 = 0,
rerank_ns: i128 = 0,
tier_reached: u8 = 0,
candidate_count: u32 = 0,
result_count: u32 = 0,
};

pub const DependencyGraph = struct {
forward: std.StringHashMap(std.ArrayList([]const u8)),
reverse: std.StringHashMap(std.StringHashMap(void)),
Expand Down Expand Up @@ -522,6 +536,7 @@ pub const Explorer = struct {
/// assert the short-circuit holds (issue: negative-query slow path).
/// Production code does not read this field.
search_tier5_count: u64 = 0,
last_search_breakdown: SearchBreakdown = .{},

pub const DEFAULT_CONTENT_CACHE_CAPACITY: u32 = 16384;

Expand Down Expand Up @@ -646,8 +661,8 @@ pub const Explorer = struct {

persistent_outline.path = stable_path;

const prior_content = self.contents.get(stable_path);
try self.contents.put(stable_path, content);
const prior_content: ?[]const u8 = null;

if (full_index) {
if (!self.word_index_complete) {
Expand Down Expand Up @@ -1520,6 +1535,9 @@ pub const Explorer = struct {

if (max_results == 0) return try allocator.alloc(SearchResult, 0);

var breakdown: SearchBreakdown = .{};
defer self.last_search_breakdown = breakdown;

var result_list: std.ArrayList(SearchResult) = .empty;
errdefer result_list.deinit(allocator);

Expand All @@ -1533,6 +1551,7 @@ pub const Explorer = struct {
// docs, and files with more exact word hits are considered first so
// popular identifiers and skip-trigram canonical files are not hidden
// behind earlier low-signal posting-list entries.
const t0_start = cio.nanoTimestamp();
const word_hits = self.word_index.search(query);
if (word_hits.len > 0) {
const Tier0File = struct {
Expand Down Expand Up @@ -1587,13 +1606,19 @@ pub const Explorer = struct {
searched.put(stats.path, {}) catch {};
try searchInContent(stats.path, ref.data, query, allocator, tier0_per_file_cap, max_results, &result_list);
}
if (result_list.items.len >= max_results)
return self.rerankAndFinalize(&result_list, query, allocator);
if (result_list.items.len >= max_results) {
breakdown.tier0_ns = cio.nanoTimestamp() - t0_start;
breakdown.tier_reached = 0;
breakdown.result_count = @intCast(result_list.items.len);
const t_rerank = cio.nanoTimestamp();
const res = self.rerankAndFinalize(&result_list, query, allocator);
breakdown.rerank_ns = cio.nanoTimestamp() - t_rerank;
return res;
}
}
breakdown.tier0_ns = cio.nanoTimestamp() - t0_start;

// Tier 0.5: prefix expansion — find all indexed keys that begin with the query.
// Activates when Tier 0 found nothing and query is ≥3 chars, catching partial
// identifier queries like "searchC" that match "searchContent" in the word index.
const t05_start = cio.nanoTimestamp();
if (result_list.items.len == 0 and query.len >= 3) {
const prefix_hits = try self.word_index.searchPrefix(query, allocator, max_results);
defer allocator.free(prefix_hits);
Expand All @@ -1616,14 +1641,23 @@ pub const Explorer = struct {
searched.put(hit_path, {}) catch {};
if (result_list.items.len >= max_results) break;
}
if (result_list.items.len >= max_results)
return self.rerankAndFinalize(&result_list, query, allocator);
if (result_list.items.len >= max_results) {
breakdown.tier05_ns = cio.nanoTimestamp() - t05_start;
breakdown.tier_reached = 1;
breakdown.result_count = @intCast(result_list.items.len);
const t_rerank = cio.nanoTimestamp();
const res = self.rerankAndFinalize(&result_list, query, allocator);
breakdown.rerank_ns = cio.nanoTimestamp() - t_rerank;
return res;
}
}
breakdown.tier05_ns = cio.nanoTimestamp() - t05_start;

const t1_start = cio.nanoTimestamp();
const candidate_paths = self.trigram_index.candidates(query, allocator);
defer if (candidate_paths) |cp| allocator.free(cp);
if (candidate_paths) |cp| breakdown.candidate_count = @intCast(cp.len);

// Tier 1: trigram candidates — fast path, skips files already found by Tier 0.
if (candidate_paths) |cp| {
if (cp.len > 0) {
// Issue #427: rank candidates by per-file word-index hit count
Expand Down Expand Up @@ -1662,18 +1696,25 @@ pub const Explorer = struct {
const ref = self.readContentForSearch(path, allocator) orelse continue;
defer ref.deinit();
try searchInContent(path, ref.data, query, allocator, max_per_file, max_results, &result_list);
if (result_list.items.len >= max_results)
return self.rerankAndFinalize(&result_list, query, allocator);
if (result_list.items.len >= max_results) {
breakdown.tier1_ns = cio.nanoTimestamp() - t1_start;
breakdown.tier_reached = 2;
breakdown.result_count = @intCast(result_list.items.len);
const t_rerank = cio.nanoTimestamp();
const res = self.rerankAndFinalize(&result_list, query, allocator);
breakdown.rerank_ns = cio.nanoTimestamp() - t_rerank;
return res;
}
}
}
}

// Mark all Tier 1 candidates as searched.
if (candidate_paths) |cp| {
for (cp) |p| searched.put(p, {}) catch {};
}
breakdown.tier1_ns = cio.nanoTimestamp() - t1_start;

// Tier 2: sparse candidates — LAZY, only computed when Tier 1 found nothing.
const t2_start = cio.nanoTimestamp();
if (result_list.items.len == 0) {
const sparse_paths = self.sparse_ngram_index.candidates(query, allocator);
defer if (sparse_paths) |sp| allocator.free(sp);
Expand All @@ -1688,8 +1729,9 @@ pub const Explorer = struct {
}
}
}
breakdown.tier2_ns = cio.nanoTimestamp() - t2_start;

// Tier 3: skip_trigram_files not already searched.
const t3_start = cio.nanoTimestamp();
if (result_list.items.len < max_results) {
var skip_iter = self.skip_trigram_files.keyIterator();
while (skip_iter.next()) |key_ptr| {
Expand All @@ -1701,8 +1743,9 @@ pub const Explorer = struct {
if (result_list.items.len >= max_results) break;
}
}
breakdown.tier3_ns = cio.nanoTimestamp() - t3_start;

// Tier 4: word index scan — for files not yet searched.
const t4_start = cio.nanoTimestamp();
if (result_list.items.len < max_results) {
const tier4_hits = self.word_index.search(query);
if (tier4_hits.len > 0) {
Expand All @@ -1720,21 +1763,9 @@ pub const Explorer = struct {
}
}
}
breakdown.tier4_ns = cio.nanoTimestamp() - t4_start;

// Tier 5: full scan fallback — only when NO results from any tier.
// Avoids 100ms+ scans on large repos when indices already found matches.
//
// Short-circuit Tier 5 whenever the trigram index was consulted with
// a query long enough to fully cover it (query.len >= 3). The trigram
// filter returns a SUPERSET of files containing the substring (every
// file containing the substring necessarily contains all its
// trigrams). If Tier 1 scanned that superset and found 0 results, no
// other trigram-indexed file can match either; skip_trigram_files
// were handled separately by Tier 3. Tier 5 would otherwise re-scan
// every indexed file for nothing — a measurable 2–3 ms p50 cost on
// queries whose constituent trigrams are common-but-not-co-occurring
// syllables (e.g. `Suspense` on a Rust corpus). The cp.len == 0
// sub-case of this was already short-circuited before this change.
const t5_start = cio.nanoTimestamp();
const trigram_ruled_out = if (candidate_paths) |_|
(query.len >= 3)
else
Expand All @@ -1750,7 +1781,23 @@ pub const Explorer = struct {
if (result_list.items.len >= max_results) break;
}
}
return self.rerankAndFinalize(&result_list, query, allocator);
breakdown.tier5_ns = cio.nanoTimestamp() - t5_start;

if (result_list.items.len > 0) {
breakdown.tier_reached = if (breakdown.tier5_ns > 0 and result_list.items.len > 0) 7
else if (breakdown.tier4_ns > 0 and result_list.items.len > 0) 6
Comment on lines +1787 to +1788
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Derive tier_reached from actual executed search tier

tier_reached is computed from tier5_ns > 0, but tier5_ns is always positive because timing is recorded even when Tier 5 is skipped. Any non-early-return search with results therefore reports tier 7, even if matches were found in earlier tiers, making the new per-tier telemetry misleading for performance/debugging decisions.

Useful? React with 👍 / 👎.

else if (breakdown.tier3_ns > 0) 5
else if (breakdown.tier2_ns > 0) 4
else if (breakdown.tier1_ns > 0) 3
else if (breakdown.tier05_ns > 0) 1
else 0;
}
breakdown.result_count = @intCast(result_list.items.len);

const t_rerank = cio.nanoTimestamp();
const res = self.rerankAndFinalize(&result_list, query, allocator);
breakdown.rerank_ns = cio.nanoTimestamp() - t_rerank;
return res;
}

/// Run the multi-signal rerank in place, then transfer ownership of
Expand All @@ -1770,7 +1817,9 @@ pub const Explorer = struct {
if (result_list.items.len > 1) {
std.sort.block(SearchResult, result_list.items, {}, struct {
pub fn lessThan(_: void, a: SearchResult, b: SearchResult) bool {
if (a.score != b.score) return a.score > b.score;
const sa = if (a.score == a.score) a.score else 0;
const sb = if (b.score == b.score) b.score else 0;
if (sa != sb) return sa > sb;
const ord = std.mem.order(u8, a.path, b.path);
if (ord != .eq) return ord == .lt;
return a.line_num < b.line_num;
Expand Down
3 changes: 2 additions & 1 deletion src/index.zig
Original file line number Diff line number Diff line change
Expand Up @@ -351,7 +351,8 @@ pub const WordIndex = struct {
pub fn avgDocLength(self: *const WordIndex) f32 {
const n = self.doc_lengths.count();
if (n == 0) return 1.0;
return @as(f32, @floatFromInt(self.total_tokens)) / @as(f32, @floatFromInt(n));
const avg = @as(f32, @floatFromInt(self.total_tokens)) / @as(f32, @floatFromInt(n));
return if (avg > 0) avg else 1.0;
}

/// Shrink all hit lists and per-file word sets to release excess capacity.
Expand Down
3 changes: 2 additions & 1 deletion src/main.zig
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ fn mainInner() void {
fn mainImpl() !void {
// Use c_allocator (libc malloc) — better page reclamation than GPA
const allocator = std.heap.c_allocator;
cio.ignoreSigpipe();

// 0.16: single Threaded I/O instance passed down through every subsystem
// that touches fs/subprocess. See issue #282. `io` flows into mcp.run,
Expand Down Expand Up @@ -1048,7 +1049,7 @@ fn mainImpl() !void {

std.log.info("codedb mcp: root={s} files={d} data={s} scan={s}", .{ abs_root, store.currentSeq(), data_dir, mcp_server.getScanState().name() });

mcp_server.run(io, allocator, &store, &explorer, &agents, abs_root, cfg.max_cached, &telem, maybe_deferred);
mcp_server.run(io, allocator, &store, &explorer, &agents, abs_root, cfg.max_cached, &telem, maybe_deferred, &shutdown);

shutdown.store(true, .release);
if (scan_thread) |st| st.join();
Expand Down
Loading
Loading