From ec7b06b972065902a901a20394528b22fe091bdd Mon Sep 17 00:00:00 2001 From: jdubdevs Date: Mon, 1 Jun 2026 16:42:52 -0700 Subject: [PATCH] fix(chunker): stop overlap-stride crawl that shattered long-prose files MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit smart_chunk applied the overlap window even when the emitted chunk was smaller than the overlap. cut_offset - overlap_chars then landed before the chunk's own start, and the .max(start_offset + 1) guard advanced the start by a single character — re-selecting the same nearby high-score break point and crawling forward one char at a time. Long, heading-dense prose files were shattered into hundreds of near-duplicate empty-heading micro-chunks (a 4.5k-word note produced 923 chunks; 907 of them 1-char-offset shrapnel), which (a) made the file unretrievable — its signal split below threshold so it never entered the candidate set — and (b) bloated the index ~10x (451 files held 91% of all chunks). Fix: only step back by the overlap window when the chunk is larger than it; otherwise advance fully to the cut. Guarantees forward progress, no crawl. Validated: the note re-chunks 923 -> 28; all chunks vectorize; a unique-phrase search returns it at rank 1. Adds test_smart_chunk_no_overlap_crawl regression. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/chunker.rs | 50 +++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 47 insertions(+), 3 deletions(-) diff --git a/src/chunker.rs b/src/chunker.rs index 833b7e8..c34df4c 100644 --- a/src/chunker.rs +++ b/src/chunker.rs @@ -296,12 +296,21 @@ pub fn smart_chunk(content: &str, target_tokens: usize, overlap_pct: usize) -> V }); } - // Move start forward, applying overlap + // Move start forward, applying overlap. if cut_offset >= content.len() { break; } - start_offset = if overlap_chars > 0 && cut_offset > overlap_chars { - (cut_offset - overlap_chars).max(start_offset + 1) + // Only step back by the overlap window when the chunk we just emitted is + // LARGER than that window. If the chunk is at or below overlap size, + // `cut_offset - overlap_chars` lands before this chunk began; the old + // `.max(start_offset + 1)` guard then advanced the start by a single + // character, re-selected the same nearby high-score break point, and + // crawled forward one char at a time — emitting hundreds of near-duplicate + // sub-chunks per file (observed: a 4.5k-word note shattered into 900+ + // empty-heading chunks). Advancing fully to the cut guarantees real + // forward progress and eliminates the crawl. + start_offset = if overlap_chars > 0 && cut_offset > start_offset + overlap_chars { + cut_offset - overlap_chars } else { cut_offset }; @@ -696,6 +705,41 @@ mod tests { } } + #[test] + fn test_smart_chunk_no_overlap_crawl() { + // Regression for the overlap-vs-stride crawl: when a high-score break + // (heading) lands within the overlap window of a chunk start, the old + // advance logic stepped the start forward one character at a time, + // re-selecting the same break and emitting hundreds of near-duplicate + // sub-chunks. A doc of many short headed sections must still produce a + // bounded, sane chunk count with no degenerate micro-chunks. + let mut content = String::new(); + content.push_str("# Title\n\n"); + for i in 0..40 { + content.push_str(&format!( + "## Section {i}\nA moderate paragraph of prose for section {i} that carries \ + real content but is shorter than the target chunk size, so the following \ + heading falls inside the overlap window.\n\n" + )); + } + let chunks = smart_chunk(&content, 512, 15); + // ~9k chars at a 2048-char target → a healthy handful of chunks, never hundreds. + assert!( + chunks.len() < 40, + "overlap-crawl regression: expected a bounded chunk count, got {}", + chunks.len() + ); + // The crawl produced ~1-token fragments; real chunks are substantial. + for c in &chunks { + assert!( + c.text.len() > 20, + "degenerate micro-chunk produced ({} chars): {:?}", + c.text.len(), + c.text + ); + } + } + #[test] fn test_smart_chunk_empty() { let chunks = smart_chunk("", 512, 15);