Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 47 additions & 3 deletions src/chunker.rs
Original file line number Diff line number Diff line change
Expand Up @@ -296,12 +296,21 @@ pub fn smart_chunk(content: &str, target_tokens: usize, overlap_pct: usize) -> V
});
}

// Move start forward, applying overlap
// Move start forward, applying overlap.
if cut_offset >= content.len() {
break;
}
start_offset = if overlap_chars > 0 && cut_offset > overlap_chars {
(cut_offset - overlap_chars).max(start_offset + 1)
// Only step back by the overlap window when the chunk we just emitted is
// LARGER than that window. If the chunk is at or below overlap size,
// `cut_offset - overlap_chars` lands before this chunk began; the old
// `.max(start_offset + 1)` guard then advanced the start by a single
// character, re-selected the same nearby high-score break point, and
// crawled forward one char at a time — emitting hundreds of near-duplicate
// sub-chunks per file (observed: a 4.5k-word note shattered into 900+
// empty-heading chunks). Advancing fully to the cut guarantees real
// forward progress and eliminates the crawl.
start_offset = if overlap_chars > 0 && cut_offset > start_offset + overlap_chars {
cut_offset - overlap_chars
} else {
cut_offset
};
Expand Down Expand Up @@ -696,6 +705,41 @@ mod tests {
}
}

#[test]
fn test_smart_chunk_no_overlap_crawl() {
// Regression for the overlap-vs-stride crawl: when a high-score break
// (heading) lands within the overlap window of a chunk start, the old
// advance logic stepped the start forward one character at a time,
// re-selecting the same break and emitting hundreds of near-duplicate
// sub-chunks. A doc of many short headed sections must still produce a
// bounded, sane chunk count with no degenerate micro-chunks.
let mut content = String::new();
content.push_str("# Title\n\n");
for i in 0..40 {
content.push_str(&format!(
"## Section {i}\nA moderate paragraph of prose for section {i} that carries \
real content but is shorter than the target chunk size, so the following \
heading falls inside the overlap window.\n\n"
));
}
let chunks = smart_chunk(&content, 512, 15);
// ~9k chars at a 2048-char target → a healthy handful of chunks, never hundreds.
assert!(
chunks.len() < 40,
"overlap-crawl regression: expected a bounded chunk count, got {}",
chunks.len()
);
// The crawl produced ~1-token fragments; real chunks are substantial.
for c in &chunks {
assert!(
c.text.len() > 20,
"degenerate micro-chunk produced ({} chars): {:?}",
c.text.len(),
c.text
);
}
}

#[test]
fn test_smart_chunk_empty() {
let chunks = smart_chunk("", 512, 15);
Expand Down
Loading