From 1fff451b2279642f926b6509205ad062a51c01a8 Mon Sep 17 00:00:00 2001 From: Jacob Fu <141651335+FuJacob@users.noreply.github.com> Date: Wed, 10 Jun 2026 21:40:59 -0700 Subject: [PATCH 1/3] fix(acceptance): teach phrase boundaries and decode stop CJK punctuation A Japanese phrase accept arrived as one giant chunk: phrase mode only knew the ASCII terminators (. ! ?), so the ideographic full stop never ended a phrase and the clause comma was no boundary at all, and the same ASCII-only assumption in SentenceBoundaryClassifier meant the decode stop policy never fired for CJK text, so generations always ran to the token budget. A flat Japanese tail also had a punctuation cliff: a chunk that starts with CJK punctuation skips ICU word segmentation (punctuation does not begin a space-less-script word) and swallowed everything up to the next whitespace in a single accept, in word mode too. Phrase boundaries now include the CJK sentence terminators and treat the ideographic and fullwidth commas as clause stops, so Tab advances clause by clause the way Japanese prose reads. Word chunking binds a trailing CJK punctuation run to the word it follows and peels a punctuation-led run as its own chunk, removing the cliff. The classifier recognizes the CJK terminators (which, unlike the ASCII period, are unambiguous) and the CJK closing brackets for its walk-back, so generation stops at the end of a Japanese sentence like it does for English. All added codepoints occur only in CJK text and ASCII "," stays a non-boundary, so space-delimited scripts are byte-for-byte unchanged. --- Cotabby/Models/SuggestionEngineModels.swift | 4 +- .../Support/SentenceBoundaryClassifier.swift | 15 ++- .../Support/SuggestionSessionReconciler.swift | 101 ++++++++++++++---- .../SentenceBoundaryClassifierTests.swift | 19 ++++ .../SuggestionSessionReconcilerTests.swift | 56 ++++++++++ 5 files changed, 172 insertions(+), 23 deletions(-) diff --git a/Cotabby/Models/SuggestionEngineModels.swift b/Cotabby/Models/SuggestionEngineModels.swift index 2f3cf42f..d8fe42a7 100644 --- a/Cotabby/Models/SuggestionEngineModels.swift +++ b/Cotabby/Models/SuggestionEngineModels.swift @@ -71,7 +71,9 @@ struct DisabledApplicationRule: Codable, Equatable, Identifiable, Sendable { enum AcceptanceGranularity: String, CaseIterable, Codable, Sendable { /// One word (with the existing trailing-punctuation policy applied per chunk). case word - /// Words accumulated until a sentence terminator (`.`, `!`, `?`, `\n`) or the tail runs out. + /// Words accumulated until a phrase boundary or the tail runs out: a sentence terminator + /// (`.`, `!`, `?`, CJK `。!?。`, `\n`) or a CJK clause comma (`、,`), so space-less scripts + /// advance clause by clause instead of a whole sentence per press. case phrase } diff --git a/Cotabby/Support/SentenceBoundaryClassifier.swift b/Cotabby/Support/SentenceBoundaryClassifier.swift index 7437f46d..6126d4dc 100644 --- a/Cotabby/Support/SentenceBoundaryClassifier.swift +++ b/Cotabby/Support/SentenceBoundaryClassifier.swift @@ -39,6 +39,13 @@ enum SentenceBoundaryClassifier { switch text[lastIndex] { case "!", "?": return true + // CJK sentence terminators: the ideographic full stop, fullwidth `!` `?`, and the halfwidth + // ideographic stop. Unlike the ASCII period these are unambiguous (they never mark decimals, + // list numbers, or abbreviations), so they are terminal without classifier disambiguation. + // Without these a Japanese completion never registers a sentence end and generation always + // runs to the token budget, which is why CJK suggestions came out so long. + case "\u{3002}", "\u{FF01}", "\u{FF1F}", "\u{FF61}": + return true case ".": return isTerminalPeriod(in: text, at: lastIndex) default: @@ -97,10 +104,14 @@ enum SentenceBoundaryClassifier { private extension Character { /// Closing punctuation that may follow a sentence terminator: straight and curly quotes, - /// parentheses, square brackets, and braces. `endsSentence` walks back past a run of these to find - /// the real terminator underneath, so `"done."` and `(stop!)` register as sentence ends. + /// parentheses, square brackets, and braces, plus the CJK closers (corner brackets, fullwidth + /// parenthesis, lenticular and angle brackets). `endsSentence` walks back past a run of these to + /// find the real terminator underneath, so `"done."`, `(stop!)`, and `終わり。」` register as + /// sentence ends. var isSentenceClosingPunctuation: Bool { self == "\"" || self == "'" || self == ")" || self == "]" || self == "}" || self == "\u{201D}" || self == "\u{2019}" + || self == "\u{300D}" || self == "\u{300F}" || self == "\u{FF09}" + || self == "\u{3011}" || self == "\u{3009}" || self == "\u{300B}" } } diff --git a/Cotabby/Support/SuggestionSessionReconciler.swift b/Cotabby/Support/SuggestionSessionReconciler.swift index 8ad25aca..1cb395b1 100644 --- a/Cotabby/Support/SuggestionSessionReconciler.swift +++ b/Cotabby/Support/SuggestionSessionReconciler.swift @@ -255,7 +255,16 @@ enum SuggestionSessionReconciler { if tokenStart < index, remainingText[tokenStart].beginsSpacelessScriptWord, let wordEnd = firstSegmentedWordEnd(in: remainingText, from: tokenStart, notPast: index) { - index = wordEnd + // Bind an immediately following CJK punctuation run to the word so one Tab accepts + // "読み、" as a unit. Without this the punctuation would lead the *next* token, and a + // punctuation-led token skips ICU segmentation entirely, so in flat text it would swallow + // everything up to the next whitespace in a single accept. + index = endOfCJKPunctuationRun(in: remainingText, from: wordEnd, notPast: index) + } else if tokenStart < index, remainingText[tokenStart].bindsToPrecedingSpacelessWord { + // A token can still begin with CJK punctuation when the previous chunk ended exactly at + // the word (a typed-through advance, or a pre-fix session). Peel the punctuation run as + // its own chunk instead of falling through to the whitespace scan's whole-run cliff. + index = endOfCJKPunctuationRun(in: remainingText, from: tokenStart, notPast: index) } if !autoAcceptTrailingPunctuation, @@ -286,10 +295,27 @@ enum SuggestionSessionReconciler { return min(wordEnd, limit) } - /// Accepts a full phrase up to the next sentence terminator (`.`, `!`, `?`, `\n`) or the end - /// of the buffered suggestion tail. Composes over `nextAcceptanceChunk` so word-boundary, - /// internal-punctuation, and leading-whitespace policy stay identical across the seams of a - /// multi-word accept. + /// The index just past the contiguous run of word-binding CJK punctuation starting at `start`, + /// clamped to `limit`. Returns `start` unchanged when the character there is not such punctuation, + /// so the word-binding call site degrades to "no extension". + private static func endOfCJKPunctuationRun( + in text: String, + from start: String.Index, + notPast limit: String.Index + ) -> String.Index { + var cursor = start + while cursor < limit, text[cursor].bindsToPrecedingSpacelessWord { + cursor = text.index(after: cursor) + } + return cursor + } + + /// Accepts a full phrase up to the next phrase boundary or the end of the buffered suggestion + /// tail. Boundaries are sentence terminators (`.`, `!`, `?`, their CJK forms `。!?。`, `\n`) + /// and the CJK clause commas (`、,`), so Japanese/Chinese phrase accepts advance clause by + /// clause instead of swallowing a whole space-less sentence in one Tab. Composes over + /// `nextAcceptanceChunk` so word-boundary, internal-punctuation, and leading-whitespace policy + /// stay identical across the seams of a multi-word accept. /// /// Newlines need an extra rule: `nextAcceptanceChunk` returns leading whitespace as part of /// the next chunk, so a tail like `Hello\nworld` would surface `\n` as the leading character @@ -339,7 +365,7 @@ enum SuggestionSessionReconciler { accumulated += chunk working = String(working.dropFirst(chunk.count)) - if endsInSentenceTerminator(accumulated) { + if endsAtPhraseBoundary(accumulated) { return accumulated } } @@ -347,11 +373,12 @@ enum SuggestionSessionReconciler { return accumulated } - /// Tail-end check for sentence terminators that survives closing quotes and brackets, so - /// `"done."` and `(yes!)` are recognized as phrase ends even though their final character is - /// a closer rather than `.!?`. Walks back past any run of closing punctuation, then checks - /// whether the character immediately before that run is a sentence terminator. - private static func endsInSentenceTerminator(_ text: String) -> Bool { + /// Tail-end check for phrase boundaries that survives closing quotes and brackets, so + /// `"done."`, `(yes!)`, and `終わり。」` are recognized as phrase ends even though their final + /// character is a closer rather than the terminator itself. Walks back past any run of closing + /// punctuation, then checks whether the character immediately before that run ends a sentence or + /// a CJK clause. + private static func endsAtPhraseBoundary(_ text: String) -> Bool { var index = text.endIndex while index > text.startIndex { let prev = text.index(before: index) @@ -365,12 +392,20 @@ enum SuggestionSessionReconciler { return false } let prev = text.index(before: index) + // The ideographic / fullwidth comma marks a clause boundary in CJK prose. Space-less scripts + // have no whitespace rhythm, so without this stop a Japanese phrase accept swallows an entire + // sentence in one Tab; with it, Tab advances clause by clause. ASCII "," is deliberately NOT + // a boundary, so English phrase cadence is unchanged. + if text[prev].isPhraseClauseBoundary { + return true + } guard text[prev].isPhraseSentenceTerminator else { return false } - // `!` and `?` always end a sentence. A period is ambiguous: decimals, list/ordinal numbers, - // single-letter initials, and common abbreviations are not sentence ends, so consult the - // classifier rather than treating every "." as terminal. + // `!`/`?` and the CJK terminators always end a sentence. An ASCII period is ambiguous: + // decimals, list/ordinal numbers, single-letter initials, and common abbreviations are not + // sentence ends, so consult the classifier rather than treating every "." as terminal. The + // ideographic `。` has no such ambiguity (it never marks decimals or abbreviations). if text[prev] == "." { return SentenceBoundaryClassifier.isTerminalPeriod(in: text, at: prev) } @@ -544,19 +579,45 @@ private extension Character { isLetter || isNumber } - /// Sentence-ending punctuation for phrase mode. `\n` is handled separately because it can - /// appear inside a leading-whitespace prefix of a composed chunk rather than at the chunk's - /// tail end. + /// Sentence-ending punctuation for phrase mode, in both ASCII and CJK forms: `.` `!` `?` plus the + /// ideographic full stop `。`, fullwidth `!` `?`, and the halfwidth ideographic stop `。`. `\n` is + /// handled separately because it can appear inside a leading-whitespace prefix of a composed chunk + /// rather than at the chunk's tail end. var isPhraseSentenceTerminator: Bool { self == "." || self == "!" || self == "?" + || self == "\u{3002}" || self == "\u{FF01}" || self == "\u{FF1F}" || self == "\u{FF61}" + } + + /// Clause-boundary punctuation for phrase mode: the ideographic comma `、` and fullwidth comma + /// `,`. CJK prose marks its natural pause points with these rather than whitespace, so phrase + /// acceptance treats them as boundaries to advance clause by clause instead of swallowing a whole + /// sentence per Tab. Both codepoints occur only in CJK text, and ASCII "," is deliberately + /// excluded, so space-delimited scripts never stop at a comma. + var isPhraseClauseBoundary: Bool { + self == "\u{3001}" || self == "\u{FF0C}" } /// Closing punctuation that may follow a sentence terminator in prose: straight + curly - /// quotes, parentheses, square brackets, and braces. The phrase scanner walks back past a - /// run of these to find the real sentence terminator underneath, so `"done."` stops as a - /// complete sentence even though its final character is the closing quote. + /// quotes, parentheses, square brackets, and braces, plus the CJK closers (corner brackets, + /// fullwidth parenthesis, lenticular and angle brackets). The phrase scanner walks back past a + /// run of these to find the real sentence terminator underneath, so `"done."` and `終わり。」` + /// stop as complete sentences even though their final character is the closer. var isPhraseClosingPunctuation: Bool { self == "\"" || self == "'" || self == ")" || self == "]" || self == "}" || self == "\u{201D}" || self == "\u{2019}" + || self == "\u{300D}" || self == "\u{300F}" || self == "\u{FF09}" + || self == "\u{3011}" || self == "\u{3009}" || self == "\u{300B}" + } + + /// CJK punctuation that binds to the space-less word it follows for acceptance chunking: clause + /// commas, sentence terminators, and closing brackets/quotes. One Tab then accepts `読み、` as a + /// unit, and a chunk can never start at a punctuation cliff that would swallow the rest of the + /// run. Opening brackets are excluded because they belong to the next word, and ASCII punctuation + /// is excluded so this set can never affect space-delimited text. + var bindsToPrecedingSpacelessWord: Bool { + isPhraseClauseBoundary + || self == "\u{3002}" || self == "\u{FF01}" || self == "\u{FF1F}" || self == "\u{FF61}" + || self == "\u{300D}" || self == "\u{300F}" || self == "\u{FF09}" + || self == "\u{3011}" || self == "\u{3009}" || self == "\u{300B}" } } diff --git a/CotabbyTests/SentenceBoundaryClassifierTests.swift b/CotabbyTests/SentenceBoundaryClassifierTests.swift index f947a48e..eb845e6b 100644 --- a/CotabbyTests/SentenceBoundaryClassifierTests.swift +++ b/CotabbyTests/SentenceBoundaryClassifierTests.swift @@ -76,4 +76,23 @@ final class SentenceBoundaryClassifierTests: XCTestCase { func test_endsSentence_falseForEmptyString() { XCTAssertFalse(SentenceBoundaryClassifier.endsSentence("")) } + + /// CJK terminators are unambiguous sentence ends. Without these the decode stop policy never + /// fires for Japanese/Chinese text and generation always runs to the token budget, which is why + /// CJK suggestions came out so long. + func test_endsSentence_trueForCJKTerminators() { + XCTAssertTrue(SentenceBoundaryClassifier.endsSentence("資料を読む。")) + XCTAssertTrue(SentenceBoundaryClassifier.endsSentence("すごい!")) + XCTAssertTrue(SentenceBoundaryClassifier.endsSentence("いいですか?")) + } + + func test_endsSentence_walksPastCJKClosingPunctuation() { + XCTAssertTrue(SentenceBoundaryClassifier.endsSentence("終わり。」")) + } + + /// The ideographic comma is a clause boundary, not a sentence end: generation should keep going + /// past `、` and only stop at a real terminator. + func test_endsSentence_falseForIdeographicComma() { + XCTAssertFalse(SentenceBoundaryClassifier.endsSentence("資料を読み、")) + } } diff --git a/CotabbyTests/SuggestionSessionReconcilerTests.swift b/CotabbyTests/SuggestionSessionReconcilerTests.swift index 87d0cc9e..2eb40f36 100644 --- a/CotabbyTests/SuggestionSessionReconcilerTests.swift +++ b/CotabbyTests/SuggestionSessionReconcilerTests.swift @@ -257,6 +257,62 @@ final class SuggestionSessionReconcilerTests: XCTestCase { ) } + // MARK: - CJK phrase boundaries + + /// The reported case: a space-less Japanese sentence must not arrive as one giant Tab. The + /// ideographic comma is a clause boundary, so phrase accepts advance clause by clause. + func test_nextAcceptancePhrase_stopsAtIdeographicComma() { + XCTAssertEqual( + SuggestionSessionReconciler.nextAcceptancePhrase(from: "理解し、その内容を自分の言葉で表現する。"), + "理解し、" + ) + } + + func test_nextAcceptancePhrase_stopsAtIdeographicFullStop() { + XCTAssertEqual( + SuggestionSessionReconciler.nextAcceptancePhrase(from: "その内容を自分の言葉で表現する。次の文"), + "その内容を自分の言葉で表現する。" + ) + } + + func test_nextAcceptancePhrase_stopsAtFullwidthExclamationAndQuestion() { + XCTAssertEqual(SuggestionSessionReconciler.nextAcceptancePhrase(from: "すごい!次へ"), "すごい!") + XCTAssertEqual(SuggestionSessionReconciler.nextAcceptancePhrase(from: "いいですか?はい"), "いいですか?") + } + + /// The closer-walk must work for CJK quotes too: the accumulated tail is `」`, and the + /// terminator underneath is the ideographic full stop. + func test_nextAcceptancePhrase_walksPastCJKClosingQuote() { + XCTAssertEqual( + SuggestionSessionReconciler.nextAcceptancePhrase(from: "終わり。」次の文"), + "終わり。」" + ) + } + + /// ASCII commas must stay non-boundaries so English phrase cadence is unchanged by the CJK rules. + func test_nextAcceptancePhrase_doesNotStopAtAsciiComma() { + XCTAssertEqual( + SuggestionSessionReconciler.nextAcceptancePhrase(from: "hello, world. next"), + "hello, world." + ) + } + + // MARK: - CJK punctuation binding in word chunks + + /// Trailing CJK punctuation binds to the word it follows, so one Tab accepts the word and its + /// comma as a unit instead of stranding the comma to lead the next chunk. + func test_nextAcceptanceChunk_bindsTrailingIdeographicCommaToWord() { + XCTAssertEqual(SuggestionSessionReconciler.nextAcceptanceChunk(from: "資料、内容"), "資料、") + } + + /// A punctuation-led tail peels the punctuation run as its own chunk. Before this rule the token + /// skipped ICU segmentation (punctuation does not begin a space-less-script word) and the accept + /// swallowed everything up to the next whitespace in one chunk. + func test_nextAcceptanceChunk_peelsLeadingCJKPunctuationRunInsteadOfSwallowingTheTail() { + XCTAssertEqual(SuggestionSessionReconciler.nextAcceptanceChunk(from: "、理解し、その内容"), "、") + XCTAssertEqual(SuggestionSessionReconciler.nextAcceptanceChunk(from: "。」次の文"), "。」") + } + func test_nextAcceptancePhrase_walksPastDottedInitialsToRealSentenceEnd() { // "U.S.A." is a run of single-letter initials, so its interior periods are not sentence // ends. SentenceBoundaryClassifier keeps phrase acceptance going until the real terminator From 2acd2c6df7d6d96754dfdd9a920fff621e85e4d1 Mon Sep 17 00:00:00 2001 From: Jacob Fu <141651335+FuJacob@users.noreply.github.com> Date: Wed, 10 Jun 2026 23:11:45 -0700 Subject: [PATCH 2/3] fix(acceptance): peel CJK opening brackets and cover halfwidth kana punctuation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Auditing the CJK chunking surfaced one remaining cliff: a chunk starting at a CJK opening bracket neither begins a space-less-script word nor binds to the preceding one, so a flat quoted run like the tail of 彼は「分かった」と言った was still swallowed whole by the whitespace scan. The punctuation-led peel now takes opening brackets too (the trailing binding still stops before them, since an opener belongs to the next word), and the halfwidth kana forms 、 」 「 join their fullwidth counterparts in the clause, closer, and opener sets, with the halfwidth corner also added to the classifier's closer walk. ASCII brackets and quotes are untouched by the peel (the sets stay CJK-only), locked in by regression tests alongside the opener, mixed-run, and halfwidth cases. Full unit bundle: 993 tests, 0 failures. --- .../Support/SentenceBoundaryClassifier.swift | 2 +- .../Support/SuggestionSessionReconciler.swift | 99 +++++++++++++------ .../SentenceBoundaryClassifierTests.swift | 7 ++ .../SuggestionSessionReconcilerTests.swift | 68 +++++++++++++ 4 files changed, 146 insertions(+), 30 deletions(-) diff --git a/Cotabby/Support/SentenceBoundaryClassifier.swift b/Cotabby/Support/SentenceBoundaryClassifier.swift index 6126d4dc..0e33c36b 100644 --- a/Cotabby/Support/SentenceBoundaryClassifier.swift +++ b/Cotabby/Support/SentenceBoundaryClassifier.swift @@ -112,6 +112,6 @@ private extension Character { self == "\"" || self == "'" || self == ")" || self == "]" || self == "}" || self == "\u{201D}" || self == "\u{2019}" || self == "\u{300D}" || self == "\u{300F}" || self == "\u{FF09}" - || self == "\u{3011}" || self == "\u{3009}" || self == "\u{300B}" + || self == "\u{3011}" || self == "\u{3009}" || self == "\u{300B}" || self == "\u{FF63}" } } diff --git a/Cotabby/Support/SuggestionSessionReconciler.swift b/Cotabby/Support/SuggestionSessionReconciler.swift index 1cb395b1..6958d062 100644 --- a/Cotabby/Support/SuggestionSessionReconciler.swift +++ b/Cotabby/Support/SuggestionSessionReconciler.swift @@ -260,13 +260,23 @@ enum SuggestionSessionReconciler { // punctuation-led token skips ICU segmentation entirely, so in flat text it would swallow // everything up to the next whitespace in a single accept. index = endOfCJKPunctuationRun(in: remainingText, from: wordEnd, notPast: index) - } else if tokenStart < index, remainingText[tokenStart].bindsToPrecedingSpacelessWord { - // A token can still begin with CJK punctuation when the previous chunk ended exactly at - // the word (a typed-through advance, or a pre-fix session). Peel the punctuation run as - // its own chunk instead of falling through to the whitespace scan's whole-run cliff. - index = endOfCJKPunctuationRun(in: remainingText, from: tokenStart, notPast: index) - } - + } else if tokenStart < index, + remainingText[tokenStart].bindsToPrecedingSpacelessWord + || remainingText[tokenStart].isCJKOpeningBracket { + // A token can also begin with CJK punctuation: closers/commas when the previous chunk + // ended exactly at the word (a typed-through advance), and opening brackets always, + // because an opener belongs to the *next* word so the trailing-binding above never + // consumes it. Peel the punctuation run as its own chunk instead of falling through to + // the whitespace scan, which would swallow everything up to the next whitespace. + index = endOfCJKPunctuationRun(in: remainingText, from: tokenStart, notPast: index, includingOpeners: true) + } + + // With trailing-punctuation auto-accept off, peel any trailing punctuation (including a CJK + // run just bound above) back off the chunk, so `資料、` accepts as `資料` and the comma waits + // for the next Tab. This intentionally overrides the binding for word granularity; the phrase + // walker re-accumulates the comma regardless, so phrase output is unchanged either way. A + // punctuation-only token survives whole because `wordEndTrimmingTrailingPunctuation` returns + // nil when there is no word character to trim back to, so the peeled chunk is never empty. if !autoAcceptTrailingPunctuation, let wordEnd = wordEndTrimmingTrailingPunctuation(in: remainingText, from: tokenStart, to: index) { index = wordEnd @@ -295,16 +305,24 @@ enum SuggestionSessionReconciler { return min(wordEnd, limit) } - /// The index just past the contiguous run of word-binding CJK punctuation starting at `start`, - /// clamped to `limit`. Returns `start` unchanged when the character there is not such punctuation, - /// so the word-binding call site degrades to "no extension". + /// The index just past the contiguous run of CJK punctuation starting at `start`, clamped to + /// `limit`. Returns `start` unchanged when the character there is not such punctuation, so the + /// word-binding call site degrades to "no extension". `includingOpeners` is true only for the + /// peel path: a trailing extension must stop before an opening bracket (it belongs to the next + /// word), while a punctuation-led peel takes the whole mixed run. private static func endOfCJKPunctuationRun( in text: String, from start: String.Index, - notPast limit: String.Index + notPast limit: String.Index, + includingOpeners: Bool = false ) -> String.Index { var cursor = start - while cursor < limit, text[cursor].bindsToPrecedingSpacelessWord { + while cursor < limit { + let character = text[cursor] + guard character.bindsToPrecedingSpacelessWord + || (includingOpeners && character.isCJKOpeningBracket) else { + break + } cursor = text.index(after: cursor) } return cursor @@ -579,22 +597,49 @@ private extension Character { isLetter || isNumber } + /// The CJK sentence terminators: ideographic full stop `。`, fullwidth `!` `?`, and the halfwidth + /// ideographic stop `。`. Declared once so `isPhraseSentenceTerminator` (phrase ends) and + /// `bindsToPrecedingSpacelessWord` (chunk binding) share one list instead of each restating the + /// four codepoints and silently drifting when one is updated. + var isCJKSentenceTerminator: Bool { + self == "\u{3002}" || self == "\u{FF01}" || self == "\u{FF1F}" || self == "\u{FF61}" + } + + /// The CJK closing punctuation: corner brackets `」` `』` (and the halfwidth corner `」`), + /// fullwidth parenthesis `)`, lenticular bracket `】`, and angle brackets `〉` `》`. Declared + /// once so `isPhraseClosingPunctuation` (the closer walk-back) and + /// `bindsToPrecedingSpacelessWord` (chunk binding) share one list instead of each restating the + /// codepoints. + var isCJKClosingPunctuation: Bool { + self == "\u{300D}" || self == "\u{300F}" || self == "\u{FF09}" + || self == "\u{3011}" || self == "\u{3009}" || self == "\u{300B}" || self == "\u{FF63}" + } + + /// The CJK opening brackets: corner brackets `「` `『` (and the halfwidth corner `「`), fullwidth + /// parenthesis `(`, lenticular bracket `【`, and angle brackets `〈` `《`. These lead the word + /// they quote, so the trailing-binding rule stops before them while the punctuation-led peel + /// takes them; without the peel a chunk starting at `「` would skip ICU segmentation and swallow + /// the rest of a flat quoted run to the next whitespace. + var isCJKOpeningBracket: Bool { + self == "\u{300C}" || self == "\u{300E}" || self == "\u{FF08}" + || self == "\u{3010}" || self == "\u{3008}" || self == "\u{300A}" || self == "\u{FF62}" + } + /// Sentence-ending punctuation for phrase mode, in both ASCII and CJK forms: `.` `!` `?` plus the /// ideographic full stop `。`, fullwidth `!` `?`, and the halfwidth ideographic stop `。`. `\n` is /// handled separately because it can appear inside a leading-whitespace prefix of a composed chunk /// rather than at the chunk's tail end. var isPhraseSentenceTerminator: Bool { - self == "." || self == "!" || self == "?" - || self == "\u{3002}" || self == "\u{FF01}" || self == "\u{FF1F}" || self == "\u{FF61}" + self == "." || self == "!" || self == "?" || isCJKSentenceTerminator } - /// Clause-boundary punctuation for phrase mode: the ideographic comma `、` and fullwidth comma - /// `,`. CJK prose marks its natural pause points with these rather than whitespace, so phrase - /// acceptance treats them as boundaries to advance clause by clause instead of swallowing a whole - /// sentence per Tab. Both codepoints occur only in CJK text, and ASCII "," is deliberately - /// excluded, so space-delimited scripts never stop at a comma. + /// Clause-boundary punctuation for phrase mode: the ideographic comma `、` (and its halfwidth + /// form `、`) and the fullwidth comma `,`. CJK prose marks its natural pause points with these + /// rather than whitespace, so phrase acceptance treats them as boundaries to advance clause by + /// clause instead of swallowing a whole sentence per Tab. All three codepoints occur only in CJK + /// text, and ASCII "," is deliberately excluded, so space-delimited scripts never stop at a comma. var isPhraseClauseBoundary: Bool { - self == "\u{3001}" || self == "\u{FF0C}" + self == "\u{3001}" || self == "\u{FF0C}" || self == "\u{FF64}" } /// Closing punctuation that may follow a sentence terminator in prose: straight + curly @@ -604,20 +649,16 @@ private extension Character { /// stop as complete sentences even though their final character is the closer. var isPhraseClosingPunctuation: Bool { self == "\"" || self == "'" || self == ")" || self == "]" || self == "}" - || self == "\u{201D}" || self == "\u{2019}" - || self == "\u{300D}" || self == "\u{300F}" || self == "\u{FF09}" - || self == "\u{3011}" || self == "\u{3009}" || self == "\u{300B}" + || self == "\u{201D}" || self == "\u{2019}" || isCJKClosingPunctuation } /// CJK punctuation that binds to the space-less word it follows for acceptance chunking: clause /// commas, sentence terminators, and closing brackets/quotes. One Tab then accepts `読み、` as a /// unit, and a chunk can never start at a punctuation cliff that would swallow the rest of the - /// run. Opening brackets are excluded because they belong to the next word, and ASCII punctuation - /// is excluded so this set can never affect space-delimited text. + /// run. Opening brackets are excluded because they belong to the next word, and every contributing + /// set is CJK-only (ASCII punctuation is never a member), so this can never affect space-delimited + /// text. var bindsToPrecedingSpacelessWord: Bool { - isPhraseClauseBoundary - || self == "\u{3002}" || self == "\u{FF01}" || self == "\u{FF1F}" || self == "\u{FF61}" - || self == "\u{300D}" || self == "\u{300F}" || self == "\u{FF09}" - || self == "\u{3011}" || self == "\u{3009}" || self == "\u{300B}" + isPhraseClauseBoundary || isCJKSentenceTerminator || isCJKClosingPunctuation } } diff --git a/CotabbyTests/SentenceBoundaryClassifierTests.swift b/CotabbyTests/SentenceBoundaryClassifierTests.swift index eb845e6b..d0f368be 100644 --- a/CotabbyTests/SentenceBoundaryClassifierTests.swift +++ b/CotabbyTests/SentenceBoundaryClassifierTests.swift @@ -90,6 +90,13 @@ final class SentenceBoundaryClassifierTests: XCTestCase { XCTAssertTrue(SentenceBoundaryClassifier.endsSentence("終わり。」")) } + /// Halfwidth kana punctuation (legacy SJIS contexts) terminates like its fullwidth counterparts, + /// including the walk past a halfwidth corner bracket. + func test_endsSentence_trueForHalfwidthTerminatorAndCloser() { + XCTAssertTrue(SentenceBoundaryClassifier.endsSentence("終わり。")) + XCTAssertTrue(SentenceBoundaryClassifier.endsSentence("終わり。」")) + } + /// The ideographic comma is a clause boundary, not a sentence end: generation should keep going /// past `、` and only stop at a real terminator. func test_endsSentence_falseForIdeographicComma() { diff --git a/CotabbyTests/SuggestionSessionReconcilerTests.swift b/CotabbyTests/SuggestionSessionReconcilerTests.swift index 2eb40f36..4f133967 100644 --- a/CotabbyTests/SuggestionSessionReconcilerTests.swift +++ b/CotabbyTests/SuggestionSessionReconcilerTests.swift @@ -313,6 +313,74 @@ final class SuggestionSessionReconcilerTests: XCTestCase { XCTAssertEqual(SuggestionSessionReconciler.nextAcceptanceChunk(from: "。」次の文"), "。」") } + /// CJK opening brackets are peeled too: `「` leads the word it quotes, so it neither begins a + /// space-less-script word nor binds to the preceding one, and without the peel a quoted run in + /// flat text would be swallowed whole (`「分かった」と言った` after `は` in one Tab). + func test_nextAcceptanceChunk_peelsLeadingCJKOpeningBracket() { + XCTAssertEqual(SuggestionSessionReconciler.nextAcceptanceChunk(from: "「分かった」と言った"), "「") + XCTAssertEqual(SuggestionSessionReconciler.nextAcceptanceChunk(from: "【内容】次"), "【") + } + + /// A mixed close-then-open run (`。」「`) peels as one punctuation chunk, so back-to-back quotes + /// never strand the walker. + func test_nextAcceptanceChunk_peelsMixedCloserOpenerRunAsOneChunk() { + XCTAssertEqual(SuggestionSessionReconciler.nextAcceptanceChunk(from: "。」「次の文"), "。」「") + } + + /// The trailing binding must stop before an opening bracket: the closer and full stop belong to + /// the word, but the next quote's opener belongs to the next word. + func test_nextAcceptanceChunk_trailingBindingStopsBeforeOpeningBracket() { + XCTAssertEqual(SuggestionSessionReconciler.nextAcceptanceChunk(from: "内容。「次"), "内容。") + } + + /// Halfwidth kana punctuation (legacy SJIS contexts) behaves like its fullwidth counterparts: + /// the halfwidth comma is a clause boundary and the halfwidth corner bracket binds and walks. + func test_halfwidthKanaPunctuation_matchesFullwidthBehavior() { + XCTAssertEqual(SuggestionSessionReconciler.nextAcceptancePhrase(from: "資料を読み、次へ"), "資料を読み、") + XCTAssertEqual(SuggestionSessionReconciler.nextAcceptancePhrase(from: "終わり。」次の文"), "終わり。」") + } + + /// ASCII brackets and quotes must keep their existing whole-token behavior: the CJK opener peel + /// is scoped to CJK codepoints, so space-delimited scripts stay byte-for-byte unchanged. + func test_nextAcceptanceChunk_asciiBracketsUnchangedByOpenerPeel() { + XCTAssertEqual(SuggestionSessionReconciler.nextAcceptanceChunk(from: "(hello) world"), "(hello)") + XCTAssertEqual(SuggestionSessionReconciler.nextAcceptanceChunk(from: "\"quote\" next"), "\"quote\"") + } + + // MARK: - CJK punctuation under trailing-punctuation policy + + /// With trailing-punctuation auto-accept off, the CJK binding is intentionally re-peeled: the word + /// accepts on its own and the clause comma waits for the next Tab, exactly how ASCII trailing + /// punctuation behaves under the same setting. The binding is a no-op in this path by design. + func test_nextAcceptanceChunk_autoAcceptOff_trimsBoundCJKCommaBackOffTheWord() { + XCTAssertEqual( + SuggestionSessionReconciler.nextAcceptanceChunk(from: "資料、内容", autoAcceptTrailingPunctuation: false), + "資料" + ) + } + + /// A punctuation-led peel must stay non-empty with auto-accept off. Trimming would otherwise strip + /// the whole chunk and stall the phrase walker, but `wordEndTrimmingTrailingPunctuation` returns + /// nil for a punctuation-only token, so the comma survives as its own chunk. + func test_nextAcceptanceChunk_autoAcceptOff_keepsPunctuationOnlyPeelNonEmpty() { + XCTAssertEqual( + SuggestionSessionReconciler.nextAcceptanceChunk(from: "、内容", autoAcceptTrailingPunctuation: false), + "、" + ) + } + + /// The flag never changes phrase output: with auto-accept off the word and comma arrive as separate + /// chunks, but they accumulate to the same clause the flag-on path returns in one binding. + func test_nextAcceptancePhrase_autoAcceptOff_stillStopsAtIdeographicComma() { + XCTAssertEqual( + SuggestionSessionReconciler.nextAcceptancePhrase( + from: "理解し、その内容を自分の言葉で表現する。", + autoAcceptTrailingPunctuation: false + ), + "理解し、" + ) + } + func test_nextAcceptancePhrase_walksPastDottedInitialsToRealSentenceEnd() { // "U.S.A." is a run of single-letter initials, so its interior periods are not sentence // ends. SentenceBoundaryClassifier keeps phrase acceptance going until the real terminator From 2ab9bb34574f5e162e261bb4294c8b40c7b4e81d Mon Sep 17 00:00:00 2001 From: Jacob Fu <141651335+FuJacob@users.noreply.github.com> Date: Thu, 11 Jun 2026 07:37:58 -0700 Subject: [PATCH 3/3] Address Greptile review on #669: single-source the CJK punctuation sets The CJK terminator and closer codepoint lists were restated in the reconciler's phrase policy and again in SentenceBoundaryClassifier's closer walk, so adding a codepoint required parallel edits with no compiler enforcement. The two primitive sets are now one internal Character extension in the reconciler file, and both the phrase policy and the classifier compose from them. --- .../Support/SentenceBoundaryClassifier.swift | 21 +++++----- .../Support/SuggestionSessionReconciler.swift | 40 ++++++++++--------- 2 files changed, 31 insertions(+), 30 deletions(-) diff --git a/Cotabby/Support/SentenceBoundaryClassifier.swift b/Cotabby/Support/SentenceBoundaryClassifier.swift index 0e33c36b..11315f67 100644 --- a/Cotabby/Support/SentenceBoundaryClassifier.swift +++ b/Cotabby/Support/SentenceBoundaryClassifier.swift @@ -39,12 +39,11 @@ enum SentenceBoundaryClassifier { switch text[lastIndex] { case "!", "?": return true - // CJK sentence terminators: the ideographic full stop, fullwidth `!` `?`, and the halfwidth - // ideographic stop. Unlike the ASCII period these are unambiguous (they never mark decimals, - // list numbers, or abbreviations), so they are terminal without classifier disambiguation. - // Without these a Japanese completion never registers a sentence end and generation always - // runs to the token budget, which is why CJK suggestions came out so long. - case "\u{3002}", "\u{FF01}", "\u{FF1F}", "\u{FF61}": + // The shared CJK terminator set (see `Character.isCJKSentenceTerminator`): unambiguous, so + // terminal without the period disambiguation below. Without these a Japanese completion never + // registers a sentence end and generation always runs to the token budget, which is why CJK + // suggestions came out so long. + case let character where character.isCJKSentenceTerminator: return true case ".": return isTerminalPeriod(in: text, at: lastIndex) @@ -104,14 +103,12 @@ enum SentenceBoundaryClassifier { private extension Character { /// Closing punctuation that may follow a sentence terminator: straight and curly quotes, - /// parentheses, square brackets, and braces, plus the CJK closers (corner brackets, fullwidth - /// parenthesis, lenticular and angle brackets). `endsSentence` walks back past a run of these to - /// find the real terminator underneath, so `"done."`, `(stop!)`, and `終わり。」` register as + /// parentheses, square brackets, and braces, plus the shared CJK closer set (see + /// `Character.isCJKClosingPunctuation`). `endsSentence` walks back past a run of these to find + /// the real terminator underneath, so `"done."`, `(stop!)`, and `終わり。」` register as /// sentence ends. var isSentenceClosingPunctuation: Bool { self == "\"" || self == "'" || self == ")" || self == "]" || self == "}" - || self == "\u{201D}" || self == "\u{2019}" - || self == "\u{300D}" || self == "\u{300F}" || self == "\u{FF09}" - || self == "\u{3011}" || self == "\u{3009}" || self == "\u{300B}" || self == "\u{FF63}" + || self == "\u{201D}" || self == "\u{2019}" || isCJKClosingPunctuation } } diff --git a/Cotabby/Support/SuggestionSessionReconciler.swift b/Cotabby/Support/SuggestionSessionReconciler.swift index 6958d062..9ea574e3 100644 --- a/Cotabby/Support/SuggestionSessionReconciler.swift +++ b/Cotabby/Support/SuggestionSessionReconciler.swift @@ -562,6 +562,28 @@ private extension String { } } +/// The CJK punctuation primitives, internal because they are the single source of truth shared by +/// this file's acceptance policy and `SentenceBoundaryClassifier`'s sentence-end detection. Adding a +/// codepoint here updates phrase boundaries, chunk binding, and the generation stop in one edit. +extension Character { + /// The CJK sentence terminators: ideographic full stop `。`, fullwidth `!` `?`, and the halfwidth + /// ideographic stop `。`. Unlike the ASCII period these are unambiguous (they never mark decimals, + /// list numbers, or abbreviations), so every consumer treats them as terminal without classifier + /// disambiguation. + var isCJKSentenceTerminator: Bool { + self == "\u{3002}" || self == "\u{FF01}" || self == "\u{FF1F}" || self == "\u{FF61}" + } + + /// The CJK closing punctuation: corner brackets `」` `』` (and the halfwidth corner `」`), + /// fullwidth parenthesis `)`, lenticular bracket `】`, and angle brackets `〉` `》`. Walk-backs + /// skip a run of these to find the real terminator underneath, and chunk binding attaches them to + /// the word they close. + var isCJKClosingPunctuation: Bool { + self == "\u{300D}" || self == "\u{300F}" || self == "\u{FF09}" + || self == "\u{3011}" || self == "\u{3009}" || self == "\u{300B}" || self == "\u{FF63}" + } +} + private extension Character { /// True when the character begins a word of a space-less script (Han, Hiragana, Katakana, Hangul, /// Thai, Lao, Khmer, Myanmar, ...). These scripts write words without separating spaces, so the @@ -597,24 +619,6 @@ private extension Character { isLetter || isNumber } - /// The CJK sentence terminators: ideographic full stop `。`, fullwidth `!` `?`, and the halfwidth - /// ideographic stop `。`. Declared once so `isPhraseSentenceTerminator` (phrase ends) and - /// `bindsToPrecedingSpacelessWord` (chunk binding) share one list instead of each restating the - /// four codepoints and silently drifting when one is updated. - var isCJKSentenceTerminator: Bool { - self == "\u{3002}" || self == "\u{FF01}" || self == "\u{FF1F}" || self == "\u{FF61}" - } - - /// The CJK closing punctuation: corner brackets `」` `』` (and the halfwidth corner `」`), - /// fullwidth parenthesis `)`, lenticular bracket `】`, and angle brackets `〉` `》`. Declared - /// once so `isPhraseClosingPunctuation` (the closer walk-back) and - /// `bindsToPrecedingSpacelessWord` (chunk binding) share one list instead of each restating the - /// codepoints. - var isCJKClosingPunctuation: Bool { - self == "\u{300D}" || self == "\u{300F}" || self == "\u{FF09}" - || self == "\u{3011}" || self == "\u{3009}" || self == "\u{300B}" || self == "\u{FF63}" - } - /// The CJK opening brackets: corner brackets `「` `『` (and the halfwidth corner `「`), fullwidth /// parenthesis `(`, lenticular bracket `【`, and angle brackets `〈` `《`. These lead the word /// they quote, so the trailing-binding rule stops before them while the punctuation-led peel