diff --git a/Cotabby/Models/SuggestionEngineModels.swift b/Cotabby/Models/SuggestionEngineModels.swift index 2f3cf42f..d8fe42a7 100644 --- a/Cotabby/Models/SuggestionEngineModels.swift +++ b/Cotabby/Models/SuggestionEngineModels.swift @@ -71,7 +71,9 @@ struct DisabledApplicationRule: Codable, Equatable, Identifiable, Sendable { enum AcceptanceGranularity: String, CaseIterable, Codable, Sendable { /// One word (with the existing trailing-punctuation policy applied per chunk). case word - /// Words accumulated until a sentence terminator (`.`, `!`, `?`, `\n`) or the tail runs out. + /// Words accumulated until a phrase boundary or the tail runs out: a sentence terminator + /// (`.`, `!`, `?`, CJK `。!?。`, `\n`) or a CJK clause comma (`、,`), so space-less scripts + /// advance clause by clause instead of a whole sentence per press. case phrase } diff --git a/Cotabby/Support/SentenceBoundaryClassifier.swift b/Cotabby/Support/SentenceBoundaryClassifier.swift index 7437f46d..11315f67 100644 --- a/Cotabby/Support/SentenceBoundaryClassifier.swift +++ b/Cotabby/Support/SentenceBoundaryClassifier.swift @@ -39,6 +39,12 @@ enum SentenceBoundaryClassifier { switch text[lastIndex] { case "!", "?": return true + // The shared CJK terminator set (see `Character.isCJKSentenceTerminator`): unambiguous, so + // terminal without the period disambiguation below. Without these a Japanese completion never + // registers a sentence end and generation always runs to the token budget, which is why CJK + // suggestions came out so long. + case let character where character.isCJKSentenceTerminator: + return true case ".": return isTerminalPeriod(in: text, at: lastIndex) default: @@ -97,10 +103,12 @@ enum SentenceBoundaryClassifier { private extension Character { /// Closing punctuation that may follow a sentence terminator: straight and curly quotes, - /// parentheses, square brackets, and braces. `endsSentence` walks back past a run of these to find - /// the real terminator underneath, so `"done."` and `(stop!)` register as sentence ends. + /// parentheses, square brackets, and braces, plus the shared CJK closer set (see + /// `Character.isCJKClosingPunctuation`). `endsSentence` walks back past a run of these to find + /// the real terminator underneath, so `"done."`, `(stop!)`, and `終わり。」` register as + /// sentence ends. var isSentenceClosingPunctuation: Bool { self == "\"" || self == "'" || self == ")" || self == "]" || self == "}" - || self == "\u{201D}" || self == "\u{2019}" + || self == "\u{201D}" || self == "\u{2019}" || isCJKClosingPunctuation } } diff --git a/Cotabby/Support/SuggestionSessionReconciler.swift b/Cotabby/Support/SuggestionSessionReconciler.swift index 8ad25aca..9ea574e3 100644 --- a/Cotabby/Support/SuggestionSessionReconciler.swift +++ b/Cotabby/Support/SuggestionSessionReconciler.swift @@ -255,9 +255,28 @@ enum SuggestionSessionReconciler { if tokenStart < index, remainingText[tokenStart].beginsSpacelessScriptWord, let wordEnd = firstSegmentedWordEnd(in: remainingText, from: tokenStart, notPast: index) { - index = wordEnd - } - + // Bind an immediately following CJK punctuation run to the word so one Tab accepts + // "読み、" as a unit. Without this the punctuation would lead the *next* token, and a + // punctuation-led token skips ICU segmentation entirely, so in flat text it would swallow + // everything up to the next whitespace in a single accept. + index = endOfCJKPunctuationRun(in: remainingText, from: wordEnd, notPast: index) + } else if tokenStart < index, + remainingText[tokenStart].bindsToPrecedingSpacelessWord + || remainingText[tokenStart].isCJKOpeningBracket { + // A token can also begin with CJK punctuation: closers/commas when the previous chunk + // ended exactly at the word (a typed-through advance), and opening brackets always, + // because an opener belongs to the *next* word so the trailing-binding above never + // consumes it. Peel the punctuation run as its own chunk instead of falling through to + // the whitespace scan, which would swallow everything up to the next whitespace. + index = endOfCJKPunctuationRun(in: remainingText, from: tokenStart, notPast: index, includingOpeners: true) + } + + // With trailing-punctuation auto-accept off, peel any trailing punctuation (including a CJK + // run just bound above) back off the chunk, so `資料、` accepts as `資料` and the comma waits + // for the next Tab. This intentionally overrides the binding for word granularity; the phrase + // walker re-accumulates the comma regardless, so phrase output is unchanged either way. A + // punctuation-only token survives whole because `wordEndTrimmingTrailingPunctuation` returns + // nil when there is no word character to trim back to, so the peeled chunk is never empty. if !autoAcceptTrailingPunctuation, let wordEnd = wordEndTrimmingTrailingPunctuation(in: remainingText, from: tokenStart, to: index) { index = wordEnd @@ -286,10 +305,35 @@ enum SuggestionSessionReconciler { return min(wordEnd, limit) } - /// Accepts a full phrase up to the next sentence terminator (`.`, `!`, `?`, `\n`) or the end - /// of the buffered suggestion tail. Composes over `nextAcceptanceChunk` so word-boundary, - /// internal-punctuation, and leading-whitespace policy stay identical across the seams of a - /// multi-word accept. + /// The index just past the contiguous run of CJK punctuation starting at `start`, clamped to + /// `limit`. Returns `start` unchanged when the character there is not such punctuation, so the + /// word-binding call site degrades to "no extension". `includingOpeners` is true only for the + /// peel path: a trailing extension must stop before an opening bracket (it belongs to the next + /// word), while a punctuation-led peel takes the whole mixed run. + private static func endOfCJKPunctuationRun( + in text: String, + from start: String.Index, + notPast limit: String.Index, + includingOpeners: Bool = false + ) -> String.Index { + var cursor = start + while cursor < limit { + let character = text[cursor] + guard character.bindsToPrecedingSpacelessWord + || (includingOpeners && character.isCJKOpeningBracket) else { + break + } + cursor = text.index(after: cursor) + } + return cursor + } + + /// Accepts a full phrase up to the next phrase boundary or the end of the buffered suggestion + /// tail. Boundaries are sentence terminators (`.`, `!`, `?`, their CJK forms `。!?。`, `\n`) + /// and the CJK clause commas (`、,`), so Japanese/Chinese phrase accepts advance clause by + /// clause instead of swallowing a whole space-less sentence in one Tab. Composes over + /// `nextAcceptanceChunk` so word-boundary, internal-punctuation, and leading-whitespace policy + /// stay identical across the seams of a multi-word accept. /// /// Newlines need an extra rule: `nextAcceptanceChunk` returns leading whitespace as part of /// the next chunk, so a tail like `Hello\nworld` would surface `\n` as the leading character @@ -339,7 +383,7 @@ enum SuggestionSessionReconciler { accumulated += chunk working = String(working.dropFirst(chunk.count)) - if endsInSentenceTerminator(accumulated) { + if endsAtPhraseBoundary(accumulated) { return accumulated } } @@ -347,11 +391,12 @@ enum SuggestionSessionReconciler { return accumulated } - /// Tail-end check for sentence terminators that survives closing quotes and brackets, so - /// `"done."` and `(yes!)` are recognized as phrase ends even though their final character is - /// a closer rather than `.!?`. Walks back past any run of closing punctuation, then checks - /// whether the character immediately before that run is a sentence terminator. - private static func endsInSentenceTerminator(_ text: String) -> Bool { + /// Tail-end check for phrase boundaries that survives closing quotes and brackets, so + /// `"done."`, `(yes!)`, and `終わり。」` are recognized as phrase ends even though their final + /// character is a closer rather than the terminator itself. Walks back past any run of closing + /// punctuation, then checks whether the character immediately before that run ends a sentence or + /// a CJK clause. + private static func endsAtPhraseBoundary(_ text: String) -> Bool { var index = text.endIndex while index > text.startIndex { let prev = text.index(before: index) @@ -365,12 +410,20 @@ enum SuggestionSessionReconciler { return false } let prev = text.index(before: index) + // The ideographic / fullwidth comma marks a clause boundary in CJK prose. Space-less scripts + // have no whitespace rhythm, so without this stop a Japanese phrase accept swallows an entire + // sentence in one Tab; with it, Tab advances clause by clause. ASCII "," is deliberately NOT + // a boundary, so English phrase cadence is unchanged. + if text[prev].isPhraseClauseBoundary { + return true + } guard text[prev].isPhraseSentenceTerminator else { return false } - // `!` and `?` always end a sentence. A period is ambiguous: decimals, list/ordinal numbers, - // single-letter initials, and common abbreviations are not sentence ends, so consult the - // classifier rather than treating every "." as terminal. + // `!`/`?` and the CJK terminators always end a sentence. An ASCII period is ambiguous: + // decimals, list/ordinal numbers, single-letter initials, and common abbreviations are not + // sentence ends, so consult the classifier rather than treating every "." as terminal. The + // ideographic `。` has no such ambiguity (it never marks decimals or abbreviations). if text[prev] == "." { return SentenceBoundaryClassifier.isTerminalPeriod(in: text, at: prev) } @@ -509,6 +562,28 @@ private extension String { } } +/// The CJK punctuation primitives, internal because they are the single source of truth shared by +/// this file's acceptance policy and `SentenceBoundaryClassifier`'s sentence-end detection. Adding a +/// codepoint here updates phrase boundaries, chunk binding, and the generation stop in one edit. +extension Character { + /// The CJK sentence terminators: ideographic full stop `。`, fullwidth `!` `?`, and the halfwidth + /// ideographic stop `。`. Unlike the ASCII period these are unambiguous (they never mark decimals, + /// list numbers, or abbreviations), so every consumer treats them as terminal without classifier + /// disambiguation. + var isCJKSentenceTerminator: Bool { + self == "\u{3002}" || self == "\u{FF01}" || self == "\u{FF1F}" || self == "\u{FF61}" + } + + /// The CJK closing punctuation: corner brackets `」` `』` (and the halfwidth corner `」`), + /// fullwidth parenthesis `)`, lenticular bracket `】`, and angle brackets `〉` `》`. Walk-backs + /// skip a run of these to find the real terminator underneath, and chunk binding attaches them to + /// the word they close. + var isCJKClosingPunctuation: Bool { + self == "\u{300D}" || self == "\u{300F}" || self == "\u{FF09}" + || self == "\u{3011}" || self == "\u{3009}" || self == "\u{300B}" || self == "\u{FF63}" + } +} + private extension Character { /// True when the character begins a word of a space-less script (Han, Hiragana, Katakana, Hangul, /// Thai, Lao, Khmer, Myanmar, ...). These scripts write words without separating spaces, so the @@ -544,19 +619,50 @@ private extension Character { isLetter || isNumber } - /// Sentence-ending punctuation for phrase mode. `\n` is handled separately because it can - /// appear inside a leading-whitespace prefix of a composed chunk rather than at the chunk's - /// tail end. + /// The CJK opening brackets: corner brackets `「` `『` (and the halfwidth corner `「`), fullwidth + /// parenthesis `(`, lenticular bracket `【`, and angle brackets `〈` `《`. These lead the word + /// they quote, so the trailing-binding rule stops before them while the punctuation-led peel + /// takes them; without the peel a chunk starting at `「` would skip ICU segmentation and swallow + /// the rest of a flat quoted run to the next whitespace. + var isCJKOpeningBracket: Bool { + self == "\u{300C}" || self == "\u{300E}" || self == "\u{FF08}" + || self == "\u{3010}" || self == "\u{3008}" || self == "\u{300A}" || self == "\u{FF62}" + } + + /// Sentence-ending punctuation for phrase mode, in both ASCII and CJK forms: `.` `!` `?` plus the + /// ideographic full stop `。`, fullwidth `!` `?`, and the halfwidth ideographic stop `。`. `\n` is + /// handled separately because it can appear inside a leading-whitespace prefix of a composed chunk + /// rather than at the chunk's tail end. var isPhraseSentenceTerminator: Bool { - self == "." || self == "!" || self == "?" + self == "." || self == "!" || self == "?" || isCJKSentenceTerminator + } + + /// Clause-boundary punctuation for phrase mode: the ideographic comma `、` (and its halfwidth + /// form `、`) and the fullwidth comma `,`. CJK prose marks its natural pause points with these + /// rather than whitespace, so phrase acceptance treats them as boundaries to advance clause by + /// clause instead of swallowing a whole sentence per Tab. All three codepoints occur only in CJK + /// text, and ASCII "," is deliberately excluded, so space-delimited scripts never stop at a comma. + var isPhraseClauseBoundary: Bool { + self == "\u{3001}" || self == "\u{FF0C}" || self == "\u{FF64}" } /// Closing punctuation that may follow a sentence terminator in prose: straight + curly - /// quotes, parentheses, square brackets, and braces. The phrase scanner walks back past a - /// run of these to find the real sentence terminator underneath, so `"done."` stops as a - /// complete sentence even though its final character is the closing quote. + /// quotes, parentheses, square brackets, and braces, plus the CJK closers (corner brackets, + /// fullwidth parenthesis, lenticular and angle brackets). The phrase scanner walks back past a + /// run of these to find the real sentence terminator underneath, so `"done."` and `終わり。」` + /// stop as complete sentences even though their final character is the closer. var isPhraseClosingPunctuation: Bool { self == "\"" || self == "'" || self == ")" || self == "]" || self == "}" - || self == "\u{201D}" || self == "\u{2019}" + || self == "\u{201D}" || self == "\u{2019}" || isCJKClosingPunctuation + } + + /// CJK punctuation that binds to the space-less word it follows for acceptance chunking: clause + /// commas, sentence terminators, and closing brackets/quotes. One Tab then accepts `読み、` as a + /// unit, and a chunk can never start at a punctuation cliff that would swallow the rest of the + /// run. Opening brackets are excluded because they belong to the next word, and every contributing + /// set is CJK-only (ASCII punctuation is never a member), so this can never affect space-delimited + /// text. + var bindsToPrecedingSpacelessWord: Bool { + isPhraseClauseBoundary || isCJKSentenceTerminator || isCJKClosingPunctuation } } diff --git a/CotabbyTests/SentenceBoundaryClassifierTests.swift b/CotabbyTests/SentenceBoundaryClassifierTests.swift index f947a48e..d0f368be 100644 --- a/CotabbyTests/SentenceBoundaryClassifierTests.swift +++ b/CotabbyTests/SentenceBoundaryClassifierTests.swift @@ -76,4 +76,30 @@ final class SentenceBoundaryClassifierTests: XCTestCase { func test_endsSentence_falseForEmptyString() { XCTAssertFalse(SentenceBoundaryClassifier.endsSentence("")) } + + /// CJK terminators are unambiguous sentence ends. Without these the decode stop policy never + /// fires for Japanese/Chinese text and generation always runs to the token budget, which is why + /// CJK suggestions came out so long. + func test_endsSentence_trueForCJKTerminators() { + XCTAssertTrue(SentenceBoundaryClassifier.endsSentence("資料を読む。")) + XCTAssertTrue(SentenceBoundaryClassifier.endsSentence("すごい!")) + XCTAssertTrue(SentenceBoundaryClassifier.endsSentence("いいですか?")) + } + + func test_endsSentence_walksPastCJKClosingPunctuation() { + XCTAssertTrue(SentenceBoundaryClassifier.endsSentence("終わり。」")) + } + + /// Halfwidth kana punctuation (legacy SJIS contexts) terminates like its fullwidth counterparts, + /// including the walk past a halfwidth corner bracket. + func test_endsSentence_trueForHalfwidthTerminatorAndCloser() { + XCTAssertTrue(SentenceBoundaryClassifier.endsSentence("終わり。")) + XCTAssertTrue(SentenceBoundaryClassifier.endsSentence("終わり。」")) + } + + /// The ideographic comma is a clause boundary, not a sentence end: generation should keep going + /// past `、` and only stop at a real terminator. + func test_endsSentence_falseForIdeographicComma() { + XCTAssertFalse(SentenceBoundaryClassifier.endsSentence("資料を読み、")) + } } diff --git a/CotabbyTests/SuggestionSessionReconcilerTests.swift b/CotabbyTests/SuggestionSessionReconcilerTests.swift index 87d0cc9e..4f133967 100644 --- a/CotabbyTests/SuggestionSessionReconcilerTests.swift +++ b/CotabbyTests/SuggestionSessionReconcilerTests.swift @@ -257,6 +257,130 @@ final class SuggestionSessionReconcilerTests: XCTestCase { ) } + // MARK: - CJK phrase boundaries + + /// The reported case: a space-less Japanese sentence must not arrive as one giant Tab. The + /// ideographic comma is a clause boundary, so phrase accepts advance clause by clause. + func test_nextAcceptancePhrase_stopsAtIdeographicComma() { + XCTAssertEqual( + SuggestionSessionReconciler.nextAcceptancePhrase(from: "理解し、その内容を自分の言葉で表現する。"), + "理解し、" + ) + } + + func test_nextAcceptancePhrase_stopsAtIdeographicFullStop() { + XCTAssertEqual( + SuggestionSessionReconciler.nextAcceptancePhrase(from: "その内容を自分の言葉で表現する。次の文"), + "その内容を自分の言葉で表現する。" + ) + } + + func test_nextAcceptancePhrase_stopsAtFullwidthExclamationAndQuestion() { + XCTAssertEqual(SuggestionSessionReconciler.nextAcceptancePhrase(from: "すごい!次へ"), "すごい!") + XCTAssertEqual(SuggestionSessionReconciler.nextAcceptancePhrase(from: "いいですか?はい"), "いいですか?") + } + + /// The closer-walk must work for CJK quotes too: the accumulated tail is `」`, and the + /// terminator underneath is the ideographic full stop. + func test_nextAcceptancePhrase_walksPastCJKClosingQuote() { + XCTAssertEqual( + SuggestionSessionReconciler.nextAcceptancePhrase(from: "終わり。」次の文"), + "終わり。」" + ) + } + + /// ASCII commas must stay non-boundaries so English phrase cadence is unchanged by the CJK rules. + func test_nextAcceptancePhrase_doesNotStopAtAsciiComma() { + XCTAssertEqual( + SuggestionSessionReconciler.nextAcceptancePhrase(from: "hello, world. next"), + "hello, world." + ) + } + + // MARK: - CJK punctuation binding in word chunks + + /// Trailing CJK punctuation binds to the word it follows, so one Tab accepts the word and its + /// comma as a unit instead of stranding the comma to lead the next chunk. + func test_nextAcceptanceChunk_bindsTrailingIdeographicCommaToWord() { + XCTAssertEqual(SuggestionSessionReconciler.nextAcceptanceChunk(from: "資料、内容"), "資料、") + } + + /// A punctuation-led tail peels the punctuation run as its own chunk. Before this rule the token + /// skipped ICU segmentation (punctuation does not begin a space-less-script word) and the accept + /// swallowed everything up to the next whitespace in one chunk. + func test_nextAcceptanceChunk_peelsLeadingCJKPunctuationRunInsteadOfSwallowingTheTail() { + XCTAssertEqual(SuggestionSessionReconciler.nextAcceptanceChunk(from: "、理解し、その内容"), "、") + XCTAssertEqual(SuggestionSessionReconciler.nextAcceptanceChunk(from: "。」次の文"), "。」") + } + + /// CJK opening brackets are peeled too: `「` leads the word it quotes, so it neither begins a + /// space-less-script word nor binds to the preceding one, and without the peel a quoted run in + /// flat text would be swallowed whole (`「分かった」と言った` after `は` in one Tab). + func test_nextAcceptanceChunk_peelsLeadingCJKOpeningBracket() { + XCTAssertEqual(SuggestionSessionReconciler.nextAcceptanceChunk(from: "「分かった」と言った"), "「") + XCTAssertEqual(SuggestionSessionReconciler.nextAcceptanceChunk(from: "【内容】次"), "【") + } + + /// A mixed close-then-open run (`。」「`) peels as one punctuation chunk, so back-to-back quotes + /// never strand the walker. + func test_nextAcceptanceChunk_peelsMixedCloserOpenerRunAsOneChunk() { + XCTAssertEqual(SuggestionSessionReconciler.nextAcceptanceChunk(from: "。」「次の文"), "。」「") + } + + /// The trailing binding must stop before an opening bracket: the closer and full stop belong to + /// the word, but the next quote's opener belongs to the next word. + func test_nextAcceptanceChunk_trailingBindingStopsBeforeOpeningBracket() { + XCTAssertEqual(SuggestionSessionReconciler.nextAcceptanceChunk(from: "内容。「次"), "内容。") + } + + /// Halfwidth kana punctuation (legacy SJIS contexts) behaves like its fullwidth counterparts: + /// the halfwidth comma is a clause boundary and the halfwidth corner bracket binds and walks. + func test_halfwidthKanaPunctuation_matchesFullwidthBehavior() { + XCTAssertEqual(SuggestionSessionReconciler.nextAcceptancePhrase(from: "資料を読み、次へ"), "資料を読み、") + XCTAssertEqual(SuggestionSessionReconciler.nextAcceptancePhrase(from: "終わり。」次の文"), "終わり。」") + } + + /// ASCII brackets and quotes must keep their existing whole-token behavior: the CJK opener peel + /// is scoped to CJK codepoints, so space-delimited scripts stay byte-for-byte unchanged. + func test_nextAcceptanceChunk_asciiBracketsUnchangedByOpenerPeel() { + XCTAssertEqual(SuggestionSessionReconciler.nextAcceptanceChunk(from: "(hello) world"), "(hello)") + XCTAssertEqual(SuggestionSessionReconciler.nextAcceptanceChunk(from: "\"quote\" next"), "\"quote\"") + } + + // MARK: - CJK punctuation under trailing-punctuation policy + + /// With trailing-punctuation auto-accept off, the CJK binding is intentionally re-peeled: the word + /// accepts on its own and the clause comma waits for the next Tab, exactly how ASCII trailing + /// punctuation behaves under the same setting. The binding is a no-op in this path by design. + func test_nextAcceptanceChunk_autoAcceptOff_trimsBoundCJKCommaBackOffTheWord() { + XCTAssertEqual( + SuggestionSessionReconciler.nextAcceptanceChunk(from: "資料、内容", autoAcceptTrailingPunctuation: false), + "資料" + ) + } + + /// A punctuation-led peel must stay non-empty with auto-accept off. Trimming would otherwise strip + /// the whole chunk and stall the phrase walker, but `wordEndTrimmingTrailingPunctuation` returns + /// nil for a punctuation-only token, so the comma survives as its own chunk. + func test_nextAcceptanceChunk_autoAcceptOff_keepsPunctuationOnlyPeelNonEmpty() { + XCTAssertEqual( + SuggestionSessionReconciler.nextAcceptanceChunk(from: "、内容", autoAcceptTrailingPunctuation: false), + "、" + ) + } + + /// The flag never changes phrase output: with auto-accept off the word and comma arrive as separate + /// chunks, but they accumulate to the same clause the flag-on path returns in one binding. + func test_nextAcceptancePhrase_autoAcceptOff_stillStopsAtIdeographicComma() { + XCTAssertEqual( + SuggestionSessionReconciler.nextAcceptancePhrase( + from: "理解し、その内容を自分の言葉で表現する。", + autoAcceptTrailingPunctuation: false + ), + "理解し、" + ) + } + func test_nextAcceptancePhrase_walksPastDottedInitialsToRealSentenceEnd() { // "U.S.A." is a run of single-letter initials, so its interior periods are not sentence // ends. SentenceBoundaryClassifier keeps phrase acceptance going until the real terminator