diff --git a/iosMath/lib/MTMathAtomFactory.m b/iosMath/lib/MTMathAtomFactory.m index 727d86e..f3945ed 100644 --- a/iosMath/lib/MTMathAtomFactory.m +++ b/iosMath/lib/MTMathAtomFactory.m @@ -98,11 +98,15 @@ + (nullable MTMathAtom *)atomForCharacter:(unichar)ch { NSString *chStr = [NSString stringWithCharacters:&ch length:1]; if (ch < 0x21 || ch > 0x7E) { - // skip non ascii characters and spaces. Non-Latin text must be - // wrapped in \text*, \textbf{...}, etc. + // No atom for control characters, spaces, or non-ASCII literals. The + // builder decides what to do with these: whitespace is silently ignored, + // everything else raises MTParseErrorInvalidCharacter. Non-Latin text + // must be wrapped in \text*, \textbf{...}, etc. return nil; } else if (ch == '$' || ch == '%' || ch == '#' || ch == '&' || ch == '~' || ch == '\'') { - // These are latex control characters that have special meanings. We don't support them. + // LaTeX control characters with special meanings. They have no atom of + // their own; the builder handles them (& / ~ / ' are consumed before + // reaching here, while $ % # raise MTParseErrorInvalidCharacter). return nil; } else if (ch == '^' || ch == '_' || ch == '{' || ch == '}' || ch == '\\') { // more special characters for Latex. diff --git a/iosMath/lib/MTMathListBuilder.h b/iosMath/lib/MTMathListBuilder.h index 69a69e7..15245b7 100644 --- a/iosMath/lib/MTMathListBuilder.h +++ b/iosMath/lib/MTMathListBuilder.h @@ -89,6 +89,10 @@ typedef NS_ENUM(NSUInteger, MTParseErrors) { MTParseErrorInvalidLimits, /// The LaTeX nesting depth exceeded the safe parsing limit. MTParseErrorNestingTooDeep, + /// A character in the string is not a valid LaTeX input character in math + /// mode (e.g. a non-ASCII literal like π, or a special character such as + /// %, #, $ that has no meaning here). + MTParseErrorInvalidCharacter, }; @end diff --git a/iosMath/lib/MTMathListBuilder.m b/iosMath/lib/MTMathListBuilder.m index e87288c..fb51fc4 100644 --- a/iosMath/lib/MTMathListBuilder.m +++ b/iosMath/lib/MTMathListBuilder.m @@ -358,11 +358,30 @@ - (MTMathList*)buildInternal:(BOOL) oneCharOnly stopChar:(unichar) stop } else if (_spacesAllowed && ch == ' ') { // If spaces are allowed then spaces do not need escaping with a \ before being used. atom = [MTMathAtomFactory atomForLatexSymbolName:@" "]; + } else if (ch == '~') { + // Tilde is a non-breaking space in LaTeX; render it as an ordinary space. + atom = [MTMathAtomFactory atomForLatexSymbolName:@" "]; } else { atom = [MTMathAtomFactory atomForCharacter:ch]; if (!atom) { - // Not a recognized character - continue; + // Characters TeX silently discards: whitespace (catcode 10/5, + // ignored in math mode) and NUL (catcode 9). Note that other + // control characters are *not* spaces in TeX (form feed is \par, + // vertical tab is an ordinary "other" character), so they fall + // through to the error below, as they should. + if (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r' || ch == '\0') { + continue; + } + // Any other unrecognized character is an error: a non-ASCII literal + // (e.g. π, ×, ≤) or a special character with no meaning in math mode + // (% is a comment, # a macro parameter, $ toggles math mode). Callers + // should use the corresponding LaTeX command (e.g. \pi, \%, \#). + // ch is a single UTF-16 code unit; we just report its value (an + // above-BMP character reports its leading surrogate, which is fine + // for an error message). + [self setError:MTParseErrorInvalidCharacter + message:[NSString stringWithFormat:@"Unknown character U+%04X is not a valid LaTeX input character in math mode. Use the corresponding LaTeX command instead.", ch]]; + return nil; } } NSAssert(atom != nil, @"Atom shouldn't be nil"); diff --git a/iosMathTests/MTMathListBuilderTest.m b/iosMathTests/MTMathListBuilderTest.m index 4bfe8c1..24c2b95 100644 --- a/iosMathTests/MTMathListBuilderTest.m +++ b/iosMathTests/MTMathListBuilderTest.m @@ -69,6 +69,8 @@ - (void)tearDown @[ @"x \\ y", @[ @(kMTMathAtomVariable), @(kMTMathAtomOrdinary), @(kMTMathAtomVariable)], @"x\\ y"], // spacing @[ @"x \\quad y \\; z \\! q", @[ @(kMTMathAtomVariable), @(kMTMathAtomSpace), @(kMTMathAtomVariable),@(kMTMathAtomSpace), @(kMTMathAtomVariable),@(kMTMathAtomSpace), @(kMTMathAtomVariable)], @"x\\quad y\\; z\\! q"], + // tilde is a non-breaking space (renders as an ordinary space, same as a literal space) + @[ @"x~y", @[ @(kMTMathAtomVariable), @(kMTMathAtomOrdinary), @(kMTMathAtomVariable)], @"x\\ y"], ]; } @@ -1488,6 +1490,16 @@ - (void) testDisplayLines @[@"x^\\choose y", @(MTParseErrorInvalidCommand)], @[@"x^\\brack y", @(MTParseErrorInvalidCommand)], @[@"x^\\brace y", @(MTParseErrorInvalidCommand)], + // REN-5: non-ASCII literal characters should produce MTParseErrorInvalidCharacter + @[@"π", @(MTParseErrorInvalidCharacter)], // π (U+03C0) + @[@"3 × 4", @(MTParseErrorInvalidCharacter)], // 3 × 4 + @[@"x ≤ y", @(MTParseErrorInvalidCharacter)], // x ≤ y + @[@"x 𝑎 y", @(MTParseErrorInvalidCharacter)], // above-BMP literal (U+1D44E, surrogate pair) + // Special characters with no meaning in math mode are errors (match LaTeX: + // % is a comment, # is a macro parameter, $ toggles math mode - none valid here). + @[@"a % b", @(MTParseErrorInvalidCharacter)], + @[@"a # b", @(MTParseErrorInvalidCharacter)], + @[@"a $ b", @(MTParseErrorInvalidCharacter)], ]; }; @@ -1508,6 +1520,26 @@ - (void) testErrors } } +// REN-5: characters TeX silently discards (whitespace catcode 10/5 and NUL +// catcode 9) must continue to parse without error. Guards against the error +// path swallowing legitimate whitespace. +- (void) testIgnoredWhitespaceCharacters +{ + unichar nulChars[3] = { 'x', 0x0000, 'y' }; + NSString* withNul = [NSString stringWithCharacters:nulChars length:3]; + NSArray* inputs = @[ @"x\ty", @"x\ny", @"x\ry", withNul ]; + for (NSString* str in inputs) { + NSError* error = nil; + MTMathList* list = [MTMathListBuilder buildFromString:str error:&error]; + NSString* desc = [NSString stringWithFormat:@"whitespace input %@", str]; + XCTAssertNotNil(list, @"%@", desc); + XCTAssertNil(error, @"%@", desc); + XCTAssertEqual(list.atoms.count, 2u, @"%@", desc); + XCTAssertEqual([list.atoms[0] type], kMTMathAtomVariable, @"%@", desc); + XCTAssertEqual([list.atoms[1] type], kMTMathAtomVariable, @"%@", desc); + } +} + // REN-6: \over inside an explicit-brace script group must still parse correctly. - (void) testOverInScriptBraces {