diff --git a/opennlp-api/src/main/java/opennlp/tools/util/normalizer/AlignedText.java b/opennlp-api/src/main/java/opennlp/tools/util/normalizer/AlignedText.java new file mode 100644 index 000000000..dc68b1f09 --- /dev/null +++ b/opennlp-api/src/main/java/opennlp/tools/util/normalizer/AlignedText.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.util.normalizer; + +import opennlp.tools.util.Span; + +/** + * The result of a normalization that keeps the original text alongside the normalized form and a + * full {@link Alignment} between them. + * + *

The original is the source of truth, the normalized form is the derived view tuned for + * matching and search, and the alignment maps spans between them through deletions, collapses, and + * expansions. Use + * {@link #toOriginalSpan(int, int)} to report a match found in the normalized form against the + * original.

+ * + * @param original The untouched source text. + * @param normalized The normalized text. + * @param alignment The alignment between the normalized and original text. + */ +public record AlignedText(CharSequence original, String normalized, Alignment alignment) { + + /** + * Maps a span of the normalized text back to the tightest span of the original text. + * + * @param normalizedStart The inclusive start offset in the normalized text. + * @param normalizedEnd The exclusive end offset in the normalized text. + * @return The corresponding original span. + */ + public Span toOriginalSpan(int normalizedStart, int normalizedEnd) { + return alignment.toOriginalSpan(normalizedStart, normalizedEnd); + } + + /** + * Maps a span of the original text forward to the normalized text. + * + * @param originalStart The inclusive start offset in the original text. + * @param originalEnd The exclusive end offset in the original text. + * @return The corresponding normalized span. + */ + public Span toNormalizedSpan(int originalStart, int originalEnd) { + return alignment.toNormalizedSpan(originalStart, originalEnd); + } +} diff --git a/opennlp-api/src/main/java/opennlp/tools/util/normalizer/Alignment.java b/opennlp-api/src/main/java/opennlp/tools/util/normalizer/Alignment.java new file mode 100644 index 000000000..0f1d47a6a --- /dev/null +++ b/opennlp-api/src/main/java/opennlp/tools/util/normalizer/Alignment.java @@ -0,0 +1,293 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.util.normalizer; + +import java.util.Arrays; + +import opennlp.tools.util.Span; + +/** + * A bidirectional alignment between an original text and a normalized form of it. + * + *

Normalization edits text in ways that move character offsets: a run of whitespace collapses to + * one space, a supplementary dash folds to a single ASCII hyphen, a case fold can grow text + * (German {@code eszett} to {@code ss}), and trimming or stripping deletes characters outright. An + * {@code Alignment} records those edits as a sequence of equal runs (text copied through + * unchanged in length) and replace runs (a block of original characters that produced a + * block of normalized characters), so any span in either form can be mapped to the other.

+ * + *

Because it represents deletions as gaps and expansions as shared blocks (rather than storing a + * single original offset per normalized character, which would assume the normalized text + * contiguously covers the original), mapping is done + * span to span ({@link #toOriginalSpan(int, int)} / {@link #toNormalizedSpan(int, int)}) so a match + * that ends next to deleted text reports a tight span rather than over-covering the deletion. Two + * alignments compose with {@link #andThen(Alignment)}, which is what lets a multi-stage + * normalization pipeline still map a result all the way back to the original.

+ * + *

Instances are immutable and thread-safe; build one with {@link Builder}.

+ */ +public final class Alignment { + + // For normalized character k, originalStart[k]/originalEnd[k] are the half-open original range it + // was produced from. Characters copied unchanged map one to one; characters from a collapse or + // expansion share their run's whole original range (it cannot be subdivided); deleted original + // characters appear as a gap that no normalized character covers. + private final int[] originalStart; + private final int[] originalEnd; + private final int originalLength; + + private Alignment(int[] originalStart, int[] originalEnd, int originalLength) { + this.originalStart = originalStart; + this.originalEnd = originalEnd; + this.originalLength = originalLength; + } + + /** {@return the length of the normalized text this alignment was built for} */ + public int normalizedLength() { + return originalStart.length; + } + + /** {@return the length of the original text this alignment was built for} */ + public int originalLength() { + return originalLength; + } + + /** + * Maps a half-open span of the normalized text to the tightest half-open span of the original + * text that produced it. + * + * @param normalizedStart The inclusive start offset, in {@code [0, normalizedLength()]}. + * @param normalizedEnd The exclusive end offset, in {@code [normalizedStart, normalizedLength()]}. + * @return The corresponding original span. + * @throws IndexOutOfBoundsException Thrown if the offsets are out of range or inverted. + */ + public Span toOriginalSpan(int normalizedStart, int normalizedEnd) { + checkRange(normalizedStart, normalizedEnd, normalizedLength()); + if (normalizedStart == normalizedEnd) { + final int at = normalizedStart < normalizedLength() + ? originalStart[normalizedStart] : originalLength; + return new Span(at, at); + } + return new Span(originalStart[normalizedStart], originalEnd[normalizedEnd - 1]); + } + + /** + * Maps a half-open span of the original text to the half-open span of the normalized text that + * covers it. Original characters that were deleted map to an empty span at the point where they + * were removed. + * + * @param originalStartOffset The inclusive start offset, in {@code [0, originalLength()]}. + * @param originalEndOffset The exclusive end offset, in {@code [originalStartOffset, originalLength()]}. + * @return The corresponding normalized span. + * @throws IndexOutOfBoundsException Thrown if the offsets are out of range or inverted. + */ + public Span toNormalizedSpan(int originalStartOffset, int originalEndOffset) { + checkRange(originalStartOffset, originalEndOffset, originalLength); + final int start = firstIndexEndingAfter(originalStartOffset); + final int end = firstIndexStartingAtOrAfter(originalEndOffset); + return new Span(start, Math.max(start, end)); + } + + /** + * Maps a normalized offset to the original offset where its character begins (start semantics). + * Prefer {@link #toOriginalSpan(int, int)} for mapping a match, since a single offset cannot + * distinguish the start and end of a span across a deletion. + * + * @param normalizedOffset An offset in {@code [0, normalizedLength()]}. + * @return The corresponding original offset. + * @throws IndexOutOfBoundsException Thrown if {@code normalizedOffset} is out of range. + */ + public int toOriginalOffset(int normalizedOffset) { + if (normalizedOffset < 0 || normalizedOffset > normalizedLength()) { + throw new IndexOutOfBoundsException("normalized offset " + normalizedOffset + + " is outside [0, " + normalizedLength() + "]"); + } + return normalizedOffset < normalizedLength() ? originalStart[normalizedOffset] : originalLength; + } + + /** + * Composes this alignment with one that further normalizes this alignment's normalized text. + * + *

If this maps {@code original -> middle} and {@code next} maps {@code middle -> final}, the + * result maps {@code original -> final} directly, so a span found in the final text can be mapped + * straight back to the original without keeping the intermediate stages.

+ * + * @param next The next stage, whose original side is this stage's normalized text. + * @return The composed alignment. + * @throws IllegalArgumentException Thrown if {@code next.originalLength()} does not equal this + * {@code normalizedLength()} (the stages do not line up). + */ + public Alignment andThen(Alignment next) { + if (next.originalLength != normalizedLength()) { + throw new IllegalArgumentException("stages do not line up: this normalizedLength=" + + normalizedLength() + " but next originalLength=" + next.originalLength); + } + final int finalLength = next.normalizedLength(); + final int[] starts = new int[finalLength]; + final int[] ends = new int[finalLength]; + for (int f = 0; f < finalLength; f++) { + final int middleStart = next.originalStart[f]; + final int middleEnd = next.originalEnd[f]; + final int start = middleStart < normalizedLength() ? originalStart[middleStart] : originalLength; + final int end = middleEnd > 0 ? originalEnd[middleEnd - 1] : 0; + starts[f] = start; + // Math.max keeps the original span non-inverted. When next inserted this final character + // (a zero-width middle range, middleStart == middleEnd) the max collapses it to a zero-width + // original span -- correct for every insertion except one landing strictly inside an + // expansion this stage produced, where the characters on either side share one atomic + // original block (originalEnd[middleEnd - 1] > originalStart[middleStart]) that has no + // interior offset to point at. There the insertion is attributed to that whole block, the + // only choice that keeps originalStart/originalEnd sorted so toOriginalSpan/toNormalizedSpan + // keep their O(log n) search; forcing it to zero-width would push originalEnd below its + // predecessor and corrupt the reverse mapping. + ends[f] = Math.max(start, end); + } + return new Alignment(starts, ends, originalLength); + } + + // First normalized index whose original coverage ends strictly after offset (so it covers or + // follows offset); normalizedLength() when offset is at or past the last covered original char. + private int firstIndexEndingAfter(int offset) { + int low = 0; + int high = originalEnd.length; + while (low < high) { + final int mid = (low + high) >>> 1; + if (originalEnd[mid] > offset) { + high = mid; + } else { + low = mid + 1; + } + } + return low; + } + + // First normalized index whose original coverage starts at or after offset. + private int firstIndexStartingAtOrAfter(int offset) { + int low = 0; + int high = originalStart.length; + while (low < high) { + final int mid = (low + high) >>> 1; + if (originalStart[mid] >= offset) { + high = mid; + } else { + low = mid + 1; + } + } + return low; + } + + private static void checkRange(int start, int end, int length) { + if (start < 0 || end > length || start > end) { + throw new IndexOutOfBoundsException("span [" + start + ", " + end + ") is outside [0, " + + length + "]"); + } + } + + /** + * Builds an {@link Alignment} as the normalized text is produced, by recording each edit in order. + * Call {@link #equal(int)} for characters copied through unchanged and {@link #replace(int, int)} + * for a block that was rewritten (including deletions and insertions), then {@link #build(int)}. + */ + public static final class Builder { + + private static final int MAX_ARRAY_SIZE = Integer.MAX_VALUE - 8; + + private int[] starts = new int[16]; + private int[] ends = new int[16]; + private int count; + private int originalCursor; + + /** + * Records {@code charCount} characters copied through unchanged (a one to one run). + * + * @param charCount The number of UTF-16 characters; must not be negative. + * @return This builder. + */ + public Builder equal(int charCount) { + if (charCount < 0) { + throw new IllegalArgumentException("charCount must not be negative: " + charCount); + } + for (int i = 0; i < charCount; i++) { + append(originalCursor, originalCursor + 1); + originalCursor++; + } + return this; + } + + /** + * Records a rewritten block: {@code originalCount} original characters that produced + * {@code normalizedCount} normalized characters. Each produced character is attributed to the + * whole original block, since a collapse or expansion cannot be subdivided. {@code 0} for + * {@code normalizedCount} is a deletion; {@code 0} for {@code originalCount} is an insertion. + * + * @param originalCount The number of original characters consumed; must not be negative. + * @param normalizedCount The number of normalized characters produced; must not be negative. + * @return This builder. + */ + public Builder replace(int originalCount, int normalizedCount) { + if (originalCount < 0 || normalizedCount < 0) { + throw new IllegalArgumentException("counts must not be negative: " + originalCount + + ", " + normalizedCount); + } + final int blockEnd = originalCursor + originalCount; + for (int i = 0; i < normalizedCount; i++) { + append(originalCursor, blockEnd); + } + originalCursor = blockEnd; + return this; + } + + /** + * Finalizes the alignment. + * + * @param originalLength The full length of the original text. + * @return The immutable {@link Alignment}. + * @throws IllegalStateException Thrown if the recorded edits do not consume exactly + * {@code originalLength} original characters (a sign that some input was not accounted for). + */ + public Alignment build(int originalLength) { + if (originalCursor != originalLength) { + throw new IllegalStateException("edits consumed " + originalCursor + + " original characters but originalLength is " + originalLength); + } + return new Alignment(Arrays.copyOf(starts, count), Arrays.copyOf(ends, count), originalLength); + } + + private void append(int start, int end) { + if (count == starts.length) { + grow(); + } + starts[count] = start; + ends[count] = end; + count++; + } + + // Overflow-aware 1.5x growth: never wraps to a negative capacity, degrades to a clean + // OutOfMemoryError at the array-size ceiling instead of NegativeArraySizeException. + private void grow() { + int newCapacity = starts.length + (starts.length >> 1); + if (newCapacity < 0 || newCapacity > MAX_ARRAY_SIZE) { + newCapacity = MAX_ARRAY_SIZE; + } + if (newCapacity <= count) { + throw new OutOfMemoryError("Alignment exceeds maximum size"); + } + starts = Arrays.copyOf(starts, newCapacity); + ends = Arrays.copyOf(ends, newCapacity); + } + } +} diff --git a/opennlp-api/src/main/java/opennlp/tools/util/normalizer/CharClass.java b/opennlp-api/src/main/java/opennlp/tools/util/normalizer/CharClass.java index 64e924b0c..a10751bc8 100644 --- a/opennlp-api/src/main/java/opennlp/tools/util/normalizer/CharClass.java +++ b/opennlp-api/src/main/java/opennlp/tools/util/normalizer/CharClass.java @@ -300,6 +300,175 @@ public String removeAll(CharSequence text) { return out.toString(); } + /** + * Like {@link #normalize(CharSequence)} but also produces the {@link Alignment} back to the + * original text. + * + * @param text The text to normalize. + * @return The normalized text and its alignment. + */ + public AlignedText normalizeAligned(CharSequence text) { + Objects.requireNonNull(text, "text"); + final StringBuilder out = new StringBuilder(text.length()); + final Alignment.Builder alignment = new Alignment.Builder(); + final int length = text.length(); + int i = 0; + while (i < length) { + final int codePoint = Character.codePointAt(text, i); + final int charCount = Character.charCount(codePoint); + if (members.contains(codePoint)) { + out.appendCodePoint(replacement); + alignment.replace(charCount, Character.charCount(replacement)); + } else { + out.appendCodePoint(codePoint); + alignment.equal(charCount); + } + i += charCount; + } + return new AlignedText(text, out.toString(), alignment.build(length)); + } + + /** + * Like {@link #collapse(CharSequence)} but also produces the {@link Alignment} back to the + * original text. Each collapsed run maps to the run's whole original extent. + * + * @param text The text to collapse. + * @return The collapsed text and its alignment. + */ + public AlignedText collapseAligned(CharSequence text) { + Objects.requireNonNull(text, "text"); + final StringBuilder out = new StringBuilder(text.length()); + final Alignment.Builder alignment = new Alignment.Builder(); + final int length = text.length(); + int i = 0; + while (i < length) { + final int codePoint = Character.codePointAt(text, i); + if (members.contains(codePoint)) { + final int runEnd = skipRun(text, i); + out.appendCodePoint(replacement); + alignment.replace(runEnd - i, Character.charCount(replacement)); + i = runEnd; + } else { + final int charCount = Character.charCount(codePoint); + out.appendCodePoint(codePoint); + alignment.equal(charCount); + i += charCount; + } + } + return new AlignedText(text, out.toString(), alignment.build(length)); + } + + /** + * Like {@link #collapsePreserving(CharSequence, CodePointSet, int)} but also produces the + * {@link Alignment} back to the original text. + * + * @param text The text to collapse. + * @param keep The member code points whose presence in a run preserves structure. + * @param keepReplacement The replacement emitted for a run that contains a {@code keep} member. + * @return The collapsed text and its alignment. + * @throws IllegalArgumentException Thrown if {@code keepReplacement} is not a valid code point. + */ + public AlignedText collapsePreservingAligned(CharSequence text, CodePointSet keep, + int keepReplacement) { + Objects.requireNonNull(text, "text"); + Objects.requireNonNull(keep, "keep"); + requireValidCodePoint(keepReplacement); + final StringBuilder out = new StringBuilder(text.length()); + final Alignment.Builder alignment = new Alignment.Builder(); + final int length = text.length(); + int i = 0; + while (i < length) { + final int codePoint = Character.codePointAt(text, i); + if (members.contains(codePoint)) { + boolean preserve = keep.contains(codePoint); + int j = i + Character.charCount(codePoint); + while (j < length) { + final int next = Character.codePointAt(text, j); + if (!members.contains(next)) { + break; + } + preserve |= keep.contains(next); + j += Character.charCount(next); + } + final int emitted = preserve ? keepReplacement : replacement; + out.appendCodePoint(emitted); + alignment.replace(j - i, Character.charCount(emitted)); + i = j; + } else { + final int charCount = Character.charCount(codePoint); + out.appendCodePoint(codePoint); + alignment.equal(charCount); + i += charCount; + } + } + return new AlignedText(text, out.toString(), alignment.build(length)); + } + + /** + * Like {@link #trim(CharSequence)} but also produces the {@link Alignment} back to the original + * text. The trimmed leading and trailing members appear as deletions, so a span never reports + * through them. + * + * @param text The text to trim. + * @return The trimmed text and its alignment. + */ + public AlignedText trimAligned(CharSequence text) { + Objects.requireNonNull(text, "text"); + final int length = text.length(); + int start = 0; + while (start < length) { + final int codePoint = Character.codePointAt(text, start); + if (!members.contains(codePoint)) { + break; + } + start += Character.charCount(codePoint); + } + int end = length; + while (end > start) { + final int codePoint = Character.codePointBefore(text, end); + if (!members.contains(codePoint)) { + break; + } + end -= Character.charCount(codePoint); + } + final Alignment.Builder alignment = new Alignment.Builder(); + if (start > 0) { + alignment.replace(start, 0); + } + alignment.equal(end - start); + if (end < length) { + alignment.replace(length - end, 0); + } + return new AlignedText(text, text.subSequence(start, end).toString(), alignment.build(length)); + } + + /** + * Like {@link #removeAll(CharSequence)} but also produces the {@link Alignment} back to the + * original text. Every removed member appears as a deletion, so a span never reports through one. + * + * @param text The text to filter. + * @return The filtered text and its alignment. + */ + public AlignedText removeAllAligned(CharSequence text) { + Objects.requireNonNull(text, "text"); + final StringBuilder out = new StringBuilder(text.length()); + final Alignment.Builder alignment = new Alignment.Builder(); + final int length = text.length(); + int i = 0; + while (i < length) { + final int codePoint = Character.codePointAt(text, i); + final int charCount = Character.charCount(codePoint); + if (members.contains(codePoint)) { + alignment.replace(charCount, 0); + } else { + out.appendCodePoint(codePoint); + alignment.equal(charCount); + } + i += charCount; + } + return new AlignedText(text, out.toString(), alignment.build(length)); + } + /** * Applies a per-code-point substitution: each code point for which {@code substitution} returns a * non-null string is replaced by that string, and the rest are copied through. This is the shared, @@ -329,6 +498,37 @@ public static String substitute(CharSequence text, IntFunction substitut return out.toString(); } + /** + * Like {@link #substitute(CharSequence, IntFunction)} but also produces the {@link Alignment} back + * to the original text. Each replaced code point maps to its replacement string as one block. + * + * @param text The text to transform. + * @param substitution The replacement for a code point, or {@code null} to copy it through. + * @return The transformed text and its alignment. + */ + public static AlignedText substituteAligned(CharSequence text, IntFunction substitution) { + Objects.requireNonNull(text, "text"); + Objects.requireNonNull(substitution, "substitution"); + final StringBuilder out = new StringBuilder(text.length()); + final Alignment.Builder alignment = new Alignment.Builder(); + final int length = text.length(); + int i = 0; + while (i < length) { + final int codePoint = Character.codePointAt(text, i); + final int charCount = Character.charCount(codePoint); + final String replacement = substitution.apply(codePoint); + if (replacement != null) { + out.append(replacement); + alignment.replace(charCount, replacement.length()); + } else { + out.appendCodePoint(codePoint); + alignment.equal(charCount); + } + i += charCount; + } + return new AlignedText(text, out.toString(), alignment.build(length)); + } + // Returns the offset just past the maximal run of members starting at runStart. private int skipRun(CharSequence text, int runStart) { final int length = text.length(); diff --git a/opennlp-api/src/main/java/opennlp/tools/util/normalizer/OffsetAwareNormalizer.java b/opennlp-api/src/main/java/opennlp/tools/util/normalizer/OffsetAwareNormalizer.java new file mode 100644 index 000000000..e812d2864 --- /dev/null +++ b/opennlp-api/src/main/java/opennlp/tools/util/normalizer/OffsetAwareNormalizer.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.util.normalizer; + +/** + * A {@link CharSequenceNormalizer} that can additionally report the {@link Alignment} from its + * normalized output back to the input, so a span found in the normalized text maps to the exact + * character offsets of the original. + * + *

Length-changing folds move offsets: collapsing a run of whitespace, folding a supplementary + * dash to one ASCII hyphen, or stripping invisible controls all shift every later character. A rung + * that performs such a fold over the cursor-based {@link CharClass} engine can record those edits + * and expose them through {@link #normalizeAligned(CharSequence)}. A rung that delegates to + * {@link java.text.Normalizer} (NFC/NFKC) or to a stemmer cannot report its edits, so it does not + * implement this interface; that is a deliberate capability split rather than an oversight.

+ * + *

{@code TextNormalizer.Builder.buildAligned()} composes a chain of these into a single + * offset-aware pipeline whose {@link AlignedText} maps a match all the way back to the original + * input. An interface-typed caller tests for the capability + * ({@code normalizer instanceof OffsetAwareNormalizer}) instead of depending on a concrete rung, + * the same plain {@code instanceof} pattern used by + * {@link opennlp.tools.namefind.OffsetMappingNameFinder} rather than reflection.

+ */ +public interface OffsetAwareNormalizer extends CharSequenceNormalizer { + + /** + * Normalizes {@code text} and returns the result together with the {@link Alignment} back to the + * input. The normalized text is identical to {@link #normalize(CharSequence)}: that is, + * {@code normalizeAligned(text).normalized()} equals {@code normalize(text).toString()}. + * + * @param text The {@link CharSequence} to normalize. + * @return The normalized text paired with its alignment to {@code text}. + */ + AlignedText normalizeAligned(CharSequence text); +} diff --git a/opennlp-api/src/test/java/opennlp/tools/util/normalizer/AlignmentTest.java b/opennlp-api/src/test/java/opennlp/tools/util/normalizer/AlignmentTest.java new file mode 100644 index 000000000..07c92de0f --- /dev/null +++ b/opennlp-api/src/test/java/opennlp/tools/util/normalizer/AlignmentTest.java @@ -0,0 +1,258 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.util.normalizer; + +import org.junit.jupiter.api.Test; + +import opennlp.tools.util.Span; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; + +public class AlignmentTest { + + private static void assertSpan(int start, int end, Span span) { + assertEquals(start, span.getStart(), "start"); + assertEquals(end, span.getEnd(), "end"); + } + + @Test + void testIdentityMapsOneToOne() { + final Alignment a = new Alignment.Builder().equal(3).build(3); // "abc" unchanged + assertEquals(3, a.normalizedLength()); + assertEquals(3, a.originalLength()); + assertSpan(0, 3, a.toOriginalSpan(0, 3)); + assertSpan(1, 2, a.toOriginalSpan(1, 2)); + } + + @Test + void testCollapsedRunMapsToWholeRun() { + // "ab " -> "ab " : keep "ab", collapse two spaces into one. + final Alignment a = new Alignment.Builder().equal(2).replace(2, 1).build(4); + assertSpan(0, 2, a.toOriginalSpan(0, 2)); // "ab" + assertSpan(2, 4, a.toOriginalSpan(2, 3)); // the collapsed space covers both originals + assertSpan(0, 4, a.toOriginalSpan(0, 3)); + } + + @Test + void testInteriorDeletionDoesNotOverCover() { + // "a b c" -> "abc" : the two spaces are deleted. A per-character offset map over-covers here. + final Alignment a = new Alignment.Builder() + .equal(1).replace(1, 0).equal(1).replace(1, 0).equal(1).build(5); + assertEquals(3, a.normalizedLength()); + assertEquals(5, a.originalLength()); + assertSpan(0, 1, a.toOriginalSpan(0, 1)); // "a" + assertSpan(2, 3, a.toOriginalSpan(1, 2)); // "b" -> [2,3), NOT [2,4) + assertSpan(4, 5, a.toOriginalSpan(2, 3)); // "c" + assertSpan(0, 5, a.toOriginalSpan(0, 3)); // whole text + } + + @Test + void testTrailingDeletionDoesNotOverCover() { + // "ab " -> "ab" : strip trailing spaces. A match at the end must not absorb them. + final Alignment a = new Alignment.Builder().equal(2).replace(2, 0).build(4); + assertSpan(0, 2, a.toOriginalSpan(0, 2)); // "ab" -> [0,2), NOT [0,4) + assertSpan(1, 2, a.toOriginalSpan(1, 2)); // "b" -> [1,2) + } + + @Test + void testExpansionSharesTheSingleSource() { + // "aßb" -> "assb" : the eszett expands to two characters that both come from it. + final Alignment a = new Alignment.Builder().equal(1).replace(1, 2).equal(1).build(3); + assertEquals(4, a.normalizedLength()); + assertSpan(1, 2, a.toOriginalSpan(1, 3)); // "ss" -> the single "ß" + assertSpan(1, 2, a.toOriginalSpan(1, 2)); // first "s" + assertSpan(1, 2, a.toOriginalSpan(2, 3)); // second "s" + assertSpan(2, 3, a.toOriginalSpan(3, 4)); // "b" + } + + @Test + void testReverseMappingAndDeletionsMapToEmptySpans() { + final Alignment a = new Alignment.Builder() + .equal(1).replace(1, 0).equal(1).replace(1, 0).equal(1).build(5); // "a b c" -> "abc" + assertSpan(1, 2, a.toNormalizedSpan(2, 3)); // original "b" -> normalized "b" + assertSpan(1, 1, a.toNormalizedSpan(1, 2)); // deleted space -> empty normalized span + assertSpan(0, 3, a.toNormalizedSpan(0, 5)); // whole original -> whole normalized + } + + @Test + void testAndThenComposesTwoStages() { + // Stage 1: "a b" -> "a b" (collapse two spaces). Stage 2: "a b" -> "a-b" (space to dash). + final Alignment whitespace = new Alignment.Builder().equal(1).replace(2, 1).equal(1).build(4); + final Alignment dash = new Alignment.Builder().equal(1).replace(1, 1).equal(1).build(3); + final Alignment composed = whitespace.andThen(dash); + + assertEquals(4, composed.originalLength()); + assertEquals(3, composed.normalizedLength()); + assertSpan(0, 1, composed.toOriginalSpan(0, 1)); // "a" + assertSpan(1, 3, composed.toOriginalSpan(1, 2)); // "-" maps back to the original " " + assertSpan(3, 4, composed.toOriginalSpan(2, 3)); // "b" + assertSpan(0, 4, composed.toOriginalSpan(0, 3)); + } + + @Test + void testAndThenRejectsMismatchedStages() { + final Alignment first = new Alignment.Builder().equal(2).build(2); // normalizedLength 2 + final Alignment second = new Alignment.Builder().equal(3).build(3); // originalLength 3 + assertThrows(IllegalArgumentException.class, () -> first.andThen(second)); + } + + @Test + void testAllDeletedProducesEmptyNormalized() { + final Alignment a = new Alignment.Builder().replace(2, 0).build(2); // " " -> "" + assertEquals(0, a.normalizedLength()); + assertEquals(2, a.originalLength()); + assertSpan(0, 0, a.toNormalizedSpan(0, 2)); // all original deleted -> empty normalized span + } + + @Test + void testBuilderRejectsWrongOriginalLength() { + assertThrows(IllegalStateException.class, () -> new Alignment.Builder().equal(2).build(3)); + } + + @Test + void testBuilderRejectsNegativeCounts() { + assertThrows(IllegalArgumentException.class, () -> new Alignment.Builder().equal(-1)); + assertThrows(IllegalArgumentException.class, () -> new Alignment.Builder().replace(-1, 0)); + } + + @Test + void testToOriginalSpanRejectsOutOfRange() { + final Alignment a = new Alignment.Builder().equal(2).build(2); + assertThrows(IndexOutOfBoundsException.class, () -> a.toOriginalSpan(-1, 1)); + assertThrows(IndexOutOfBoundsException.class, () -> a.toOriginalSpan(0, 3)); + assertThrows(IndexOutOfBoundsException.class, () -> a.toOriginalSpan(2, 1)); + } + + @Test + void testToOriginalOffsetConvenience() { + final Alignment a = new Alignment.Builder().equal(2).replace(2, 1).build(4); // "ab "->"ab " + assertEquals(0, a.toOriginalOffset(0)); + assertEquals(2, a.toOriginalOffset(2)); // start of the collapsed space + assertEquals(4, a.toOriginalOffset(3)); // end sentinel -> original length + } + + @Test + void testBuilderGrowsBeyondInitialCapacity() { + // 20 equal chars force the builder past its initial 16-entry buffers (exercises grow()). + final Alignment a = new Alignment.Builder().equal(20).build(20); + assertEquals(20, a.normalizedLength()); + assertEquals(20, a.originalLength()); + assertSpan(0, 20, a.toOriginalSpan(0, 20)); + assertSpan(17, 18, a.toOriginalSpan(17, 18)); + } + + @Test + void testAndThenChainsThreeStages() { + // "a b" -> "a b" (collapse) -> "a-b" (space->dash) -> "a_b" (dash->underscore). + final Alignment s1 = new Alignment.Builder().equal(1).replace(2, 1).equal(1).build(4); + final Alignment s2 = new Alignment.Builder().equal(1).replace(1, 1).equal(1).build(3); + final Alignment s3 = new Alignment.Builder().equal(1).replace(1, 1).equal(1).build(3); + final Alignment composed = s1.andThen(s2).andThen(s3); + + assertEquals(4, composed.originalLength()); + assertEquals(3, composed.normalizedLength()); + assertSpan(0, 1, composed.toOriginalSpan(0, 1)); // a + assertSpan(1, 3, composed.toOriginalSpan(1, 2)); // "_" maps all the way back to the " " + assertSpan(3, 4, composed.toOriginalSpan(2, 3)); // b + } + + @Test + void testAndThenHandlesLeadingInsertionInNextStage() { + // Exercises the andThen branch where the next stage's character covers zero middle characters + // at offset 0 (a leading insertion: originalEnd == 0). The result must be a zero-width original + // span at 0, and the rest of the mapping must stay correct. + final Alignment first = new Alignment.Builder().equal(2).build(2); // "ab" unchanged + final Alignment next = new Alignment.Builder().replace(0, 1).equal(2).build(2); // "ab" -> "Xab" + final Alignment composed = first.andThen(next); + + assertEquals(2, composed.originalLength()); + assertEquals(3, composed.normalizedLength()); + assertSpan(0, 0, composed.toOriginalSpan(0, 1)); // inserted "X" -> zero-width span at original 0 + assertSpan(0, 1, composed.toOriginalSpan(1, 2)); // "a" + assertSpan(1, 2, composed.toOriginalSpan(2, 3)); // "b" + assertSpan(0, 2, composed.toOriginalSpan(0, 3)); // whole normalized -> whole original + } + + @Test + void testAndThenHandlesInteriorInsertionInCopiedRegion() { + // An insertion in the next stage that is NOT at offset 0 and lands in a one-to-one (copied) + // region must still map to a zero-width original span at the insertion point: the andThen branch + // where middleStart == middleEnd with middleEnd > 0. Without correct handling this is exactly the + // case that would misattribute the inserted character to a neighbouring original character. + final Alignment first = new Alignment.Builder().equal(3).build(3); // "abc" + final Alignment next = new Alignment.Builder().equal(1).replace(0, 1).equal(2).build(3); // "abc"->"aXbc" + final Alignment composed = first.andThen(next); + + assertEquals(3, composed.originalLength()); + assertEquals(4, composed.normalizedLength()); + assertSpan(0, 1, composed.toOriginalSpan(0, 1)); // "a" + assertSpan(1, 1, composed.toOriginalSpan(1, 2)); // inserted "X" -> zero-width span at original 1 + assertSpan(1, 2, composed.toOriginalSpan(2, 3)); // "b" + assertSpan(2, 3, composed.toOriginalSpan(3, 4)); // "c" + assertSpan(0, 3, composed.toOriginalSpan(0, 4)); // whole normalized -> whole original + } + + @Test + void testAndThenInsertionInsideExpansionStaysConsistent() { + // The hard case: stage 1 expands "ss" from one original character, then stage 2 inserts a + // character BETWEEN the two produced characters. The two halves of an expansion share one atomic + // original block ([1, 2)), which has no interior offset, so the inserted character is attributed + // to that whole block rather than a zero-width point. That is the only mapping that keeps + // originalStart/originalEnd sorted, so BOTH directions still resolve correctly -- a zero-width + // mapping here would push originalEnd below its predecessor and corrupt the reverse search. + // stage 1: "aXb" -> "assb" (X expands to "ss"); stage 2: "assb" -> "asYsb" (insert Y between). + final Alignment expand = new Alignment.Builder().equal(1).replace(1, 2).equal(1).build(3); + final Alignment insert = new Alignment.Builder().equal(2).replace(0, 1).equal(2).build(4); + final Alignment composed = expand.andThen(insert); + + assertEquals(3, composed.originalLength()); + assertEquals(5, composed.normalizedLength()); + assertSpan(0, 1, composed.toOriginalSpan(0, 1)); // "a" + assertSpan(1, 2, composed.toOriginalSpan(1, 2)); // first "s" -> the expanded original char + assertSpan(1, 2, composed.toOriginalSpan(2, 3)); // inserted char -> attributed to the atomic block + assertSpan(1, 2, composed.toOriginalSpan(3, 4)); // second "s" -> the expanded original char + assertSpan(2, 3, composed.toOriginalSpan(4, 5)); // "b" + assertSpan(0, 3, composed.toOriginalSpan(0, 5)); // whole normalized -> whole original + + // Reverse direction stays correct because the start/end arrays remain sorted: the expanded + // original character maps to its full normalized footprint (the two halves plus the insertion). + assertSpan(1, 4, composed.toNormalizedSpan(1, 2)); // expanded char -> "sYs" + assertSpan(0, 1, composed.toNormalizedSpan(0, 1)); // "a" + assertSpan(4, 5, composed.toNormalizedSpan(2, 3)); // "b" + } + + @Test + void testToNormalizedSpanDoesNotOverCoverAcrossDeletions() { + // "a b" -> "ab" : the two interior spaces are deleted. Forward mapping a span that ends inside + // the deleted run must stop at the last kept character rather than over-covering into "b". + final Alignment a = new Alignment.Builder().equal(1).replace(2, 0).equal(1).build(4); + assertEquals(2, a.normalizedLength()); + assertSpan(0, 1, a.toNormalizedSpan(0, 3)); // "a" plus the two deleted spaces -> just "a" + assertSpan(1, 1, a.toNormalizedSpan(1, 3)); // only the deleted spaces -> empty normalized span + assertSpan(0, 2, a.toNormalizedSpan(0, 4)); // whole original -> whole normalized + assertSpan(1, 2, a.toNormalizedSpan(3, 4)); // "b" + } + + @Test + void testToNormalizedSpanAcrossExpansion() { + final Alignment a = new Alignment.Builder().equal(1).replace(1, 2).equal(1).build(3); // ß->ss + assertSpan(1, 3, a.toNormalizedSpan(1, 2)); // original "ß" -> the two-char "ss" + assertSpan(0, 1, a.toNormalizedSpan(0, 1)); // a + assertSpan(3, 4, a.toNormalizedSpan(2, 3)); // b + } +} diff --git a/opennlp-api/src/test/java/opennlp/tools/util/normalizer/CharClassTest.java b/opennlp-api/src/test/java/opennlp/tools/util/normalizer/CharClassTest.java index 76911a34d..052350d12 100644 --- a/opennlp-api/src/test/java/opennlp/tools/util/normalizer/CharClassTest.java +++ b/opennlp-api/src/test/java/opennlp/tools/util/normalizer/CharClassTest.java @@ -212,4 +212,187 @@ void testOfRejectsInvalidReplacement() { () -> CharClass.of(CodePointSet.of(0x20), Character.MAX_CODE_POINT + 1)); } + // --- aligned variants (Alignment / AlignedText) ------------------------------------------ + + private static void assertSpan(int start, int end, Span span) { + assertEquals(start, span.getStart(), "start"); + assertEquals(end, span.getEnd(), "end"); + } + + @Test + void testCollapseAlignedMapsRunToWholeExtent() { + final AlignedText at = WS.collapseAligned("a b"); + assertEquals("a b", at.normalized()); + assertSpan(0, 1, at.toOriginalSpan(0, 1)); // a + assertSpan(1, 3, at.toOriginalSpan(1, 2)); // the collapsed space covers both originals + assertSpan(3, 4, at.toOriginalSpan(2, 3)); // b + } + + @Test + void testRemoveAllAlignedDoesNotOverCover() { + final AlignedText at = WS.removeAllAligned("a b c"); + assertEquals("abc", at.normalized()); + assertSpan(2, 3, at.toOriginalSpan(1, 2)); // "b" -> [2,3), not [2,4) + assertSpan(0, 5, at.toOriginalSpan(0, 3)); + } + + @Test + void testTrimAlignedDropsEdgesWithoutOverCovering() { + final AlignedText at = WS.trimAligned(" ab "); + assertEquals("ab", at.normalized()); + assertEquals(6, at.alignment().originalLength()); + assertSpan(2, 4, at.toOriginalSpan(0, 2)); // "ab" sits at original [2,4) + assertSpan(3, 4, at.toOriginalSpan(1, 2)); // "b" + } + + @Test + void testCollapsePreservingAlignedKeepsLineBreak() { + final AlignedText at = WS.collapsePreservingAligned("a\n\n\t\tb", lineBreaks(), '\n'); + assertEquals("a\nb", at.normalized()); + assertSpan(1, 5, at.toOriginalSpan(1, 2)); // the preserved newline covers the whole run + } + + @Test + void testNormalizeAlignedAcrossSupplementaryDash() { + final AlignedText at = DASH.normalizeAligned("x" + YEZIDI_HYPHEN + "y"); + assertEquals("x-y", at.normalized()); + assertSpan(0, 1, at.toOriginalSpan(0, 1)); // x + assertSpan(1, 3, at.toOriginalSpan(1, 2)); // "-" maps back to the two-char Yezidi hyphen + assertSpan(3, 4, at.toOriginalSpan(2, 3)); // y + } + + // --- aligned edge cases (restore + extend the deleted *Mapped coverage) ------------------ + + @Test + void testCollapseAlignedAcrossMixedUnicodeWhitespaceRun() { + final AlignedText at = WS.collapseAligned("a" + NBSP + IDEOGRAPHIC + cp(0x2002) + "b"); + assertEquals("a b", at.normalized()); + assertSpan(1, 4, at.toOriginalSpan(1, 2)); // the one space covers the three-char ws run + assertSpan(4, 5, at.toOriginalSpan(2, 3)); // b + } + + @Test + void testCollapseAlignedAcrossTabRun() { + final AlignedText at = WS.collapseAligned("a\t\t\t\t\tb"); + assertEquals("a b", at.normalized()); + assertSpan(1, 6, at.toOriginalSpan(1, 2)); // five tabs collapse to one space + assertSpan(6, 7, at.toOriginalSpan(2, 3)); + } + + @Test + void testCollapseAlignedAcrossNewlineRun() { + final AlignedText at = WS.collapseAligned("a\r\n\tb"); + assertEquals("a b", at.normalized()); + assertSpan(1, 4, at.toOriginalSpan(1, 2)); + } + + @Test + void testCollapseAlignedEmptySingleAndAllWhitespace() { + assertEquals("", WS.collapseAligned("").normalized()); + assertEquals(0, WS.collapseAligned("").alignment().normalizedLength()); + + final AlignedText single = WS.collapseAligned("a"); + assertEquals("a", single.normalized()); + assertSpan(0, 1, single.toOriginalSpan(0, 1)); + + final AlignedText allWs = WS.collapseAligned("\t\t\t"); + assertEquals(" ", allWs.normalized()); // all whitespace collapses to one space, not empty + assertSpan(0, 3, allWs.toOriginalSpan(0, 1)); + } + + @Test + void testCollapseAlignedKeepsSurrogatePairOffsets() { + final AlignedText at = WS.collapseAligned(GRINNING_FACE + "\t\tb"); + assertEquals(GRINNING_FACE + " b", at.normalized()); + assertSpan(0, 2, at.toOriginalSpan(0, 2)); // the emoji occupies two original chars + assertSpan(2, 4, at.toOriginalSpan(2, 3)); // the collapsed tabs + assertSpan(4, 5, at.toOriginalSpan(3, 4)); // b + } + + @Test + void testNormalizeAlignedIsIdentityWhenNothingMatches() { + final AlignedText at = WS.normalizeAligned("abc"); + assertEquals("abc", at.normalized()); + for (int i = 0; i < 3; i++) { + assertSpan(i, i + 1, at.toOriginalSpan(i, i + 1)); + } + } + + @Test + void testNormalizeAlignedPreservesSupplementaryNonMember() { + final AlignedText at = WS.normalizeAligned("a" + GRINNING_FACE + "b"); + assertEquals("a" + GRINNING_FACE + "b", at.normalized()); + assertSpan(1, 3, at.toOriginalSpan(1, 3)); // the emoji passes through unchanged + } + + @Test + void testNormalizeAlignedExpandsToSupplementaryReplacement() { + // A BMP member replaced by a supplementary code point grows by one char (1 -> 2). + final CharClass toPenguin = CharClass.of(CodePointSet.of(' '), 0x1F427); + final AlignedText at = toPenguin.normalizeAligned("a b"); + assertEquals("a" + cp(0x1F427) + "b", at.normalized()); + assertSpan(0, 1, at.toOriginalSpan(0, 1)); // a + assertSpan(1, 2, at.toOriginalSpan(1, 3)); // both penguin halves come from the one space + assertSpan(2, 3, at.toOriginalSpan(3, 4)); // b + } + + @Test + void testRemoveAllAlignedLeadingAndTrailingDeletions() { + final AlignedText at = WS.removeAllAligned(" a b "); + assertEquals("ab", at.normalized()); + assertSpan(1, 2, at.toOriginalSpan(0, 1)); // a (leading space deleted) + assertSpan(3, 4, at.toOriginalSpan(1, 2)); // b (trailing space deleted, not over-covered) + } + + @Test + void testTrimAlignedAllWhitespaceIsEmpty() { + final AlignedText at = WS.trimAligned("\t\t"); + assertEquals("", at.normalized()); + assertEquals(0, at.alignment().normalizedLength()); + assertEquals(2, at.alignment().originalLength()); + } + + @Test + void testCollapsePreservingAlignedRunWithoutKeepCollapsesToReplacement() { + final AlignedText at = WS.collapsePreservingAligned("a \t b", lineBreaks(), '\n'); + assertEquals("a b", at.normalized()); // no line break in the run -> plain space + assertSpan(1, 4, at.toOriginalSpan(1, 2)); + } + + // Every aligned operation must produce exactly the same string as its plain counterpart; only the + // alignment is extra. This pins that contract across a battery of inputs so the two code paths + // cannot drift apart. + @Test + void testAlignedOperationsAgreeWithPlainOutput() { + final CodePointSet keep = lineBreaks(); + final String[] inputs = { + "", + "abc", + " a b ", + "a" + NBSP + IDEOGRAPHIC + "b", + "a\t\t\t\t\tb", + "a\r\n\tb", + "\n\nabc", + " ", + GRINNING_FACE + "\t\tb", + "x" + YEZIDI_HYPHEN + YEZIDI_HYPHEN + "y", + "well" + EM_DASH + EN_DASH + "known", + "5" + MINUS_SIGN + "3", + }; + for (final CharClass charClass : new CharClass[] {WS, DASH}) { + for (final String input : inputs) { + assertEquals(charClass.normalize(input), charClass.normalizeAligned(input).normalized(), + "normalize vs normalizeAligned for [" + input + "]"); + assertEquals(charClass.collapse(input), charClass.collapseAligned(input).normalized(), + "collapse vs collapseAligned for [" + input + "]"); + assertEquals(charClass.trim(input), charClass.trimAligned(input).normalized(), + "trim vs trimAligned for [" + input + "]"); + assertEquals(charClass.removeAll(input), charClass.removeAllAligned(input).normalized(), + "removeAll vs removeAllAligned for [" + input + "]"); + assertEquals(charClass.collapsePreserving(input, keep, '\n'), + charClass.collapsePreservingAligned(input, keep, '\n').normalized(), + "collapsePreserving vs collapsePreservingAligned for [" + input + "]"); + } + } + } } diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/AlignedAggregateCharSequenceNormalizer.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/AlignedAggregateCharSequenceNormalizer.java new file mode 100644 index 000000000..f57ddc29b --- /dev/null +++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/AlignedAggregateCharSequenceNormalizer.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.util.normalizer; + +/** + * An {@link OffsetAwareNormalizer} that applies a chain of offset-aware rungs in order and composes + * their per-stage {@link Alignment}s with {@link Alignment#andThen(Alignment)}, so the result maps a + * span found in the fully normalized text back to the original input through every stage. + * + *

Produced by {@code TextNormalizer.Builder.buildAligned()}, which validates that every rung is + * offset-aware before constructing this.

+ */ +final class AlignedAggregateCharSequenceNormalizer implements OffsetAwareNormalizer { + + private static final long serialVersionUID = 3056944120186103477L; + + private final OffsetAwareNormalizer[] steps; + + AlignedAggregateCharSequenceNormalizer(OffsetAwareNormalizer[] steps) { + this.steps = steps; + } + + @Override + public CharSequence normalize(CharSequence text) { + CharSequence result = text; + for (final OffsetAwareNormalizer step : steps) { + result = step.normalize(result); + } + return result; + } + + @Override + public AlignedText normalizeAligned(CharSequence text) { + if (steps.length == 0) { + // Identity pipeline: use one String for both sides so the alignment's lengths cannot diverge + // from the stored original for a CharSequence whose length() differs from its toString(). + final String identity = text.toString(); + return new AlignedText(identity, identity, + new Alignment.Builder().equal(identity.length()).build(identity.length())); + } + // Normalize the input to a String once so the stored original and the per-stage alignment + // lengths agree even for a CharSequence whose length() differs from its toString(). + final String input = text.toString(); + AlignedText stage = steps[0].normalizeAligned(input); + Alignment alignment = stage.alignment(); + for (int i = 1; i < steps.length; i++) { + final AlignedText next = steps[i].normalizeAligned(stage.normalized()); + alignment = alignment.andThen(next.alignment()); + stage = next; + } + return new AlignedText(input, stage.normalized(), alignment); + } +} diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/BulletCharSequenceNormalizer.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/BulletCharSequenceNormalizer.java index 9d1d63304..84476bf81 100644 --- a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/BulletCharSequenceNormalizer.java +++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/BulletCharSequenceNormalizer.java @@ -25,7 +25,7 @@ * because it is a letter in Catalan ({@code l..l}) and other orthographies; only characters that * are unambiguously list bullets are replaced.

*/ -public class BulletCharSequenceNormalizer implements CharSequenceNormalizer { +public class BulletCharSequenceNormalizer implements OffsetAwareNormalizer { private static final long serialVersionUID = 5521093348871625541L; @@ -49,4 +49,8 @@ public CharSequence normalize(CharSequence text) { return BULLETS.normalize(text); } + @Override + public AlignedText normalizeAligned(CharSequence text) { + return BULLETS.normalizeAligned(text); + } } diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/DashCharSequenceNormalizer.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/DashCharSequenceNormalizer.java index 21c25873b..308c4cfaf 100644 --- a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/DashCharSequenceNormalizer.java +++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/DashCharSequenceNormalizer.java @@ -25,7 +25,7 @@ * regardless of which dash the source used. The mathematical minus signs are left untouched by * default, and {@code U+00AD} SOFT HYPHEN (a format character) is not treated as a dash.

*/ -public class DashCharSequenceNormalizer implements CharSequenceNormalizer { +public class DashCharSequenceNormalizer implements OffsetAwareNormalizer { private static final long serialVersionUID = 6620885194730155303L; @@ -43,4 +43,8 @@ public CharSequence normalize(CharSequence text) { return DASHES.normalize(text); } + @Override + public AlignedText normalizeAligned(CharSequence text) { + return DASHES.normalizeAligned(text); + } } diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/DigitCharSequenceNormalizer.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/DigitCharSequenceNormalizer.java index 10bb882fe..68039c1ab 100644 --- a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/DigitCharSequenceNormalizer.java +++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/DigitCharSequenceNormalizer.java @@ -26,7 +26,7 @@ * left unchanged. Scanning is a single O(1)-per-code-point cursor pass with no regular * expression.

*/ -public class DigitCharSequenceNormalizer implements CharSequenceNormalizer { +public class DigitCharSequenceNormalizer implements OffsetAwareNormalizer { private static final long serialVersionUID = 8451270936618204413L; @@ -48,4 +48,8 @@ private static String toAscii(int codePoint) { return value >= 0 ? String.valueOf((char) ('0' + value)) : null; } + @Override + public AlignedText normalizeAligned(CharSequence text) { + return CharClass.substituteAligned(text, DigitCharSequenceNormalizer::toAscii); + } } diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/EllipsisCharSequenceNormalizer.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/EllipsisCharSequenceNormalizer.java index e4971aa40..e5c692d73 100644 --- a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/EllipsisCharSequenceNormalizer.java +++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/EllipsisCharSequenceNormalizer.java @@ -24,7 +24,7 @@ *

Scanning is a single O(1)-per-code-point cursor pass with no regular expression. ASCII dot * runs are left unchanged.

*/ -public class EllipsisCharSequenceNormalizer implements CharSequenceNormalizer { +public class EllipsisCharSequenceNormalizer implements OffsetAwareNormalizer { private static final long serialVersionUID = 2298647015583729167L; @@ -41,6 +41,10 @@ public CharSequence normalize(CharSequence text) { return CharClass.substitute(text, EllipsisCharSequenceNormalizer::expansion); } + @Override + public AlignedText normalizeAligned(CharSequence text) { + return CharClass.substituteAligned(text, EllipsisCharSequenceNormalizer::expansion); + } // The ASCII expansion for an ellipsis or leader code point, or null to copy the code point through. private static String expansion(int codePoint) { diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/GermanUmlautCharSequenceNormalizer.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/GermanUmlautCharSequenceNormalizer.java index 79d4e71b7..d4c2c4645 100644 --- a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/GermanUmlautCharSequenceNormalizer.java +++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/GermanUmlautCharSequenceNormalizer.java @@ -32,7 +32,7 @@ * (for example {@code a} + U+0308) is not a member and passes through unchanged, so apply NFC * composition first if the input may contain decomposed forms.

*/ -public class GermanUmlautCharSequenceNormalizer implements CharSequenceNormalizer { +public class GermanUmlautCharSequenceNormalizer implements OffsetAwareNormalizer { private static final long serialVersionUID = 7106934482250176835L; @@ -61,6 +61,11 @@ public CharSequence normalize(CharSequence text) { return CharClass.substitute(text, GermanUmlautCharSequenceNormalizer::expansion); } + @Override + public AlignedText normalizeAligned(CharSequence text) { + return CharClass.substituteAligned(text, GermanUmlautCharSequenceNormalizer::expansion); + } + // The DIN 5007-2 transliteration for an umlaut or eszett, or null to copy the code point through. // All members are in the BMP, so a code point equals its char; supplementary code points miss every // case and pass through. diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/InvisibleCharSequenceNormalizer.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/InvisibleCharSequenceNormalizer.java index 5e0465f73..91c7f7c75 100644 --- a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/InvisibleCharSequenceNormalizer.java +++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/InvisibleCharSequenceNormalizer.java @@ -28,7 +28,7 @@ * sequences; so are variation selectors. Use this only for a matching/search form, not for * display.

*/ -public class InvisibleCharSequenceNormalizer implements CharSequenceNormalizer { +public class InvisibleCharSequenceNormalizer implements OffsetAwareNormalizer { private static final long serialVersionUID = 4837512098664301927L; @@ -69,4 +69,8 @@ public CharSequence normalize(CharSequence text) { return INVISIBLE.removeAll(text); } + @Override + public AlignedText normalizeAligned(CharSequence text) { + return INVISIBLE.removeAllAligned(text); + } } diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/LineBreakPreservingWhitespaceCharSequenceNormalizer.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/LineBreakPreservingWhitespaceCharSequenceNormalizer.java new file mode 100644 index 000000000..ec198fda1 --- /dev/null +++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/LineBreakPreservingWhitespaceCharSequenceNormalizer.java @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.util.normalizer; + +/** + * A {@link CharSequenceNormalizer} that collapses runs of Unicode whitespace like + * {@link WhitespaceCharSequenceNormalizer}, but keeps line and paragraph structure: any whitespace + * run that contains a line break collapses to a single newline ({@code U+000A}) instead of a space, + * and leading and trailing whitespace is trimmed. + * + *

This is the form wanted for readable snippets and display: horizontal runs of spaces and tabs + * become a single space, yet a blank line between paragraphs survives as one newline rather than + * being flattened into the surrounding text. It reuses the cursor based + * {@link CharClass#collapsePreserving(CharSequence, CodePointSet, int)} engine, so it recognizes the + * full Unicode {@code White_Space} set with no regular expression.

+ */ +public class LineBreakPreservingWhitespaceCharSequenceNormalizer implements OffsetAwareNormalizer { + + private static final long serialVersionUID = 5471829006633512874L; + + private static final int NEWLINE = 0x000A; + + private static final CharClass WHITESPACE = CharClass.whitespace(); + + // The Unicode mandatory break code points (UAX #14 classes BK/CR/LF/NL): line feed, vertical tab, + // form feed, carriage return, next line, line separator, and paragraph separator. A whitespace run + // that contains any of these collapses to a single newline rather than a space, so line and + // paragraph structure survives while horizontal runs are squished. + private static final CodePointSet LINE_BREAKS = CodePointSet.of( + 0x000A, // line feed + 0x000B, // vertical tab + 0x000C, // form feed + 0x000D, // carriage return + 0x0085, // next line + 0x2028, // line separator + 0x2029); // paragraph separator + + private static final LineBreakPreservingWhitespaceCharSequenceNormalizer INSTANCE = + new LineBreakPreservingWhitespaceCharSequenceNormalizer(); + + /** {@return the shared, stateless instance} */ + public static LineBreakPreservingWhitespaceCharSequenceNormalizer getInstance() { + return INSTANCE; + } + + @Override + public CharSequence normalize(CharSequence text) { + return WHITESPACE.trim(WHITESPACE.collapsePreserving(text, LINE_BREAKS, NEWLINE)); + } + + @Override + public AlignedText normalizeAligned(CharSequence text) { + final AlignedText collapsed = WHITESPACE.collapsePreservingAligned(text, LINE_BREAKS, NEWLINE); + final AlignedText trimmed = WHITESPACE.trimAligned(collapsed.normalized()); + return new AlignedText(text, trimmed.normalized(), + collapsed.alignment().andThen(trimmed.alignment())); + } +} diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/QuoteCharSequenceNormalizer.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/QuoteCharSequenceNormalizer.java index ec86e4fa6..f4551d21d 100644 --- a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/QuoteCharSequenceNormalizer.java +++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/QuoteCharSequenceNormalizer.java @@ -25,7 +25,7 @@ * two {@link CharClass} sets, so membership is O(1) and scanning is a single cursor pass with no * regular expression. ASCII quotes are left unchanged.

*/ -public class QuoteCharSequenceNormalizer implements CharSequenceNormalizer { +public class QuoteCharSequenceNormalizer implements OffsetAwareNormalizer { private static final long serialVersionUID = 3415829076651283471L; @@ -67,4 +67,11 @@ public CharSequence normalize(CharSequence text) { return DOUBLE.normalize(SINGLE.normalize(text)); } + @Override + public AlignedText normalizeAligned(CharSequence text) { + final AlignedText single = SINGLE.normalizeAligned(text); + final AlignedText both = DOUBLE.normalizeAligned(single.normalized()); + return new AlignedText(text, both.normalized(), + single.alignment().andThen(both.alignment())); + } } diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/TextNormalizer.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/TextNormalizer.java index c7d877ecc..a5f1bb8de 100644 --- a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/TextNormalizer.java +++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/TextNormalizer.java @@ -93,6 +93,15 @@ public Builder whitespace() { return add(Dimension.WHITESPACE.defaultNormalizer()); } + /** + * {@return this builder with whitespace collapsing that preserves line and paragraph breaks + * appended} Horizontal runs collapse to a single space, but a run containing a line break + * collapses to a single newline, so paragraph structure survives. + */ + public Builder whitespacePreservingLineBreaks() { + return add(LineBreakPreservingWhitespaceCharSequenceNormalizer.getInstance()); + } + /** {@return this builder with quotation-mark folding appended} */ public Builder quotes() { return add(QuoteCharSequenceNormalizer.getInstance()); @@ -143,6 +152,38 @@ public CharSequenceNormalizer build() { return new AggregateCharSequenceNormalizer(steps.toArray(new CharSequenceNormalizer[0])); } + /** + * {@return an offset-aware composition of the rungs added so far} + * + *

Every rung must be an {@link OffsetAwareNormalizer}. Each per-code-point fold is one; + * the folds that delegate to {@link java.text.Normalizer} or to JDK case mapping (NFC, NFKC, + * accent folding, confusable folding, and case folding) cannot report their per-character edits + * and so are rejected here. The returned normalizer's + * {@link OffsetAwareNormalizer#normalizeAligned(CharSequence)} maps a span found in the fully + * normalized text back to the original input through every stage, so a match in a normalized + * document reports its true offsets in the source.

+ * + * @throws IllegalStateException Thrown if any rung cannot report an alignment (for example NFC, + * NFKC, accent folding, confusable folding, or case folding, which delegate to + * {@link java.text.Normalizer} or to JDK case mapping); the message names the offending + * rung. + */ + public OffsetAwareNormalizer buildAligned() { + final OffsetAwareNormalizer[] aligned = new OffsetAwareNormalizer[steps.size()]; + for (int i = 0; i < steps.size(); i++) { + final CharSequenceNormalizer step = steps.get(i); + if (!(step instanceof OffsetAwareNormalizer)) { + throw new IllegalStateException("rung at 0-based index " + i + " (" + step.getClass().getName() + + ") is not offset-aware and cannot be composed into an aligned pipeline; the " + + "per-code-point folds report an alignment, while folds that delegate to " + + "java.text.Normalizer or JDK case mapping (such as NFC, NFKC, accent, confusable, " + + "or case folding) do not"); + } + aligned[i] = (OffsetAwareNormalizer) step; + } + return new AlignedAggregateCharSequenceNormalizer(aligned); + } + private Builder add(CharSequenceNormalizer normalizer) { steps.add(normalizer); return this; diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/WhitespaceCharSequenceNormalizer.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/WhitespaceCharSequenceNormalizer.java index 6aa267d39..a61ffed9c 100644 --- a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/WhitespaceCharSequenceNormalizer.java +++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/WhitespaceCharSequenceNormalizer.java @@ -25,7 +25,7 @@ * and so on), so spacing copied from the web, PDFs, or non-Latin sources normalizes consistently. * It is the Unicode-aware, regex-free counterpart to {@link ShrinkCharSequenceNormalizer}.

*/ -public class WhitespaceCharSequenceNormalizer implements CharSequenceNormalizer { +public class WhitespaceCharSequenceNormalizer implements OffsetAwareNormalizer { private static final long serialVersionUID = 6748290315562094783L; @@ -44,4 +44,11 @@ public CharSequence normalize(CharSequence text) { return WHITESPACE.trim(WHITESPACE.collapse(text)); } + @Override + public AlignedText normalizeAligned(CharSequence text) { + final AlignedText collapsed = WHITESPACE.collapseAligned(text); + final AlignedText trimmed = WHITESPACE.trimAligned(collapsed.normalized()); + return new AlignedText(text, trimmed.normalized(), + collapsed.alignment().andThen(trimmed.alignment())); + } } diff --git a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/AlignedNormalizerPipelineTest.java b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/AlignedNormalizerPipelineTest.java new file mode 100644 index 000000000..7813babe5 --- /dev/null +++ b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/AlignedNormalizerPipelineTest.java @@ -0,0 +1,342 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.util.normalizer; + +import org.junit.jupiter.api.Test; + +import opennlp.tools.util.Span; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertSame; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +/** + * Exercises {@link OffsetAwareNormalizer} and {@code TextNormalizer.Builder.buildAligned()}: the + * cursor-based rungs report alignments, an aligned pipeline composes them with + * {@link Alignment#andThen(Alignment)} so a span found in the fully normalized text maps back to the + * original input, and a non-alignable rung is rejected loudly. + */ +public class AlignedNormalizerPipelineTest { + + private static final int ZERO_WIDTH_SPACE = 0x200B; + private static final int EM_DASH = 0x2014; + private static final int YEZIDI_HYPHEN = 0x10EAD; // a supplementary (non-BMP) dash + private static final int MATH_BOLD_DIGIT_ZERO = 0x1D7CE; // a supplementary decimal digit + + private static String cp(int codePoint) { + return new String(Character.toChars(codePoint)); + } + + private static String covered(AlignedText aligned, int normalizedStart, int normalizedEnd) { + final Span span = aligned.toOriginalSpan(normalizedStart, normalizedEnd); + return aligned.original().subSequence(span.getStart(), span.getEnd()).toString(); + } + + // The aligned form must always reproduce exactly what the plain form produces. + @Test + void alignedNormalizedTextMatchesPlainForEveryRung() { + final OffsetAwareNormalizer[] rungs = { + WhitespaceCharSequenceNormalizer.getInstance(), + LineBreakPreservingWhitespaceCharSequenceNormalizer.getInstance(), + DashCharSequenceNormalizer.getInstance(), + InvisibleCharSequenceNormalizer.getInstance(), + QuoteCharSequenceNormalizer.getInstance(), + DigitCharSequenceNormalizer.getInstance(), + EllipsisCharSequenceNormalizer.getInstance(), + BulletCharSequenceNormalizer.getInstance(), + GermanUmlautCharSequenceNormalizer.getInstance() + }; + final String[] inputs = { + "", + "plain", + " lots of\tspace ", + "\n\n para one\n\n\tpara two \n", + "a" + cp(ZERO_WIDTH_SPACE) + "b" + cp(YEZIDI_HYPHEN) + "c" + cp(EM_DASH) + "d", + cp(ZERO_WIDTH_SPACE) + " " + cp(ZERO_WIDTH_SPACE), + // quotes, ellipsis, eszett, bullet, fullwidth and supplementary digits in one string + cp(0x201C) + "don" + cp(0x2019) + "t " + cp(0x2026) + " Stra" + cp(0x00DF) + "e " + + cp(0x2022) + " " + cp(0xFF15) + cp(MATH_BOLD_DIGIT_ZERO) + }; + for (final OffsetAwareNormalizer rung : rungs) { + for (final String input : inputs) { + assertEquals(rung.normalize(input).toString(), rung.normalizeAligned(input).normalized(), + rung.getClass().getSimpleName() + " on [" + input + "]"); + } + } + } + + @Test + void whitespaceCollapseAndTrimMapsSpanBackToOriginal() { + final String original = " hello world "; + final AlignedText aligned = WhitespaceCharSequenceNormalizer.getInstance() + .normalizeAligned(original); + assertEquals("hello world", aligned.normalized()); + // "world" sits at [6, 11) in the collapsed/trimmed form. + final Span span = aligned.toOriginalSpan(6, 11); + assertEquals(original.indexOf("world"), span.getStart()); + assertEquals("world", covered(aligned, 6, 11)); + } + + @Test + void dashFoldOfSupplementaryDashMapsSpanBackToOriginal() { + final String original = "a" + cp(YEZIDI_HYPHEN) + "b"; + final AlignedText aligned = DashCharSequenceNormalizer.getInstance().normalizeAligned(original); + assertEquals("a-b", aligned.normalized()); + // The two-unit supplementary dash folded to one ASCII hyphen, so 'b' moved from 3 to 2. + assertEquals("b", covered(aligned, 2, 3)); + assertEquals(cp(YEZIDI_HYPHEN), covered(aligned, 1, 2)); + } + + @Test + void invisibleStripMapsSpanBackAcrossDeletion() { + final String original = "a" + cp(ZERO_WIDTH_SPACE) + "b"; + final AlignedText aligned = InvisibleCharSequenceNormalizer.getInstance() + .normalizeAligned(original); + assertEquals("ab", aligned.normalized()); + // 'b' is at index 1 in "ab" but index 2 in the original; the deleted ZWSP must not be covered. + assertEquals("b", covered(aligned, 1, 2)); + assertEquals(2, aligned.toOriginalSpan(1, 2).getStart()); + } + + @Test + void pipelineComposesStripInvisibleWhitespaceAndDashesBackToOriginal() { + // 'a', zero-width space, two spaces, 'b', em dash, 'c'. + final String original = "a" + cp(ZERO_WIDTH_SPACE) + " b" + cp(EM_DASH) + "c"; + final OffsetAwareNormalizer pipeline = TextNormalizer.builder() + .stripInvisible().whitespace().dashes().buildAligned(); + + final AlignedText aligned = pipeline.normalizeAligned(original); + assertEquals("a b-c", aligned.normalized()); + assertEquals(pipeline.normalize(original).toString(), aligned.normalized()); + // "b-c" at [2, 5) maps back across a deletion, a collapse, and a dash fold to "bc". + assertEquals("b" + cp(EM_DASH) + "c", covered(aligned, 2, 5)); + } + + @Test + void emptyAlignedPipelineIsIdentity() { + final AlignedText aligned = TextNormalizer.builder().buildAligned().normalizeAligned("Hello"); + assertEquals("Hello", aligned.normalized()); + assertEquals("Hello", covered(aligned, 0, 5)); + } + + @Test + void buildAlignedRejectsNonAlignableRungLoudly() { + final IllegalStateException ex = assertThrows(IllegalStateException.class, + () -> TextNormalizer.builder().nfc().whitespace().buildAligned()); + assertTrue(ex.getMessage().contains("Nfc"), ex.getMessage()); + assertTrue(ex.getMessage().contains("offset-aware"), ex.getMessage()); + } + + @Test + void buildAlignedReportsTheOffendingRungIndexWhenItIsNotFirst() { + // A non-alignable rung after several offset-aware ones must still be rejected, and the message + // must name its 0-based position (index 2) and type so the failure points at the right fold. + final IllegalStateException ex = assertThrows(IllegalStateException.class, + () -> TextNormalizer.builder().whitespace().dashes().caseFold().buildAligned()); + assertTrue(ex.getMessage().contains("rung at 0-based index 2"), ex.getMessage()); + assertTrue(ex.getMessage().contains("CaseFold"), ex.getMessage()); + } + + @Test + void buildAlignedRejectsEachKindOfNonAlignableRung() { + // Every fold that routes through java.text.Normalizer or JDK case mapping is rejected, named. + assertThrows(IllegalStateException.class, + () -> TextNormalizer.builder().nfkc().buildAligned()); + assertThrows(IllegalStateException.class, + () -> TextNormalizer.builder().accentFold().buildAligned()); + assertThrows(IllegalStateException.class, + () -> TextNormalizer.builder().caseFold().buildAligned()); + } + + @Test + void capabilityIsDetectableByInstanceOf() { + assertTrue(WhitespaceCharSequenceNormalizer.getInstance() instanceof OffsetAwareNormalizer); + assertTrue(DashCharSequenceNormalizer.getInstance() instanceof OffsetAwareNormalizer); + assertTrue(InvisibleCharSequenceNormalizer.getInstance() instanceof OffsetAwareNormalizer); + assertFalse(NfcCharSequenceNormalizer.getInstance() instanceof OffsetAwareNormalizer); + assertTrue(TextNormalizer.builder().whitespace().dashes().buildAligned() + instanceof OffsetAwareNormalizer); + // The per-code-point substitution folds are offset-aware too. + assertTrue(QuoteCharSequenceNormalizer.getInstance() instanceof OffsetAwareNormalizer); + assertTrue(DigitCharSequenceNormalizer.getInstance() instanceof OffsetAwareNormalizer); + assertTrue(EllipsisCharSequenceNormalizer.getInstance() instanceof OffsetAwareNormalizer); + assertTrue(BulletCharSequenceNormalizer.getInstance() instanceof OffsetAwareNormalizer); + assertTrue(GermanUmlautCharSequenceNormalizer.getInstance() instanceof OffsetAwareNormalizer); + // The folds that route through java.text.Normalizer or JDK case mapping cannot, by design. + assertFalse(NfkcCharSequenceNormalizer.getInstance() instanceof OffsetAwareNormalizer); + assertFalse(CaseFoldCharSequenceNormalizer.getInstance() instanceof OffsetAwareNormalizer); + assertFalse(AccentFoldCharSequenceNormalizer.getInstance() instanceof OffsetAwareNormalizer); + assertFalse(ConfusableSkeletonCharSequenceNormalizer.getInstance() + instanceof OffsetAwareNormalizer); + } + + @Test + void roundTripOfAFullySpanningMatchReturnsTheWholeOriginal() { + final String original = " the quick "; + final AlignedText aligned = WhitespaceCharSequenceNormalizer.getInstance() + .normalizeAligned(original); + final String normalized = aligned.normalized(); + assertEquals("the quick", normalized); + final Span whole = aligned.toOriginalSpan(0, normalized.length()); + assertSame(original, aligned.original()); + // The match spanning the whole normalized text covers the original from first to last kept char. + assertEquals("the quick", original.subSequence(whole.getStart(), whole.getEnd()).toString()); + } + + @Test + void lineBreakPreservingCollapsesHorizontalRunsButKeepsBreaks() { + final LineBreakPreservingWhitespaceCharSequenceNormalizer rung = + LineBreakPreservingWhitespaceCharSequenceNormalizer.getInstance(); + final String original = "Hello world\n\n\tfoo bar"; + assertEquals("Hello world\nfoo bar", rung.normalize(original).toString()); + + // The plain whitespace rung instead flattens the blank line into a single space. + assertEquals("Hello world foo bar", + WhitespaceCharSequenceNormalizer.getInstance().normalize(original).toString()); + + final AlignedText aligned = rung.normalizeAligned(original); + assertEquals(rung.normalize(original).toString(), aligned.normalized()); + // "bar" sits at [16, 19) in the collapsed form and at [21, 24) in the original. + assertEquals(original.indexOf("bar"), aligned.toOriginalSpan(16, 19).getStart()); + assertEquals("bar", covered(aligned, 16, 19)); + // The preserved newline at index 11 maps back to the whole "\n\n\t" run it came from. + assertEquals("\n\n\t", covered(aligned, 11, 12)); + } + + @Test + void lineBreakPreservingTrimsLeadingAndTrailingBreaks() { + final LineBreakPreservingWhitespaceCharSequenceNormalizer rung = + LineBreakPreservingWhitespaceCharSequenceNormalizer.getInstance(); + final String original = "\n\nHello\n\n"; + final AlignedText aligned = rung.normalizeAligned(original); + assertEquals("Hello", aligned.normalized()); + assertEquals("Hello", covered(aligned, 0, 5)); + assertEquals(original.indexOf("Hello"), aligned.toOriginalSpan(0, 5).getStart()); + } + + @Test + void lineBreakPreservingComposesInAnAlignedPipeline() { + assertTrue(LineBreakPreservingWhitespaceCharSequenceNormalizer.getInstance() + instanceof OffsetAwareNormalizer); + final String original = "a" + cp(ZERO_WIDTH_SPACE) + " b\n\nc" + cp(EM_DASH) + "d"; + final OffsetAwareNormalizer pipeline = TextNormalizer.builder() + .stripInvisible().whitespacePreservingLineBreaks().dashes().buildAligned(); + + final AlignedText aligned = pipeline.normalizeAligned(original); + assertEquals("a b\nc-d", aligned.normalized()); + assertEquals(pipeline.normalize(original).toString(), aligned.normalized()); + // "c-d" at [4, 7) maps back across a deletion, a break-preserving collapse, and a dash fold. + assertEquals("c" + cp(EM_DASH) + "d", covered(aligned, 4, 7)); + } + + @Test + void pipelineMapsAnOriginalSpanForwardToTheNormalizedText() { + final String original = "a" + cp(ZERO_WIDTH_SPACE) + " b" + cp(EM_DASH) + "c"; + final AlignedText aligned = TextNormalizer.builder() + .stripInvisible().whitespace().dashes().buildAligned().normalizeAligned(original); + assertEquals("a b-c", aligned.normalized()); + // 'b' is at original index 4 and normalized index 2; the forward mapping must agree. + final Span forward = aligned.toNormalizedSpan(4, 5); + assertEquals(2, forward.getStart()); + assertEquals("b", aligned.normalized().substring(forward.getStart(), forward.getEnd())); + } + + @Test + void lineBreakPreservingNormalizesCrLfAndUnicodeSeparators() { + final LineBreakPreservingWhitespaceCharSequenceNormalizer rung = + LineBreakPreservingWhitespaceCharSequenceNormalizer.getInstance(); + assertEquals("a\nb", rung.normalize("a\r\nb").toString()); // CRLF -> one newline + assertEquals("a\nb", rung.normalize("a\n\n\n\nb").toString()); // blank lines -> one newline + assertEquals("x\ny", rung.normalize("x" + cp(0x2028) + "y").toString()); // line separator + assertEquals("p\nq", rung.normalize("p" + cp(0x2029) + "q").toString()); // paragraph separator + // A horizontal run still collapses to a space even when mixed with a break-bearing run. + assertEquals("a b\nc", rung.normalize("a b \n c").toString()); + } + + @Test + void whitespaceRungCollapsesAllWhitespaceToEmptyWithAValidSpan() { + final AlignedText aligned = + WhitespaceCharSequenceNormalizer.getInstance().normalizeAligned(" "); + assertEquals("", aligned.normalized()); + // Mapping the empty match must yield a valid empty span rather than throwing. + final Span empty = aligned.toOriginalSpan(0, 0); + assertEquals(empty.getStart(), empty.getEnd()); + } + + @Test + void ellipsisExpansionMapsSpanBackToOriginal() { + final String original = "a" + cp(0x2026) + "b"; + final AlignedText aligned = EllipsisCharSequenceNormalizer.getInstance() + .normalizeAligned(original); + assertEquals("a...b", aligned.normalized()); + // The single ellipsis expanded to three dots, so 'b' moved from index 2 to index 4. + assertEquals("b", covered(aligned, 4, 5)); + // The whole expansion, and any sub-span of it, maps back to the one source ellipsis. + assertEquals(cp(0x2026), covered(aligned, 1, 4)); + assertEquals(cp(0x2026), covered(aligned, 2, 3)); + } + + @Test + void germanUmlautExpansionMapsSpanBackToOriginal() { + final String original = "Stra" + cp(0x00DF) + "e"; // "Strasse" from the eszett form + final AlignedText aligned = GermanUmlautCharSequenceNormalizer.getInstance() + .normalizeAligned(original); + assertEquals("Strasse", aligned.normalized()); + // The eszett expanded to "ss", so the trailing 'e' moved from index 5 to index 6. + assertEquals("e", covered(aligned, 6, 7)); + // Both halves of "ss" map back to the single source eszett. + assertEquals(cp(0x00DF), covered(aligned, 4, 6)); + assertEquals(cp(0x00DF), covered(aligned, 5, 6)); + } + + @Test + void digitFoldOfSupplementaryDigitMapsSpanBackToOriginal() { + final String original = "a" + cp(MATH_BOLD_DIGIT_ZERO) + "b"; + final AlignedText aligned = DigitCharSequenceNormalizer.getInstance() + .normalizeAligned(original); + assertEquals("a0b", aligned.normalized()); + // The two-unit supplementary digit folded to one ASCII '0', so 'b' moved from 3 to 2. + assertEquals("b", covered(aligned, 2, 3)); + assertEquals(cp(MATH_BOLD_DIGIT_ZERO), covered(aligned, 1, 2)); + } + + @Test + void quoteFoldMapsSpanBackToOriginal() { + final String original = cp(0x201C) + "hi" + cp(0x201D); // curly double quotes + final AlignedText aligned = QuoteCharSequenceNormalizer.getInstance() + .normalizeAligned(original); + assertEquals("\"hi\"", aligned.normalized()); + assertEquals("hi", covered(aligned, 1, 3)); + // A one-for-one fold, so the opening quote maps straight back to the curly source quote. + assertEquals(cp(0x201C), covered(aligned, 0, 1)); + } + + @Test + void substitutionFoldsComposeInAnAlignedPipeline() { + final String original = "say " + cp(0x201C) + "hi" + cp(0x201D) + cp(0x2026); + final OffsetAwareNormalizer pipeline = TextNormalizer.builder() + .quotes().ellipsis().buildAligned(); + final AlignedText aligned = pipeline.normalizeAligned(original); + assertEquals("say \"hi\"...", aligned.normalized()); + assertEquals(pipeline.normalize(original).toString(), aligned.normalized()); + // The expanded "..." maps back across the quote fold to the single source ellipsis. + assertEquals(cp(0x2026), covered(aligned, 8, 11)); + assertEquals("hi", covered(aligned, 5, 7)); + } +} diff --git a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/GermanUmlautCharSequenceNormalizerTest.java b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/GermanUmlautCharSequenceNormalizerTest.java index c4752fdad..ac9abdabd 100644 --- a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/GermanUmlautCharSequenceNormalizerTest.java +++ b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/GermanUmlautCharSequenceNormalizerTest.java @@ -60,6 +60,17 @@ void testCapitalEszett() { assertEquals("STRASSE", fold("STRA" + cp(0x1E9E) + "E")); // STRASSE } + @Test + void testCapitalEszettOffsets() { + // The capital eszett expands one source character into two, so the aligned fold reports a + // 1->2 replacement and a span over the two produced characters maps back to the single source. + final AlignedText aligned = FOLD.normalizeAligned("A" + cp(0x1E9E) + "B"); // AB + assertEquals("ASSB", aligned.normalized().toString()); + final var source = aligned.alignment().toOriginalSpan(1, 3); // the produced "SS" + assertEquals(1, source.getStart()); + assertEquals(2, source.getEnd()); + } + @Test void testAsciiAndOtherCharactersUnchanged() { assertEquals("hello world 123", fold("hello world 123"));