From b24c9ee3df90d01cfafc87a0bbebfd4b1e30066d Mon Sep 17 00:00:00 2001
From: Kristian Rickert
Date: Tue, 23 Jun 2026 09:50:34 -0400
Subject: [PATCH] OPENNLP-1850 Offset/alignment layer: Alignment, AlignedText,
buildAligned, *Aligned (1b)
The conceptually hard half of the former foundation PR, split out on review request: the
bidirectional Alignment edit-sequence and AlignedText, the OffsetAwareNormalizer capability
interface, TextNormalizer.buildAligned(), the *Aligned CharClass variants and the offset-aware
rungs, the line-break-preserving rung, and the dense span-mapping tests (binary-search span
mapping, expansion/deletion edge cases, andThen composition including the insertion-in-expansion
case). Builds on the engine in 1a.
---
.../tools/util/normalizer/AlignedText.java | 58 +++
.../tools/util/normalizer/Alignment.java | 293 +++++++++++++++
.../tools/util/normalizer/CharClass.java | 200 ++++++++++
.../normalizer/OffsetAwareNormalizer.java | 49 +++
.../tools/util/normalizer/AlignmentTest.java | 258 +++++++++++++
.../tools/util/normalizer/CharClassTest.java | 183 ++++++++++
...lignedAggregateCharSequenceNormalizer.java | 67 ++++
.../BulletCharSequenceNormalizer.java | 6 +-
.../DashCharSequenceNormalizer.java | 6 +-
.../DigitCharSequenceNormalizer.java | 6 +-
.../EllipsisCharSequenceNormalizer.java | 6 +-
.../GermanUmlautCharSequenceNormalizer.java | 7 +-
.../InvisibleCharSequenceNormalizer.java | 6 +-
...rvingWhitespaceCharSequenceNormalizer.java | 72 ++++
.../QuoteCharSequenceNormalizer.java | 9 +-
.../tools/util/normalizer/TextNormalizer.java | 41 +++
.../WhitespaceCharSequenceNormalizer.java | 9 +-
.../AlignedNormalizerPipelineTest.java | 342 ++++++++++++++++++
...ermanUmlautCharSequenceNormalizerTest.java | 11 +
19 files changed, 1621 insertions(+), 8 deletions(-)
create mode 100644 opennlp-api/src/main/java/opennlp/tools/util/normalizer/AlignedText.java
create mode 100644 opennlp-api/src/main/java/opennlp/tools/util/normalizer/Alignment.java
create mode 100644 opennlp-api/src/main/java/opennlp/tools/util/normalizer/OffsetAwareNormalizer.java
create mode 100644 opennlp-api/src/test/java/opennlp/tools/util/normalizer/AlignmentTest.java
create mode 100644 opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/AlignedAggregateCharSequenceNormalizer.java
create mode 100644 opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/LineBreakPreservingWhitespaceCharSequenceNormalizer.java
create mode 100644 opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/AlignedNormalizerPipelineTest.java
diff --git a/opennlp-api/src/main/java/opennlp/tools/util/normalizer/AlignedText.java b/opennlp-api/src/main/java/opennlp/tools/util/normalizer/AlignedText.java
new file mode 100644
index 000000000..dc68b1f09
--- /dev/null
+++ b/opennlp-api/src/main/java/opennlp/tools/util/normalizer/AlignedText.java
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.util.normalizer;
+
+import opennlp.tools.util.Span;
+
+/**
+ * The result of a normalization that keeps the original text alongside the normalized form and a
+ * full {@link Alignment} between them.
+ *
+ * The original is the source of truth, the normalized form is the derived view tuned for
+ * matching and search, and the alignment maps spans between them through deletions, collapses, and
+ * expansions. Use
+ * {@link #toOriginalSpan(int, int)} to report a match found in the normalized form against the
+ * original.
+ *
+ * @param original The untouched source text.
+ * @param normalized The normalized text.
+ * @param alignment The alignment between the normalized and original text.
+ */
+public record AlignedText(CharSequence original, String normalized, Alignment alignment) {
+
+ /**
+ * Maps a span of the normalized text back to the tightest span of the original text.
+ *
+ * @param normalizedStart The inclusive start offset in the normalized text.
+ * @param normalizedEnd The exclusive end offset in the normalized text.
+ * @return The corresponding original span.
+ */
+ public Span toOriginalSpan(int normalizedStart, int normalizedEnd) {
+ return alignment.toOriginalSpan(normalizedStart, normalizedEnd);
+ }
+
+ /**
+ * Maps a span of the original text forward to the normalized text.
+ *
+ * @param originalStart The inclusive start offset in the original text.
+ * @param originalEnd The exclusive end offset in the original text.
+ * @return The corresponding normalized span.
+ */
+ public Span toNormalizedSpan(int originalStart, int originalEnd) {
+ return alignment.toNormalizedSpan(originalStart, originalEnd);
+ }
+}
diff --git a/opennlp-api/src/main/java/opennlp/tools/util/normalizer/Alignment.java b/opennlp-api/src/main/java/opennlp/tools/util/normalizer/Alignment.java
new file mode 100644
index 000000000..0f1d47a6a
--- /dev/null
+++ b/opennlp-api/src/main/java/opennlp/tools/util/normalizer/Alignment.java
@@ -0,0 +1,293 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.util.normalizer;
+
+import java.util.Arrays;
+
+import opennlp.tools.util.Span;
+
+/**
+ * A bidirectional alignment between an original text and a normalized form of it.
+ *
+ * Normalization edits text in ways that move character offsets: a run of whitespace collapses to
+ * one space, a supplementary dash folds to a single ASCII hyphen, a case fold can grow text
+ * (German {@code eszett} to {@code ss}), and trimming or stripping deletes characters outright. An
+ * {@code Alignment} records those edits as a sequence of equal runs (text copied through
+ * unchanged in length) and replace runs (a block of original characters that produced a
+ * block of normalized characters), so any span in either form can be mapped to the other.
+ *
+ * Because it represents deletions as gaps and expansions as shared blocks (rather than storing a
+ * single original offset per normalized character, which would assume the normalized text
+ * contiguously covers the original), mapping is done
+ * span to span ({@link #toOriginalSpan(int, int)} / {@link #toNormalizedSpan(int, int)}) so a match
+ * that ends next to deleted text reports a tight span rather than over-covering the deletion. Two
+ * alignments compose with {@link #andThen(Alignment)}, which is what lets a multi-stage
+ * normalization pipeline still map a result all the way back to the original.
+ *
+ * Instances are immutable and thread-safe; build one with {@link Builder}.
+ */
+public final class Alignment {
+
+ // For normalized character k, originalStart[k]/originalEnd[k] are the half-open original range it
+ // was produced from. Characters copied unchanged map one to one; characters from a collapse or
+ // expansion share their run's whole original range (it cannot be subdivided); deleted original
+ // characters appear as a gap that no normalized character covers.
+ private final int[] originalStart;
+ private final int[] originalEnd;
+ private final int originalLength;
+
+ private Alignment(int[] originalStart, int[] originalEnd, int originalLength) {
+ this.originalStart = originalStart;
+ this.originalEnd = originalEnd;
+ this.originalLength = originalLength;
+ }
+
+ /** {@return the length of the normalized text this alignment was built for} */
+ public int normalizedLength() {
+ return originalStart.length;
+ }
+
+ /** {@return the length of the original text this alignment was built for} */
+ public int originalLength() {
+ return originalLength;
+ }
+
+ /**
+ * Maps a half-open span of the normalized text to the tightest half-open span of the original
+ * text that produced it.
+ *
+ * @param normalizedStart The inclusive start offset, in {@code [0, normalizedLength()]}.
+ * @param normalizedEnd The exclusive end offset, in {@code [normalizedStart, normalizedLength()]}.
+ * @return The corresponding original span.
+ * @throws IndexOutOfBoundsException Thrown if the offsets are out of range or inverted.
+ */
+ public Span toOriginalSpan(int normalizedStart, int normalizedEnd) {
+ checkRange(normalizedStart, normalizedEnd, normalizedLength());
+ if (normalizedStart == normalizedEnd) {
+ final int at = normalizedStart < normalizedLength()
+ ? originalStart[normalizedStart] : originalLength;
+ return new Span(at, at);
+ }
+ return new Span(originalStart[normalizedStart], originalEnd[normalizedEnd - 1]);
+ }
+
+ /**
+ * Maps a half-open span of the original text to the half-open span of the normalized text that
+ * covers it. Original characters that were deleted map to an empty span at the point where they
+ * were removed.
+ *
+ * @param originalStartOffset The inclusive start offset, in {@code [0, originalLength()]}.
+ * @param originalEndOffset The exclusive end offset, in {@code [originalStartOffset, originalLength()]}.
+ * @return The corresponding normalized span.
+ * @throws IndexOutOfBoundsException Thrown if the offsets are out of range or inverted.
+ */
+ public Span toNormalizedSpan(int originalStartOffset, int originalEndOffset) {
+ checkRange(originalStartOffset, originalEndOffset, originalLength);
+ final int start = firstIndexEndingAfter(originalStartOffset);
+ final int end = firstIndexStartingAtOrAfter(originalEndOffset);
+ return new Span(start, Math.max(start, end));
+ }
+
+ /**
+ * Maps a normalized offset to the original offset where its character begins (start semantics).
+ * Prefer {@link #toOriginalSpan(int, int)} for mapping a match, since a single offset cannot
+ * distinguish the start and end of a span across a deletion.
+ *
+ * @param normalizedOffset An offset in {@code [0, normalizedLength()]}.
+ * @return The corresponding original offset.
+ * @throws IndexOutOfBoundsException Thrown if {@code normalizedOffset} is out of range.
+ */
+ public int toOriginalOffset(int normalizedOffset) {
+ if (normalizedOffset < 0 || normalizedOffset > normalizedLength()) {
+ throw new IndexOutOfBoundsException("normalized offset " + normalizedOffset
+ + " is outside [0, " + normalizedLength() + "]");
+ }
+ return normalizedOffset < normalizedLength() ? originalStart[normalizedOffset] : originalLength;
+ }
+
+ /**
+ * Composes this alignment with one that further normalizes this alignment's normalized text.
+ *
+ * If this maps {@code original -> middle} and {@code next} maps {@code middle -> final}, the
+ * result maps {@code original -> final} directly, so a span found in the final text can be mapped
+ * straight back to the original without keeping the intermediate stages.
+ *
+ * @param next The next stage, whose original side is this stage's normalized text.
+ * @return The composed alignment.
+ * @throws IllegalArgumentException Thrown if {@code next.originalLength()} does not equal this
+ * {@code normalizedLength()} (the stages do not line up).
+ */
+ public Alignment andThen(Alignment next) {
+ if (next.originalLength != normalizedLength()) {
+ throw new IllegalArgumentException("stages do not line up: this normalizedLength="
+ + normalizedLength() + " but next originalLength=" + next.originalLength);
+ }
+ final int finalLength = next.normalizedLength();
+ final int[] starts = new int[finalLength];
+ final int[] ends = new int[finalLength];
+ for (int f = 0; f < finalLength; f++) {
+ final int middleStart = next.originalStart[f];
+ final int middleEnd = next.originalEnd[f];
+ final int start = middleStart < normalizedLength() ? originalStart[middleStart] : originalLength;
+ final int end = middleEnd > 0 ? originalEnd[middleEnd - 1] : 0;
+ starts[f] = start;
+ // Math.max keeps the original span non-inverted. When next inserted this final character
+ // (a zero-width middle range, middleStart == middleEnd) the max collapses it to a zero-width
+ // original span -- correct for every insertion except one landing strictly inside an
+ // expansion this stage produced, where the characters on either side share one atomic
+ // original block (originalEnd[middleEnd - 1] > originalStart[middleStart]) that has no
+ // interior offset to point at. There the insertion is attributed to that whole block, the
+ // only choice that keeps originalStart/originalEnd sorted so toOriginalSpan/toNormalizedSpan
+ // keep their O(log n) search; forcing it to zero-width would push originalEnd below its
+ // predecessor and corrupt the reverse mapping.
+ ends[f] = Math.max(start, end);
+ }
+ return new Alignment(starts, ends, originalLength);
+ }
+
+ // First normalized index whose original coverage ends strictly after offset (so it covers or
+ // follows offset); normalizedLength() when offset is at or past the last covered original char.
+ private int firstIndexEndingAfter(int offset) {
+ int low = 0;
+ int high = originalEnd.length;
+ while (low < high) {
+ final int mid = (low + high) >>> 1;
+ if (originalEnd[mid] > offset) {
+ high = mid;
+ } else {
+ low = mid + 1;
+ }
+ }
+ return low;
+ }
+
+ // First normalized index whose original coverage starts at or after offset.
+ private int firstIndexStartingAtOrAfter(int offset) {
+ int low = 0;
+ int high = originalStart.length;
+ while (low < high) {
+ final int mid = (low + high) >>> 1;
+ if (originalStart[mid] >= offset) {
+ high = mid;
+ } else {
+ low = mid + 1;
+ }
+ }
+ return low;
+ }
+
+ private static void checkRange(int start, int end, int length) {
+ if (start < 0 || end > length || start > end) {
+ throw new IndexOutOfBoundsException("span [" + start + ", " + end + ") is outside [0, "
+ + length + "]");
+ }
+ }
+
+ /**
+ * Builds an {@link Alignment} as the normalized text is produced, by recording each edit in order.
+ * Call {@link #equal(int)} for characters copied through unchanged and {@link #replace(int, int)}
+ * for a block that was rewritten (including deletions and insertions), then {@link #build(int)}.
+ */
+ public static final class Builder {
+
+ private static final int MAX_ARRAY_SIZE = Integer.MAX_VALUE - 8;
+
+ private int[] starts = new int[16];
+ private int[] ends = new int[16];
+ private int count;
+ private int originalCursor;
+
+ /**
+ * Records {@code charCount} characters copied through unchanged (a one to one run).
+ *
+ * @param charCount The number of UTF-16 characters; must not be negative.
+ * @return This builder.
+ */
+ public Builder equal(int charCount) {
+ if (charCount < 0) {
+ throw new IllegalArgumentException("charCount must not be negative: " + charCount);
+ }
+ for (int i = 0; i < charCount; i++) {
+ append(originalCursor, originalCursor + 1);
+ originalCursor++;
+ }
+ return this;
+ }
+
+ /**
+ * Records a rewritten block: {@code originalCount} original characters that produced
+ * {@code normalizedCount} normalized characters. Each produced character is attributed to the
+ * whole original block, since a collapse or expansion cannot be subdivided. {@code 0} for
+ * {@code normalizedCount} is a deletion; {@code 0} for {@code originalCount} is an insertion.
+ *
+ * @param originalCount The number of original characters consumed; must not be negative.
+ * @param normalizedCount The number of normalized characters produced; must not be negative.
+ * @return This builder.
+ */
+ public Builder replace(int originalCount, int normalizedCount) {
+ if (originalCount < 0 || normalizedCount < 0) {
+ throw new IllegalArgumentException("counts must not be negative: " + originalCount
+ + ", " + normalizedCount);
+ }
+ final int blockEnd = originalCursor + originalCount;
+ for (int i = 0; i < normalizedCount; i++) {
+ append(originalCursor, blockEnd);
+ }
+ originalCursor = blockEnd;
+ return this;
+ }
+
+ /**
+ * Finalizes the alignment.
+ *
+ * @param originalLength The full length of the original text.
+ * @return The immutable {@link Alignment}.
+ * @throws IllegalStateException Thrown if the recorded edits do not consume exactly
+ * {@code originalLength} original characters (a sign that some input was not accounted for).
+ */
+ public Alignment build(int originalLength) {
+ if (originalCursor != originalLength) {
+ throw new IllegalStateException("edits consumed " + originalCursor
+ + " original characters but originalLength is " + originalLength);
+ }
+ return new Alignment(Arrays.copyOf(starts, count), Arrays.copyOf(ends, count), originalLength);
+ }
+
+ private void append(int start, int end) {
+ if (count == starts.length) {
+ grow();
+ }
+ starts[count] = start;
+ ends[count] = end;
+ count++;
+ }
+
+ // Overflow-aware 1.5x growth: never wraps to a negative capacity, degrades to a clean
+ // OutOfMemoryError at the array-size ceiling instead of NegativeArraySizeException.
+ private void grow() {
+ int newCapacity = starts.length + (starts.length >> 1);
+ if (newCapacity < 0 || newCapacity > MAX_ARRAY_SIZE) {
+ newCapacity = MAX_ARRAY_SIZE;
+ }
+ if (newCapacity <= count) {
+ throw new OutOfMemoryError("Alignment exceeds maximum size");
+ }
+ starts = Arrays.copyOf(starts, newCapacity);
+ ends = Arrays.copyOf(ends, newCapacity);
+ }
+ }
+}
diff --git a/opennlp-api/src/main/java/opennlp/tools/util/normalizer/CharClass.java b/opennlp-api/src/main/java/opennlp/tools/util/normalizer/CharClass.java
index 64e924b0c..a10751bc8 100644
--- a/opennlp-api/src/main/java/opennlp/tools/util/normalizer/CharClass.java
+++ b/opennlp-api/src/main/java/opennlp/tools/util/normalizer/CharClass.java
@@ -300,6 +300,175 @@ public String removeAll(CharSequence text) {
return out.toString();
}
+ /**
+ * Like {@link #normalize(CharSequence)} but also produces the {@link Alignment} back to the
+ * original text.
+ *
+ * @param text The text to normalize.
+ * @return The normalized text and its alignment.
+ */
+ public AlignedText normalizeAligned(CharSequence text) {
+ Objects.requireNonNull(text, "text");
+ final StringBuilder out = new StringBuilder(text.length());
+ final Alignment.Builder alignment = new Alignment.Builder();
+ final int length = text.length();
+ int i = 0;
+ while (i < length) {
+ final int codePoint = Character.codePointAt(text, i);
+ final int charCount = Character.charCount(codePoint);
+ if (members.contains(codePoint)) {
+ out.appendCodePoint(replacement);
+ alignment.replace(charCount, Character.charCount(replacement));
+ } else {
+ out.appendCodePoint(codePoint);
+ alignment.equal(charCount);
+ }
+ i += charCount;
+ }
+ return new AlignedText(text, out.toString(), alignment.build(length));
+ }
+
+ /**
+ * Like {@link #collapse(CharSequence)} but also produces the {@link Alignment} back to the
+ * original text. Each collapsed run maps to the run's whole original extent.
+ *
+ * @param text The text to collapse.
+ * @return The collapsed text and its alignment.
+ */
+ public AlignedText collapseAligned(CharSequence text) {
+ Objects.requireNonNull(text, "text");
+ final StringBuilder out = new StringBuilder(text.length());
+ final Alignment.Builder alignment = new Alignment.Builder();
+ final int length = text.length();
+ int i = 0;
+ while (i < length) {
+ final int codePoint = Character.codePointAt(text, i);
+ if (members.contains(codePoint)) {
+ final int runEnd = skipRun(text, i);
+ out.appendCodePoint(replacement);
+ alignment.replace(runEnd - i, Character.charCount(replacement));
+ i = runEnd;
+ } else {
+ final int charCount = Character.charCount(codePoint);
+ out.appendCodePoint(codePoint);
+ alignment.equal(charCount);
+ i += charCount;
+ }
+ }
+ return new AlignedText(text, out.toString(), alignment.build(length));
+ }
+
+ /**
+ * Like {@link #collapsePreserving(CharSequence, CodePointSet, int)} but also produces the
+ * {@link Alignment} back to the original text.
+ *
+ * @param text The text to collapse.
+ * @param keep The member code points whose presence in a run preserves structure.
+ * @param keepReplacement The replacement emitted for a run that contains a {@code keep} member.
+ * @return The collapsed text and its alignment.
+ * @throws IllegalArgumentException Thrown if {@code keepReplacement} is not a valid code point.
+ */
+ public AlignedText collapsePreservingAligned(CharSequence text, CodePointSet keep,
+ int keepReplacement) {
+ Objects.requireNonNull(text, "text");
+ Objects.requireNonNull(keep, "keep");
+ requireValidCodePoint(keepReplacement);
+ final StringBuilder out = new StringBuilder(text.length());
+ final Alignment.Builder alignment = new Alignment.Builder();
+ final int length = text.length();
+ int i = 0;
+ while (i < length) {
+ final int codePoint = Character.codePointAt(text, i);
+ if (members.contains(codePoint)) {
+ boolean preserve = keep.contains(codePoint);
+ int j = i + Character.charCount(codePoint);
+ while (j < length) {
+ final int next = Character.codePointAt(text, j);
+ if (!members.contains(next)) {
+ break;
+ }
+ preserve |= keep.contains(next);
+ j += Character.charCount(next);
+ }
+ final int emitted = preserve ? keepReplacement : replacement;
+ out.appendCodePoint(emitted);
+ alignment.replace(j - i, Character.charCount(emitted));
+ i = j;
+ } else {
+ final int charCount = Character.charCount(codePoint);
+ out.appendCodePoint(codePoint);
+ alignment.equal(charCount);
+ i += charCount;
+ }
+ }
+ return new AlignedText(text, out.toString(), alignment.build(length));
+ }
+
+ /**
+ * Like {@link #trim(CharSequence)} but also produces the {@link Alignment} back to the original
+ * text. The trimmed leading and trailing members appear as deletions, so a span never reports
+ * through them.
+ *
+ * @param text The text to trim.
+ * @return The trimmed text and its alignment.
+ */
+ public AlignedText trimAligned(CharSequence text) {
+ Objects.requireNonNull(text, "text");
+ final int length = text.length();
+ int start = 0;
+ while (start < length) {
+ final int codePoint = Character.codePointAt(text, start);
+ if (!members.contains(codePoint)) {
+ break;
+ }
+ start += Character.charCount(codePoint);
+ }
+ int end = length;
+ while (end > start) {
+ final int codePoint = Character.codePointBefore(text, end);
+ if (!members.contains(codePoint)) {
+ break;
+ }
+ end -= Character.charCount(codePoint);
+ }
+ final Alignment.Builder alignment = new Alignment.Builder();
+ if (start > 0) {
+ alignment.replace(start, 0);
+ }
+ alignment.equal(end - start);
+ if (end < length) {
+ alignment.replace(length - end, 0);
+ }
+ return new AlignedText(text, text.subSequence(start, end).toString(), alignment.build(length));
+ }
+
+ /**
+ * Like {@link #removeAll(CharSequence)} but also produces the {@link Alignment} back to the
+ * original text. Every removed member appears as a deletion, so a span never reports through one.
+ *
+ * @param text The text to filter.
+ * @return The filtered text and its alignment.
+ */
+ public AlignedText removeAllAligned(CharSequence text) {
+ Objects.requireNonNull(text, "text");
+ final StringBuilder out = new StringBuilder(text.length());
+ final Alignment.Builder alignment = new Alignment.Builder();
+ final int length = text.length();
+ int i = 0;
+ while (i < length) {
+ final int codePoint = Character.codePointAt(text, i);
+ final int charCount = Character.charCount(codePoint);
+ if (members.contains(codePoint)) {
+ alignment.replace(charCount, 0);
+ } else {
+ out.appendCodePoint(codePoint);
+ alignment.equal(charCount);
+ }
+ i += charCount;
+ }
+ return new AlignedText(text, out.toString(), alignment.build(length));
+ }
+
/**
* Applies a per-code-point substitution: each code point for which {@code substitution} returns a
* non-null string is replaced by that string, and the rest are copied through. This is the shared,
@@ -329,6 +498,37 @@ public static String substitute(CharSequence text, IntFunction substitut
return out.toString();
}
+ /**
+ * Like {@link #substitute(CharSequence, IntFunction)} but also produces the {@link Alignment} back
+ * to the original text. Each replaced code point maps to its replacement string as one block.
+ *
+ * @param text The text to transform.
+ * @param substitution The replacement for a code point, or {@code null} to copy it through.
+ * @return The transformed text and its alignment.
+ */
+ public static AlignedText substituteAligned(CharSequence text, IntFunction substitution) {
+ Objects.requireNonNull(text, "text");
+ Objects.requireNonNull(substitution, "substitution");
+ final StringBuilder out = new StringBuilder(text.length());
+ final Alignment.Builder alignment = new Alignment.Builder();
+ final int length = text.length();
+ int i = 0;
+ while (i < length) {
+ final int codePoint = Character.codePointAt(text, i);
+ final int charCount = Character.charCount(codePoint);
+ final String replacement = substitution.apply(codePoint);
+ if (replacement != null) {
+ out.append(replacement);
+ alignment.replace(charCount, replacement.length());
+ } else {
+ out.appendCodePoint(codePoint);
+ alignment.equal(charCount);
+ }
+ i += charCount;
+ }
+ return new AlignedText(text, out.toString(), alignment.build(length));
+ }
+
// Returns the offset just past the maximal run of members starting at runStart.
private int skipRun(CharSequence text, int runStart) {
final int length = text.length();
diff --git a/opennlp-api/src/main/java/opennlp/tools/util/normalizer/OffsetAwareNormalizer.java b/opennlp-api/src/main/java/opennlp/tools/util/normalizer/OffsetAwareNormalizer.java
new file mode 100644
index 000000000..e812d2864
--- /dev/null
+++ b/opennlp-api/src/main/java/opennlp/tools/util/normalizer/OffsetAwareNormalizer.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.util.normalizer;
+
+/**
+ * A {@link CharSequenceNormalizer} that can additionally report the {@link Alignment} from its
+ * normalized output back to the input, so a span found in the normalized text maps to the exact
+ * character offsets of the original.
+ *
+ * Length-changing folds move offsets: collapsing a run of whitespace, folding a supplementary
+ * dash to one ASCII hyphen, or stripping invisible controls all shift every later character. A rung
+ * that performs such a fold over the cursor-based {@link CharClass} engine can record those edits
+ * and expose them through {@link #normalizeAligned(CharSequence)}. A rung that delegates to
+ * {@link java.text.Normalizer} (NFC/NFKC) or to a stemmer cannot report its edits, so it does not
+ * implement this interface; that is a deliberate capability split rather than an oversight.
+ *
+ * {@code TextNormalizer.Builder.buildAligned()} composes a chain of these into a single
+ * offset-aware pipeline whose {@link AlignedText} maps a match all the way back to the original
+ * input. An interface-typed caller tests for the capability
+ * ({@code normalizer instanceof OffsetAwareNormalizer}) instead of depending on a concrete rung,
+ * the same plain {@code instanceof} pattern used by
+ * {@link opennlp.tools.namefind.OffsetMappingNameFinder} rather than reflection.
+ */
+public interface OffsetAwareNormalizer extends CharSequenceNormalizer {
+
+ /**
+ * Normalizes {@code text} and returns the result together with the {@link Alignment} back to the
+ * input. The normalized text is identical to {@link #normalize(CharSequence)}: that is,
+ * {@code normalizeAligned(text).normalized()} equals {@code normalize(text).toString()}.
+ *
+ * @param text The {@link CharSequence} to normalize.
+ * @return The normalized text paired with its alignment to {@code text}.
+ */
+ AlignedText normalizeAligned(CharSequence text);
+}
diff --git a/opennlp-api/src/test/java/opennlp/tools/util/normalizer/AlignmentTest.java b/opennlp-api/src/test/java/opennlp/tools/util/normalizer/AlignmentTest.java
new file mode 100644
index 000000000..07c92de0f
--- /dev/null
+++ b/opennlp-api/src/test/java/opennlp/tools/util/normalizer/AlignmentTest.java
@@ -0,0 +1,258 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.util.normalizer;
+
+import org.junit.jupiter.api.Test;
+
+import opennlp.tools.util.Span;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+
+public class AlignmentTest {
+
+ private static void assertSpan(int start, int end, Span span) {
+ assertEquals(start, span.getStart(), "start");
+ assertEquals(end, span.getEnd(), "end");
+ }
+
+ @Test
+ void testIdentityMapsOneToOne() {
+ final Alignment a = new Alignment.Builder().equal(3).build(3); // "abc" unchanged
+ assertEquals(3, a.normalizedLength());
+ assertEquals(3, a.originalLength());
+ assertSpan(0, 3, a.toOriginalSpan(0, 3));
+ assertSpan(1, 2, a.toOriginalSpan(1, 2));
+ }
+
+ @Test
+ void testCollapsedRunMapsToWholeRun() {
+ // "ab " -> "ab " : keep "ab", collapse two spaces into one.
+ final Alignment a = new Alignment.Builder().equal(2).replace(2, 1).build(4);
+ assertSpan(0, 2, a.toOriginalSpan(0, 2)); // "ab"
+ assertSpan(2, 4, a.toOriginalSpan(2, 3)); // the collapsed space covers both originals
+ assertSpan(0, 4, a.toOriginalSpan(0, 3));
+ }
+
+ @Test
+ void testInteriorDeletionDoesNotOverCover() {
+ // "a b c" -> "abc" : the two spaces are deleted. A per-character offset map over-covers here.
+ final Alignment a = new Alignment.Builder()
+ .equal(1).replace(1, 0).equal(1).replace(1, 0).equal(1).build(5);
+ assertEquals(3, a.normalizedLength());
+ assertEquals(5, a.originalLength());
+ assertSpan(0, 1, a.toOriginalSpan(0, 1)); // "a"
+ assertSpan(2, 3, a.toOriginalSpan(1, 2)); // "b" -> [2,3), NOT [2,4)
+ assertSpan(4, 5, a.toOriginalSpan(2, 3)); // "c"
+ assertSpan(0, 5, a.toOriginalSpan(0, 3)); // whole text
+ }
+
+ @Test
+ void testTrailingDeletionDoesNotOverCover() {
+ // "ab " -> "ab" : strip trailing spaces. A match at the end must not absorb them.
+ final Alignment a = new Alignment.Builder().equal(2).replace(2, 0).build(4);
+ assertSpan(0, 2, a.toOriginalSpan(0, 2)); // "ab" -> [0,2), NOT [0,4)
+ assertSpan(1, 2, a.toOriginalSpan(1, 2)); // "b" -> [1,2)
+ }
+
+ @Test
+ void testExpansionSharesTheSingleSource() {
+ // "aßb" -> "assb" : the eszett expands to two characters that both come from it.
+ final Alignment a = new Alignment.Builder().equal(1).replace(1, 2).equal(1).build(3);
+ assertEquals(4, a.normalizedLength());
+ assertSpan(1, 2, a.toOriginalSpan(1, 3)); // "ss" -> the single "ß"
+ assertSpan(1, 2, a.toOriginalSpan(1, 2)); // first "s"
+ assertSpan(1, 2, a.toOriginalSpan(2, 3)); // second "s"
+ assertSpan(2, 3, a.toOriginalSpan(3, 4)); // "b"
+ }
+
+ @Test
+ void testReverseMappingAndDeletionsMapToEmptySpans() {
+ final Alignment a = new Alignment.Builder()
+ .equal(1).replace(1, 0).equal(1).replace(1, 0).equal(1).build(5); // "a b c" -> "abc"
+ assertSpan(1, 2, a.toNormalizedSpan(2, 3)); // original "b" -> normalized "b"
+ assertSpan(1, 1, a.toNormalizedSpan(1, 2)); // deleted space -> empty normalized span
+ assertSpan(0, 3, a.toNormalizedSpan(0, 5)); // whole original -> whole normalized
+ }
+
+ @Test
+ void testAndThenComposesTwoStages() {
+ // Stage 1: "a b" -> "a b" (collapse two spaces). Stage 2: "a b" -> "a-b" (space to dash).
+ final Alignment whitespace = new Alignment.Builder().equal(1).replace(2, 1).equal(1).build(4);
+ final Alignment dash = new Alignment.Builder().equal(1).replace(1, 1).equal(1).build(3);
+ final Alignment composed = whitespace.andThen(dash);
+
+ assertEquals(4, composed.originalLength());
+ assertEquals(3, composed.normalizedLength());
+ assertSpan(0, 1, composed.toOriginalSpan(0, 1)); // "a"
+ assertSpan(1, 3, composed.toOriginalSpan(1, 2)); // "-" maps back to the original " "
+ assertSpan(3, 4, composed.toOriginalSpan(2, 3)); // "b"
+ assertSpan(0, 4, composed.toOriginalSpan(0, 3));
+ }
+
+ @Test
+ void testAndThenRejectsMismatchedStages() {
+ final Alignment first = new Alignment.Builder().equal(2).build(2); // normalizedLength 2
+ final Alignment second = new Alignment.Builder().equal(3).build(3); // originalLength 3
+ assertThrows(IllegalArgumentException.class, () -> first.andThen(second));
+ }
+
+ @Test
+ void testAllDeletedProducesEmptyNormalized() {
+ final Alignment a = new Alignment.Builder().replace(2, 0).build(2); // " " -> ""
+ assertEquals(0, a.normalizedLength());
+ assertEquals(2, a.originalLength());
+ assertSpan(0, 0, a.toNormalizedSpan(0, 2)); // all original deleted -> empty normalized span
+ }
+
+ @Test
+ void testBuilderRejectsWrongOriginalLength() {
+ assertThrows(IllegalStateException.class, () -> new Alignment.Builder().equal(2).build(3));
+ }
+
+ @Test
+ void testBuilderRejectsNegativeCounts() {
+ assertThrows(IllegalArgumentException.class, () -> new Alignment.Builder().equal(-1));
+ assertThrows(IllegalArgumentException.class, () -> new Alignment.Builder().replace(-1, 0));
+ }
+
+ @Test
+ void testToOriginalSpanRejectsOutOfRange() {
+ final Alignment a = new Alignment.Builder().equal(2).build(2);
+ assertThrows(IndexOutOfBoundsException.class, () -> a.toOriginalSpan(-1, 1));
+ assertThrows(IndexOutOfBoundsException.class, () -> a.toOriginalSpan(0, 3));
+ assertThrows(IndexOutOfBoundsException.class, () -> a.toOriginalSpan(2, 1));
+ }
+
+ @Test
+ void testToOriginalOffsetConvenience() {
+ final Alignment a = new Alignment.Builder().equal(2).replace(2, 1).build(4); // "ab "->"ab "
+ assertEquals(0, a.toOriginalOffset(0));
+ assertEquals(2, a.toOriginalOffset(2)); // start of the collapsed space
+ assertEquals(4, a.toOriginalOffset(3)); // end sentinel -> original length
+ }
+
+ @Test
+ void testBuilderGrowsBeyondInitialCapacity() {
+ // 20 equal chars force the builder past its initial 16-entry buffers (exercises grow()).
+ final Alignment a = new Alignment.Builder().equal(20).build(20);
+ assertEquals(20, a.normalizedLength());
+ assertEquals(20, a.originalLength());
+ assertSpan(0, 20, a.toOriginalSpan(0, 20));
+ assertSpan(17, 18, a.toOriginalSpan(17, 18));
+ }
+
+ @Test
+ void testAndThenChainsThreeStages() {
+ // "a b" -> "a b" (collapse) -> "a-b" (space->dash) -> "a_b" (dash->underscore).
+ final Alignment s1 = new Alignment.Builder().equal(1).replace(2, 1).equal(1).build(4);
+ final Alignment s2 = new Alignment.Builder().equal(1).replace(1, 1).equal(1).build(3);
+ final Alignment s3 = new Alignment.Builder().equal(1).replace(1, 1).equal(1).build(3);
+ final Alignment composed = s1.andThen(s2).andThen(s3);
+
+ assertEquals(4, composed.originalLength());
+ assertEquals(3, composed.normalizedLength());
+ assertSpan(0, 1, composed.toOriginalSpan(0, 1)); // a
+ assertSpan(1, 3, composed.toOriginalSpan(1, 2)); // "_" maps all the way back to the " "
+ assertSpan(3, 4, composed.toOriginalSpan(2, 3)); // b
+ }
+
+ @Test
+ void testAndThenHandlesLeadingInsertionInNextStage() {
+ // Exercises the andThen branch where the next stage's character covers zero middle characters
+ // at offset 0 (a leading insertion: originalEnd == 0). The result must be a zero-width original
+ // span at 0, and the rest of the mapping must stay correct.
+ final Alignment first = new Alignment.Builder().equal(2).build(2); // "ab" unchanged
+ final Alignment next = new Alignment.Builder().replace(0, 1).equal(2).build(2); // "ab" -> "Xab"
+ final Alignment composed = first.andThen(next);
+
+ assertEquals(2, composed.originalLength());
+ assertEquals(3, composed.normalizedLength());
+ assertSpan(0, 0, composed.toOriginalSpan(0, 1)); // inserted "X" -> zero-width span at original 0
+ assertSpan(0, 1, composed.toOriginalSpan(1, 2)); // "a"
+ assertSpan(1, 2, composed.toOriginalSpan(2, 3)); // "b"
+ assertSpan(0, 2, composed.toOriginalSpan(0, 3)); // whole normalized -> whole original
+ }
+
+ @Test
+ void testAndThenHandlesInteriorInsertionInCopiedRegion() {
+ // An insertion in the next stage that is NOT at offset 0 and lands in a one-to-one (copied)
+ // region must still map to a zero-width original span at the insertion point: the andThen branch
+ // where middleStart == middleEnd with middleEnd > 0. Without correct handling this is exactly the
+ // case that would misattribute the inserted character to a neighbouring original character.
+ final Alignment first = new Alignment.Builder().equal(3).build(3); // "abc"
+ final Alignment next = new Alignment.Builder().equal(1).replace(0, 1).equal(2).build(3); // "abc"->"aXbc"
+ final Alignment composed = first.andThen(next);
+
+ assertEquals(3, composed.originalLength());
+ assertEquals(4, composed.normalizedLength());
+ assertSpan(0, 1, composed.toOriginalSpan(0, 1)); // "a"
+ assertSpan(1, 1, composed.toOriginalSpan(1, 2)); // inserted "X" -> zero-width span at original 1
+ assertSpan(1, 2, composed.toOriginalSpan(2, 3)); // "b"
+ assertSpan(2, 3, composed.toOriginalSpan(3, 4)); // "c"
+ assertSpan(0, 3, composed.toOriginalSpan(0, 4)); // whole normalized -> whole original
+ }
+
+ @Test
+ void testAndThenInsertionInsideExpansionStaysConsistent() {
+ // The hard case: stage 1 expands "ss" from one original character, then stage 2 inserts a
+ // character BETWEEN the two produced characters. The two halves of an expansion share one atomic
+ // original block ([1, 2)), which has no interior offset, so the inserted character is attributed
+ // to that whole block rather than a zero-width point. That is the only mapping that keeps
+ // originalStart/originalEnd sorted, so BOTH directions still resolve correctly -- a zero-width
+ // mapping here would push originalEnd below its predecessor and corrupt the reverse search.
+ // stage 1: "aXb" -> "assb" (X expands to "ss"); stage 2: "assb" -> "asYsb" (insert Y between).
+ final Alignment expand = new Alignment.Builder().equal(1).replace(1, 2).equal(1).build(3);
+ final Alignment insert = new Alignment.Builder().equal(2).replace(0, 1).equal(2).build(4);
+ final Alignment composed = expand.andThen(insert);
+
+ assertEquals(3, composed.originalLength());
+ assertEquals(5, composed.normalizedLength());
+ assertSpan(0, 1, composed.toOriginalSpan(0, 1)); // "a"
+ assertSpan(1, 2, composed.toOriginalSpan(1, 2)); // first "s" -> the expanded original char
+ assertSpan(1, 2, composed.toOriginalSpan(2, 3)); // inserted char -> attributed to the atomic block
+ assertSpan(1, 2, composed.toOriginalSpan(3, 4)); // second "s" -> the expanded original char
+ assertSpan(2, 3, composed.toOriginalSpan(4, 5)); // "b"
+ assertSpan(0, 3, composed.toOriginalSpan(0, 5)); // whole normalized -> whole original
+
+ // Reverse direction stays correct because the start/end arrays remain sorted: the expanded
+ // original character maps to its full normalized footprint (the two halves plus the insertion).
+ assertSpan(1, 4, composed.toNormalizedSpan(1, 2)); // expanded char -> "sYs"
+ assertSpan(0, 1, composed.toNormalizedSpan(0, 1)); // "a"
+ assertSpan(4, 5, composed.toNormalizedSpan(2, 3)); // "b"
+ }
+
+ @Test
+ void testToNormalizedSpanDoesNotOverCoverAcrossDeletions() {
+ // "a b" -> "ab" : the two interior spaces are deleted. Forward mapping a span that ends inside
+ // the deleted run must stop at the last kept character rather than over-covering into "b".
+ final Alignment a = new Alignment.Builder().equal(1).replace(2, 0).equal(1).build(4);
+ assertEquals(2, a.normalizedLength());
+ assertSpan(0, 1, a.toNormalizedSpan(0, 3)); // "a" plus the two deleted spaces -> just "a"
+ assertSpan(1, 1, a.toNormalizedSpan(1, 3)); // only the deleted spaces -> empty normalized span
+ assertSpan(0, 2, a.toNormalizedSpan(0, 4)); // whole original -> whole normalized
+ assertSpan(1, 2, a.toNormalizedSpan(3, 4)); // "b"
+ }
+
+ @Test
+ void testToNormalizedSpanAcrossExpansion() {
+ final Alignment a = new Alignment.Builder().equal(1).replace(1, 2).equal(1).build(3); // ß->ss
+ assertSpan(1, 3, a.toNormalizedSpan(1, 2)); // original "ß" -> the two-char "ss"
+ assertSpan(0, 1, a.toNormalizedSpan(0, 1)); // a
+ assertSpan(3, 4, a.toNormalizedSpan(2, 3)); // b
+ }
+}
diff --git a/opennlp-api/src/test/java/opennlp/tools/util/normalizer/CharClassTest.java b/opennlp-api/src/test/java/opennlp/tools/util/normalizer/CharClassTest.java
index 76911a34d..052350d12 100644
--- a/opennlp-api/src/test/java/opennlp/tools/util/normalizer/CharClassTest.java
+++ b/opennlp-api/src/test/java/opennlp/tools/util/normalizer/CharClassTest.java
@@ -212,4 +212,187 @@ void testOfRejectsInvalidReplacement() {
() -> CharClass.of(CodePointSet.of(0x20), Character.MAX_CODE_POINT + 1));
}
+ // --- aligned variants (Alignment / AlignedText) ------------------------------------------
+
+ private static void assertSpan(int start, int end, Span span) {
+ assertEquals(start, span.getStart(), "start");
+ assertEquals(end, span.getEnd(), "end");
+ }
+
+ @Test
+ void testCollapseAlignedMapsRunToWholeExtent() {
+ final AlignedText at = WS.collapseAligned("a b");
+ assertEquals("a b", at.normalized());
+ assertSpan(0, 1, at.toOriginalSpan(0, 1)); // a
+ assertSpan(1, 3, at.toOriginalSpan(1, 2)); // the collapsed space covers both originals
+ assertSpan(3, 4, at.toOriginalSpan(2, 3)); // b
+ }
+
+ @Test
+ void testRemoveAllAlignedDoesNotOverCover() {
+ final AlignedText at = WS.removeAllAligned("a b c");
+ assertEquals("abc", at.normalized());
+ assertSpan(2, 3, at.toOriginalSpan(1, 2)); // "b" -> [2,3), not [2,4)
+ assertSpan(0, 5, at.toOriginalSpan(0, 3));
+ }
+
+ @Test
+ void testTrimAlignedDropsEdgesWithoutOverCovering() {
+ final AlignedText at = WS.trimAligned(" ab ");
+ assertEquals("ab", at.normalized());
+ assertEquals(6, at.alignment().originalLength());
+ assertSpan(2, 4, at.toOriginalSpan(0, 2)); // "ab" sits at original [2,4)
+ assertSpan(3, 4, at.toOriginalSpan(1, 2)); // "b"
+ }
+
+ @Test
+ void testCollapsePreservingAlignedKeepsLineBreak() {
+ final AlignedText at = WS.collapsePreservingAligned("a\n\n\t\tb", lineBreaks(), '\n');
+ assertEquals("a\nb", at.normalized());
+ assertSpan(1, 5, at.toOriginalSpan(1, 2)); // the preserved newline covers the whole run
+ }
+
+ @Test
+ void testNormalizeAlignedAcrossSupplementaryDash() {
+ final AlignedText at = DASH.normalizeAligned("x" + YEZIDI_HYPHEN + "y");
+ assertEquals("x-y", at.normalized());
+ assertSpan(0, 1, at.toOriginalSpan(0, 1)); // x
+ assertSpan(1, 3, at.toOriginalSpan(1, 2)); // "-" maps back to the two-char Yezidi hyphen
+ assertSpan(3, 4, at.toOriginalSpan(2, 3)); // y
+ }
+
+ // --- aligned edge cases (restore + extend the deleted *Mapped coverage) ------------------
+
+ @Test
+ void testCollapseAlignedAcrossMixedUnicodeWhitespaceRun() {
+ final AlignedText at = WS.collapseAligned("a" + NBSP + IDEOGRAPHIC + cp(0x2002) + "b");
+ assertEquals("a b", at.normalized());
+ assertSpan(1, 4, at.toOriginalSpan(1, 2)); // the one space covers the three-char ws run
+ assertSpan(4, 5, at.toOriginalSpan(2, 3)); // b
+ }
+
+ @Test
+ void testCollapseAlignedAcrossTabRun() {
+ final AlignedText at = WS.collapseAligned("a\t\t\t\t\tb");
+ assertEquals("a b", at.normalized());
+ assertSpan(1, 6, at.toOriginalSpan(1, 2)); // five tabs collapse to one space
+ assertSpan(6, 7, at.toOriginalSpan(2, 3));
+ }
+
+ @Test
+ void testCollapseAlignedAcrossNewlineRun() {
+ final AlignedText at = WS.collapseAligned("a\r\n\tb");
+ assertEquals("a b", at.normalized());
+ assertSpan(1, 4, at.toOriginalSpan(1, 2));
+ }
+
+ @Test
+ void testCollapseAlignedEmptySingleAndAllWhitespace() {
+ assertEquals("", WS.collapseAligned("").normalized());
+ assertEquals(0, WS.collapseAligned("").alignment().normalizedLength());
+
+ final AlignedText single = WS.collapseAligned("a");
+ assertEquals("a", single.normalized());
+ assertSpan(0, 1, single.toOriginalSpan(0, 1));
+
+ final AlignedText allWs = WS.collapseAligned("\t\t\t");
+ assertEquals(" ", allWs.normalized()); // all whitespace collapses to one space, not empty
+ assertSpan(0, 3, allWs.toOriginalSpan(0, 1));
+ }
+
+ @Test
+ void testCollapseAlignedKeepsSurrogatePairOffsets() {
+ final AlignedText at = WS.collapseAligned(GRINNING_FACE + "\t\tb");
+ assertEquals(GRINNING_FACE + " b", at.normalized());
+ assertSpan(0, 2, at.toOriginalSpan(0, 2)); // the emoji occupies two original chars
+ assertSpan(2, 4, at.toOriginalSpan(2, 3)); // the collapsed tabs
+ assertSpan(4, 5, at.toOriginalSpan(3, 4)); // b
+ }
+
+ @Test
+ void testNormalizeAlignedIsIdentityWhenNothingMatches() {
+ final AlignedText at = WS.normalizeAligned("abc");
+ assertEquals("abc", at.normalized());
+ for (int i = 0; i < 3; i++) {
+ assertSpan(i, i + 1, at.toOriginalSpan(i, i + 1));
+ }
+ }
+
+ @Test
+ void testNormalizeAlignedPreservesSupplementaryNonMember() {
+ final AlignedText at = WS.normalizeAligned("a" + GRINNING_FACE + "b");
+ assertEquals("a" + GRINNING_FACE + "b", at.normalized());
+ assertSpan(1, 3, at.toOriginalSpan(1, 3)); // the emoji passes through unchanged
+ }
+
+ @Test
+ void testNormalizeAlignedExpandsToSupplementaryReplacement() {
+ // A BMP member replaced by a supplementary code point grows by one char (1 -> 2).
+ final CharClass toPenguin = CharClass.of(CodePointSet.of(' '), 0x1F427);
+ final AlignedText at = toPenguin.normalizeAligned("a b");
+ assertEquals("a" + cp(0x1F427) + "b", at.normalized());
+ assertSpan(0, 1, at.toOriginalSpan(0, 1)); // a
+ assertSpan(1, 2, at.toOriginalSpan(1, 3)); // both penguin halves come from the one space
+ assertSpan(2, 3, at.toOriginalSpan(3, 4)); // b
+ }
+
+ @Test
+ void testRemoveAllAlignedLeadingAndTrailingDeletions() {
+ final AlignedText at = WS.removeAllAligned(" a b ");
+ assertEquals("ab", at.normalized());
+ assertSpan(1, 2, at.toOriginalSpan(0, 1)); // a (leading space deleted)
+ assertSpan(3, 4, at.toOriginalSpan(1, 2)); // b (trailing space deleted, not over-covered)
+ }
+
+ @Test
+ void testTrimAlignedAllWhitespaceIsEmpty() {
+ final AlignedText at = WS.trimAligned("\t\t");
+ assertEquals("", at.normalized());
+ assertEquals(0, at.alignment().normalizedLength());
+ assertEquals(2, at.alignment().originalLength());
+ }
+
+ @Test
+ void testCollapsePreservingAlignedRunWithoutKeepCollapsesToReplacement() {
+ final AlignedText at = WS.collapsePreservingAligned("a \t b", lineBreaks(), '\n');
+ assertEquals("a b", at.normalized()); // no line break in the run -> plain space
+ assertSpan(1, 4, at.toOriginalSpan(1, 2));
+ }
+
+ // Every aligned operation must produce exactly the same string as its plain counterpart; only the
+ // alignment is extra. This pins that contract across a battery of inputs so the two code paths
+ // cannot drift apart.
+ @Test
+ void testAlignedOperationsAgreeWithPlainOutput() {
+ final CodePointSet keep = lineBreaks();
+ final String[] inputs = {
+ "",
+ "abc",
+ " a b ",
+ "a" + NBSP + IDEOGRAPHIC + "b",
+ "a\t\t\t\t\tb",
+ "a\r\n\tb",
+ "\n\nabc",
+ " ",
+ GRINNING_FACE + "\t\tb",
+ "x" + YEZIDI_HYPHEN + YEZIDI_HYPHEN + "y",
+ "well" + EM_DASH + EN_DASH + "known",
+ "5" + MINUS_SIGN + "3",
+ };
+ for (final CharClass charClass : new CharClass[] {WS, DASH}) {
+ for (final String input : inputs) {
+ assertEquals(charClass.normalize(input), charClass.normalizeAligned(input).normalized(),
+ "normalize vs normalizeAligned for [" + input + "]");
+ assertEquals(charClass.collapse(input), charClass.collapseAligned(input).normalized(),
+ "collapse vs collapseAligned for [" + input + "]");
+ assertEquals(charClass.trim(input), charClass.trimAligned(input).normalized(),
+ "trim vs trimAligned for [" + input + "]");
+ assertEquals(charClass.removeAll(input), charClass.removeAllAligned(input).normalized(),
+ "removeAll vs removeAllAligned for [" + input + "]");
+ assertEquals(charClass.collapsePreserving(input, keep, '\n'),
+ charClass.collapsePreservingAligned(input, keep, '\n').normalized(),
+ "collapsePreserving vs collapsePreservingAligned for [" + input + "]");
+ }
+ }
+ }
}
diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/AlignedAggregateCharSequenceNormalizer.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/AlignedAggregateCharSequenceNormalizer.java
new file mode 100644
index 000000000..f57ddc29b
--- /dev/null
+++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/AlignedAggregateCharSequenceNormalizer.java
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.util.normalizer;
+
+/**
+ * An {@link OffsetAwareNormalizer} that applies a chain of offset-aware rungs in order and composes
+ * their per-stage {@link Alignment}s with {@link Alignment#andThen(Alignment)}, so the result maps a
+ * span found in the fully normalized text back to the original input through every stage.
+ *
+ * Produced by {@code TextNormalizer.Builder.buildAligned()}, which validates that every rung is
+ * offset-aware before constructing this.
+ */
+final class AlignedAggregateCharSequenceNormalizer implements OffsetAwareNormalizer {
+
+ private static final long serialVersionUID = 3056944120186103477L;
+
+ private final OffsetAwareNormalizer[] steps;
+
+ AlignedAggregateCharSequenceNormalizer(OffsetAwareNormalizer[] steps) {
+ this.steps = steps;
+ }
+
+ @Override
+ public CharSequence normalize(CharSequence text) {
+ CharSequence result = text;
+ for (final OffsetAwareNormalizer step : steps) {
+ result = step.normalize(result);
+ }
+ return result;
+ }
+
+ @Override
+ public AlignedText normalizeAligned(CharSequence text) {
+ if (steps.length == 0) {
+ // Identity pipeline: use one String for both sides so the alignment's lengths cannot diverge
+ // from the stored original for a CharSequence whose length() differs from its toString().
+ final String identity = text.toString();
+ return new AlignedText(identity, identity,
+ new Alignment.Builder().equal(identity.length()).build(identity.length()));
+ }
+ // Normalize the input to a String once so the stored original and the per-stage alignment
+ // lengths agree even for a CharSequence whose length() differs from its toString().
+ final String input = text.toString();
+ AlignedText stage = steps[0].normalizeAligned(input);
+ Alignment alignment = stage.alignment();
+ for (int i = 1; i < steps.length; i++) {
+ final AlignedText next = steps[i].normalizeAligned(stage.normalized());
+ alignment = alignment.andThen(next.alignment());
+ stage = next;
+ }
+ return new AlignedText(input, stage.normalized(), alignment);
+ }
+}
diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/BulletCharSequenceNormalizer.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/BulletCharSequenceNormalizer.java
index 9d1d63304..84476bf81 100644
--- a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/BulletCharSequenceNormalizer.java
+++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/BulletCharSequenceNormalizer.java
@@ -25,7 +25,7 @@
* because it is a letter in Catalan ({@code l..l}) and other orthographies; only characters that
* are unambiguously list bullets are replaced.
*/
-public class BulletCharSequenceNormalizer implements CharSequenceNormalizer {
+public class BulletCharSequenceNormalizer implements OffsetAwareNormalizer {
private static final long serialVersionUID = 5521093348871625541L;
@@ -49,4 +49,8 @@ public CharSequence normalize(CharSequence text) {
return BULLETS.normalize(text);
}
+ @Override
+ public AlignedText normalizeAligned(CharSequence text) {
+ return BULLETS.normalizeAligned(text);
+ }
}
diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/DashCharSequenceNormalizer.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/DashCharSequenceNormalizer.java
index 21c25873b..308c4cfaf 100644
--- a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/DashCharSequenceNormalizer.java
+++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/DashCharSequenceNormalizer.java
@@ -25,7 +25,7 @@
* regardless of which dash the source used. The mathematical minus signs are left untouched by
* default, and {@code U+00AD} SOFT HYPHEN (a format character) is not treated as a dash.
*/
-public class DashCharSequenceNormalizer implements CharSequenceNormalizer {
+public class DashCharSequenceNormalizer implements OffsetAwareNormalizer {
private static final long serialVersionUID = 6620885194730155303L;
@@ -43,4 +43,8 @@ public CharSequence normalize(CharSequence text) {
return DASHES.normalize(text);
}
+ @Override
+ public AlignedText normalizeAligned(CharSequence text) {
+ return DASHES.normalizeAligned(text);
+ }
}
diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/DigitCharSequenceNormalizer.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/DigitCharSequenceNormalizer.java
index 10bb882fe..68039c1ab 100644
--- a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/DigitCharSequenceNormalizer.java
+++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/DigitCharSequenceNormalizer.java
@@ -26,7 +26,7 @@
* left unchanged. Scanning is a single O(1)-per-code-point cursor pass with no regular
* expression.
*/
-public class DigitCharSequenceNormalizer implements CharSequenceNormalizer {
+public class DigitCharSequenceNormalizer implements OffsetAwareNormalizer {
private static final long serialVersionUID = 8451270936618204413L;
@@ -48,4 +48,8 @@ private static String toAscii(int codePoint) {
return value >= 0 ? String.valueOf((char) ('0' + value)) : null;
}
+ @Override
+ public AlignedText normalizeAligned(CharSequence text) {
+ return CharClass.substituteAligned(text, DigitCharSequenceNormalizer::toAscii);
+ }
}
diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/EllipsisCharSequenceNormalizer.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/EllipsisCharSequenceNormalizer.java
index e4971aa40..e5c692d73 100644
--- a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/EllipsisCharSequenceNormalizer.java
+++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/EllipsisCharSequenceNormalizer.java
@@ -24,7 +24,7 @@
* Scanning is a single O(1)-per-code-point cursor pass with no regular expression. ASCII dot
* runs are left unchanged.
*/
-public class EllipsisCharSequenceNormalizer implements CharSequenceNormalizer {
+public class EllipsisCharSequenceNormalizer implements OffsetAwareNormalizer {
private static final long serialVersionUID = 2298647015583729167L;
@@ -41,6 +41,10 @@ public CharSequence normalize(CharSequence text) {
return CharClass.substitute(text, EllipsisCharSequenceNormalizer::expansion);
}
+ @Override
+ public AlignedText normalizeAligned(CharSequence text) {
+ return CharClass.substituteAligned(text, EllipsisCharSequenceNormalizer::expansion);
+ }
// The ASCII expansion for an ellipsis or leader code point, or null to copy the code point through.
private static String expansion(int codePoint) {
diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/GermanUmlautCharSequenceNormalizer.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/GermanUmlautCharSequenceNormalizer.java
index 79d4e71b7..d4c2c4645 100644
--- a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/GermanUmlautCharSequenceNormalizer.java
+++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/GermanUmlautCharSequenceNormalizer.java
@@ -32,7 +32,7 @@
* (for example {@code a} + U+0308) is not a member and passes through unchanged, so apply NFC
* composition first if the input may contain decomposed forms.
*/
-public class GermanUmlautCharSequenceNormalizer implements CharSequenceNormalizer {
+public class GermanUmlautCharSequenceNormalizer implements OffsetAwareNormalizer {
private static final long serialVersionUID = 7106934482250176835L;
@@ -61,6 +61,11 @@ public CharSequence normalize(CharSequence text) {
return CharClass.substitute(text, GermanUmlautCharSequenceNormalizer::expansion);
}
+ @Override
+ public AlignedText normalizeAligned(CharSequence text) {
+ return CharClass.substituteAligned(text, GermanUmlautCharSequenceNormalizer::expansion);
+ }
+
// The DIN 5007-2 transliteration for an umlaut or eszett, or null to copy the code point through.
// All members are in the BMP, so a code point equals its char; supplementary code points miss every
// case and pass through.
diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/InvisibleCharSequenceNormalizer.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/InvisibleCharSequenceNormalizer.java
index 5e0465f73..91c7f7c75 100644
--- a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/InvisibleCharSequenceNormalizer.java
+++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/InvisibleCharSequenceNormalizer.java
@@ -28,7 +28,7 @@
* sequences; so are variation selectors. Use this only for a matching/search form, not for
* display.
*/
-public class InvisibleCharSequenceNormalizer implements CharSequenceNormalizer {
+public class InvisibleCharSequenceNormalizer implements OffsetAwareNormalizer {
private static final long serialVersionUID = 4837512098664301927L;
@@ -69,4 +69,8 @@ public CharSequence normalize(CharSequence text) {
return INVISIBLE.removeAll(text);
}
+ @Override
+ public AlignedText normalizeAligned(CharSequence text) {
+ return INVISIBLE.removeAllAligned(text);
+ }
}
diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/LineBreakPreservingWhitespaceCharSequenceNormalizer.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/LineBreakPreservingWhitespaceCharSequenceNormalizer.java
new file mode 100644
index 000000000..ec198fda1
--- /dev/null
+++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/LineBreakPreservingWhitespaceCharSequenceNormalizer.java
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.util.normalizer;
+
+/**
+ * A {@link CharSequenceNormalizer} that collapses runs of Unicode whitespace like
+ * {@link WhitespaceCharSequenceNormalizer}, but keeps line and paragraph structure: any whitespace
+ * run that contains a line break collapses to a single newline ({@code U+000A}) instead of a space,
+ * and leading and trailing whitespace is trimmed.
+ *
+ * This is the form wanted for readable snippets and display: horizontal runs of spaces and tabs
+ * become a single space, yet a blank line between paragraphs survives as one newline rather than
+ * being flattened into the surrounding text. It reuses the cursor based
+ * {@link CharClass#collapsePreserving(CharSequence, CodePointSet, int)} engine, so it recognizes the
+ * full Unicode {@code White_Space} set with no regular expression.
+ */
+public class LineBreakPreservingWhitespaceCharSequenceNormalizer implements OffsetAwareNormalizer {
+
+ private static final long serialVersionUID = 5471829006633512874L;
+
+ private static final int NEWLINE = 0x000A;
+
+ private static final CharClass WHITESPACE = CharClass.whitespace();
+
+ // The Unicode mandatory break code points (UAX #14 classes BK/CR/LF/NL): line feed, vertical tab,
+ // form feed, carriage return, next line, line separator, and paragraph separator. A whitespace run
+ // that contains any of these collapses to a single newline rather than a space, so line and
+ // paragraph structure survives while horizontal runs are squished.
+ private static final CodePointSet LINE_BREAKS = CodePointSet.of(
+ 0x000A, // line feed
+ 0x000B, // vertical tab
+ 0x000C, // form feed
+ 0x000D, // carriage return
+ 0x0085, // next line
+ 0x2028, // line separator
+ 0x2029); // paragraph separator
+
+ private static final LineBreakPreservingWhitespaceCharSequenceNormalizer INSTANCE =
+ new LineBreakPreservingWhitespaceCharSequenceNormalizer();
+
+ /** {@return the shared, stateless instance} */
+ public static LineBreakPreservingWhitespaceCharSequenceNormalizer getInstance() {
+ return INSTANCE;
+ }
+
+ @Override
+ public CharSequence normalize(CharSequence text) {
+ return WHITESPACE.trim(WHITESPACE.collapsePreserving(text, LINE_BREAKS, NEWLINE));
+ }
+
+ @Override
+ public AlignedText normalizeAligned(CharSequence text) {
+ final AlignedText collapsed = WHITESPACE.collapsePreservingAligned(text, LINE_BREAKS, NEWLINE);
+ final AlignedText trimmed = WHITESPACE.trimAligned(collapsed.normalized());
+ return new AlignedText(text, trimmed.normalized(),
+ collapsed.alignment().andThen(trimmed.alignment()));
+ }
+}
diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/QuoteCharSequenceNormalizer.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/QuoteCharSequenceNormalizer.java
index ec86e4fa6..f4551d21d 100644
--- a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/QuoteCharSequenceNormalizer.java
+++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/QuoteCharSequenceNormalizer.java
@@ -25,7 +25,7 @@
* two {@link CharClass} sets, so membership is O(1) and scanning is a single cursor pass with no
* regular expression. ASCII quotes are left unchanged.
*/
-public class QuoteCharSequenceNormalizer implements CharSequenceNormalizer {
+public class QuoteCharSequenceNormalizer implements OffsetAwareNormalizer {
private static final long serialVersionUID = 3415829076651283471L;
@@ -67,4 +67,11 @@ public CharSequence normalize(CharSequence text) {
return DOUBLE.normalize(SINGLE.normalize(text));
}
+ @Override
+ public AlignedText normalizeAligned(CharSequence text) {
+ final AlignedText single = SINGLE.normalizeAligned(text);
+ final AlignedText both = DOUBLE.normalizeAligned(single.normalized());
+ return new AlignedText(text, both.normalized(),
+ single.alignment().andThen(both.alignment()));
+ }
}
diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/TextNormalizer.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/TextNormalizer.java
index c7d877ecc..a5f1bb8de 100644
--- a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/TextNormalizer.java
+++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/TextNormalizer.java
@@ -93,6 +93,15 @@ public Builder whitespace() {
return add(Dimension.WHITESPACE.defaultNormalizer());
}
+ /**
+ * {@return this builder with whitespace collapsing that preserves line and paragraph breaks
+ * appended} Horizontal runs collapse to a single space, but a run containing a line break
+ * collapses to a single newline, so paragraph structure survives.
+ */
+ public Builder whitespacePreservingLineBreaks() {
+ return add(LineBreakPreservingWhitespaceCharSequenceNormalizer.getInstance());
+ }
+
/** {@return this builder with quotation-mark folding appended} */
public Builder quotes() {
return add(QuoteCharSequenceNormalizer.getInstance());
@@ -143,6 +152,38 @@ public CharSequenceNormalizer build() {
return new AggregateCharSequenceNormalizer(steps.toArray(new CharSequenceNormalizer[0]));
}
+ /**
+ * {@return an offset-aware composition of the rungs added so far}
+ *
+ * Every rung must be an {@link OffsetAwareNormalizer}. Each per-code-point fold is one;
+ * the folds that delegate to {@link java.text.Normalizer} or to JDK case mapping (NFC, NFKC,
+ * accent folding, confusable folding, and case folding) cannot report their per-character edits
+ * and so are rejected here. The returned normalizer's
+ * {@link OffsetAwareNormalizer#normalizeAligned(CharSequence)} maps a span found in the fully
+ * normalized text back to the original input through every stage, so a match in a normalized
+ * document reports its true offsets in the source.
+ *
+ * @throws IllegalStateException Thrown if any rung cannot report an alignment (for example NFC,
+ * NFKC, accent folding, confusable folding, or case folding, which delegate to
+ * {@link java.text.Normalizer} or to JDK case mapping); the message names the offending
+ * rung.
+ */
+ public OffsetAwareNormalizer buildAligned() {
+ final OffsetAwareNormalizer[] aligned = new OffsetAwareNormalizer[steps.size()];
+ for (int i = 0; i < steps.size(); i++) {
+ final CharSequenceNormalizer step = steps.get(i);
+ if (!(step instanceof OffsetAwareNormalizer)) {
+ throw new IllegalStateException("rung at 0-based index " + i + " (" + step.getClass().getName()
+ + ") is not offset-aware and cannot be composed into an aligned pipeline; the "
+ + "per-code-point folds report an alignment, while folds that delegate to "
+ + "java.text.Normalizer or JDK case mapping (such as NFC, NFKC, accent, confusable, "
+ + "or case folding) do not");
+ }
+ aligned[i] = (OffsetAwareNormalizer) step;
+ }
+ return new AlignedAggregateCharSequenceNormalizer(aligned);
+ }
+
private Builder add(CharSequenceNormalizer normalizer) {
steps.add(normalizer);
return this;
diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/WhitespaceCharSequenceNormalizer.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/WhitespaceCharSequenceNormalizer.java
index 6aa267d39..a61ffed9c 100644
--- a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/WhitespaceCharSequenceNormalizer.java
+++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/WhitespaceCharSequenceNormalizer.java
@@ -25,7 +25,7 @@
* and so on), so spacing copied from the web, PDFs, or non-Latin sources normalizes consistently.
* It is the Unicode-aware, regex-free counterpart to {@link ShrinkCharSequenceNormalizer}.
*/
-public class WhitespaceCharSequenceNormalizer implements CharSequenceNormalizer {
+public class WhitespaceCharSequenceNormalizer implements OffsetAwareNormalizer {
private static final long serialVersionUID = 6748290315562094783L;
@@ -44,4 +44,11 @@ public CharSequence normalize(CharSequence text) {
return WHITESPACE.trim(WHITESPACE.collapse(text));
}
+ @Override
+ public AlignedText normalizeAligned(CharSequence text) {
+ final AlignedText collapsed = WHITESPACE.collapseAligned(text);
+ final AlignedText trimmed = WHITESPACE.trimAligned(collapsed.normalized());
+ return new AlignedText(text, trimmed.normalized(),
+ collapsed.alignment().andThen(trimmed.alignment()));
+ }
}
diff --git a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/AlignedNormalizerPipelineTest.java b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/AlignedNormalizerPipelineTest.java
new file mode 100644
index 000000000..7813babe5
--- /dev/null
+++ b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/AlignedNormalizerPipelineTest.java
@@ -0,0 +1,342 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.util.normalizer;
+
+import org.junit.jupiter.api.Test;
+
+import opennlp.tools.util.Span;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertSame;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+/**
+ * Exercises {@link OffsetAwareNormalizer} and {@code TextNormalizer.Builder.buildAligned()}: the
+ * cursor-based rungs report alignments, an aligned pipeline composes them with
+ * {@link Alignment#andThen(Alignment)} so a span found in the fully normalized text maps back to the
+ * original input, and a non-alignable rung is rejected loudly.
+ */
+public class AlignedNormalizerPipelineTest {
+
+ private static final int ZERO_WIDTH_SPACE = 0x200B;
+ private static final int EM_DASH = 0x2014;
+ private static final int YEZIDI_HYPHEN = 0x10EAD; // a supplementary (non-BMP) dash
+ private static final int MATH_BOLD_DIGIT_ZERO = 0x1D7CE; // a supplementary decimal digit
+
+ private static String cp(int codePoint) {
+ return new String(Character.toChars(codePoint));
+ }
+
+ private static String covered(AlignedText aligned, int normalizedStart, int normalizedEnd) {
+ final Span span = aligned.toOriginalSpan(normalizedStart, normalizedEnd);
+ return aligned.original().subSequence(span.getStart(), span.getEnd()).toString();
+ }
+
+ // The aligned form must always reproduce exactly what the plain form produces.
+ @Test
+ void alignedNormalizedTextMatchesPlainForEveryRung() {
+ final OffsetAwareNormalizer[] rungs = {
+ WhitespaceCharSequenceNormalizer.getInstance(),
+ LineBreakPreservingWhitespaceCharSequenceNormalizer.getInstance(),
+ DashCharSequenceNormalizer.getInstance(),
+ InvisibleCharSequenceNormalizer.getInstance(),
+ QuoteCharSequenceNormalizer.getInstance(),
+ DigitCharSequenceNormalizer.getInstance(),
+ EllipsisCharSequenceNormalizer.getInstance(),
+ BulletCharSequenceNormalizer.getInstance(),
+ GermanUmlautCharSequenceNormalizer.getInstance()
+ };
+ final String[] inputs = {
+ "",
+ "plain",
+ " lots of\tspace ",
+ "\n\n para one\n\n\tpara two \n",
+ "a" + cp(ZERO_WIDTH_SPACE) + "b" + cp(YEZIDI_HYPHEN) + "c" + cp(EM_DASH) + "d",
+ cp(ZERO_WIDTH_SPACE) + " " + cp(ZERO_WIDTH_SPACE),
+ // quotes, ellipsis, eszett, bullet, fullwidth and supplementary digits in one string
+ cp(0x201C) + "don" + cp(0x2019) + "t " + cp(0x2026) + " Stra" + cp(0x00DF) + "e "
+ + cp(0x2022) + " " + cp(0xFF15) + cp(MATH_BOLD_DIGIT_ZERO)
+ };
+ for (final OffsetAwareNormalizer rung : rungs) {
+ for (final String input : inputs) {
+ assertEquals(rung.normalize(input).toString(), rung.normalizeAligned(input).normalized(),
+ rung.getClass().getSimpleName() + " on [" + input + "]");
+ }
+ }
+ }
+
+ @Test
+ void whitespaceCollapseAndTrimMapsSpanBackToOriginal() {
+ final String original = " hello world ";
+ final AlignedText aligned = WhitespaceCharSequenceNormalizer.getInstance()
+ .normalizeAligned(original);
+ assertEquals("hello world", aligned.normalized());
+ // "world" sits at [6, 11) in the collapsed/trimmed form.
+ final Span span = aligned.toOriginalSpan(6, 11);
+ assertEquals(original.indexOf("world"), span.getStart());
+ assertEquals("world", covered(aligned, 6, 11));
+ }
+
+ @Test
+ void dashFoldOfSupplementaryDashMapsSpanBackToOriginal() {
+ final String original = "a" + cp(YEZIDI_HYPHEN) + "b";
+ final AlignedText aligned = DashCharSequenceNormalizer.getInstance().normalizeAligned(original);
+ assertEquals("a-b", aligned.normalized());
+ // The two-unit supplementary dash folded to one ASCII hyphen, so 'b' moved from 3 to 2.
+ assertEquals("b", covered(aligned, 2, 3));
+ assertEquals(cp(YEZIDI_HYPHEN), covered(aligned, 1, 2));
+ }
+
+ @Test
+ void invisibleStripMapsSpanBackAcrossDeletion() {
+ final String original = "a" + cp(ZERO_WIDTH_SPACE) + "b";
+ final AlignedText aligned = InvisibleCharSequenceNormalizer.getInstance()
+ .normalizeAligned(original);
+ assertEquals("ab", aligned.normalized());
+ // 'b' is at index 1 in "ab" but index 2 in the original; the deleted ZWSP must not be covered.
+ assertEquals("b", covered(aligned, 1, 2));
+ assertEquals(2, aligned.toOriginalSpan(1, 2).getStart());
+ }
+
+ @Test
+ void pipelineComposesStripInvisibleWhitespaceAndDashesBackToOriginal() {
+ // 'a', zero-width space, two spaces, 'b', em dash, 'c'.
+ final String original = "a" + cp(ZERO_WIDTH_SPACE) + " b" + cp(EM_DASH) + "c";
+ final OffsetAwareNormalizer pipeline = TextNormalizer.builder()
+ .stripInvisible().whitespace().dashes().buildAligned();
+
+ final AlignedText aligned = pipeline.normalizeAligned(original);
+ assertEquals("a b-c", aligned.normalized());
+ assertEquals(pipeline.normalize(original).toString(), aligned.normalized());
+ // "b-c" at [2, 5) maps back across a deletion, a collapse, and a dash fold to "bc".
+ assertEquals("b" + cp(EM_DASH) + "c", covered(aligned, 2, 5));
+ }
+
+ @Test
+ void emptyAlignedPipelineIsIdentity() {
+ final AlignedText aligned = TextNormalizer.builder().buildAligned().normalizeAligned("Hello");
+ assertEquals("Hello", aligned.normalized());
+ assertEquals("Hello", covered(aligned, 0, 5));
+ }
+
+ @Test
+ void buildAlignedRejectsNonAlignableRungLoudly() {
+ final IllegalStateException ex = assertThrows(IllegalStateException.class,
+ () -> TextNormalizer.builder().nfc().whitespace().buildAligned());
+ assertTrue(ex.getMessage().contains("Nfc"), ex.getMessage());
+ assertTrue(ex.getMessage().contains("offset-aware"), ex.getMessage());
+ }
+
+ @Test
+ void buildAlignedReportsTheOffendingRungIndexWhenItIsNotFirst() {
+ // A non-alignable rung after several offset-aware ones must still be rejected, and the message
+ // must name its 0-based position (index 2) and type so the failure points at the right fold.
+ final IllegalStateException ex = assertThrows(IllegalStateException.class,
+ () -> TextNormalizer.builder().whitespace().dashes().caseFold().buildAligned());
+ assertTrue(ex.getMessage().contains("rung at 0-based index 2"), ex.getMessage());
+ assertTrue(ex.getMessage().contains("CaseFold"), ex.getMessage());
+ }
+
+ @Test
+ void buildAlignedRejectsEachKindOfNonAlignableRung() {
+ // Every fold that routes through java.text.Normalizer or JDK case mapping is rejected, named.
+ assertThrows(IllegalStateException.class,
+ () -> TextNormalizer.builder().nfkc().buildAligned());
+ assertThrows(IllegalStateException.class,
+ () -> TextNormalizer.builder().accentFold().buildAligned());
+ assertThrows(IllegalStateException.class,
+ () -> TextNormalizer.builder().caseFold().buildAligned());
+ }
+
+ @Test
+ void capabilityIsDetectableByInstanceOf() {
+ assertTrue(WhitespaceCharSequenceNormalizer.getInstance() instanceof OffsetAwareNormalizer);
+ assertTrue(DashCharSequenceNormalizer.getInstance() instanceof OffsetAwareNormalizer);
+ assertTrue(InvisibleCharSequenceNormalizer.getInstance() instanceof OffsetAwareNormalizer);
+ assertFalse(NfcCharSequenceNormalizer.getInstance() instanceof OffsetAwareNormalizer);
+ assertTrue(TextNormalizer.builder().whitespace().dashes().buildAligned()
+ instanceof OffsetAwareNormalizer);
+ // The per-code-point substitution folds are offset-aware too.
+ assertTrue(QuoteCharSequenceNormalizer.getInstance() instanceof OffsetAwareNormalizer);
+ assertTrue(DigitCharSequenceNormalizer.getInstance() instanceof OffsetAwareNormalizer);
+ assertTrue(EllipsisCharSequenceNormalizer.getInstance() instanceof OffsetAwareNormalizer);
+ assertTrue(BulletCharSequenceNormalizer.getInstance() instanceof OffsetAwareNormalizer);
+ assertTrue(GermanUmlautCharSequenceNormalizer.getInstance() instanceof OffsetAwareNormalizer);
+ // The folds that route through java.text.Normalizer or JDK case mapping cannot, by design.
+ assertFalse(NfkcCharSequenceNormalizer.getInstance() instanceof OffsetAwareNormalizer);
+ assertFalse(CaseFoldCharSequenceNormalizer.getInstance() instanceof OffsetAwareNormalizer);
+ assertFalse(AccentFoldCharSequenceNormalizer.getInstance() instanceof OffsetAwareNormalizer);
+ assertFalse(ConfusableSkeletonCharSequenceNormalizer.getInstance()
+ instanceof OffsetAwareNormalizer);
+ }
+
+ @Test
+ void roundTripOfAFullySpanningMatchReturnsTheWholeOriginal() {
+ final String original = " the quick ";
+ final AlignedText aligned = WhitespaceCharSequenceNormalizer.getInstance()
+ .normalizeAligned(original);
+ final String normalized = aligned.normalized();
+ assertEquals("the quick", normalized);
+ final Span whole = aligned.toOriginalSpan(0, normalized.length());
+ assertSame(original, aligned.original());
+ // The match spanning the whole normalized text covers the original from first to last kept char.
+ assertEquals("the quick", original.subSequence(whole.getStart(), whole.getEnd()).toString());
+ }
+
+ @Test
+ void lineBreakPreservingCollapsesHorizontalRunsButKeepsBreaks() {
+ final LineBreakPreservingWhitespaceCharSequenceNormalizer rung =
+ LineBreakPreservingWhitespaceCharSequenceNormalizer.getInstance();
+ final String original = "Hello world\n\n\tfoo bar";
+ assertEquals("Hello world\nfoo bar", rung.normalize(original).toString());
+
+ // The plain whitespace rung instead flattens the blank line into a single space.
+ assertEquals("Hello world foo bar",
+ WhitespaceCharSequenceNormalizer.getInstance().normalize(original).toString());
+
+ final AlignedText aligned = rung.normalizeAligned(original);
+ assertEquals(rung.normalize(original).toString(), aligned.normalized());
+ // "bar" sits at [16, 19) in the collapsed form and at [21, 24) in the original.
+ assertEquals(original.indexOf("bar"), aligned.toOriginalSpan(16, 19).getStart());
+ assertEquals("bar", covered(aligned, 16, 19));
+ // The preserved newline at index 11 maps back to the whole "\n\n\t" run it came from.
+ assertEquals("\n\n\t", covered(aligned, 11, 12));
+ }
+
+ @Test
+ void lineBreakPreservingTrimsLeadingAndTrailingBreaks() {
+ final LineBreakPreservingWhitespaceCharSequenceNormalizer rung =
+ LineBreakPreservingWhitespaceCharSequenceNormalizer.getInstance();
+ final String original = "\n\nHello\n\n";
+ final AlignedText aligned = rung.normalizeAligned(original);
+ assertEquals("Hello", aligned.normalized());
+ assertEquals("Hello", covered(aligned, 0, 5));
+ assertEquals(original.indexOf("Hello"), aligned.toOriginalSpan(0, 5).getStart());
+ }
+
+ @Test
+ void lineBreakPreservingComposesInAnAlignedPipeline() {
+ assertTrue(LineBreakPreservingWhitespaceCharSequenceNormalizer.getInstance()
+ instanceof OffsetAwareNormalizer);
+ final String original = "a" + cp(ZERO_WIDTH_SPACE) + " b\n\nc" + cp(EM_DASH) + "d";
+ final OffsetAwareNormalizer pipeline = TextNormalizer.builder()
+ .stripInvisible().whitespacePreservingLineBreaks().dashes().buildAligned();
+
+ final AlignedText aligned = pipeline.normalizeAligned(original);
+ assertEquals("a b\nc-d", aligned.normalized());
+ assertEquals(pipeline.normalize(original).toString(), aligned.normalized());
+ // "c-d" at [4, 7) maps back across a deletion, a break-preserving collapse, and a dash fold.
+ assertEquals("c" + cp(EM_DASH) + "d", covered(aligned, 4, 7));
+ }
+
+ @Test
+ void pipelineMapsAnOriginalSpanForwardToTheNormalizedText() {
+ final String original = "a" + cp(ZERO_WIDTH_SPACE) + " b" + cp(EM_DASH) + "c";
+ final AlignedText aligned = TextNormalizer.builder()
+ .stripInvisible().whitespace().dashes().buildAligned().normalizeAligned(original);
+ assertEquals("a b-c", aligned.normalized());
+ // 'b' is at original index 4 and normalized index 2; the forward mapping must agree.
+ final Span forward = aligned.toNormalizedSpan(4, 5);
+ assertEquals(2, forward.getStart());
+ assertEquals("b", aligned.normalized().substring(forward.getStart(), forward.getEnd()));
+ }
+
+ @Test
+ void lineBreakPreservingNormalizesCrLfAndUnicodeSeparators() {
+ final LineBreakPreservingWhitespaceCharSequenceNormalizer rung =
+ LineBreakPreservingWhitespaceCharSequenceNormalizer.getInstance();
+ assertEquals("a\nb", rung.normalize("a\r\nb").toString()); // CRLF -> one newline
+ assertEquals("a\nb", rung.normalize("a\n\n\n\nb").toString()); // blank lines -> one newline
+ assertEquals("x\ny", rung.normalize("x" + cp(0x2028) + "y").toString()); // line separator
+ assertEquals("p\nq", rung.normalize("p" + cp(0x2029) + "q").toString()); // paragraph separator
+ // A horizontal run still collapses to a space even when mixed with a break-bearing run.
+ assertEquals("a b\nc", rung.normalize("a b \n c").toString());
+ }
+
+ @Test
+ void whitespaceRungCollapsesAllWhitespaceToEmptyWithAValidSpan() {
+ final AlignedText aligned =
+ WhitespaceCharSequenceNormalizer.getInstance().normalizeAligned(" ");
+ assertEquals("", aligned.normalized());
+ // Mapping the empty match must yield a valid empty span rather than throwing.
+ final Span empty = aligned.toOriginalSpan(0, 0);
+ assertEquals(empty.getStart(), empty.getEnd());
+ }
+
+ @Test
+ void ellipsisExpansionMapsSpanBackToOriginal() {
+ final String original = "a" + cp(0x2026) + "b";
+ final AlignedText aligned = EllipsisCharSequenceNormalizer.getInstance()
+ .normalizeAligned(original);
+ assertEquals("a...b", aligned.normalized());
+ // The single ellipsis expanded to three dots, so 'b' moved from index 2 to index 4.
+ assertEquals("b", covered(aligned, 4, 5));
+ // The whole expansion, and any sub-span of it, maps back to the one source ellipsis.
+ assertEquals(cp(0x2026), covered(aligned, 1, 4));
+ assertEquals(cp(0x2026), covered(aligned, 2, 3));
+ }
+
+ @Test
+ void germanUmlautExpansionMapsSpanBackToOriginal() {
+ final String original = "Stra" + cp(0x00DF) + "e"; // "Strasse" from the eszett form
+ final AlignedText aligned = GermanUmlautCharSequenceNormalizer.getInstance()
+ .normalizeAligned(original);
+ assertEquals("Strasse", aligned.normalized());
+ // The eszett expanded to "ss", so the trailing 'e' moved from index 5 to index 6.
+ assertEquals("e", covered(aligned, 6, 7));
+ // Both halves of "ss" map back to the single source eszett.
+ assertEquals(cp(0x00DF), covered(aligned, 4, 6));
+ assertEquals(cp(0x00DF), covered(aligned, 5, 6));
+ }
+
+ @Test
+ void digitFoldOfSupplementaryDigitMapsSpanBackToOriginal() {
+ final String original = "a" + cp(MATH_BOLD_DIGIT_ZERO) + "b";
+ final AlignedText aligned = DigitCharSequenceNormalizer.getInstance()
+ .normalizeAligned(original);
+ assertEquals("a0b", aligned.normalized());
+ // The two-unit supplementary digit folded to one ASCII '0', so 'b' moved from 3 to 2.
+ assertEquals("b", covered(aligned, 2, 3));
+ assertEquals(cp(MATH_BOLD_DIGIT_ZERO), covered(aligned, 1, 2));
+ }
+
+ @Test
+ void quoteFoldMapsSpanBackToOriginal() {
+ final String original = cp(0x201C) + "hi" + cp(0x201D); // curly double quotes
+ final AlignedText aligned = QuoteCharSequenceNormalizer.getInstance()
+ .normalizeAligned(original);
+ assertEquals("\"hi\"", aligned.normalized());
+ assertEquals("hi", covered(aligned, 1, 3));
+ // A one-for-one fold, so the opening quote maps straight back to the curly source quote.
+ assertEquals(cp(0x201C), covered(aligned, 0, 1));
+ }
+
+ @Test
+ void substitutionFoldsComposeInAnAlignedPipeline() {
+ final String original = "say " + cp(0x201C) + "hi" + cp(0x201D) + cp(0x2026);
+ final OffsetAwareNormalizer pipeline = TextNormalizer.builder()
+ .quotes().ellipsis().buildAligned();
+ final AlignedText aligned = pipeline.normalizeAligned(original);
+ assertEquals("say \"hi\"...", aligned.normalized());
+ assertEquals(pipeline.normalize(original).toString(), aligned.normalized());
+ // The expanded "..." maps back across the quote fold to the single source ellipsis.
+ assertEquals(cp(0x2026), covered(aligned, 8, 11));
+ assertEquals("hi", covered(aligned, 5, 7));
+ }
+}
diff --git a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/GermanUmlautCharSequenceNormalizerTest.java b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/GermanUmlautCharSequenceNormalizerTest.java
index c4752fdad..ac9abdabd 100644
--- a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/GermanUmlautCharSequenceNormalizerTest.java
+++ b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/GermanUmlautCharSequenceNormalizerTest.java
@@ -60,6 +60,17 @@ void testCapitalEszett() {
assertEquals("STRASSE", fold("STRA" + cp(0x1E9E) + "E")); // STRASSE
}
+ @Test
+ void testCapitalEszettOffsets() {
+ // The capital eszett expands one source character into two, so the aligned fold reports a
+ // 1->2 replacement and a span over the two produced characters maps back to the single source.
+ final AlignedText aligned = FOLD.normalizeAligned("A" + cp(0x1E9E) + "B"); // AB
+ assertEquals("ASSB", aligned.normalized().toString());
+ final var source = aligned.alignment().toOriginalSpan(1, 3); // the produced "SS"
+ assertEquals(1, source.getStart());
+ assertEquals(2, source.getEnd());
+ }
+
@Test
void testAsciiAndOtherCharactersUnchanged() {
assertEquals("hello world 123", fold("hello world 123"));