From b24c9ee3df90d01cfafc87a0bbebfd4b1e30066d Mon Sep 17 00:00:00 2001
From: Kristian Rickert <krickert@gmail.com>
Date: Tue, 23 Jun 2026 09:50:34 -0400
Subject: [PATCH] OPENNLP-1850 Offset/alignment layer: Alignment, AlignedText,
 buildAligned, *Aligned (1b)

The conceptually hard half of the former foundation PR, split out on review request: the
bidirectional Alignment edit-sequence and AlignedText, the OffsetAwareNormalizer capability
interface, TextNormalizer.buildAligned(), the *Aligned CharClass variants and the offset-aware
rungs, the line-break-preserving rung, and the dense span-mapping tests (binary-search span
mapping, expansion/deletion edge cases, andThen composition including the insertion-in-expansion
case). Builds on the engine in 1a.
---
 .../tools/util/normalizer/AlignedText.java    |  58 +++
 .../tools/util/normalizer/Alignment.java      | 293 +++++++++++++++
 .../tools/util/normalizer/CharClass.java      | 200 ++++++++++
 .../normalizer/OffsetAwareNormalizer.java     |  49 +++
 .../tools/util/normalizer/AlignmentTest.java  | 258 +++++++++++++
 .../tools/util/normalizer/CharClassTest.java  | 183 ++++++++++
 ...lignedAggregateCharSequenceNormalizer.java |  67 ++++
 .../BulletCharSequenceNormalizer.java         |   6 +-
 .../DashCharSequenceNormalizer.java           |   6 +-
 .../DigitCharSequenceNormalizer.java          |   6 +-
 .../EllipsisCharSequenceNormalizer.java       |   6 +-
 .../GermanUmlautCharSequenceNormalizer.java   |   7 +-
 .../InvisibleCharSequenceNormalizer.java      |   6 +-
 ...rvingWhitespaceCharSequenceNormalizer.java |  72 ++++
 .../QuoteCharSequenceNormalizer.java          |   9 +-
 .../tools/util/normalizer/TextNormalizer.java |  41 +++
 .../WhitespaceCharSequenceNormalizer.java     |   9 +-
 .../AlignedNormalizerPipelineTest.java        | 342 ++++++++++++++++++
 ...ermanUmlautCharSequenceNormalizerTest.java |  11 +
 19 files changed, 1621 insertions(+), 8 deletions(-)
 create mode 100644 opennlp-api/src/main/java/opennlp/tools/util/normalizer/AlignedText.java
 create mode 100644 opennlp-api/src/main/java/opennlp/tools/util/normalizer/Alignment.java
 create mode 100644 opennlp-api/src/main/java/opennlp/tools/util/normalizer/OffsetAwareNormalizer.java
 create mode 100644 opennlp-api/src/test/java/opennlp/tools/util/normalizer/AlignmentTest.java
 create mode 100644 opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/AlignedAggregateCharSequenceNormalizer.java
 create mode 100644 opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/LineBreakPreservingWhitespaceCharSequenceNormalizer.java
 create mode 100644 opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/AlignedNormalizerPipelineTest.java

diff --git a/opennlp-api/src/main/java/opennlp/tools/util/normalizer/AlignedText.java b/opennlp-api/src/main/java/opennlp/tools/util/normalizer/AlignedText.java
new file mode 100644
index 000000000..dc68b1f09
--- /dev/null
+++ b/opennlp-api/src/main/java/opennlp/tools/util/normalizer/AlignedText.java
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.util.normalizer;
+
+import opennlp.tools.util.Span;
+
+/**
+ * The result of a normalization that keeps the original text alongside the normalized form and a
+ * full {@link Alignment} between them.
+ *
+ * <p>The original is the source of truth, the normalized form is the derived view tuned for
+ * matching and search, and the alignment maps spans between them through deletions, collapses, and
+ * expansions. Use
+ * {@link #toOriginalSpan(int, int)} to report a match found in the normalized form against the
+ * original.</p>
+ *
+ * @param original   The untouched source text.
+ * @param normalized The normalized text.
+ * @param alignment  The alignment between the normalized and original text.
+ */
+public record AlignedText(CharSequence original, String normalized, Alignment alignment) {
+
+  /**
+   * Maps a span of the normalized text back to the tightest span of the original text.
+   *
+   * @param normalizedStart The inclusive start offset in the normalized text.
+   * @param normalizedEnd   The exclusive end offset in the normalized text.
+   * @return The corresponding original span.
+   */
+  public Span toOriginalSpan(int normalizedStart, int normalizedEnd) {
+    return alignment.toOriginalSpan(normalizedStart, normalizedEnd);
+  }
+
+  /**
+   * Maps a span of the original text forward to the normalized text.
+   *
+   * @param originalStart The inclusive start offset in the original text.
+   * @param originalEnd   The exclusive end offset in the original text.
+   * @return The corresponding normalized span.
+   */
+  public Span toNormalizedSpan(int originalStart, int originalEnd) {
+    return alignment.toNormalizedSpan(originalStart, originalEnd);
+  }
+}
diff --git a/opennlp-api/src/main/java/opennlp/tools/util/normalizer/Alignment.java b/opennlp-api/src/main/java/opennlp/tools/util/normalizer/Alignment.java
new file mode 100644
index 000000000..0f1d47a6a
--- /dev/null
+++ b/opennlp-api/src/main/java/opennlp/tools/util/normalizer/Alignment.java
@@ -0,0 +1,293 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.util.normalizer;
+
+import java.util.Arrays;
+
+import opennlp.tools.util.Span;
+
+/**
+ * A bidirectional alignment between an original text and a normalized form of it.
+ *
+ * <p>Normalization edits text in ways that move character offsets: a run of whitespace collapses to
+ * one space, a supplementary dash folds to a single ASCII hyphen, a case fold can grow text
+ * (German {@code eszett} to {@code ss}), and trimming or stripping deletes characters outright. An
+ * {@code Alignment} records those edits as a sequence of <em>equal</em> runs (text copied through
+ * unchanged in length) and <em>replace</em> runs (a block of original characters that produced a
+ * block of normalized characters), so any span in either form can be mapped to the other.</p>
+ *
+ * <p>Because it represents deletions as gaps and expansions as shared blocks (rather than storing a
+ * single original offset per normalized character, which would assume the normalized text
+ * contiguously covers the original), mapping is done
+ * span to span ({@link #toOriginalSpan(int, int)} / {@link #toNormalizedSpan(int, int)}) so a match
+ * that ends next to deleted text reports a tight span rather than over-covering the deletion. Two
+ * alignments compose with {@link #andThen(Alignment)}, which is what lets a multi-stage
+ * normalization pipeline still map a result all the way back to the original.</p>
+ *
+ * <p>Instances are immutable and thread-safe; build one with {@link Builder}.</p>
+ */
+public final class Alignment {
+
+  // For normalized character k, originalStart[k]/originalEnd[k] are the half-open original range it
+  // was produced from. Characters copied unchanged map one to one; characters from a collapse or
+  // expansion share their run's whole original range (it cannot be subdivided); deleted original
+  // characters appear as a gap that no normalized character covers.
+  private final int[] originalStart;
+  private final int[] originalEnd;
+  private final int originalLength;
+
+  private Alignment(int[] originalStart, int[] originalEnd, int originalLength) {
+    this.originalStart = originalStart;
+    this.originalEnd = originalEnd;
+    this.originalLength = originalLength;
+  }
+
+  /** {@return the length of the normalized text this alignment was built for} */
+  public int normalizedLength() {
+    return originalStart.length;
+  }
+
+  /** {@return the length of the original text this alignment was built for} */
+  public int originalLength() {
+    return originalLength;
+  }
+
+  /**
+   * Maps a half-open span of the normalized text to the tightest half-open span of the original
+   * text that produced it.
+   *
+   * @param normalizedStart The inclusive start offset, in {@code [0, normalizedLength()]}.
+   * @param normalizedEnd   The exclusive end offset, in {@code [normalizedStart, normalizedLength()]}.
+   * @return The corresponding original span.
+   * @throws IndexOutOfBoundsException Thrown if the offsets are out of range or inverted.
+   */
+  public Span toOriginalSpan(int normalizedStart, int normalizedEnd) {
+    checkRange(normalizedStart, normalizedEnd, normalizedLength());
+    if (normalizedStart == normalizedEnd) {
+      final int at = normalizedStart < normalizedLength()
+          ? originalStart[normalizedStart] : originalLength;
+      return new Span(at, at);
+    }
+    return new Span(originalStart[normalizedStart], originalEnd[normalizedEnd - 1]);
+  }
+
+  /**
+   * Maps a half-open span of the original text to the half-open span of the normalized text that
+   * covers it. Original characters that were deleted map to an empty span at the point where they
+   * were removed.
+   *
+   * @param originalStartOffset The inclusive start offset, in {@code [0, originalLength()]}.
+   * @param originalEndOffset   The exclusive end offset, in {@code [originalStartOffset, originalLength()]}.
+   * @return The corresponding normalized span.
+   * @throws IndexOutOfBoundsException Thrown if the offsets are out of range or inverted.
+   */
+  public Span toNormalizedSpan(int originalStartOffset, int originalEndOffset) {
+    checkRange(originalStartOffset, originalEndOffset, originalLength);
+    final int start = firstIndexEndingAfter(originalStartOffset);
+    final int end = firstIndexStartingAtOrAfter(originalEndOffset);
+    return new Span(start, Math.max(start, end));
+  }
+
+  /**
+   * Maps a normalized offset to the original offset where its character begins (start semantics).
+   * Prefer {@link #toOriginalSpan(int, int)} for mapping a match, since a single offset cannot
+   * distinguish the start and end of a span across a deletion.
+   *
+   * @param normalizedOffset An offset in {@code [0, normalizedLength()]}.
+   * @return The corresponding original offset.
+   * @throws IndexOutOfBoundsException Thrown if {@code normalizedOffset} is out of range.
+   */
+  public int toOriginalOffset(int normalizedOffset) {
+    if (normalizedOffset < 0 || normalizedOffset > normalizedLength()) {
+      throw new IndexOutOfBoundsException("normalized offset " + normalizedOffset
+          + " is outside [0, " + normalizedLength() + "]");
+    }
+    return normalizedOffset < normalizedLength() ? originalStart[normalizedOffset] : originalLength;
+  }
+
+  /**
+   * Composes this alignment with one that further normalizes this alignment's normalized text.
+   *
+   * <p>If this maps {@code original -> middle} and {@code next} maps {@code middle -> final}, the
+   * result maps {@code original -> final} directly, so a span found in the final text can be mapped
+   * straight back to the original without keeping the intermediate stages.</p>
+   *
+   * @param next The next stage, whose original side is this stage's normalized text.
+   * @return The composed alignment.
+   * @throws IllegalArgumentException Thrown if {@code next.originalLength()} does not equal this
+   *     {@code normalizedLength()} (the stages do not line up).
+   */
+  public Alignment andThen(Alignment next) {
+    if (next.originalLength != normalizedLength()) {
+      throw new IllegalArgumentException("stages do not line up: this normalizedLength="
+          + normalizedLength() + " but next originalLength=" + next.originalLength);
+    }
+    final int finalLength = next.normalizedLength();
+    final int[] starts = new int[finalLength];
+    final int[] ends = new int[finalLength];
+    for (int f = 0; f < finalLength; f++) {
+      final int middleStart = next.originalStart[f];
+      final int middleEnd = next.originalEnd[f];
+      final int start = middleStart < normalizedLength() ? originalStart[middleStart] : originalLength;
+      final int end = middleEnd > 0 ? originalEnd[middleEnd - 1] : 0;
+      starts[f] = start;
+      // Math.max keeps the original span non-inverted. When next inserted this final character
+      // (a zero-width middle range, middleStart == middleEnd) the max collapses it to a zero-width
+      // original span -- correct for every insertion except one landing strictly inside an
+      // expansion this stage produced, where the characters on either side share one atomic
+      // original block (originalEnd[middleEnd - 1] > originalStart[middleStart]) that has no
+      // interior offset to point at. There the insertion is attributed to that whole block, the
+      // only choice that keeps originalStart/originalEnd sorted so toOriginalSpan/toNormalizedSpan
+      // keep their O(log n) search; forcing it to zero-width would push originalEnd below its
+      // predecessor and corrupt the reverse mapping.
+      ends[f] = Math.max(start, end);
+    }
+    return new Alignment(starts, ends, originalLength);
+  }
+
+  // First normalized index whose original coverage ends strictly after offset (so it covers or
+  // follows offset); normalizedLength() when offset is at or past the last covered original char.
+  private int firstIndexEndingAfter(int offset) {
+    int low = 0;
+    int high = originalEnd.length;
+    while (low < high) {
+      final int mid = (low + high) >>> 1;
+      if (originalEnd[mid] > offset) {
+        high = mid;
+      } else {
+        low = mid + 1;
+      }
+    }
+    return low;
+  }
+
+  // First normalized index whose original coverage starts at or after offset.
+  private int firstIndexStartingAtOrAfter(int offset) {
+    int low = 0;
+    int high = originalStart.length;
+    while (low < high) {
+      final int mid = (low + high) >>> 1;
+      if (originalStart[mid] >= offset) {
+        high = mid;
+      } else {
+        low = mid + 1;
+      }
+    }
+    return low;
+  }
+
+  private static void checkRange(int start, int end, int length) {
+    if (start < 0 || end > length || start > end) {
+      throw new IndexOutOfBoundsException("span [" + start + ", " + end + ") is outside [0, "
+          + length + "]");
+    }
+  }
+
+  /**
+   * Builds an {@link Alignment} as the normalized text is produced, by recording each edit in order.
+   * Call {@link #equal(int)} for characters copied through unchanged and {@link #replace(int, int)}
+   * for a block that was rewritten (including deletions and insertions), then {@link #build(int)}.
+   */
+  public static final class Builder {
+
+    private static final int MAX_ARRAY_SIZE = Integer.MAX_VALUE - 8;
+
+    private int[] starts = new int[16];
+    private int[] ends = new int[16];
+    private int count;
+    private int originalCursor;
+
+    /**
+     * Records {@code charCount} characters copied through unchanged (a one to one run).
+     *
+     * @param charCount The number of UTF-16 characters; must not be negative.
+     * @return This builder.
+     */
+    public Builder equal(int charCount) {
+      if (charCount < 0) {
+        throw new IllegalArgumentException("charCount must not be negative: " + charCount);
+      }
+      for (int i = 0; i < charCount; i++) {
+        append(originalCursor, originalCursor + 1);
+        originalCursor++;
+      }
+      return this;
+    }
+
+    /**
+     * Records a rewritten block: {@code originalCount} original characters that produced
+     * {@code normalizedCount} normalized characters. Each produced character is attributed to the
+     * whole original block, since a collapse or expansion cannot be subdivided. {@code 0} for
+     * {@code normalizedCount} is a deletion; {@code 0} for {@code originalCount} is an insertion.
+     *
+     * @param originalCount   The number of original characters consumed; must not be negative.
+     * @param normalizedCount The number of normalized characters produced; must not be negative.
+     * @return This builder.
+     */
+    public Builder replace(int originalCount, int normalizedCount) {
+      if (originalCount < 0 || normalizedCount < 0) {
+        throw new IllegalArgumentException("counts must not be negative: " + originalCount
+            + ", " + normalizedCount);
+      }
+      final int blockEnd = originalCursor + originalCount;
+      for (int i = 0; i < normalizedCount; i++) {
+        append(originalCursor, blockEnd);
+      }
+      originalCursor = blockEnd;
+      return this;
+    }
+
+    /**
+     * Finalizes the alignment.
+     *
+     * @param originalLength The full length of the original text.
+     * @return The immutable {@link Alignment}.
+     * @throws IllegalStateException Thrown if the recorded edits do not consume exactly
+     *     {@code originalLength} original characters (a sign that some input was not accounted for).
+     */
+    public Alignment build(int originalLength) {
+      if (originalCursor != originalLength) {
+        throw new IllegalStateException("edits consumed " + originalCursor
+            + " original characters but originalLength is " + originalLength);
+      }
+      return new Alignment(Arrays.copyOf(starts, count), Arrays.copyOf(ends, count), originalLength);
+    }
+
+    private void append(int start, int end) {
+      if (count == starts.length) {
+        grow();
+      }
+      starts[count] = start;
+      ends[count] = end;
+      count++;
+    }
+
+    // Overflow-aware 1.5x growth: never wraps to a negative capacity, degrades to a clean
+    // OutOfMemoryError at the array-size ceiling instead of NegativeArraySizeException.
+    private void grow() {
+      int newCapacity = starts.length + (starts.length >> 1);
+      if (newCapacity < 0 || newCapacity > MAX_ARRAY_SIZE) {
+        newCapacity = MAX_ARRAY_SIZE;
+      }
+      if (newCapacity <= count) {
+        throw new OutOfMemoryError("Alignment exceeds maximum size");
+      }
+      starts = Arrays.copyOf(starts, newCapacity);
+      ends = Arrays.copyOf(ends, newCapacity);
+    }
+  }
+}
diff --git a/opennlp-api/src/main/java/opennlp/tools/util/normalizer/CharClass.java b/opennlp-api/src/main/java/opennlp/tools/util/normalizer/CharClass.java
index 64e924b0c..a10751bc8 100644
--- a/opennlp-api/src/main/java/opennlp/tools/util/normalizer/CharClass.java
+++ b/opennlp-api/src/main/java/opennlp/tools/util/normalizer/CharClass.java
@@ -300,6 +300,175 @@ public String removeAll(CharSequence text) {
     return out.toString();
   }
 
+  /**
+   * Like {@link #normalize(CharSequence)} but also produces the {@link Alignment} back to the
+   * original text.
+   *
+   * @param text The text to normalize.
+   * @return The normalized text and its alignment.
+   */
+  public AlignedText normalizeAligned(CharSequence text) {
+    Objects.requireNonNull(text, "text");
+    final StringBuilder out = new StringBuilder(text.length());
+    final Alignment.Builder alignment = new Alignment.Builder();
+    final int length = text.length();
+    int i = 0;
+    while (i < length) {
+      final int codePoint = Character.codePointAt(text, i);
+      final int charCount = Character.charCount(codePoint);
+      if (members.contains(codePoint)) {
+        out.appendCodePoint(replacement);
+        alignment.replace(charCount, Character.charCount(replacement));
+      } else {
+        out.appendCodePoint(codePoint);
+        alignment.equal(charCount);
+      }
+      i += charCount;
+    }
+    return new AlignedText(text, out.toString(), alignment.build(length));
+  }
+
+  /**
+   * Like {@link #collapse(CharSequence)} but also produces the {@link Alignment} back to the
+   * original text. Each collapsed run maps to the run's whole original extent.
+   *
+   * @param text The text to collapse.
+   * @return The collapsed text and its alignment.
+   */
+  public AlignedText collapseAligned(CharSequence text) {
+    Objects.requireNonNull(text, "text");
+    final StringBuilder out = new StringBuilder(text.length());
+    final Alignment.Builder alignment = new Alignment.Builder();
+    final int length = text.length();
+    int i = 0;
+    while (i < length) {
+      final int codePoint = Character.codePointAt(text, i);
+      if (members.contains(codePoint)) {
+        final int runEnd = skipRun(text, i);
+        out.appendCodePoint(replacement);
+        alignment.replace(runEnd - i, Character.charCount(replacement));
+        i = runEnd;
+      } else {
+        final int charCount = Character.charCount(codePoint);
+        out.appendCodePoint(codePoint);
+        alignment.equal(charCount);
+        i += charCount;
+      }
+    }
+    return new AlignedText(text, out.toString(), alignment.build(length));
+  }
+
+  /**
+   * Like {@link #collapsePreserving(CharSequence, CodePointSet, int)} but also produces the
+   * {@link Alignment} back to the original text.
+   *
+   * @param text The text to collapse.
+   * @param keep The member code points whose presence in a run preserves structure.
+   * @param keepReplacement The replacement emitted for a run that contains a {@code keep} member.
+   * @return The collapsed text and its alignment.
+   * @throws IllegalArgumentException Thrown if {@code keepReplacement} is not a valid code point.
+   */
+  public AlignedText collapsePreservingAligned(CharSequence text, CodePointSet keep,
+                                               int keepReplacement) {
+    Objects.requireNonNull(text, "text");
+    Objects.requireNonNull(keep, "keep");
+    requireValidCodePoint(keepReplacement);
+    final StringBuilder out = new StringBuilder(text.length());
+    final Alignment.Builder alignment = new Alignment.Builder();
+    final int length = text.length();
+    int i = 0;
+    while (i < length) {
+      final int codePoint = Character.codePointAt(text, i);
+      if (members.contains(codePoint)) {
+        boolean preserve = keep.contains(codePoint);
+        int j = i + Character.charCount(codePoint);
+        while (j < length) {
+          final int next = Character.codePointAt(text, j);
+          if (!members.contains(next)) {
+            break;
+          }
+          preserve |= keep.contains(next);
+          j += Character.charCount(next);
+        }
+        final int emitted = preserve ? keepReplacement : replacement;
+        out.appendCodePoint(emitted);
+        alignment.replace(j - i, Character.charCount(emitted));
+        i = j;
+      } else {
+        final int charCount = Character.charCount(codePoint);
+        out.appendCodePoint(codePoint);
+        alignment.equal(charCount);
+        i += charCount;
+      }
+    }
+    return new AlignedText(text, out.toString(), alignment.build(length));
+  }
+
+  /**
+   * Like {@link #trim(CharSequence)} but also produces the {@link Alignment} back to the original
+   * text. The trimmed leading and trailing members appear as deletions, so a span never reports
+   * through them.
+   *
+   * @param text The text to trim.
+   * @return The trimmed text and its alignment.
+   */
+  public AlignedText trimAligned(CharSequence text) {
+    Objects.requireNonNull(text, "text");
+    final int length = text.length();
+    int start = 0;
+    while (start < length) {
+      final int codePoint = Character.codePointAt(text, start);
+      if (!members.contains(codePoint)) {
+        break;
+      }
+      start += Character.charCount(codePoint);
+    }
+    int end = length;
+    while (end > start) {
+      final int codePoint = Character.codePointBefore(text, end);
+      if (!members.contains(codePoint)) {
+        break;
+      }
+      end -= Character.charCount(codePoint);
+    }
+    final Alignment.Builder alignment = new Alignment.Builder();
+    if (start > 0) {
+      alignment.replace(start, 0);
+    }
+    alignment.equal(end - start);
+    if (end < length) {
+      alignment.replace(length - end, 0);
+    }
+    return new AlignedText(text, text.subSequence(start, end).toString(), alignment.build(length));
+  }
+
+  /**
+   * Like {@link #removeAll(CharSequence)} but also produces the {@link Alignment} back to the
+   * original text. Every removed member appears as a deletion, so a span never reports through one.
+   *
+   * @param text The text to filter.
+   * @return The filtered text and its alignment.
+   */
+  public AlignedText removeAllAligned(CharSequence text) {
+    Objects.requireNonNull(text, "text");
+    final StringBuilder out = new StringBuilder(text.length());
+    final Alignment.Builder alignment = new Alignment.Builder();
+    final int length = text.length();
+    int i = 0;
+    while (i < length) {
+      final int codePoint = Character.codePointAt(text, i);
+      final int charCount = Character.charCount(codePoint);
+      if (members.contains(codePoint)) {
+        alignment.replace(charCount, 0);
+      } else {
+        out.appendCodePoint(codePoint);
+        alignment.equal(charCount);
+      }
+      i += charCount;
+    }
+    return new AlignedText(text, out.toString(), alignment.build(length));
+  }
+
   /**
    * Applies a per-code-point substitution: each code point for which {@code substitution} returns a
    * non-null string is replaced by that string, and the rest are copied through. This is the shared,
@@ -329,6 +498,37 @@ public static String substitute(CharSequence text, IntFunction<String> substitut
     return out.toString();
   }
 
+  /**
+   * Like {@link #substitute(CharSequence, IntFunction)} but also produces the {@link Alignment} back
+   * to the original text. Each replaced code point maps to its replacement string as one block.
+   *
+   * @param text         The text to transform.
+   * @param substitution The replacement for a code point, or {@code null} to copy it through.
+   * @return The transformed text and its alignment.
+   */
+  public static AlignedText substituteAligned(CharSequence text, IntFunction<String> substitution) {
+    Objects.requireNonNull(text, "text");
+    Objects.requireNonNull(substitution, "substitution");
+    final StringBuilder out = new StringBuilder(text.length());
+    final Alignment.Builder alignment = new Alignment.Builder();
+    final int length = text.length();
+    int i = 0;
+    while (i < length) {
+      final int codePoint = Character.codePointAt(text, i);
+      final int charCount = Character.charCount(codePoint);
+      final String replacement = substitution.apply(codePoint);
+      if (replacement != null) {
+        out.append(replacement);
+        alignment.replace(charCount, replacement.length());
+      } else {
+        out.appendCodePoint(codePoint);
+        alignment.equal(charCount);
+      }
+      i += charCount;
+    }
+    return new AlignedText(text, out.toString(), alignment.build(length));
+  }
+
   // Returns the offset just past the maximal run of members starting at runStart.
   private int skipRun(CharSequence text, int runStart) {
     final int length = text.length();
diff --git a/opennlp-api/src/main/java/opennlp/tools/util/normalizer/OffsetAwareNormalizer.java b/opennlp-api/src/main/java/opennlp/tools/util/normalizer/OffsetAwareNormalizer.java
new file mode 100644
index 000000000..e812d2864
--- /dev/null
+++ b/opennlp-api/src/main/java/opennlp/tools/util/normalizer/OffsetAwareNormalizer.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.util.normalizer;
+
+/**
+ * A {@link CharSequenceNormalizer} that can additionally report the {@link Alignment} from its
+ * normalized output back to the input, so a span found in the normalized text maps to the exact
+ * character offsets of the original.
+ *
+ * <p>Length-changing folds move offsets: collapsing a run of whitespace, folding a supplementary
+ * dash to one ASCII hyphen, or stripping invisible controls all shift every later character. A rung
+ * that performs such a fold over the cursor-based {@link CharClass} engine can record those edits
+ * and expose them through {@link #normalizeAligned(CharSequence)}. A rung that delegates to
+ * {@link java.text.Normalizer} (NFC/NFKC) or to a stemmer cannot report its edits, so it does not
+ * implement this interface; that is a deliberate capability split rather than an oversight.</p>
+ *
+ * <p>{@code TextNormalizer.Builder.buildAligned()} composes a chain of these into a single
+ * offset-aware pipeline whose {@link AlignedText} maps a match all the way back to the original
+ * input. An interface-typed caller tests for the capability
+ * ({@code normalizer instanceof OffsetAwareNormalizer}) instead of depending on a concrete rung,
+ * the same plain {@code instanceof} pattern used by
+ * {@link opennlp.tools.namefind.OffsetMappingNameFinder} rather than reflection.</p>
+ */
+public interface OffsetAwareNormalizer extends CharSequenceNormalizer {
+
+  /**
+   * Normalizes {@code text} and returns the result together with the {@link Alignment} back to the
+   * input. The normalized text is identical to {@link #normalize(CharSequence)}: that is,
+   * {@code normalizeAligned(text).normalized()} equals {@code normalize(text).toString()}.
+   *
+   * @param text The {@link CharSequence} to normalize.
+   * @return The normalized text paired with its alignment to {@code text}.
+   */
+  AlignedText normalizeAligned(CharSequence text);
+}
diff --git a/opennlp-api/src/test/java/opennlp/tools/util/normalizer/AlignmentTest.java b/opennlp-api/src/test/java/opennlp/tools/util/normalizer/AlignmentTest.java
new file mode 100644
index 000000000..07c92de0f
--- /dev/null
+++ b/opennlp-api/src/test/java/opennlp/tools/util/normalizer/AlignmentTest.java
@@ -0,0 +1,258 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.util.normalizer;
+
+import org.junit.jupiter.api.Test;
+
+import opennlp.tools.util.Span;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+
+public class AlignmentTest {
+
+  private static void assertSpan(int start, int end, Span span) {
+    assertEquals(start, span.getStart(), "start");
+    assertEquals(end, span.getEnd(), "end");
+  }
+
+  @Test
+  void testIdentityMapsOneToOne() {
+    final Alignment a = new Alignment.Builder().equal(3).build(3); // "abc" unchanged
+    assertEquals(3, a.normalizedLength());
+    assertEquals(3, a.originalLength());
+    assertSpan(0, 3, a.toOriginalSpan(0, 3));
+    assertSpan(1, 2, a.toOriginalSpan(1, 2));
+  }
+
+  @Test
+  void testCollapsedRunMapsToWholeRun() {
+    // "ab  " -> "ab " : keep "ab", collapse two spaces into one.
+    final Alignment a = new Alignment.Builder().equal(2).replace(2, 1).build(4);
+    assertSpan(0, 2, a.toOriginalSpan(0, 2)); // "ab"
+    assertSpan(2, 4, a.toOriginalSpan(2, 3)); // the collapsed space covers both originals
+    assertSpan(0, 4, a.toOriginalSpan(0, 3));
+  }
+
+  @Test
+  void testInteriorDeletionDoesNotOverCover() {
+    // "a b c" -> "abc" : the two spaces are deleted. A per-character offset map over-covers here.
+    final Alignment a = new Alignment.Builder()
+        .equal(1).replace(1, 0).equal(1).replace(1, 0).equal(1).build(5);
+    assertEquals(3, a.normalizedLength());
+    assertEquals(5, a.originalLength());
+    assertSpan(0, 1, a.toOriginalSpan(0, 1)); // "a"
+    assertSpan(2, 3, a.toOriginalSpan(1, 2)); // "b" -> [2,3), NOT [2,4)
+    assertSpan(4, 5, a.toOriginalSpan(2, 3)); // "c"
+    assertSpan(0, 5, a.toOriginalSpan(0, 3)); // whole text
+  }
+
+  @Test
+  void testTrailingDeletionDoesNotOverCover() {
+    // "ab  " -> "ab" : strip trailing spaces. A match at the end must not absorb them.
+    final Alignment a = new Alignment.Builder().equal(2).replace(2, 0).build(4);
+    assertSpan(0, 2, a.toOriginalSpan(0, 2)); // "ab" -> [0,2), NOT [0,4)
+    assertSpan(1, 2, a.toOriginalSpan(1, 2)); // "b" -> [1,2)
+  }
+
+  @Test
+  void testExpansionSharesTheSingleSource() {
+    // "aßb" -> "assb" : the eszett expands to two characters that both come from it.
+    final Alignment a = new Alignment.Builder().equal(1).replace(1, 2).equal(1).build(3);
+    assertEquals(4, a.normalizedLength());
+    assertSpan(1, 2, a.toOriginalSpan(1, 3)); // "ss" -> the single "ß"
+    assertSpan(1, 2, a.toOriginalSpan(1, 2)); // first "s"
+    assertSpan(1, 2, a.toOriginalSpan(2, 3)); // second "s"
+    assertSpan(2, 3, a.toOriginalSpan(3, 4)); // "b"
+  }
+
+  @Test
+  void testReverseMappingAndDeletionsMapToEmptySpans() {
+    final Alignment a = new Alignment.Builder()
+        .equal(1).replace(1, 0).equal(1).replace(1, 0).equal(1).build(5); // "a b c" -> "abc"
+    assertSpan(1, 2, a.toNormalizedSpan(2, 3)); // original "b" -> normalized "b"
+    assertSpan(1, 1, a.toNormalizedSpan(1, 2)); // deleted space -> empty normalized span
+    assertSpan(0, 3, a.toNormalizedSpan(0, 5)); // whole original -> whole normalized
+  }
+
+  @Test
+  void testAndThenComposesTwoStages() {
+    // Stage 1: "a  b" -> "a b" (collapse two spaces). Stage 2: "a b" -> "a-b" (space to dash).
+    final Alignment whitespace = new Alignment.Builder().equal(1).replace(2, 1).equal(1).build(4);
+    final Alignment dash = new Alignment.Builder().equal(1).replace(1, 1).equal(1).build(3);
+    final Alignment composed = whitespace.andThen(dash);
+
+    assertEquals(4, composed.originalLength());
+    assertEquals(3, composed.normalizedLength());
+    assertSpan(0, 1, composed.toOriginalSpan(0, 1)); // "a"
+    assertSpan(1, 3, composed.toOriginalSpan(1, 2)); // "-" maps back to the original "  "
+    assertSpan(3, 4, composed.toOriginalSpan(2, 3)); // "b"
+    assertSpan(0, 4, composed.toOriginalSpan(0, 3));
+  }
+
+  @Test
+  void testAndThenRejectsMismatchedStages() {
+    final Alignment first = new Alignment.Builder().equal(2).build(2);  // normalizedLength 2
+    final Alignment second = new Alignment.Builder().equal(3).build(3); // originalLength 3
+    assertThrows(IllegalArgumentException.class, () -> first.andThen(second));
+  }
+
+  @Test
+  void testAllDeletedProducesEmptyNormalized() {
+    final Alignment a = new Alignment.Builder().replace(2, 0).build(2); // "  " -> ""
+    assertEquals(0, a.normalizedLength());
+    assertEquals(2, a.originalLength());
+    assertSpan(0, 0, a.toNormalizedSpan(0, 2)); // all original deleted -> empty normalized span
+  }
+
+  @Test
+  void testBuilderRejectsWrongOriginalLength() {
+    assertThrows(IllegalStateException.class, () -> new Alignment.Builder().equal(2).build(3));
+  }
+
+  @Test
+  void testBuilderRejectsNegativeCounts() {
+    assertThrows(IllegalArgumentException.class, () -> new Alignment.Builder().equal(-1));
+    assertThrows(IllegalArgumentException.class, () -> new Alignment.Builder().replace(-1, 0));
+  }
+
+  @Test
+  void testToOriginalSpanRejectsOutOfRange() {
+    final Alignment a = new Alignment.Builder().equal(2).build(2);
+    assertThrows(IndexOutOfBoundsException.class, () -> a.toOriginalSpan(-1, 1));
+    assertThrows(IndexOutOfBoundsException.class, () -> a.toOriginalSpan(0, 3));
+    assertThrows(IndexOutOfBoundsException.class, () -> a.toOriginalSpan(2, 1));
+  }
+
+  @Test
+  void testToOriginalOffsetConvenience() {
+    final Alignment a = new Alignment.Builder().equal(2).replace(2, 1).build(4); // "ab  "->"ab "
+    assertEquals(0, a.toOriginalOffset(0));
+    assertEquals(2, a.toOriginalOffset(2)); // start of the collapsed space
+    assertEquals(4, a.toOriginalOffset(3)); // end sentinel -> original length
+  }
+
+  @Test
+  void testBuilderGrowsBeyondInitialCapacity() {
+    // 20 equal chars force the builder past its initial 16-entry buffers (exercises grow()).
+    final Alignment a = new Alignment.Builder().equal(20).build(20);
+    assertEquals(20, a.normalizedLength());
+    assertEquals(20, a.originalLength());
+    assertSpan(0, 20, a.toOriginalSpan(0, 20));
+    assertSpan(17, 18, a.toOriginalSpan(17, 18));
+  }
+
+  @Test
+  void testAndThenChainsThreeStages() {
+    // "a  b" -> "a b" (collapse) -> "a-b" (space->dash) -> "a_b" (dash->underscore).
+    final Alignment s1 = new Alignment.Builder().equal(1).replace(2, 1).equal(1).build(4);
+    final Alignment s2 = new Alignment.Builder().equal(1).replace(1, 1).equal(1).build(3);
+    final Alignment s3 = new Alignment.Builder().equal(1).replace(1, 1).equal(1).build(3);
+    final Alignment composed = s1.andThen(s2).andThen(s3);
+
+    assertEquals(4, composed.originalLength());
+    assertEquals(3, composed.normalizedLength());
+    assertSpan(0, 1, composed.toOriginalSpan(0, 1)); // a
+    assertSpan(1, 3, composed.toOriginalSpan(1, 2)); // "_" maps all the way back to the "  "
+    assertSpan(3, 4, composed.toOriginalSpan(2, 3)); // b
+  }
+
+  @Test
+  void testAndThenHandlesLeadingInsertionInNextStage() {
+    // Exercises the andThen branch where the next stage's character covers zero middle characters
+    // at offset 0 (a leading insertion: originalEnd == 0). The result must be a zero-width original
+    // span at 0, and the rest of the mapping must stay correct.
+    final Alignment first = new Alignment.Builder().equal(2).build(2);            // "ab" unchanged
+    final Alignment next = new Alignment.Builder().replace(0, 1).equal(2).build(2); // "ab" -> "Xab"
+    final Alignment composed = first.andThen(next);
+
+    assertEquals(2, composed.originalLength());
+    assertEquals(3, composed.normalizedLength());
+    assertSpan(0, 0, composed.toOriginalSpan(0, 1)); // inserted "X" -> zero-width span at original 0
+    assertSpan(0, 1, composed.toOriginalSpan(1, 2)); // "a"
+    assertSpan(1, 2, composed.toOriginalSpan(2, 3)); // "b"
+    assertSpan(0, 2, composed.toOriginalSpan(0, 3)); // whole normalized -> whole original
+  }
+
+  @Test
+  void testAndThenHandlesInteriorInsertionInCopiedRegion() {
+    // An insertion in the next stage that is NOT at offset 0 and lands in a one-to-one (copied)
+    // region must still map to a zero-width original span at the insertion point: the andThen branch
+    // where middleStart == middleEnd with middleEnd > 0. Without correct handling this is exactly the
+    // case that would misattribute the inserted character to a neighbouring original character.
+    final Alignment first = new Alignment.Builder().equal(3).build(3);                       // "abc"
+    final Alignment next = new Alignment.Builder().equal(1).replace(0, 1).equal(2).build(3); // "abc"->"aXbc"
+    final Alignment composed = first.andThen(next);
+
+    assertEquals(3, composed.originalLength());
+    assertEquals(4, composed.normalizedLength());
+    assertSpan(0, 1, composed.toOriginalSpan(0, 1)); // "a"
+    assertSpan(1, 1, composed.toOriginalSpan(1, 2)); // inserted "X" -> zero-width span at original 1
+    assertSpan(1, 2, composed.toOriginalSpan(2, 3)); // "b"
+    assertSpan(2, 3, composed.toOriginalSpan(3, 4)); // "c"
+    assertSpan(0, 3, composed.toOriginalSpan(0, 4)); // whole normalized -> whole original
+  }
+
+  @Test
+  void testAndThenInsertionInsideExpansionStaysConsistent() {
+    // The hard case: stage 1 expands "ss" from one original character, then stage 2 inserts a
+    // character BETWEEN the two produced characters. The two halves of an expansion share one atomic
+    // original block ([1, 2)), which has no interior offset, so the inserted character is attributed
+    // to that whole block rather than a zero-width point. That is the only mapping that keeps
+    // originalStart/originalEnd sorted, so BOTH directions still resolve correctly -- a zero-width
+    // mapping here would push originalEnd below its predecessor and corrupt the reverse search.
+    // stage 1: "aXb" -> "assb" (X expands to "ss"); stage 2: "assb" -> "asYsb" (insert Y between).
+    final Alignment expand = new Alignment.Builder().equal(1).replace(1, 2).equal(1).build(3);
+    final Alignment insert = new Alignment.Builder().equal(2).replace(0, 1).equal(2).build(4);
+    final Alignment composed = expand.andThen(insert);
+
+    assertEquals(3, composed.originalLength());
+    assertEquals(5, composed.normalizedLength());
+    assertSpan(0, 1, composed.toOriginalSpan(0, 1)); // "a"
+    assertSpan(1, 2, composed.toOriginalSpan(1, 2)); // first "s" -> the expanded original char
+    assertSpan(1, 2, composed.toOriginalSpan(2, 3)); // inserted char -> attributed to the atomic block
+    assertSpan(1, 2, composed.toOriginalSpan(3, 4)); // second "s" -> the expanded original char
+    assertSpan(2, 3, composed.toOriginalSpan(4, 5)); // "b"
+    assertSpan(0, 3, composed.toOriginalSpan(0, 5)); // whole normalized -> whole original
+
+    // Reverse direction stays correct because the start/end arrays remain sorted: the expanded
+    // original character maps to its full normalized footprint (the two halves plus the insertion).
+    assertSpan(1, 4, composed.toNormalizedSpan(1, 2)); // expanded char -> "sYs"
+    assertSpan(0, 1, composed.toNormalizedSpan(0, 1)); // "a"
+    assertSpan(4, 5, composed.toNormalizedSpan(2, 3)); // "b"
+  }
+
+  @Test
+  void testToNormalizedSpanDoesNotOverCoverAcrossDeletions() {
+    // "a  b" -> "ab" : the two interior spaces are deleted. Forward mapping a span that ends inside
+    // the deleted run must stop at the last kept character rather than over-covering into "b".
+    final Alignment a = new Alignment.Builder().equal(1).replace(2, 0).equal(1).build(4);
+    assertEquals(2, a.normalizedLength());
+    assertSpan(0, 1, a.toNormalizedSpan(0, 3)); // "a" plus the two deleted spaces -> just "a"
+    assertSpan(1, 1, a.toNormalizedSpan(1, 3)); // only the deleted spaces -> empty normalized span
+    assertSpan(0, 2, a.toNormalizedSpan(0, 4)); // whole original -> whole normalized
+    assertSpan(1, 2, a.toNormalizedSpan(3, 4)); // "b"
+  }
+
+  @Test
+  void testToNormalizedSpanAcrossExpansion() {
+    final Alignment a = new Alignment.Builder().equal(1).replace(1, 2).equal(1).build(3); // ß->ss
+    assertSpan(1, 3, a.toNormalizedSpan(1, 2)); // original "ß" -> the two-char "ss"
+    assertSpan(0, 1, a.toNormalizedSpan(0, 1)); // a
+    assertSpan(3, 4, a.toNormalizedSpan(2, 3)); // b
+  }
+}
diff --git a/opennlp-api/src/test/java/opennlp/tools/util/normalizer/CharClassTest.java b/opennlp-api/src/test/java/opennlp/tools/util/normalizer/CharClassTest.java
index 76911a34d..052350d12 100644
--- a/opennlp-api/src/test/java/opennlp/tools/util/normalizer/CharClassTest.java
+++ b/opennlp-api/src/test/java/opennlp/tools/util/normalizer/CharClassTest.java
@@ -212,4 +212,187 @@ void testOfRejectsInvalidReplacement() {
         () -> CharClass.of(CodePointSet.of(0x20), Character.MAX_CODE_POINT + 1));
   }
 
+  // --- aligned variants (Alignment / AlignedText) ------------------------------------------
+
+  private static void assertSpan(int start, int end, Span span) {
+    assertEquals(start, span.getStart(), "start");
+    assertEquals(end, span.getEnd(), "end");
+  }
+
+  @Test
+  void testCollapseAlignedMapsRunToWholeExtent() {
+    final AlignedText at = WS.collapseAligned("a  b");
+    assertEquals("a b", at.normalized());
+    assertSpan(0, 1, at.toOriginalSpan(0, 1)); // a
+    assertSpan(1, 3, at.toOriginalSpan(1, 2)); // the collapsed space covers both originals
+    assertSpan(3, 4, at.toOriginalSpan(2, 3)); // b
+  }
+
+  @Test
+  void testRemoveAllAlignedDoesNotOverCover() {
+    final AlignedText at = WS.removeAllAligned("a b c");
+    assertEquals("abc", at.normalized());
+    assertSpan(2, 3, at.toOriginalSpan(1, 2)); // "b" -> [2,3), not [2,4)
+    assertSpan(0, 5, at.toOriginalSpan(0, 3));
+  }
+
+  @Test
+  void testTrimAlignedDropsEdgesWithoutOverCovering() {
+    final AlignedText at = WS.trimAligned("  ab  ");
+    assertEquals("ab", at.normalized());
+    assertEquals(6, at.alignment().originalLength());
+    assertSpan(2, 4, at.toOriginalSpan(0, 2)); // "ab" sits at original [2,4)
+    assertSpan(3, 4, at.toOriginalSpan(1, 2)); // "b"
+  }
+
+  @Test
+  void testCollapsePreservingAlignedKeepsLineBreak() {
+    final AlignedText at = WS.collapsePreservingAligned("a\n\n\t\tb", lineBreaks(), '\n');
+    assertEquals("a\nb", at.normalized());
+    assertSpan(1, 5, at.toOriginalSpan(1, 2)); // the preserved newline covers the whole run
+  }
+
+  @Test
+  void testNormalizeAlignedAcrossSupplementaryDash() {
+    final AlignedText at = DASH.normalizeAligned("x" + YEZIDI_HYPHEN + "y");
+    assertEquals("x-y", at.normalized());
+    assertSpan(0, 1, at.toOriginalSpan(0, 1)); // x
+    assertSpan(1, 3, at.toOriginalSpan(1, 2)); // "-" maps back to the two-char Yezidi hyphen
+    assertSpan(3, 4, at.toOriginalSpan(2, 3)); // y
+  }
+
+  // --- aligned edge cases (restore + extend the deleted *Mapped coverage) ------------------
+
+  @Test
+  void testCollapseAlignedAcrossMixedUnicodeWhitespaceRun() {
+    final AlignedText at = WS.collapseAligned("a" + NBSP + IDEOGRAPHIC + cp(0x2002) + "b");
+    assertEquals("a b", at.normalized());
+    assertSpan(1, 4, at.toOriginalSpan(1, 2)); // the one space covers the three-char ws run
+    assertSpan(4, 5, at.toOriginalSpan(2, 3)); // b
+  }
+
+  @Test
+  void testCollapseAlignedAcrossTabRun() {
+    final AlignedText at = WS.collapseAligned("a\t\t\t\t\tb");
+    assertEquals("a b", at.normalized());
+    assertSpan(1, 6, at.toOriginalSpan(1, 2)); // five tabs collapse to one space
+    assertSpan(6, 7, at.toOriginalSpan(2, 3));
+  }
+
+  @Test
+  void testCollapseAlignedAcrossNewlineRun() {
+    final AlignedText at = WS.collapseAligned("a\r\n\tb");
+    assertEquals("a b", at.normalized());
+    assertSpan(1, 4, at.toOriginalSpan(1, 2));
+  }
+
+  @Test
+  void testCollapseAlignedEmptySingleAndAllWhitespace() {
+    assertEquals("", WS.collapseAligned("").normalized());
+    assertEquals(0, WS.collapseAligned("").alignment().normalizedLength());
+
+    final AlignedText single = WS.collapseAligned("a");
+    assertEquals("a", single.normalized());
+    assertSpan(0, 1, single.toOriginalSpan(0, 1));
+
+    final AlignedText allWs = WS.collapseAligned("\t\t\t");
+    assertEquals(" ", allWs.normalized()); // all whitespace collapses to one space, not empty
+    assertSpan(0, 3, allWs.toOriginalSpan(0, 1));
+  }
+
+  @Test
+  void testCollapseAlignedKeepsSurrogatePairOffsets() {
+    final AlignedText at = WS.collapseAligned(GRINNING_FACE + "\t\tb");
+    assertEquals(GRINNING_FACE + " b", at.normalized());
+    assertSpan(0, 2, at.toOriginalSpan(0, 2)); // the emoji occupies two original chars
+    assertSpan(2, 4, at.toOriginalSpan(2, 3)); // the collapsed tabs
+    assertSpan(4, 5, at.toOriginalSpan(3, 4)); // b
+  }
+
+  @Test
+  void testNormalizeAlignedIsIdentityWhenNothingMatches() {
+    final AlignedText at = WS.normalizeAligned("abc");
+    assertEquals("abc", at.normalized());
+    for (int i = 0; i < 3; i++) {
+      assertSpan(i, i + 1, at.toOriginalSpan(i, i + 1));
+    }
+  }
+
+  @Test
+  void testNormalizeAlignedPreservesSupplementaryNonMember() {
+    final AlignedText at = WS.normalizeAligned("a" + GRINNING_FACE + "b");
+    assertEquals("a" + GRINNING_FACE + "b", at.normalized());
+    assertSpan(1, 3, at.toOriginalSpan(1, 3)); // the emoji passes through unchanged
+  }
+
+  @Test
+  void testNormalizeAlignedExpandsToSupplementaryReplacement() {
+    // A BMP member replaced by a supplementary code point grows by one char (1 -> 2).
+    final CharClass toPenguin = CharClass.of(CodePointSet.of(' '), 0x1F427);
+    final AlignedText at = toPenguin.normalizeAligned("a b");
+    assertEquals("a" + cp(0x1F427) + "b", at.normalized());
+    assertSpan(0, 1, at.toOriginalSpan(0, 1)); // a
+    assertSpan(1, 2, at.toOriginalSpan(1, 3)); // both penguin halves come from the one space
+    assertSpan(2, 3, at.toOriginalSpan(3, 4)); // b
+  }
+
+  @Test
+  void testRemoveAllAlignedLeadingAndTrailingDeletions() {
+    final AlignedText at = WS.removeAllAligned(" a b ");
+    assertEquals("ab", at.normalized());
+    assertSpan(1, 2, at.toOriginalSpan(0, 1)); // a (leading space deleted)
+    assertSpan(3, 4, at.toOriginalSpan(1, 2)); // b (trailing space deleted, not over-covered)
+  }
+
+  @Test
+  void testTrimAlignedAllWhitespaceIsEmpty() {
+    final AlignedText at = WS.trimAligned("\t\t");
+    assertEquals("", at.normalized());
+    assertEquals(0, at.alignment().normalizedLength());
+    assertEquals(2, at.alignment().originalLength());
+  }
+
+  @Test
+  void testCollapsePreservingAlignedRunWithoutKeepCollapsesToReplacement() {
+    final AlignedText at = WS.collapsePreservingAligned("a \t b", lineBreaks(), '\n');
+    assertEquals("a b", at.normalized()); // no line break in the run -> plain space
+    assertSpan(1, 4, at.toOriginalSpan(1, 2));
+  }
+
+  // Every aligned operation must produce exactly the same string as its plain counterpart; only the
+  // alignment is extra. This pins that contract across a battery of inputs so the two code paths
+  // cannot drift apart.
+  @Test
+  void testAlignedOperationsAgreeWithPlainOutput() {
+    final CodePointSet keep = lineBreaks();
+    final String[] inputs = {
+        "",
+        "abc",
+        "  a b  ",
+        "a" + NBSP + IDEOGRAPHIC + "b",
+        "a\t\t\t\t\tb",
+        "a\r\n\tb",
+        "\n\nabc",
+        "  ",
+        GRINNING_FACE + "\t\tb",
+        "x" + YEZIDI_HYPHEN + YEZIDI_HYPHEN + "y",
+        "well" + EM_DASH + EN_DASH + "known",
+        "5" + MINUS_SIGN + "3",
+    };
+    for (final CharClass charClass : new CharClass[] {WS, DASH}) {
+      for (final String input : inputs) {
+        assertEquals(charClass.normalize(input), charClass.normalizeAligned(input).normalized(),
+            "normalize vs normalizeAligned for [" + input + "]");
+        assertEquals(charClass.collapse(input), charClass.collapseAligned(input).normalized(),
+            "collapse vs collapseAligned for [" + input + "]");
+        assertEquals(charClass.trim(input), charClass.trimAligned(input).normalized(),
+            "trim vs trimAligned for [" + input + "]");
+        assertEquals(charClass.removeAll(input), charClass.removeAllAligned(input).normalized(),
+            "removeAll vs removeAllAligned for [" + input + "]");
+        assertEquals(charClass.collapsePreserving(input, keep, '\n'),
+            charClass.collapsePreservingAligned(input, keep, '\n').normalized(),
+            "collapsePreserving vs collapsePreservingAligned for [" + input + "]");
+      }
+    }
+  }
 }
diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/AlignedAggregateCharSequenceNormalizer.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/AlignedAggregateCharSequenceNormalizer.java
new file mode 100644
index 000000000..f57ddc29b
--- /dev/null
+++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/AlignedAggregateCharSequenceNormalizer.java
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.util.normalizer;
+
+/**
+ * An {@link OffsetAwareNormalizer} that applies a chain of offset-aware rungs in order and composes
+ * their per-stage {@link Alignment}s with {@link Alignment#andThen(Alignment)}, so the result maps a
+ * span found in the fully normalized text back to the original input through every stage.
+ *
+ * <p>Produced by {@code TextNormalizer.Builder.buildAligned()}, which validates that every rung is
+ * offset-aware before constructing this.</p>
+ */
+final class AlignedAggregateCharSequenceNormalizer implements OffsetAwareNormalizer {
+
+  private static final long serialVersionUID = 3056944120186103477L;
+
+  private final OffsetAwareNormalizer[] steps;
+
+  AlignedAggregateCharSequenceNormalizer(OffsetAwareNormalizer[] steps) {
+    this.steps = steps;
+  }
+
+  @Override
+  public CharSequence normalize(CharSequence text) {
+    CharSequence result = text;
+    for (final OffsetAwareNormalizer step : steps) {
+      result = step.normalize(result);
+    }
+    return result;
+  }
+
+  @Override
+  public AlignedText normalizeAligned(CharSequence text) {
+    if (steps.length == 0) {
+      // Identity pipeline: use one String for both sides so the alignment's lengths cannot diverge
+      // from the stored original for a CharSequence whose length() differs from its toString().
+      final String identity = text.toString();
+      return new AlignedText(identity, identity,
+          new Alignment.Builder().equal(identity.length()).build(identity.length()));
+    }
+    // Normalize the input to a String once so the stored original and the per-stage alignment
+    // lengths agree even for a CharSequence whose length() differs from its toString().
+    final String input = text.toString();
+    AlignedText stage = steps[0].normalizeAligned(input);
+    Alignment alignment = stage.alignment();
+    for (int i = 1; i < steps.length; i++) {
+      final AlignedText next = steps[i].normalizeAligned(stage.normalized());
+      alignment = alignment.andThen(next.alignment());
+      stage = next;
+    }
+    return new AlignedText(input, stage.normalized(), alignment);
+  }
+}
diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/BulletCharSequenceNormalizer.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/BulletCharSequenceNormalizer.java
index 9d1d63304..84476bf81 100644
--- a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/BulletCharSequenceNormalizer.java
+++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/BulletCharSequenceNormalizer.java
@@ -25,7 +25,7 @@
  * because it is a letter in Catalan ({@code l..l}) and other orthographies; only characters that
  * are unambiguously list bullets are replaced.</p>
  */
-public class BulletCharSequenceNormalizer implements CharSequenceNormalizer {
+public class BulletCharSequenceNormalizer implements OffsetAwareNormalizer {
 
   private static final long serialVersionUID = 5521093348871625541L;
 
@@ -49,4 +49,8 @@ public CharSequence normalize(CharSequence text) {
     return BULLETS.normalize(text);
   }
 
+  @Override
+  public AlignedText normalizeAligned(CharSequence text) {
+    return BULLETS.normalizeAligned(text);
+  }
 }
diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/DashCharSequenceNormalizer.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/DashCharSequenceNormalizer.java
index 21c25873b..308c4cfaf 100644
--- a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/DashCharSequenceNormalizer.java
+++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/DashCharSequenceNormalizer.java
@@ -25,7 +25,7 @@
  * regardless of which dash the source used. The mathematical minus signs are left untouched by
  * default, and {@code U+00AD} SOFT HYPHEN (a format character) is not treated as a dash.</p>
  */
-public class DashCharSequenceNormalizer implements CharSequenceNormalizer {
+public class DashCharSequenceNormalizer implements OffsetAwareNormalizer {
 
   private static final long serialVersionUID = 6620885194730155303L;
 
@@ -43,4 +43,8 @@ public CharSequence normalize(CharSequence text) {
     return DASHES.normalize(text);
   }
 
+  @Override
+  public AlignedText normalizeAligned(CharSequence text) {
+    return DASHES.normalizeAligned(text);
+  }
 }
diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/DigitCharSequenceNormalizer.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/DigitCharSequenceNormalizer.java
index 10bb882fe..68039c1ab 100644
--- a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/DigitCharSequenceNormalizer.java
+++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/DigitCharSequenceNormalizer.java
@@ -26,7 +26,7 @@
  * left unchanged. Scanning is a single O(1)-per-code-point cursor pass with no regular
  * expression.</p>
  */
-public class DigitCharSequenceNormalizer implements CharSequenceNormalizer {
+public class DigitCharSequenceNormalizer implements OffsetAwareNormalizer {
 
   private static final long serialVersionUID = 8451270936618204413L;
 
@@ -48,4 +48,8 @@ private static String toAscii(int codePoint) {
     return value >= 0 ? String.valueOf((char) ('0' + value)) : null;
   }
 
+  @Override
+  public AlignedText normalizeAligned(CharSequence text) {
+    return CharClass.substituteAligned(text, DigitCharSequenceNormalizer::toAscii);
+  }
 }
diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/EllipsisCharSequenceNormalizer.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/EllipsisCharSequenceNormalizer.java
index e4971aa40..e5c692d73 100644
--- a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/EllipsisCharSequenceNormalizer.java
+++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/EllipsisCharSequenceNormalizer.java
@@ -24,7 +24,7 @@
  * <p>Scanning is a single O(1)-per-code-point cursor pass with no regular expression. ASCII dot
  * runs are left unchanged.</p>
  */
-public class EllipsisCharSequenceNormalizer implements CharSequenceNormalizer {
+public class EllipsisCharSequenceNormalizer implements OffsetAwareNormalizer {
 
   private static final long serialVersionUID = 2298647015583729167L;
 
@@ -41,6 +41,10 @@ public CharSequence normalize(CharSequence text) {
     return CharClass.substitute(text, EllipsisCharSequenceNormalizer::expansion);
   }
 
+  @Override
+  public AlignedText normalizeAligned(CharSequence text) {
+    return CharClass.substituteAligned(text, EllipsisCharSequenceNormalizer::expansion);
+  }
 
   // The ASCII expansion for an ellipsis or leader code point, or null to copy the code point through.
   private static String expansion(int codePoint) {
diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/GermanUmlautCharSequenceNormalizer.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/GermanUmlautCharSequenceNormalizer.java
index 79d4e71b7..d4c2c4645 100644
--- a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/GermanUmlautCharSequenceNormalizer.java
+++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/GermanUmlautCharSequenceNormalizer.java
@@ -32,7 +32,7 @@
  * (for example {@code a} + U+0308) is not a member and passes through unchanged, so apply NFC
  * composition first if the input may contain decomposed forms.</p>
  */
-public class GermanUmlautCharSequenceNormalizer implements CharSequenceNormalizer {
+public class GermanUmlautCharSequenceNormalizer implements OffsetAwareNormalizer {
 
   private static final long serialVersionUID = 7106934482250176835L;
 
@@ -61,6 +61,11 @@ public CharSequence normalize(CharSequence text) {
     return CharClass.substitute(text, GermanUmlautCharSequenceNormalizer::expansion);
   }
 
+  @Override
+  public AlignedText normalizeAligned(CharSequence text) {
+    return CharClass.substituteAligned(text, GermanUmlautCharSequenceNormalizer::expansion);
+  }
+
   // The DIN 5007-2 transliteration for an umlaut or eszett, or null to copy the code point through.
   // All members are in the BMP, so a code point equals its char; supplementary code points miss every
   // case and pass through.
diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/InvisibleCharSequenceNormalizer.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/InvisibleCharSequenceNormalizer.java
index 5e0465f73..91c7f7c75 100644
--- a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/InvisibleCharSequenceNormalizer.java
+++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/InvisibleCharSequenceNormalizer.java
@@ -28,7 +28,7 @@
  * sequences; so are variation selectors. Use this only for a matching/search form, not for
  * display.</p>
  */
-public class InvisibleCharSequenceNormalizer implements CharSequenceNormalizer {
+public class InvisibleCharSequenceNormalizer implements OffsetAwareNormalizer {
 
   private static final long serialVersionUID = 4837512098664301927L;
 
@@ -69,4 +69,8 @@ public CharSequence normalize(CharSequence text) {
     return INVISIBLE.removeAll(text);
   }
 
+  @Override
+  public AlignedText normalizeAligned(CharSequence text) {
+    return INVISIBLE.removeAllAligned(text);
+  }
 }
diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/LineBreakPreservingWhitespaceCharSequenceNormalizer.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/LineBreakPreservingWhitespaceCharSequenceNormalizer.java
new file mode 100644
index 000000000..ec198fda1
--- /dev/null
+++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/LineBreakPreservingWhitespaceCharSequenceNormalizer.java
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.util.normalizer;
+
+/**
+ * A {@link CharSequenceNormalizer} that collapses runs of Unicode whitespace like
+ * {@link WhitespaceCharSequenceNormalizer}, but keeps line and paragraph structure: any whitespace
+ * run that contains a line break collapses to a single newline ({@code U+000A}) instead of a space,
+ * and leading and trailing whitespace is trimmed.
+ *
+ * <p>This is the form wanted for readable snippets and display: horizontal runs of spaces and tabs
+ * become a single space, yet a blank line between paragraphs survives as one newline rather than
+ * being flattened into the surrounding text. It reuses the cursor based
+ * {@link CharClass#collapsePreserving(CharSequence, CodePointSet, int)} engine, so it recognizes the
+ * full Unicode {@code White_Space} set with no regular expression.</p>
+ */
+public class LineBreakPreservingWhitespaceCharSequenceNormalizer implements OffsetAwareNormalizer {
+
+  private static final long serialVersionUID = 5471829006633512874L;
+
+  private static final int NEWLINE = 0x000A;
+
+  private static final CharClass WHITESPACE = CharClass.whitespace();
+
+  // The Unicode mandatory break code points (UAX #14 classes BK/CR/LF/NL): line feed, vertical tab,
+  // form feed, carriage return, next line, line separator, and paragraph separator. A whitespace run
+  // that contains any of these collapses to a single newline rather than a space, so line and
+  // paragraph structure survives while horizontal runs are squished.
+  private static final CodePointSet LINE_BREAKS = CodePointSet.of(
+      0x000A,   // line feed
+      0x000B,   // vertical tab
+      0x000C,   // form feed
+      0x000D,   // carriage return
+      0x0085,   // next line
+      0x2028,   // line separator
+      0x2029);  // paragraph separator
+
+  private static final LineBreakPreservingWhitespaceCharSequenceNormalizer INSTANCE =
+      new LineBreakPreservingWhitespaceCharSequenceNormalizer();
+
+  /** {@return the shared, stateless instance} */
+  public static LineBreakPreservingWhitespaceCharSequenceNormalizer getInstance() {
+    return INSTANCE;
+  }
+
+  @Override
+  public CharSequence normalize(CharSequence text) {
+    return WHITESPACE.trim(WHITESPACE.collapsePreserving(text, LINE_BREAKS, NEWLINE));
+  }
+
+  @Override
+  public AlignedText normalizeAligned(CharSequence text) {
+    final AlignedText collapsed = WHITESPACE.collapsePreservingAligned(text, LINE_BREAKS, NEWLINE);
+    final AlignedText trimmed = WHITESPACE.trimAligned(collapsed.normalized());
+    return new AlignedText(text, trimmed.normalized(),
+        collapsed.alignment().andThen(trimmed.alignment()));
+  }
+}
diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/QuoteCharSequenceNormalizer.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/QuoteCharSequenceNormalizer.java
index ec86e4fa6..f4551d21d 100644
--- a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/QuoteCharSequenceNormalizer.java
+++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/QuoteCharSequenceNormalizer.java
@@ -25,7 +25,7 @@
  * two {@link CharClass} sets, so membership is O(1) and scanning is a single cursor pass with no
  * regular expression. ASCII quotes are left unchanged.</p>
  */
-public class QuoteCharSequenceNormalizer implements CharSequenceNormalizer {
+public class QuoteCharSequenceNormalizer implements OffsetAwareNormalizer {
 
   private static final long serialVersionUID = 3415829076651283471L;
 
@@ -67,4 +67,11 @@ public CharSequence normalize(CharSequence text) {
     return DOUBLE.normalize(SINGLE.normalize(text));
   }
 
+  @Override
+  public AlignedText normalizeAligned(CharSequence text) {
+    final AlignedText single = SINGLE.normalizeAligned(text);
+    final AlignedText both = DOUBLE.normalizeAligned(single.normalized());
+    return new AlignedText(text, both.normalized(),
+        single.alignment().andThen(both.alignment()));
+  }
 }
diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/TextNormalizer.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/TextNormalizer.java
index c7d877ecc..a5f1bb8de 100644
--- a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/TextNormalizer.java
+++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/TextNormalizer.java
@@ -93,6 +93,15 @@ public Builder whitespace() {
       return add(Dimension.WHITESPACE.defaultNormalizer());
     }
 
+    /**
+     * {@return this builder with whitespace collapsing that preserves line and paragraph breaks
+     * appended} Horizontal runs collapse to a single space, but a run containing a line break
+     * collapses to a single newline, so paragraph structure survives.
+     */
+    public Builder whitespacePreservingLineBreaks() {
+      return add(LineBreakPreservingWhitespaceCharSequenceNormalizer.getInstance());
+    }
+
     /** {@return this builder with quotation-mark folding appended} */
     public Builder quotes() {
       return add(QuoteCharSequenceNormalizer.getInstance());
@@ -143,6 +152,38 @@ public CharSequenceNormalizer build() {
       return new AggregateCharSequenceNormalizer(steps.toArray(new CharSequenceNormalizer[0]));
     }
 
+    /**
+     * {@return an offset-aware composition of the rungs added so far}
+     *
+     * <p>Every rung must be an {@link OffsetAwareNormalizer}. Each per-code-point fold is one;
+     * the folds that delegate to {@link java.text.Normalizer} or to JDK case mapping (NFC, NFKC,
+     * accent folding, confusable folding, and case folding) cannot report their per-character edits
+     * and so are rejected here. The returned normalizer's
+     * {@link OffsetAwareNormalizer#normalizeAligned(CharSequence)} maps a span found in the fully
+     * normalized text back to the original input through every stage, so a match in a normalized
+     * document reports its true offsets in the source.</p>
+     *
+     * @throws IllegalStateException Thrown if any rung cannot report an alignment (for example NFC,
+     *     NFKC, accent folding, confusable folding, or case folding, which delegate to
+     *     {@link java.text.Normalizer} or to JDK case mapping); the message names the offending
+     *     rung.
+     */
+    public OffsetAwareNormalizer buildAligned() {
+      final OffsetAwareNormalizer[] aligned = new OffsetAwareNormalizer[steps.size()];
+      for (int i = 0; i < steps.size(); i++) {
+        final CharSequenceNormalizer step = steps.get(i);
+        if (!(step instanceof OffsetAwareNormalizer)) {
+          throw new IllegalStateException("rung at 0-based index " + i + " (" + step.getClass().getName()
+              + ") is not offset-aware and cannot be composed into an aligned pipeline; the "
+              + "per-code-point folds report an alignment, while folds that delegate to "
+              + "java.text.Normalizer or JDK case mapping (such as NFC, NFKC, accent, confusable, "
+              + "or case folding) do not");
+        }
+        aligned[i] = (OffsetAwareNormalizer) step;
+      }
+      return new AlignedAggregateCharSequenceNormalizer(aligned);
+    }
+
     private Builder add(CharSequenceNormalizer normalizer) {
       steps.add(normalizer);
       return this;
diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/WhitespaceCharSequenceNormalizer.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/WhitespaceCharSequenceNormalizer.java
index 6aa267d39..a61ffed9c 100644
--- a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/WhitespaceCharSequenceNormalizer.java
+++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/WhitespaceCharSequenceNormalizer.java
@@ -25,7 +25,7 @@
  * and so on), so spacing copied from the web, PDFs, or non-Latin sources normalizes consistently.
  * It is the Unicode-aware, regex-free counterpart to {@link ShrinkCharSequenceNormalizer}.</p>
  */
-public class WhitespaceCharSequenceNormalizer implements CharSequenceNormalizer {
+public class WhitespaceCharSequenceNormalizer implements OffsetAwareNormalizer {
 
   private static final long serialVersionUID = 6748290315562094783L;
 
@@ -44,4 +44,11 @@ public CharSequence normalize(CharSequence text) {
     return WHITESPACE.trim(WHITESPACE.collapse(text));
   }
 
+  @Override
+  public AlignedText normalizeAligned(CharSequence text) {
+    final AlignedText collapsed = WHITESPACE.collapseAligned(text);
+    final AlignedText trimmed = WHITESPACE.trimAligned(collapsed.normalized());
+    return new AlignedText(text, trimmed.normalized(),
+        collapsed.alignment().andThen(trimmed.alignment()));
+  }
 }
diff --git a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/AlignedNormalizerPipelineTest.java b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/AlignedNormalizerPipelineTest.java
new file mode 100644
index 000000000..7813babe5
--- /dev/null
+++ b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/AlignedNormalizerPipelineTest.java
@@ -0,0 +1,342 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.util.normalizer;
+
+import org.junit.jupiter.api.Test;
+
+import opennlp.tools.util.Span;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertSame;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+/**
+ * Exercises {@link OffsetAwareNormalizer} and {@code TextNormalizer.Builder.buildAligned()}: the
+ * cursor-based rungs report alignments, an aligned pipeline composes them with
+ * {@link Alignment#andThen(Alignment)} so a span found in the fully normalized text maps back to the
+ * original input, and a non-alignable rung is rejected loudly.
+ */
+public class AlignedNormalizerPipelineTest {
+
+  private static final int ZERO_WIDTH_SPACE = 0x200B;
+  private static final int EM_DASH = 0x2014;
+  private static final int YEZIDI_HYPHEN = 0x10EAD; // a supplementary (non-BMP) dash
+  private static final int MATH_BOLD_DIGIT_ZERO = 0x1D7CE; // a supplementary decimal digit
+
+  private static String cp(int codePoint) {
+    return new String(Character.toChars(codePoint));
+  }
+
+  private static String covered(AlignedText aligned, int normalizedStart, int normalizedEnd) {
+    final Span span = aligned.toOriginalSpan(normalizedStart, normalizedEnd);
+    return aligned.original().subSequence(span.getStart(), span.getEnd()).toString();
+  }
+
+  // The aligned form must always reproduce exactly what the plain form produces.
+  @Test
+  void alignedNormalizedTextMatchesPlainForEveryRung() {
+    final OffsetAwareNormalizer[] rungs = {
+        WhitespaceCharSequenceNormalizer.getInstance(),
+        LineBreakPreservingWhitespaceCharSequenceNormalizer.getInstance(),
+        DashCharSequenceNormalizer.getInstance(),
+        InvisibleCharSequenceNormalizer.getInstance(),
+        QuoteCharSequenceNormalizer.getInstance(),
+        DigitCharSequenceNormalizer.getInstance(),
+        EllipsisCharSequenceNormalizer.getInstance(),
+        BulletCharSequenceNormalizer.getInstance(),
+        GermanUmlautCharSequenceNormalizer.getInstance()
+    };
+    final String[] inputs = {
+        "",
+        "plain",
+        "  lots   of\tspace  ",
+        "\n\n  para   one\n\n\tpara two  \n",
+        "a" + cp(ZERO_WIDTH_SPACE) + "b" + cp(YEZIDI_HYPHEN) + "c" + cp(EM_DASH) + "d",
+        cp(ZERO_WIDTH_SPACE) + "  " + cp(ZERO_WIDTH_SPACE),
+        // quotes, ellipsis, eszett, bullet, fullwidth and supplementary digits in one string
+        cp(0x201C) + "don" + cp(0x2019) + "t " + cp(0x2026) + " Stra" + cp(0x00DF) + "e "
+            + cp(0x2022) + " " + cp(0xFF15) + cp(MATH_BOLD_DIGIT_ZERO)
+    };
+    for (final OffsetAwareNormalizer rung : rungs) {
+      for (final String input : inputs) {
+        assertEquals(rung.normalize(input).toString(), rung.normalizeAligned(input).normalized(),
+            rung.getClass().getSimpleName() + " on [" + input + "]");
+      }
+    }
+  }
+
+  @Test
+  void whitespaceCollapseAndTrimMapsSpanBackToOriginal() {
+    final String original = "  hello   world  ";
+    final AlignedText aligned = WhitespaceCharSequenceNormalizer.getInstance()
+        .normalizeAligned(original);
+    assertEquals("hello world", aligned.normalized());
+    // "world" sits at [6, 11) in the collapsed/trimmed form.
+    final Span span = aligned.toOriginalSpan(6, 11);
+    assertEquals(original.indexOf("world"), span.getStart());
+    assertEquals("world", covered(aligned, 6, 11));
+  }
+
+  @Test
+  void dashFoldOfSupplementaryDashMapsSpanBackToOriginal() {
+    final String original = "a" + cp(YEZIDI_HYPHEN) + "b";
+    final AlignedText aligned = DashCharSequenceNormalizer.getInstance().normalizeAligned(original);
+    assertEquals("a-b", aligned.normalized());
+    // The two-unit supplementary dash folded to one ASCII hyphen, so 'b' moved from 3 to 2.
+    assertEquals("b", covered(aligned, 2, 3));
+    assertEquals(cp(YEZIDI_HYPHEN), covered(aligned, 1, 2));
+  }
+
+  @Test
+  void invisibleStripMapsSpanBackAcrossDeletion() {
+    final String original = "a" + cp(ZERO_WIDTH_SPACE) + "b";
+    final AlignedText aligned = InvisibleCharSequenceNormalizer.getInstance()
+        .normalizeAligned(original);
+    assertEquals("ab", aligned.normalized());
+    // 'b' is at index 1 in "ab" but index 2 in the original; the deleted ZWSP must not be covered.
+    assertEquals("b", covered(aligned, 1, 2));
+    assertEquals(2, aligned.toOriginalSpan(1, 2).getStart());
+  }
+
+  @Test
+  void pipelineComposesStripInvisibleWhitespaceAndDashesBackToOriginal() {
+    // 'a', zero-width space, two spaces, 'b', em dash, 'c'.
+    final String original = "a" + cp(ZERO_WIDTH_SPACE) + "  b" + cp(EM_DASH) + "c";
+    final OffsetAwareNormalizer pipeline = TextNormalizer.builder()
+        .stripInvisible().whitespace().dashes().buildAligned();
+
+    final AlignedText aligned = pipeline.normalizeAligned(original);
+    assertEquals("a b-c", aligned.normalized());
+    assertEquals(pipeline.normalize(original).toString(), aligned.normalized());
+    // "b-c" at [2, 5) maps back across a deletion, a collapse, and a dash fold to "b<em-dash>c".
+    assertEquals("b" + cp(EM_DASH) + "c", covered(aligned, 2, 5));
+  }
+
+  @Test
+  void emptyAlignedPipelineIsIdentity() {
+    final AlignedText aligned = TextNormalizer.builder().buildAligned().normalizeAligned("Hello");
+    assertEquals("Hello", aligned.normalized());
+    assertEquals("Hello", covered(aligned, 0, 5));
+  }
+
+  @Test
+  void buildAlignedRejectsNonAlignableRungLoudly() {
+    final IllegalStateException ex = assertThrows(IllegalStateException.class,
+        () -> TextNormalizer.builder().nfc().whitespace().buildAligned());
+    assertTrue(ex.getMessage().contains("Nfc"), ex.getMessage());
+    assertTrue(ex.getMessage().contains("offset-aware"), ex.getMessage());
+  }
+
+  @Test
+  void buildAlignedReportsTheOffendingRungIndexWhenItIsNotFirst() {
+    // A non-alignable rung after several offset-aware ones must still be rejected, and the message
+    // must name its 0-based position (index 2) and type so the failure points at the right fold.
+    final IllegalStateException ex = assertThrows(IllegalStateException.class,
+        () -> TextNormalizer.builder().whitespace().dashes().caseFold().buildAligned());
+    assertTrue(ex.getMessage().contains("rung at 0-based index 2"), ex.getMessage());
+    assertTrue(ex.getMessage().contains("CaseFold"), ex.getMessage());
+  }
+
+  @Test
+  void buildAlignedRejectsEachKindOfNonAlignableRung() {
+    // Every fold that routes through java.text.Normalizer or JDK case mapping is rejected, named.
+    assertThrows(IllegalStateException.class,
+        () -> TextNormalizer.builder().nfkc().buildAligned());
+    assertThrows(IllegalStateException.class,
+        () -> TextNormalizer.builder().accentFold().buildAligned());
+    assertThrows(IllegalStateException.class,
+        () -> TextNormalizer.builder().caseFold().buildAligned());
+  }
+
+  @Test
+  void capabilityIsDetectableByInstanceOf() {
+    assertTrue(WhitespaceCharSequenceNormalizer.getInstance() instanceof OffsetAwareNormalizer);
+    assertTrue(DashCharSequenceNormalizer.getInstance() instanceof OffsetAwareNormalizer);
+    assertTrue(InvisibleCharSequenceNormalizer.getInstance() instanceof OffsetAwareNormalizer);
+    assertFalse(NfcCharSequenceNormalizer.getInstance() instanceof OffsetAwareNormalizer);
+    assertTrue(TextNormalizer.builder().whitespace().dashes().buildAligned()
+        instanceof OffsetAwareNormalizer);
+    // The per-code-point substitution folds are offset-aware too.
+    assertTrue(QuoteCharSequenceNormalizer.getInstance() instanceof OffsetAwareNormalizer);
+    assertTrue(DigitCharSequenceNormalizer.getInstance() instanceof OffsetAwareNormalizer);
+    assertTrue(EllipsisCharSequenceNormalizer.getInstance() instanceof OffsetAwareNormalizer);
+    assertTrue(BulletCharSequenceNormalizer.getInstance() instanceof OffsetAwareNormalizer);
+    assertTrue(GermanUmlautCharSequenceNormalizer.getInstance() instanceof OffsetAwareNormalizer);
+    // The folds that route through java.text.Normalizer or JDK case mapping cannot, by design.
+    assertFalse(NfkcCharSequenceNormalizer.getInstance() instanceof OffsetAwareNormalizer);
+    assertFalse(CaseFoldCharSequenceNormalizer.getInstance() instanceof OffsetAwareNormalizer);
+    assertFalse(AccentFoldCharSequenceNormalizer.getInstance() instanceof OffsetAwareNormalizer);
+    assertFalse(ConfusableSkeletonCharSequenceNormalizer.getInstance()
+        instanceof OffsetAwareNormalizer);
+  }
+
+  @Test
+  void roundTripOfAFullySpanningMatchReturnsTheWholeOriginal() {
+    final String original = "  the   quick  ";
+    final AlignedText aligned = WhitespaceCharSequenceNormalizer.getInstance()
+        .normalizeAligned(original);
+    final String normalized = aligned.normalized();
+    assertEquals("the quick", normalized);
+    final Span whole = aligned.toOriginalSpan(0, normalized.length());
+    assertSame(original, aligned.original());
+    // The match spanning the whole normalized text covers the original from first to last kept char.
+    assertEquals("the   quick", original.subSequence(whole.getStart(), whole.getEnd()).toString());
+  }
+
+  @Test
+  void lineBreakPreservingCollapsesHorizontalRunsButKeepsBreaks() {
+    final LineBreakPreservingWhitespaceCharSequenceNormalizer rung =
+        LineBreakPreservingWhitespaceCharSequenceNormalizer.getInstance();
+    final String original = "Hello   world\n\n\tfoo  bar";
+    assertEquals("Hello world\nfoo bar", rung.normalize(original).toString());
+
+    // The plain whitespace rung instead flattens the blank line into a single space.
+    assertEquals("Hello world foo bar",
+        WhitespaceCharSequenceNormalizer.getInstance().normalize(original).toString());
+
+    final AlignedText aligned = rung.normalizeAligned(original);
+    assertEquals(rung.normalize(original).toString(), aligned.normalized());
+    // "bar" sits at [16, 19) in the collapsed form and at [21, 24) in the original.
+    assertEquals(original.indexOf("bar"), aligned.toOriginalSpan(16, 19).getStart());
+    assertEquals("bar", covered(aligned, 16, 19));
+    // The preserved newline at index 11 maps back to the whole "\n\n\t" run it came from.
+    assertEquals("\n\n\t", covered(aligned, 11, 12));
+  }
+
+  @Test
+  void lineBreakPreservingTrimsLeadingAndTrailingBreaks() {
+    final LineBreakPreservingWhitespaceCharSequenceNormalizer rung =
+        LineBreakPreservingWhitespaceCharSequenceNormalizer.getInstance();
+    final String original = "\n\nHello\n\n";
+    final AlignedText aligned = rung.normalizeAligned(original);
+    assertEquals("Hello", aligned.normalized());
+    assertEquals("Hello", covered(aligned, 0, 5));
+    assertEquals(original.indexOf("Hello"), aligned.toOriginalSpan(0, 5).getStart());
+  }
+
+  @Test
+  void lineBreakPreservingComposesInAnAlignedPipeline() {
+    assertTrue(LineBreakPreservingWhitespaceCharSequenceNormalizer.getInstance()
+        instanceof OffsetAwareNormalizer);
+    final String original = "a" + cp(ZERO_WIDTH_SPACE) + "  b\n\nc" + cp(EM_DASH) + "d";
+    final OffsetAwareNormalizer pipeline = TextNormalizer.builder()
+        .stripInvisible().whitespacePreservingLineBreaks().dashes().buildAligned();
+
+    final AlignedText aligned = pipeline.normalizeAligned(original);
+    assertEquals("a b\nc-d", aligned.normalized());
+    assertEquals(pipeline.normalize(original).toString(), aligned.normalized());
+    // "c-d" at [4, 7) maps back across a deletion, a break-preserving collapse, and a dash fold.
+    assertEquals("c" + cp(EM_DASH) + "d", covered(aligned, 4, 7));
+  }
+
+  @Test
+  void pipelineMapsAnOriginalSpanForwardToTheNormalizedText() {
+    final String original = "a" + cp(ZERO_WIDTH_SPACE) + "  b" + cp(EM_DASH) + "c";
+    final AlignedText aligned = TextNormalizer.builder()
+        .stripInvisible().whitespace().dashes().buildAligned().normalizeAligned(original);
+    assertEquals("a b-c", aligned.normalized());
+    // 'b' is at original index 4 and normalized index 2; the forward mapping must agree.
+    final Span forward = aligned.toNormalizedSpan(4, 5);
+    assertEquals(2, forward.getStart());
+    assertEquals("b", aligned.normalized().substring(forward.getStart(), forward.getEnd()));
+  }
+
+  @Test
+  void lineBreakPreservingNormalizesCrLfAndUnicodeSeparators() {
+    final LineBreakPreservingWhitespaceCharSequenceNormalizer rung =
+        LineBreakPreservingWhitespaceCharSequenceNormalizer.getInstance();
+    assertEquals("a\nb", rung.normalize("a\r\nb").toString());            // CRLF -> one newline
+    assertEquals("a\nb", rung.normalize("a\n\n\n\nb").toString());        // blank lines -> one newline
+    assertEquals("x\ny", rung.normalize("x" + cp(0x2028) + "y").toString()); // line separator
+    assertEquals("p\nq", rung.normalize("p" + cp(0x2029) + "q").toString()); // paragraph separator
+    // A horizontal run still collapses to a space even when mixed with a break-bearing run.
+    assertEquals("a b\nc", rung.normalize("a  b \n c").toString());
+  }
+
+  @Test
+  void whitespaceRungCollapsesAllWhitespaceToEmptyWithAValidSpan() {
+    final AlignedText aligned =
+        WhitespaceCharSequenceNormalizer.getInstance().normalizeAligned("   ");
+    assertEquals("", aligned.normalized());
+    // Mapping the empty match must yield a valid empty span rather than throwing.
+    final Span empty = aligned.toOriginalSpan(0, 0);
+    assertEquals(empty.getStart(), empty.getEnd());
+  }
+
+  @Test
+  void ellipsisExpansionMapsSpanBackToOriginal() {
+    final String original = "a" + cp(0x2026) + "b";
+    final AlignedText aligned = EllipsisCharSequenceNormalizer.getInstance()
+        .normalizeAligned(original);
+    assertEquals("a...b", aligned.normalized());
+    // The single ellipsis expanded to three dots, so 'b' moved from index 2 to index 4.
+    assertEquals("b", covered(aligned, 4, 5));
+    // The whole expansion, and any sub-span of it, maps back to the one source ellipsis.
+    assertEquals(cp(0x2026), covered(aligned, 1, 4));
+    assertEquals(cp(0x2026), covered(aligned, 2, 3));
+  }
+
+  @Test
+  void germanUmlautExpansionMapsSpanBackToOriginal() {
+    final String original = "Stra" + cp(0x00DF) + "e";   // "Strasse" from the eszett form
+    final AlignedText aligned = GermanUmlautCharSequenceNormalizer.getInstance()
+        .normalizeAligned(original);
+    assertEquals("Strasse", aligned.normalized());
+    // The eszett expanded to "ss", so the trailing 'e' moved from index 5 to index 6.
+    assertEquals("e", covered(aligned, 6, 7));
+    // Both halves of "ss" map back to the single source eszett.
+    assertEquals(cp(0x00DF), covered(aligned, 4, 6));
+    assertEquals(cp(0x00DF), covered(aligned, 5, 6));
+  }
+
+  @Test
+  void digitFoldOfSupplementaryDigitMapsSpanBackToOriginal() {
+    final String original = "a" + cp(MATH_BOLD_DIGIT_ZERO) + "b";
+    final AlignedText aligned = DigitCharSequenceNormalizer.getInstance()
+        .normalizeAligned(original);
+    assertEquals("a0b", aligned.normalized());
+    // The two-unit supplementary digit folded to one ASCII '0', so 'b' moved from 3 to 2.
+    assertEquals("b", covered(aligned, 2, 3));
+    assertEquals(cp(MATH_BOLD_DIGIT_ZERO), covered(aligned, 1, 2));
+  }
+
+  @Test
+  void quoteFoldMapsSpanBackToOriginal() {
+    final String original = cp(0x201C) + "hi" + cp(0x201D);   // curly double quotes
+    final AlignedText aligned = QuoteCharSequenceNormalizer.getInstance()
+        .normalizeAligned(original);
+    assertEquals("\"hi\"", aligned.normalized());
+    assertEquals("hi", covered(aligned, 1, 3));
+    // A one-for-one fold, so the opening quote maps straight back to the curly source quote.
+    assertEquals(cp(0x201C), covered(aligned, 0, 1));
+  }
+
+  @Test
+  void substitutionFoldsComposeInAnAlignedPipeline() {
+    final String original = "say " + cp(0x201C) + "hi" + cp(0x201D) + cp(0x2026);
+    final OffsetAwareNormalizer pipeline = TextNormalizer.builder()
+        .quotes().ellipsis().buildAligned();
+    final AlignedText aligned = pipeline.normalizeAligned(original);
+    assertEquals("say \"hi\"...", aligned.normalized());
+    assertEquals(pipeline.normalize(original).toString(), aligned.normalized());
+    // The expanded "..." maps back across the quote fold to the single source ellipsis.
+    assertEquals(cp(0x2026), covered(aligned, 8, 11));
+    assertEquals("hi", covered(aligned, 5, 7));
+  }
+}
diff --git a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/GermanUmlautCharSequenceNormalizerTest.java b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/GermanUmlautCharSequenceNormalizerTest.java
index c4752fdad..ac9abdabd 100644
--- a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/GermanUmlautCharSequenceNormalizerTest.java
+++ b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/GermanUmlautCharSequenceNormalizerTest.java
@@ -60,6 +60,17 @@ void testCapitalEszett() {
     assertEquals("STRASSE", fold("STRA" + cp(0x1E9E) + "E")); // STRASSE
   }
 
+  @Test
+  void testCapitalEszettOffsets() {
+    // The capital eszett expands one source character into two, so the aligned fold reports a
+    // 1->2 replacement and a span over the two produced characters maps back to the single source.
+    final AlignedText aligned = FOLD.normalizeAligned("A" + cp(0x1E9E) + "B"); // A<capital eszett>B
+    assertEquals("ASSB", aligned.normalized().toString());
+    final var source = aligned.alignment().toOriginalSpan(1, 3); // the produced "SS"
+    assertEquals(1, source.getStart());
+    assertEquals(2, source.getEnd());
+  }
+
   @Test
   void testAsciiAndOtherCharactersUnchanged() {
     assertEquals("hello world 123", fold("hello world 123"));