diff --git a/LICENSE b/LICENSE
index 807713a00..3626b6782 100644
--- a/LICENSE
+++ b/LICENSE
@@ -371,9 +371,13 @@ The following license applies to the SLF4J API:
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-The following license applies to the bundled Unicode data file in
+The following license applies to the bundled Unicode data files in
+opennlp-core/opennlp-runtime/src/main/resources/opennlp/tools/tokenize/uax29
+(WordBreakProperty.txt, ExtendedPictographic.txt),
opennlp-core/opennlp-runtime/src/main/resources/opennlp/tools/util/normalizer
-(confusables.txt):
+(confusables.txt), and
+opennlp-core/opennlp-runtime/src/test/resources/opennlp/tools/tokenize/uax29
+(WordBreakTest.txt):
UNICODE LICENSE V3
diff --git a/NOTICE b/NOTICE
index 0e02c8fba..08b702225 100644
--- a/NOTICE
+++ b/NOTICE
@@ -94,16 +94,30 @@ SOFTWARE.
============================================================================
-This product bundles a data file from the Unicode Security Mechanisms
-(UTS #39), version 17.0.0, published by Unicode, Inc.
-(https://www.unicode.org/Public/).
-
+This product bundles data files from the Unicode Character Database (UCD)
+and the Unicode Security Mechanisms, version 17.0.0, published by Unicode,
+Inc. (https://www.unicode.org/Public/).
+
+ * opennlp-core/opennlp-runtime/src/main/resources/opennlp/tools/tokenize/uax29/WordBreakProperty.txt
+ is the upstream WordBreakProperty-17.0.0.txt, unmodified except for the
+ file name.
+ * opennlp-core/opennlp-runtime/src/test/resources/opennlp/tools/tokenize/uax29/WordBreakTest.txt
+ is the upstream WordBreakTest-17.0.0.txt, unmodified except for the file
+ name.
* opennlp-core/opennlp-runtime/src/main/resources/opennlp/tools/util/normalizer/confusables.txt
- is the upstream confusables.txt, unmodified.
+ is the upstream confusables.txt from the Unicode Security Mechanisms
+ (UTS #39), unmodified.
+ * opennlp-core/opennlp-runtime/src/main/resources/opennlp/tools/tokenize/uax29/ExtendedPictographic.txt
+ is derived from the upstream emoji-data.txt (Emoji Data for UTS #51,
+ version 17.0): it keeps only the lines that assign the
+ Extended_Pictographic property and is renamed accordingly. It is a
+ filtered subset; the upstream file additionally carries the Emoji,
+ Emoji_Presentation, Emoji_Modifier, Emoji_Modifier_Base, and
+ Emoji_Component properties, which are not retained.
The original Unicode copyright and license header is preserved verbatim at the
-top of the bundled file. It is distributed under the Unicode License V3, the
-full text of which is reproduced in the LICENSE file accompanying this
+top of each bundled file. These files are distributed under the Unicode License
+V3, the full text of which is reproduced in the LICENSE file accompanying this
distribution.
Copyright (c) 1991-2025 Unicode, Inc. All rights reserved.
diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/uax29/ExtendedPictographic.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/uax29/ExtendedPictographic.java
new file mode 100644
index 000000000..367873b41
--- /dev/null
+++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/uax29/ExtendedPictographic.java
@@ -0,0 +1,109 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.tokenize.uax29;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.UncheckedIOException;
+import java.nio.charset.StandardCharsets;
+import java.util.BitSet;
+
+/**
+ * Tests the Unicode {@code Extended_Pictographic} property of a code point.
+ *
+ *
This is the one extra property the word boundary algorithm needs (rule WB3c), to keep emoji
+ * zero-width-joiner sequences together. The data is loaded once from the {@code emoji-data.txt}
+ * derived resource of the Unicode Character Database and stored in a {@link BitSet}, so membership
+ * is an O(1) bit test.
+ */
+public final class ExtendedPictographic {
+
+ private static final String RESOURCE = "ExtendedPictographic.txt";
+
+ // Loaded lazily on first use (see members()) so a missing or unreadable resource surfaces as a
+ // catchable exception at call time rather than an ExceptionInInitializerError that permanently
+ // poisons the class -- a real risk in container, OSGi, shaded, or modular setups.
+ private static volatile BitSet members;
+
+ private ExtendedPictographic() {
+ }
+
+ // Double-checked lazy initialization: load() runs once on first use, and a failure leaves the
+ // field null so a later call retries instead of the class being permanently unusable.
+ private static BitSet members() {
+ BitSet set = members;
+ if (set == null) {
+ synchronized (ExtendedPictographic.class) {
+ set = members;
+ if (set == null) {
+ set = load();
+ members = set;
+ }
+ }
+ }
+ return set;
+ }
+
+ private static BitSet load() {
+ final BitSet set = new BitSet();
+ try (InputStream in = ExtendedPictographic.class.getResourceAsStream(RESOURCE)) {
+ if (in == null) {
+ throw new IllegalStateException("Missing Extended_Pictographic data resource: " + RESOURCE);
+ }
+ parse(in, set);
+ } catch (IOException e) {
+ throw new UncheckedIOException(
+ "Unable to read Extended_Pictographic data resource " + RESOURCE, e);
+ }
+ return set;
+ }
+
+ private static void parse(InputStream in, BitSet set) throws IOException {
+ try (BufferedReader reader =
+ new BufferedReader(new InputStreamReader(in, StandardCharsets.UTF_8))) {
+ String line;
+ while ((line = reader.readLine()) != null) {
+ final int hash = line.indexOf('#');
+ final String content = (hash < 0 ? line : line.substring(0, hash)).strip();
+ if (content.isEmpty()) {
+ continue;
+ }
+ final int semicolon = content.indexOf(';');
+ final String codePoints = (semicolon < 0 ? content : content.substring(0, semicolon)).strip();
+ final int dots = codePoints.indexOf("..");
+ if (dots < 0) {
+ set.set(Integer.parseInt(codePoints, 16));
+ } else {
+ final int start = Integer.parseInt(codePoints.substring(0, dots), 16);
+ final int end = Integer.parseInt(codePoints.substring(dots + 2), 16);
+ set.set(start, end + 1);
+ }
+ }
+ }
+ }
+
+ /**
+ * {@return whether a code point has the {@code Extended_Pictographic} property}
+ *
+ * @param codePoint The code point. Values outside {@code [0, U+10FFFF]} return {@code false}.
+ */
+ public static boolean is(int codePoint) {
+ return codePoint >= 0 && codePoint <= Character.MAX_CODE_POINT && members().get(codePoint);
+ }
+}
diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/uax29/WordBreak.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/uax29/WordBreak.java
new file mode 100644
index 000000000..570df2b91
--- /dev/null
+++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/uax29/WordBreak.java
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.tokenize.uax29;
+
+/**
+ * The Unicode {@code Word_Break} property values, used by the UAX #29 word boundary algorithm.
+ *
+ * {@link #OTHER} is the default for code points that carry no {@code Word_Break} value in the
+ * Unicode Character Database. The remaining constants correspond one-to-one to the values in
+ * {@code WordBreakProperty.txt} (see
+ * UAX #29).
+ */
+public enum WordBreak {
+
+ /** No assigned {@code Word_Break} value (the default). */
+ OTHER,
+ /** Carriage return ({@code U+000D}). */
+ CR,
+ /** Line feed ({@code U+000A}). */
+ LF,
+ /** Other mandatory line breaks (vertical tab, form feed, NEL, line/paragraph separators). */
+ NEWLINE,
+ /** Combining marks and other characters that extend the preceding one. */
+ EXTEND,
+ /** Zero width joiner ({@code U+200D}). */
+ ZWJ,
+ /** Regional indicator symbols (used in pairs for flag emoji). */
+ REGIONAL_INDICATOR,
+ /** Format characters. */
+ FORMAT,
+ /** Katakana letters. */
+ KATAKANA,
+ /** Hebrew letters (distinguished so a single quote may join them). */
+ HEBREW_LETTER,
+ /** Alphabetic letters. */
+ ALETTER,
+ /** The apostrophe ({@code U+0027}). */
+ SINGLE_QUOTE,
+ /** The quotation mark ({@code U+0022}). */
+ DOUBLE_QUOTE,
+ /** Characters that join letters or numbers (for example the full stop). */
+ MID_NUM_LET,
+ /** Characters that join letters (for example the middle dot). */
+ MID_LETTER,
+ /** Characters that join numbers (for example the comma). */
+ MID_NUM,
+ /** Decimal digits. */
+ NUMERIC,
+ /** Characters that extend a number or letter sequence (for example the low line). */
+ EXTEND_NUM_LET,
+ /** Whitespace that segments words ({@code Word_Break=WSegSpace}). */
+ WSEG_SPACE;
+
+ /**
+ * Maps a {@code Word_Break} value name, as written in {@code WordBreakProperty.txt}, to its
+ * constant.
+ *
+ * @param name The property value name (for example {@code ALetter}).
+ * @return The matching constant.
+ * @throws IllegalArgumentException Thrown if the name is not a known {@code Word_Break} value.
+ */
+ static WordBreak fromPropertyName(String name) {
+ return switch (name) {
+ case "CR" -> CR;
+ case "LF" -> LF;
+ case "Newline" -> NEWLINE;
+ case "Extend" -> EXTEND;
+ case "ZWJ" -> ZWJ;
+ case "Regional_Indicator" -> REGIONAL_INDICATOR;
+ case "Format" -> FORMAT;
+ case "Katakana" -> KATAKANA;
+ case "Hebrew_Letter" -> HEBREW_LETTER;
+ case "ALetter" -> ALETTER;
+ case "Single_Quote" -> SINGLE_QUOTE;
+ case "Double_Quote" -> DOUBLE_QUOTE;
+ case "MidNumLet" -> MID_NUM_LET;
+ case "MidLetter" -> MID_LETTER;
+ case "MidNum" -> MID_NUM;
+ case "Numeric" -> NUMERIC;
+ case "ExtendNumLet" -> EXTEND_NUM_LET;
+ case "WSegSpace" -> WSEG_SPACE;
+ default -> throw new IllegalArgumentException("Unknown Word_Break value: " + name);
+ };
+ }
+}
diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/uax29/WordBreakProperty.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/uax29/WordBreakProperty.java
new file mode 100644
index 000000000..ded780bf6
--- /dev/null
+++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/uax29/WordBreakProperty.java
@@ -0,0 +1,190 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.tokenize.uax29;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.UncheckedIOException;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * Looks up the Unicode {@link WordBreak Word_Break} property of a code point.
+ *
+ * The data is loaded once from the {@code WordBreakProperty.txt} resource of the Unicode
+ * Character Database (parsed with simple cursor scanning, no regular expression). Lookup is O(1)
+ * for the Basic Multilingual Plane (a direct array index) and O(log n) for supplementary code
+ * points (a binary search over a small sorted range table), so it imposes no per-character
+ * allocation on the word boundary algorithm.
+ */
+public final class WordBreakProperty {
+
+ private static final String RESOURCE = "WordBreakProperty.txt";
+
+ private static final WordBreak[] VALUES = WordBreak.values();
+
+ // Loaded lazily on first use (see data()) so a missing or unreadable resource surfaces as a
+ // catchable exception at call time rather than an ExceptionInInitializerError that permanently
+ // poisons the class -- a real risk in container, OSGi, shaded, or modular setups.
+ private static volatile Data data;
+
+ private WordBreakProperty() {
+ }
+
+ // Immutable Word_Break tables: ordinal per BMP code point, plus supplementary ranges sorted by
+ // start for binary search.
+ private static final class Data {
+ final byte[] bmp;
+ final int[] supplementaryStart;
+ final int[] supplementaryEnd;
+ final byte[] supplementaryValue;
+
+ Data(byte[] bmp, int[] start, int[] end, byte[] value) {
+ this.bmp = bmp;
+ this.supplementaryStart = start;
+ this.supplementaryEnd = end;
+ this.supplementaryValue = value;
+ }
+ }
+
+ // Double-checked lazy initialization: load() runs once on first use, and a failure leaves the
+ // field null so a later call retries instead of the class being permanently unusable.
+ private static Data data() {
+ Data d = data;
+ if (d == null) {
+ synchronized (WordBreakProperty.class) {
+ d = data;
+ if (d == null) {
+ d = load();
+ data = d;
+ }
+ }
+ }
+ return d;
+ }
+
+ private static Data load() {
+ final byte[] bmp = new byte[0x10000];
+ final List supplementary = new ArrayList<>();
+ try (InputStream in = WordBreakProperty.class.getResourceAsStream(RESOURCE)) {
+ if (in == null) {
+ throw new IllegalStateException("Missing Word_Break data resource: " + RESOURCE);
+ }
+ parse(in, bmp, supplementary);
+ } catch (IOException e) {
+ throw new UncheckedIOException("Unable to read Word_Break data resource " + RESOURCE, e);
+ }
+ supplementary.sort((a, b) -> Integer.compare(a[0], b[0]));
+ final int[] start = new int[supplementary.size()];
+ final int[] end = new int[supplementary.size()];
+ final byte[] value = new byte[supplementary.size()];
+ for (int i = 0; i < supplementary.size(); i++) {
+ final int[] range = supplementary.get(i);
+ start[i] = range[0];
+ end[i] = range[1];
+ value[i] = (byte) range[2];
+ }
+ return new Data(bmp, start, end, value);
+ }
+
+ private static void parse(InputStream in, byte[] bmp, List supplementary) throws IOException {
+ try (BufferedReader reader =
+ new BufferedReader(new InputStreamReader(in, StandardCharsets.UTF_8))) {
+ String line;
+ while ((line = reader.readLine()) != null) {
+ final int hash = line.indexOf('#');
+ final String content = (hash < 0 ? line : line.substring(0, hash)).strip();
+ if (content.isEmpty()) {
+ continue;
+ }
+ final int semicolon = content.indexOf(';');
+ final String codePoints = content.substring(0, semicolon).strip();
+ final String value = content.substring(semicolon + 1).strip();
+ final byte ordinal = (byte) WordBreak.fromPropertyName(value).ordinal();
+
+ final int dots = codePoints.indexOf("..");
+ final int start;
+ final int end;
+ if (dots < 0) {
+ start = Integer.parseInt(codePoints, 16);
+ end = start;
+ } else {
+ start = Integer.parseInt(codePoints.substring(0, dots), 16);
+ end = Integer.parseInt(codePoints.substring(dots + 2), 16);
+ }
+ assign(start, end, ordinal, bmp, supplementary);
+ }
+ }
+ }
+
+ private static void assign(int start, int end, byte ordinal, byte[] bmp, List supplementary) {
+ final int bmpEnd = Math.min(end, 0xFFFF);
+ if (start <= bmpEnd) {
+ Arrays.fill(bmp, start, bmpEnd + 1, ordinal); // bulk fill the BMP portion of the range
+ }
+ if (end > 0xFFFF) {
+ supplementary.add(new int[] {Math.max(start, 0x10000), end, ordinal});
+ }
+ }
+
+ /**
+ * {@return the {@link WordBreak} value of a code point}
+ *
+ * @param codePoint The code point. Values outside {@code [0, U+10FFFF]} return
+ * {@link WordBreak#OTHER}.
+ */
+ public static WordBreak of(int codePoint) {
+ return VALUES[ordinalOf(codePoint)];
+ }
+
+ /**
+ * {@return the {@link WordBreak#ordinal() ordinal} of a code point's {@link WordBreak} value}
+ * This is the allocation-free form of {@link #of(int)} for hot loops that work with ordinals.
+ *
+ * @param codePoint The code point. Values outside {@code [0, U+10FFFF]} return the ordinal of
+ * {@link WordBreak#OTHER}.
+ */
+ public static int ordinalOf(int codePoint) {
+ if (codePoint >= 0 && codePoint <= 0xFFFF) {
+ return data().bmp[codePoint] & 0xFF; // unsigned byte ordinal
+ }
+ return ordinalOfSupplementary(codePoint);
+ }
+
+ private static int ordinalOfSupplementary(int codePoint) {
+ if (codePoint > 0xFFFF && codePoint <= Character.MAX_CODE_POINT) {
+ final Data d = data();
+ int low = 0;
+ int high = d.supplementaryStart.length - 1;
+ while (low <= high) {
+ final int mid = (low + high) >>> 1;
+ if (codePoint < d.supplementaryStart[mid]) {
+ high = mid - 1;
+ } else if (codePoint > d.supplementaryEnd[mid]) {
+ low = mid + 1;
+ } else {
+ return d.supplementaryValue[mid] & 0xFF; // unsigned byte ordinal
+ }
+ }
+ }
+ return WordBreak.OTHER.ordinal();
+ }
+}
diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/uax29/WordSegmenter.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/uax29/WordSegmenter.java
new file mode 100644
index 000000000..c1529d740
--- /dev/null
+++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/uax29/WordSegmenter.java
@@ -0,0 +1,404 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.tokenize.uax29;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import opennlp.tools.util.Span;
+
+/**
+ * Finds word boundaries in text using the Unicode Text Segmentation algorithm
+ * (UAX #29), rules WB1 through WB999.
+ *
+ * The implementation is a single forward cursor pass with O(1) {@link WordBreakProperty}
+ * lookups and no regular expression. It decodes each code point once, keeps only a constant amount
+ * of state, and allocates nothing per character. It implements the "ignore" semantics of WB4 (a
+ * base character absorbs following {@code Extend}, {@code Format}, and {@code ZWJ}), the look-ahead
+ * rules WB6/WB7/WB7b/WB12, the Hebrew quote rules WB7a-WB7c, the emoji zero-width-joiner rule WB3c,
+ * and regional-indicator pairing WB15/WB16. The look-ahead for the WB6/WB7b/WB12 rules is resolved
+ * lazily and only at mid-word punctuation, so the common case never scans ahead.
+ *
+ * {@link #forEachSegment(CharSequence, SegmentConsumer)} streams the segments with no
+ * allocation; {@link #boundaries(CharSequence)} returns every boundary offset (always including
+ * {@code 0} and the text length); {@link #segments(CharSequence)} returns the spans between
+ * them.
+ */
+public final class WordSegmenter {
+
+ /** Receives each word segment as the half-open character range {@code [start, end)}. */
+ @FunctionalInterface
+ public interface SegmentConsumer {
+ /**
+ * Accepts one segment.
+ *
+ * @param start The inclusive start character offset.
+ * @param end The exclusive end character offset.
+ */
+ void accept(int start, int end);
+ }
+
+ // Decisions for the WB5-WB999 rules. NO_BREAK/BREAK are final; CONSULT marks a (last, current)
+ // pair whose decision also depends on look-ahead or regional-indicator parity, so the full rule
+ // cascade must be consulted. GO_SLOW appears only in the FAST table (never in TRANSITION) and
+ // marks a current class that can trigger a WB3-family or WB4 rule.
+ private static final byte NO_BREAK = 0;
+ private static final byte BREAK = 1;
+ private static final byte CONSULT = 2;
+ private static final byte GO_SLOW = 3;
+
+ private static final WordBreak[] CLASSES = WordBreak.values();
+ private static final int CLASS_COUNT = CLASSES.length;
+
+ // TRANSITION[last * CLASS_COUNT + current] holds the WB5-WB999 decision for a (last, current)
+ // pair: NO_BREAK or BREAK when the decision is the same for every secondLast, next significant
+ // value, and parity, or CONSULT otherwise. The table is derived from afterPrefix(...) at
+ // class-load, so it is equivalent to the rule cascade by construction; only the hot path reads it.
+ private static final byte[] TRANSITION = buildTransitionTable();
+
+ // Ordinals of the Word_Break classes that the WB3 family and WB4 examine. The hot loop works with
+ // ordinals to avoid materializing a WordBreak enum per character.
+ private static final int OTHER_ORDINAL = WordBreak.OTHER.ordinal();
+ private static final int CR_ORDINAL = WordBreak.CR.ordinal();
+ private static final int LF_ORDINAL = WordBreak.LF.ordinal();
+ private static final int NEWLINE_ORDINAL = WordBreak.NEWLINE.ordinal();
+ private static final int ZWJ_ORDINAL = WordBreak.ZWJ.ordinal();
+ private static final int WSEG_SPACE_ORDINAL = WordBreak.WSEG_SPACE.ordinal();
+ private static final int EXTEND_ORDINAL = WordBreak.EXTEND.ordinal();
+ private static final int FORMAT_ORDINAL = WordBreak.FORMAT.ordinal();
+ private static final int REGIONAL_INDICATOR_ORDINAL = WordBreak.REGIONAL_INDICATOR.ordinal();
+
+ // SPECIAL[ordinal] is true for the classes that can trigger a WB3-family or WB4 rule (the
+ // newline, ZWJ, word-segment-space, and ignorable classes). When neither the previous nor the
+ // current class is special, those rules cannot fire and the hot loop goes straight to the
+ // transition table.
+ private static final boolean[] SPECIAL = buildSpecialTable();
+
+ // FAST[last * CLASS_COUNT + current] is the hot-loop table: the TRANSITION decision when the
+ // current class is ordinary, or GO_SLOW when it is special. One read decides the common case and
+ // detects a special current class, so the loop never reloads SPECIAL[current].
+ private static final byte[] FAST = buildFastTable();
+
+ private WordSegmenter() {
+ }
+
+ private static boolean[] buildSpecialTable() {
+ final boolean[] special = new boolean[CLASS_COUNT];
+ special[CR_ORDINAL] = true;
+ special[LF_ORDINAL] = true;
+ special[NEWLINE_ORDINAL] = true;
+ special[ZWJ_ORDINAL] = true;
+ special[WSEG_SPACE_ORDINAL] = true;
+ special[EXTEND_ORDINAL] = true;
+ special[FORMAT_ORDINAL] = true;
+ return special;
+ }
+
+ private static byte[] buildFastTable() {
+ final byte[] fast = new byte[CLASS_COUNT * CLASS_COUNT];
+ for (int last = 0; last < CLASS_COUNT; last++) {
+ for (int current = 0; current < CLASS_COUNT; current++) {
+ final int index = last * CLASS_COUNT + current;
+ fast[index] = SPECIAL[current] ? GO_SLOW : TRANSITION[index];
+ }
+ }
+ return fast;
+ }
+
+ private static byte[] buildTransitionTable() {
+ final byte[] table = new byte[CLASS_COUNT * CLASS_COUNT];
+ for (final WordBreak last : CLASSES) {
+ for (final WordBreak current : CLASSES) {
+ table[last.ordinal() * CLASS_COUNT + current.ordinal()] = deriveDecision(last, current);
+ }
+ }
+ return table;
+ }
+
+ // Returns the constant WB5-WB999 decision for a (last, current) pair, or CONSULT if afterPrefix
+ // gives different answers for different secondLast, next, or parity values.
+ private static byte deriveDecision(WordBreak last, WordBreak current) {
+ Boolean constant = null;
+ for (final WordBreak secondLast : CLASSES) {
+ for (final WordBreak next : CLASSES) {
+ for (int parity = 0; parity <= 1; parity++) {
+ final boolean decision = afterPrefix(current, last, secondLast, next, parity);
+ if (constant == null) {
+ constant = decision;
+ } else if (constant != decision) {
+ return CONSULT;
+ }
+ }
+ }
+ }
+ return constant ? BREAK : NO_BREAK;
+ }
+
+ /**
+ * Streams the word segments of {@code text} to {@code consumer} in order, allocating nothing.
+ * Each segment is delivered as the half-open character range {@code [start, end)}; the segments
+ * are contiguous and together cover the whole text.
+ *
+ * @param text The text to segment.
+ * @param consumer The receiver of the segment ranges.
+ */
+ public static void forEachSegment(CharSequence text, SegmentConsumer consumer) {
+ final int length = text.length();
+ if (length == 0) {
+ return;
+ }
+
+ final int firstCp = Character.codePointAt(text, 0);
+ int prev = WordBreakProperty.ordinalOf(firstCp);
+ boolean prevSpecial = SPECIAL[prev];
+ int last = OTHER_ORDINAL;
+ int secondLast = OTHER_ORDINAL;
+ int regionalIndicatorRun = 0;
+ if (!isIgnorable(prev)) {
+ last = prev;
+ regionalIndicatorRun = prev == REGIONAL_INDICATOR_ORDINAL ? 1 : 0;
+ }
+
+ int segmentStart = 0;
+ int i = Character.charCount(firstCp);
+ while (i < length) {
+ final int codePoint = Character.codePointAt(text, i);
+ final int charCount = Character.charCount(codePoint);
+ final int current = WordBreakProperty.ordinalOf(codePoint);
+
+ // One table read per character. It is the decision for the common case and, as GO_SLOW, the
+ // "current is special" flag; combined with the carried prevSpecial it avoids the two SPECIAL
+ // look-ups the rules would otherwise need.
+ final byte action = FAST[last * CLASS_COUNT + current];
+ final boolean currentSpecial = action == GO_SLOW;
+ final boolean breakHere;
+ if (prevSpecial || currentSpecial) {
+ breakHere = breakAtSpecial(prev, current, codePoint, last, secondLast,
+ regionalIndicatorRun, text, i + charCount, length);
+ } else {
+ breakHere = action == CONSULT
+ ? consult(text, i + charCount, length, current, last, secondLast, regionalIndicatorRun)
+ : action == BREAK;
+ }
+
+ if (breakHere) {
+ consumer.accept(segmentStart, i);
+ segmentStart = i;
+ }
+
+ if (!isIgnorable(current)) {
+ secondLast = last;
+ last = current;
+ regionalIndicatorRun = current == REGIONAL_INDICATOR_ORDINAL ? regionalIndicatorRun + 1 : 0;
+ }
+ prev = current;
+ prevSpecial = currentSpecial;
+ i += charCount;
+ }
+ consumer.accept(segmentStart, length);
+ }
+
+ // Handles a position where the previous or current class is special: applies the WB3 family and
+ // WB4 (which depend on the immediately preceding code point), then falls back to the transition
+ // table for the WB5-WB999 rules.
+ private static boolean breakAtSpecial(int prev, int current, int codePoint, int last,
+ int secondLast, int regionalIndicatorRun, CharSequence text, int nextFrom, int length) {
+ if (prev == CR_ORDINAL && current == LF_ORDINAL) {
+ return false; // WB3
+ }
+ if (prev == CR_ORDINAL || prev == LF_ORDINAL || prev == NEWLINE_ORDINAL) {
+ return true; // WB3a
+ }
+ if (current == CR_ORDINAL || current == LF_ORDINAL || current == NEWLINE_ORDINAL) {
+ return true; // WB3b
+ }
+ if (prev == ZWJ_ORDINAL && ExtendedPictographic.is(codePoint)) {
+ return false; // WB3c
+ }
+ if (prev == WSEG_SPACE_ORDINAL && current == WSEG_SPACE_ORDINAL) {
+ return false; // WB3d
+ }
+ if (current == EXTEND_ORDINAL || current == FORMAT_ORDINAL || current == ZWJ_ORDINAL) {
+ return false; // WB4
+ }
+ final byte action = TRANSITION[last * CLASS_COUNT + current];
+ return action == CONSULT
+ ? consult(text, nextFrom, length, current, last, secondLast, regionalIndicatorRun)
+ : action == BREAK;
+ }
+
+ // Resolves a CONSULT cell: a look-ahead (WB6/WB7b/WB12) or parity (WB15/WB16) rule applies, so
+ // the next significant value is read (the only place it is needed) and the full cascade is run.
+ private static boolean consult(CharSequence text, int nextFrom, int length, int current,
+ int last, int secondLast, int regionalIndicatorRun) {
+ final WordBreak next = nextSignificant(text, nextFrom, length);
+ return afterPrefix(CLASSES[current], CLASSES[last], CLASSES[secondLast], next,
+ regionalIndicatorRun);
+ }
+
+ /**
+ * Returns the word boundary character offsets in {@code text}, in ascending order, including the
+ * boundaries at {@code 0} and {@code text.length()}.
+ *
+ * @param text The text to segment.
+ * @return The boundary offsets; for empty text, {@code [0]}.
+ */
+ public static int[] boundaries(CharSequence text) {
+ if (text.length() == 0) {
+ return new int[] {0};
+ }
+ final IntList offsets = new IntList();
+ offsets.add(0); // WB1: break at start of text.
+ // Boundaries are 0 followed by every segment end; the last end is the text length (WB2).
+ forEachSegment(text, (start, end) -> offsets.add(end));
+ return offsets.toArray();
+ }
+
+ /**
+ * Returns the word segments of {@code text} as spans between consecutive boundaries.
+ *
+ * @param text The text to segment.
+ * @return The segment spans, in order.
+ */
+ public static List segments(CharSequence text) {
+ final List spans = new ArrayList<>();
+ forEachSegment(text, (start, end) -> spans.add(new Span(start, end)));
+ return spans;
+ }
+
+ // The Word_Break value of the next non-ignorable code point at or after "from" (else OTHER).
+ private static WordBreak nextSignificant(CharSequence text, int from, int length) {
+ for (int j = from; j < length; ) {
+ final int codePoint = Character.codePointAt(text, j);
+ final WordBreak value = WordBreakProperty.of(codePoint);
+ if (!isIgnorable(value)) {
+ return value;
+ }
+ j += Character.charCount(codePoint);
+ }
+ return WordBreak.OTHER;
+ }
+
+ // Applies WB5 through WB999. These rules depend only on the last two significant values, the
+ // current value, the next significant value, and the regional-indicator parity, never on the
+ // immediately preceding code point; that is what lets them be captured by the transition table.
+ // "last"/"secondLast" skip the WB4-absorbed characters.
+ private static boolean afterPrefix(WordBreak current, WordBreak last, WordBreak secondLast,
+ WordBreak next, int regionalIndicatorRun) {
+ if (isAhLetter(last) && isAhLetter(current)) {
+ return false; // WB5
+ }
+ if (isAhLetter(last) && isMidLetter(current) && isAhLetter(next)) {
+ return false; // WB6
+ }
+ if (isAhLetter(secondLast) && isMidLetter(last) && isAhLetter(current)) {
+ return false; // WB7
+ }
+ if (last == WordBreak.HEBREW_LETTER && current == WordBreak.SINGLE_QUOTE) {
+ return false; // WB7a
+ }
+ if (last == WordBreak.HEBREW_LETTER && current == WordBreak.DOUBLE_QUOTE
+ && next == WordBreak.HEBREW_LETTER) {
+ return false; // WB7b
+ }
+ if (secondLast == WordBreak.HEBREW_LETTER && last == WordBreak.DOUBLE_QUOTE
+ && current == WordBreak.HEBREW_LETTER) {
+ return false; // WB7c
+ }
+ if (last == WordBreak.NUMERIC && current == WordBreak.NUMERIC) {
+ return false; // WB8
+ }
+ if (isAhLetter(last) && current == WordBreak.NUMERIC) {
+ return false; // WB9
+ }
+ if (last == WordBreak.NUMERIC && isAhLetter(current)) {
+ return false; // WB10
+ }
+ if (secondLast == WordBreak.NUMERIC && isMidNumber(last) && current == WordBreak.NUMERIC) {
+ return false; // WB11
+ }
+ if (last == WordBreak.NUMERIC && isMidNumber(current) && next == WordBreak.NUMERIC) {
+ return false; // WB12
+ }
+ if (last == WordBreak.KATAKANA && current == WordBreak.KATAKANA) {
+ return false; // WB13
+ }
+ if ((isAhLetter(last) || last == WordBreak.NUMERIC || last == WordBreak.KATAKANA
+ || last == WordBreak.EXTEND_NUM_LET) && current == WordBreak.EXTEND_NUM_LET) {
+ return false; // WB13a
+ }
+ if (last == WordBreak.EXTEND_NUM_LET && (isAhLetter(current) || current == WordBreak.NUMERIC
+ || current == WordBreak.KATAKANA)) {
+ return false; // WB13b
+ }
+ if (current == WordBreak.REGIONAL_INDICATOR && last == WordBreak.REGIONAL_INDICATOR
+ && (regionalIndicatorRun & 1) == 1) {
+ return false; // WB15 / WB16
+ }
+ return true; // WB999
+ }
+
+ private static boolean isIgnorable(WordBreak value) {
+ return value == WordBreak.EXTEND || value == WordBreak.FORMAT || value == WordBreak.ZWJ;
+ }
+
+ private static boolean isIgnorable(int ordinal) {
+ return ordinal == EXTEND_ORDINAL || ordinal == FORMAT_ORDINAL || ordinal == ZWJ_ORDINAL;
+ }
+
+ private static boolean isAhLetter(WordBreak value) {
+ return value == WordBreak.ALETTER || value == WordBreak.HEBREW_LETTER;
+ }
+
+ // MidLetter | MidNumLet | Single_Quote (the "MidLetterQ" set used by WB6 and WB7).
+ private static boolean isMidLetter(WordBreak value) {
+ return value == WordBreak.MID_LETTER || value == WordBreak.MID_NUM_LET
+ || value == WordBreak.SINGLE_QUOTE;
+ }
+
+ // MidNum | MidNumLet | Single_Quote (the set used by WB11 and WB12).
+ private static boolean isMidNumber(WordBreak value) {
+ return value == WordBreak.MID_NUM || value == WordBreak.MID_NUM_LET
+ || value == WordBreak.SINGLE_QUOTE;
+ }
+
+ // A minimal growable int array, so boundaries() makes one backing allocation instead of one per
+ // boundary (an ArrayList would box every offset).
+ private static final class IntList {
+ private static final int MAX_ARRAY_SIZE = Integer.MAX_VALUE - 8;
+ private int[] values = new int[16];
+ private int size;
+
+ void add(int value) {
+ if (size == values.length) {
+ // Overflow-aware 1.5x growth so a very large boundary count never wraps to a negative
+ // capacity (NegativeArraySizeException); it degrades to a clean OutOfMemoryError instead.
+ int newCapacity = values.length + (values.length >> 1);
+ if (newCapacity < 0 || newCapacity > MAX_ARRAY_SIZE) {
+ newCapacity = MAX_ARRAY_SIZE;
+ }
+ values = Arrays.copyOf(values, newCapacity);
+ }
+ values[size++] = value;
+ }
+
+ int[] toArray() {
+ return Arrays.copyOf(values, size);
+ }
+ }
+}
diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/uax29/WordToken.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/uax29/WordToken.java
new file mode 100644
index 000000000..0e07287db
--- /dev/null
+++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/uax29/WordToken.java
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.tokenize.uax29;
+
+import opennlp.tools.util.Span;
+
+/**
+ * A word token produced by {@link WordTokenizer}: the character {@link Span} into the source text
+ * together with its {@link WordType}.
+ *
+ * @param span The character offsets of the token in the source text.
+ * @param type The token category.
+ */
+public record WordToken(Span span, WordType type) {
+
+ /**
+ * Returns the covered text of this token.
+ *
+ * @param source The text this token was produced from.
+ * @return The covered text.
+ */
+ public CharSequence text(CharSequence source) {
+ return span.getCoveredText(source);
+ }
+}
diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/uax29/WordTokenizer.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/uax29/WordTokenizer.java
new file mode 100644
index 000000000..1f6707b13
--- /dev/null
+++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/uax29/WordTokenizer.java
@@ -0,0 +1,171 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.tokenize.uax29;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import opennlp.tools.tokenize.Tokenizer;
+import opennlp.tools.util.Span;
+
+/**
+ * A word tokenizer built on the Unicode Text Segmentation algorithm (UAX #29). It finds segments
+ * with {@link WordSegmenter}, keeps the ones that are words (letters, digits, ideographs, kana,
+ * Hangul, Southeast-Asian script, or emoji), drops whitespace and punctuation, and classifies each
+ * kept token with a {@link WordType}. Emoji here means any {@code Extended_Pictographic} code point,
+ * so symbol-like characters such as the copyright, trademark, and double-exclamation signs are kept
+ * (typed {@link WordType#EMOJI}) rather than dropped as punctuation.
+ *
+ * A token longer than {@code maxTokenLength} is emitted as consecutive pieces, never splitting a
+ * surrogate pair. The tokenizer reports offset {@link Span}s, so the original text and its character
+ * offsets are preserved for downstream normalization.
+ *
+ * It implements {@link Tokenizer}: {@link #tokenize(String)} returns the token strings and
+ * {@link #tokenizePos(String)} their offsets. {@link #tokenizeTyped(CharSequence)} additionally
+ * carries each token's {@link WordType}, and {@link #tokenize(CharSequence, TokenHandler)} streams
+ * tokens with no per-token allocation. Instances are immutable and thread-safe.
+ */
+// Implements Tokenizer directly rather than extending AbstractTokenizer: this tokenizer produces
+// its spans from the UAX #29 segmenter in one pass and shares none of AbstractTokenizer's
+// per-character probability/merge machinery, so subclassing it would only add unused state.
+public final class WordTokenizer implements Tokenizer {
+
+ /** Receives each word token as a character range and its type, with no allocation. */
+ @FunctionalInterface
+ public interface TokenHandler {
+ /**
+ * Accepts one word token.
+ *
+ * @param start The inclusive start character offset.
+ * @param end The exclusive end character offset.
+ * @param type The token category.
+ */
+ void token(int start, int end, WordType type);
+ }
+
+ /** The default maximum token length. */
+ public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;
+
+ private final int maxTokenLength;
+
+ /**
+ * Creates a tokenizer with the {@linkplain #DEFAULT_MAX_TOKEN_LENGTH default} maximum token
+ * length.
+ */
+ public WordTokenizer() {
+ this(DEFAULT_MAX_TOKEN_LENGTH);
+ }
+
+ /**
+ * Creates a tokenizer with the given maximum token length.
+ *
+ * @param maxTokenLength The maximum number of characters in a token; longer tokens are chopped
+ * into consecutive pieces. Must be at least {@code 1}.
+ * @throws IllegalArgumentException if {@code maxTokenLength} is less than {@code 1}.
+ */
+ public WordTokenizer(int maxTokenLength) {
+ if (maxTokenLength < 1) {
+ throw new IllegalArgumentException("maxTokenLength must be at least 1, got " + maxTokenLength);
+ }
+ this.maxTokenLength = maxTokenLength;
+ }
+
+ /**
+ * Streams the word tokens of {@code text} to {@code handler} in order, allocating nothing.
+ *
+ * @param text The text to tokenize.
+ * @param handler The receiver of the tokens.
+ */
+ public void tokenize(CharSequence text, TokenHandler handler) {
+ WordSegmenter.forEachSegment(text, (start, end) -> {
+ final WordType type = WordType.of(text, start, end);
+ if (type != null) {
+ emit(text, start, end, type, handler);
+ }
+ });
+ }
+
+ /**
+ * Returns the word tokens of {@code s} as strings, in order.
+ *
+ * @param s The text to tokenize.
+ * @return The token strings.
+ */
+ @Override
+ public String[] tokenize(String s) {
+ final List tokens = new ArrayList<>();
+ tokenize(s, (start, end, type) -> tokens.add(s.substring(start, end)));
+ return tokens.toArray(new String[0]);
+ }
+
+ /**
+ * Returns the offset spans of the word tokens of {@code s}, in order.
+ *
+ * @param s The text to tokenize.
+ * @return The token spans.
+ */
+ @Override
+ public Span[] tokenizePos(String s) {
+ final List spans = tokenizeSpans(s);
+ return spans.toArray(new Span[0]);
+ }
+
+ /**
+ * Returns the offset spans of the word tokens in {@code text}, in order.
+ *
+ * @param text The text to tokenize.
+ * @return The word-token spans.
+ */
+ public List tokenizeSpans(CharSequence text) {
+ final List spans = new ArrayList<>();
+ tokenize(text, (start, end, type) -> spans.add(new Span(start, end)));
+ return spans;
+ }
+
+ /**
+ * Returns the word tokens in {@code text}, each carrying its {@link WordType}, in order.
+ *
+ * @param text The text to tokenize.
+ * @return The typed word tokens.
+ */
+ public List tokenizeTyped(CharSequence text) {
+ final List tokens = new ArrayList<>();
+ tokenize(text, (start, end, type) -> tokens.add(new WordToken(new Span(start, end), type)));
+ return tokens;
+ }
+
+ // Emits [start, end) as one or more tokens no longer than maxTokenLength, never splitting a
+ // surrogate pair. The whole word is classified once and every piece carries that type.
+ private void emit(CharSequence text, int start, int end, WordType type, TokenHandler handler) {
+ int from = start;
+ while (end - from > maxTokenLength) {
+ int cut = from + maxTokenLength;
+ if (Character.isHighSurrogate(text.charAt(cut - 1))) {
+ cut--; // keep the surrogate pair together
+ }
+ if (cut <= from) {
+ // maxTokenLength is shorter than the leading code point; emit it whole rather than stall.
+ cut = from + Character.charCount(Character.codePointAt(text, from));
+ }
+ handler.token(from, cut, type);
+ from = cut;
+ }
+ if (from < end) {
+ handler.token(from, end, type);
+ }
+ }
+}
diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/uax29/WordType.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/uax29/WordType.java
new file mode 100644
index 000000000..4dcd7cc27
--- /dev/null
+++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/uax29/WordType.java
@@ -0,0 +1,148 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.tokenize.uax29;
+
+/**
+ * The category of a {@linkplain WordTokenizer word token}. {@link #ALPHANUMERIC} and
+ * {@link #NUMERIC} cover letter and digit words; the remaining categories identify scripts and
+ * emoji that benefit from script-specific handling. The boundaries themselves follow the Unicode
+ * release shipped with {@link WordSegmenter}.
+ */
+public enum WordType {
+
+ /** A token that contains at least one letter (optionally mixed with digits and connectors). */
+ ALPHANUMERIC,
+
+ /** A token made up entirely of digits and numeric connectors. */
+ NUMERIC,
+
+ /** A token containing a Han ideograph (one ideograph per token under UAX #29 segmentation). */
+ IDEOGRAPHIC,
+
+ /** A Hiragana token. */
+ HIRAGANA,
+
+ /** A Katakana token. */
+ KATAKANA,
+
+ /** A Hangul token. */
+ HANGUL,
+
+ /** A token in a Southeast Asian script that requires dictionary segmentation (Thai, Lao, ...). */
+ SOUTHEAST_ASIAN,
+
+ /** An emoji, emoji sequence, or regional-indicator flag. */
+ EMOJI;
+
+ private static final int REGIONAL_INDICATOR_FIRST = 0x1F1E6;
+ private static final int REGIONAL_INDICATOR_LAST = 0x1F1FF;
+
+ // No code point below this can belong to a script-specific category (the lowest is Thai, U+0E00),
+ // so Latin, Greek, Cyrillic, and ASCII text skips the relatively costly script lookup entirely.
+ private static final int LOWEST_SCRIPT_CODE_POINT = 0x0E00;
+
+ // ASCII kind: 0 = neither, 1 = letter, 2 = digit. No ASCII code point is pictographic or in a
+ // script-specific category, so ASCII characters skip those tests and the Character.isLetter /
+ // isDigit general-category look-ups entirely.
+ private static final byte[] ASCII_KIND = buildAsciiKind();
+
+ private static byte[] buildAsciiKind() {
+ final byte[] kind = new byte[0x80];
+ for (int c = '0'; c <= '9'; c++) {
+ kind[c] = 2;
+ }
+ for (int c = 'A'; c <= 'Z'; c++) {
+ kind[c] = 1;
+ }
+ for (int c = 'a'; c <= 'z'; c++) {
+ kind[c] = 1;
+ }
+ return kind;
+ }
+
+ // Classifies the code points in text over [start, end) as a word token type, or returns null
+ // when the range is not a word (pure whitespace, punctuation, or symbols). Emoji win over
+ // scripts, scripts over the generic alphanumeric/numeric split.
+ static WordType of(CharSequence text, int start, int end) {
+ boolean hasLetter = false;
+ boolean hasDigit = false;
+ WordType script = null;
+ for (int i = start; i < end; ) {
+ final int codePoint = Character.codePointAt(text, i);
+ i += Character.charCount(codePoint);
+ if (codePoint < 0x80) {
+ final int kind = ASCII_KIND[codePoint];
+ if (kind == 1) {
+ hasLetter = true;
+ } else if (kind == 2) {
+ hasDigit = true;
+ }
+ continue;
+ }
+ if (ExtendedPictographic.is(codePoint) || isRegionalIndicator(codePoint)) {
+ return EMOJI;
+ }
+ if (codePoint >= LOWEST_SCRIPT_CODE_POINT && script == null) {
+ script = scriptType(codePoint);
+ }
+ if (Character.isLetter(codePoint)) {
+ hasLetter = true;
+ } else if (Character.isDigit(codePoint)) {
+ hasDigit = true;
+ }
+ }
+ if (script != null) {
+ return script;
+ }
+ if (hasLetter) {
+ return ALPHANUMERIC;
+ }
+ if (hasDigit) {
+ return NUMERIC;
+ }
+ return null;
+ }
+
+ private static boolean isRegionalIndicator(int codePoint) {
+ return codePoint >= REGIONAL_INDICATOR_FIRST && codePoint <= REGIONAL_INDICATOR_LAST;
+ }
+
+ // Maps a code point to a script-specific token type, or null for scripts (Latin, Greek, ...) that
+ // fall through to the generic alphanumeric category.
+ private static WordType scriptType(int codePoint) {
+ switch (Character.UnicodeScript.of(codePoint)) {
+ case HAN:
+ return IDEOGRAPHIC;
+ case HIRAGANA:
+ return HIRAGANA;
+ case KATAKANA:
+ return KATAKANA;
+ case HANGUL:
+ return HANGUL;
+ case THAI:
+ case LAO:
+ case MYANMAR:
+ case KHMER:
+ case TAI_LE:
+ case NEW_TAI_LUE:
+ case TAI_VIET:
+ return SOUTHEAST_ASIAN;
+ default:
+ return null;
+ }
+ }
+}
diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/Dimension.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/Dimension.java
index 6ad068471..7caece7a0 100644
--- a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/Dimension.java
+++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/Dimension.java
@@ -19,13 +19,13 @@
import java.util.function.Supplier;
/**
- * A layer of the {@code Term} normalization stack, in increasing order of aggressiveness. A
- * {@code TermAnalyzer} applies a configured prefix of these to each token; the declaration order is
+ * A layer of the {@link Term} normalization stack, in increasing order of aggressiveness. A
+ * {@link TermAnalyzer} applies a configured prefix of these to each token; the declaration order is
* the canonical pipeline order, because the transforms do not commute (case folding then accent
* folding differs from the reverse for Turkish dotted/dotless i and the German eszett).
*
* This enum is the single definition of the character-level steps: each one carries its default
- * {@link CharSequenceNormalizer}, which both {@code TermAnalyzer} and {@link TextNormalizer} read
+ * {@link CharSequenceNormalizer}, which both {@link TermAnalyzer} and {@link TextNormalizer} read
* from rather than re-listing. The default is resolved lazily, so loading this enum does not eagerly
* initialize heavy data such as the confusables table.
*
diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/NormalizationProfile.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/NormalizationProfile.java
new file mode 100644
index 000000000..e45d4231d
--- /dev/null
+++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/NormalizationProfile.java
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.util.normalizer;
+
+import opennlp.tools.stemmer.Stemmer;
+import opennlp.tools.stemmer.snowball.SnowballStemmer;
+
+/**
+ * Per-language normalization settings, mirroring how OpenNLP already selects a Snowball stemmer by
+ * language. A profile pairs a language with its Snowball {@link SnowballStemmer.ALGORITHM} and the
+ * diacritic fold appropriate for that language (if any).
+ *
+ * The {@code accentFold} normalizer is the language's diacritic transform for a search form, or
+ * {@code null} when folding is not appropriate. It is the generic
+ * {@link AccentFoldCharSequenceNormalizer} for English and the major Romance languages (where
+ * accented letters are search variants of their base letter), the German-specific
+ * {@link GermanUmlautCharSequenceNormalizer} (a-umlaut to {@code ae}, eszett to {@code ss}, ...) for
+ * German, and {@code null} where diacritics mark distinct letters (the Nordic languages and the
+ * non-Latin scripts), because folding there is language-wrong. This is a search-recall choice, not a
+ * statement of linguistic correctness; callers can build a {@link TermAnalyzer} directly to
+ * override it.
+ *
+ * @param language The language, as an ISO 639-3 code (for example {@code "eng"}).
+ * @param stemmerAlgorithm The Snowball algorithm for the language.
+ * @param accentFold The diacritic fold for the language, or {@code null} for none.
+ */
+public record NormalizationProfile(String language, SnowballStemmer.ALGORITHM stemmerAlgorithm,
+ CharSequenceNormalizer accentFold) {
+
+ /**
+ * {@return a new {@link Stemmer} for this language} A fresh instance is returned on each call
+ * because the Snowball stemmers are stateful and not thread-safe.
+ */
+ public Stemmer newStemmer() {
+ return new SnowballStemmer(stemmerAlgorithm);
+ }
+
+ /**
+ * Returns a search-oriented analyzer for this language: NFC, case folding, the language's
+ * {@linkplain #accentFold() diacritic fold} when it has one, then stemming. Each call builds an
+ * independent analyzer with its own stemmer, so use one per thread when stemming.
+ *
+ * @return the analyzer.
+ */
+ public TermAnalyzer searchAnalyzer() {
+ final TermAnalyzer.Builder builder = TermAnalyzer.builder().nfc().caseFold();
+ if (accentFold != null) {
+ builder.transform(Dimension.ACCENT_FOLD, accentFold);
+ }
+ return builder.stem(newStemmer()).build();
+ }
+}
diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/NormalizationProfiles.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/NormalizationProfiles.java
new file mode 100644
index 000000000..c39abee43
--- /dev/null
+++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/NormalizationProfiles.java
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.util.normalizer;
+
+import java.util.HashMap;
+import java.util.Locale;
+import java.util.Map;
+import java.util.MissingResourceException;
+import java.util.Objects;
+import java.util.Optional;
+import java.util.Set;
+
+import opennlp.tools.langdetect.LanguageDetector;
+import opennlp.tools.stemmer.snowball.SnowballStemmer;
+
+/**
+ * A registry of {@link NormalizationProfile}s by language, with detection-based fallback. This is
+ * the language dispatch the design note calls for: pick the profile for a requested language, or
+ * detect the language with a {@link LanguageDetector} when it is unspecified. The covered languages
+ * are exactly those with a Snowball stemmer.
+ *
+ * Profiles are keyed by ISO 639-3 code (what {@link LanguageDetector} produces);
+ * {@link #forLanguage(String)} also accepts ISO 639-1 two-letter codes.
+ */
+public final class NormalizationProfiles {
+
+ private static final Map BY_LANGUAGE = build();
+
+ private NormalizationProfiles() {
+ }
+
+ private static Map build() {
+ final Map map = new HashMap<>();
+ // The generic accent fold is used for English and the major Romance languages, German uses its
+ // own ae/oe/ue/ss fold, and folding is disabled elsewhere (Nordic, non-Latin) where diacritics
+ // mark distinct letters.
+ final CharSequenceNormalizer latin = AccentFoldCharSequenceNormalizer.getInstance();
+ final CharSequenceNormalizer german = GermanUmlautCharSequenceNormalizer.getInstance();
+ add(map, "ara", SnowballStemmer.ALGORITHM.ARABIC, null);
+ add(map, "cat", SnowballStemmer.ALGORITHM.CATALAN, latin);
+ add(map, "dan", SnowballStemmer.ALGORITHM.DANISH, null);
+ add(map, "deu", SnowballStemmer.ALGORITHM.GERMAN, german);
+ add(map, "ell", SnowballStemmer.ALGORITHM.GREEK, null);
+ add(map, "eng", SnowballStemmer.ALGORITHM.ENGLISH, latin);
+ add(map, "fin", SnowballStemmer.ALGORITHM.FINNISH, null);
+ add(map, "fra", SnowballStemmer.ALGORITHM.FRENCH, latin);
+ add(map, "gle", SnowballStemmer.ALGORITHM.IRISH, null);
+ add(map, "hun", SnowballStemmer.ALGORITHM.HUNGARIAN, null);
+ add(map, "ind", SnowballStemmer.ALGORITHM.INDONESIAN, null);
+ add(map, "ita", SnowballStemmer.ALGORITHM.ITALIAN, latin);
+ add(map, "nld", SnowballStemmer.ALGORITHM.DUTCH, null);
+ add(map, "nor", SnowballStemmer.ALGORITHM.NORWEGIAN, null);
+ add(map, "por", SnowballStemmer.ALGORITHM.PORTUGUESE, latin);
+ add(map, "ron", SnowballStemmer.ALGORITHM.ROMANIAN, null);
+ add(map, "rus", SnowballStemmer.ALGORITHM.RUSSIAN, null);
+ add(map, "spa", SnowballStemmer.ALGORITHM.SPANISH, latin);
+ add(map, "swe", SnowballStemmer.ALGORITHM.SWEDISH, null);
+ return Map.copyOf(map);
+ }
+
+ private static void add(Map map, String language,
+ SnowballStemmer.ALGORITHM algorithm, CharSequenceNormalizer accentFold) {
+ map.put(language, new NormalizationProfile(language, algorithm, accentFold));
+ }
+
+ /**
+ * Returns the profile for a language.
+ *
+ * @param language An ISO 639-3 or ISO 639-1 language code; case-insensitive.
+ * @return The profile, or empty if the language has no Snowball stemmer.
+ */
+ public static Optional forLanguage(String language) {
+ Objects.requireNonNull(language, "language");
+ String code = language.strip().toLowerCase(Locale.ROOT);
+ if (code.length() == 2) {
+ try {
+ final String iso3 = Locale.of(code).getISO3Language();
+ if (!iso3.isEmpty()) {
+ code = iso3;
+ }
+ } catch (MissingResourceException ignored) {
+ // No ISO 639-3 code for this two-letter code; fall through and look up as given.
+ }
+ }
+ return Optional.ofNullable(BY_LANGUAGE.get(code));
+ }
+
+ /**
+ * Detects the language of {@code text} and returns its profile.
+ *
+ * @param text The text to detect.
+ * @param detector The language detector to use.
+ * @return The profile for the detected language, or empty if it has no Snowball stemmer.
+ */
+ public static Optional detect(CharSequence text,
+ LanguageDetector detector) {
+ Objects.requireNonNull(text, "text");
+ Objects.requireNonNull(detector, "detector");
+ return forLanguage(detector.predictLanguage(text).getLang());
+ }
+
+ /**
+ * {@return the ISO 639-3 codes of the supported languages}
+ */
+ public static Set supportedLanguages() {
+ return BY_LANGUAGE.keySet();
+ }
+}
diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/Term.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/Term.java
new file mode 100644
index 000000000..eda3c4107
--- /dev/null
+++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/Term.java
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.util.normalizer;
+
+import java.util.EnumMap;
+import java.util.List;
+
+import opennlp.tools.util.Span;
+
+/**
+ * One token as a stack of normalization layers. The {@link #original()} form is the canonical
+ * source of truth; the other layers are derived, increasingly aggressive {@link Dimension}s tuned
+ * for matching and search. The dimensions configured on the producing {@link TermAnalyzer} are
+ * computed eagerly and cached; any other dimension is computed on first request, applied on top of
+ * the {@link #normalized() configured form}, and then cached.
+ *
+ * Because the original is always retained, aggressive folding is safe: a match on a derived layer
+ * can always be reported in original coordinates through {@link #span()}. Querying a configured
+ * layer, or {@link #peel() peeling} the last-applied one, is O(1); adding an unconfigured dimension
+ * costs one transform on first touch and is O(1) thereafter.
+ *
+ * Instances are created by {@link TermAnalyzer} and are not thread-safe (the lazy cache is
+ * mutated on first access of an unconfigured dimension).
+ */
+public final class Term {
+
+ private final TermAnalyzer analyzer;
+ private final Span span;
+ private final String posTag;
+ private final EnumMap layers = new EnumMap<>(Dimension.class);
+
+ Term(TermAnalyzer analyzer, String original, Span span, String posTag) {
+ this.analyzer = analyzer;
+ this.span = span;
+ this.posTag = posTag;
+ String value = original;
+ layers.put(Dimension.ORIGINAL, value);
+ for (final Dimension dimension : analyzer.dimensions()) {
+ value = analyzer.apply(dimension, value, posTag);
+ layers.put(dimension, value);
+ }
+ }
+
+ /**
+ * {@return the source span of this token, or {@code null} if it was supplied as a pre-tokenized
+ * string} The span indexes into the text passed to {@link TermAnalyzer#analyze(CharSequence)}.
+ */
+ public Span span() {
+ return span;
+ }
+
+ /**
+ * {@return the original token text}
+ */
+ public String original() {
+ return layers.get(Dimension.ORIGINAL);
+ }
+
+ /**
+ * {@return the token at the analyzer's final configured dimension} Equal to {@link #original()}
+ * when no dimensions were configured.
+ */
+ public String normalized() {
+ return at(analyzer.finalDimension());
+ }
+
+ /**
+ * Returns the token at {@code dimension}. Configured dimensions are cached; an unconfigured
+ * dimension is computed by applying its transform to {@link #normalized()} and then cached.
+ *
+ * Note: an unconfigured dimension is applied on top of {@link #normalized()} (the most
+ * aggressive configured layer), not spliced into canonical pipeline order. Because the transforms
+ * do not commute (see {@link Dimension}), requesting a dimension that ranks earlier than
+ * the configured ones can differ from having configured it. For example, asking for
+ * {@link Dimension#CASE_FOLD} on an analyzer configured only through {@link Dimension#ACCENT_FOLD}
+ * case-folds the already accent-folded text, which is not the same as case-folding first.
+ * Configure the dimension on the analyzer when canonical order matters.
+ *
+ * @param dimension The dimension to project to.
+ * @return The token at that dimension.
+ * @throws IllegalStateException if the dimension needs an engine or tag that was not configured
+ * (see {@link Dimension#STEM} and {@link Dimension#LEMMA}).
+ */
+ public String at(Dimension dimension) {
+ final String cached = layers.get(dimension);
+ if (cached != null) {
+ return cached;
+ }
+ final String value = analyzer.apply(dimension, normalized(), posTag);
+ layers.put(dimension, value);
+ return value;
+ }
+
+ /**
+ * {@return the token at the dimension just below the final configured one} This is the
+ * last-applied layer removed (for example the form before stemming when {@link Dimension#STEM}
+ * is the final dimension); equal to {@link #original()} when at most one dimension is configured.
+ */
+ public String peel() {
+ final List dimensions = analyzer.dimensions();
+ if (dimensions.size() < 2) {
+ return original();
+ }
+ return at(dimensions.get(dimensions.size() - 2));
+ }
+}
diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/TermAnalyzer.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/TermAnalyzer.java
new file mode 100644
index 000000000..0d9956e8e
--- /dev/null
+++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/TermAnalyzer.java
@@ -0,0 +1,368 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.util.normalizer;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.EnumMap;
+import java.util.EnumSet;
+import java.util.List;
+import java.util.Locale;
+import java.util.Objects;
+import java.util.Set;
+
+import opennlp.tools.lemmatizer.Lemmatizer;
+import opennlp.tools.stemmer.Stemmer;
+import opennlp.tools.tokenize.uax29.WordTokenizer;
+import opennlp.tools.util.Span;
+
+/**
+ * Builds {@link Term}s by segmenting text and applying a configured stack of normalization
+ * {@link Dimension}s to each token. The analyzer is the configuration; each {@link Term} is the
+ * layered result for one token, with the configured dimensions computed eagerly and any other
+ * dimension computed lazily on first request.
+ *
+ * Segmentation uses the Unicode {@linkplain WordTokenizer UAX #29 word tokenizer}, so the
+ * input does not need to be pre-tokenized. The character-level dimensions ({@link Dimension#NFC}
+ * through {@link Dimension#ACCENT_FOLD}) have built-in defaults; {@link Dimension#STEM} and
+ * {@link Dimension#LEMMA} are enabled by supplying a {@link Stemmer} or {@link Lemmatizer}.
+ *
+ * An instance is immutable and is thread-safe when its configured transforms are. The built-in
+ * character normalizers are stateless, but the Snowball stemmers are not, so an analyzer configured
+ * with a {@link Stemmer} (for example through {@link NormalizationProfile#searchAnalyzer()}) should
+ * not be shared across threads when {@link Dimension#STEM} is used. Build one with
+ * {@link #builder()}.
+ */
+public final class TermAnalyzer {
+
+ private final List chain;
+ private final Dimension finalDimension;
+ private final EnumMap transforms;
+ private final Stemmer stemmer;
+ private final Lemmatizer lemmatizer;
+ private final WordTokenizer tokenizer;
+
+ private TermAnalyzer(Builder builder) {
+ final List ordered = new ArrayList<>(builder.chain);
+ Collections.sort(ordered); // canonical pipeline order (enum declaration order)
+ this.chain = List.copyOf(ordered);
+ this.finalDimension = ordered.isEmpty() ? Dimension.ORIGINAL : ordered.get(ordered.size() - 1);
+ // Only the per-analyzer overrides from the builder; the defaults live on Dimension itself.
+ this.transforms = new EnumMap<>(builder.transforms);
+ this.stemmer = builder.stemmer;
+ this.lemmatizer = builder.lemmatizer;
+ this.tokenizer = builder.tokenizer;
+ }
+
+ /**
+ * {@return a new builder}
+ */
+ public static Builder builder() {
+ return new Builder();
+ }
+
+ /**
+ * Segments {@code text} with the UAX #29 word tokenizer and returns one {@link Term} per
+ * word token, in order. The terms carry no part-of-speech tag, so {@link Dimension#LEMMA} is not
+ * available from them.
+ *
+ * @param text The text to analyze.
+ * @return The terms.
+ */
+ public List analyze(CharSequence text) {
+ final List spans = tokenizer.tokenizeSpans(text);
+ final List terms = new ArrayList<>(spans.size());
+ for (final Span span : spans) {
+ terms.add(new Term(this, span.getCoveredText(text).toString(), span, null));
+ }
+ return terms;
+ }
+
+ /**
+ * Returns one {@link Term} per supplied token, attaching the matching part-of-speech tag so that
+ * {@link Dimension#LEMMA} can be computed. The terms have no source span.
+ *
+ * @param tokens The tokens.
+ * @param tags The part-of-speech tag for each token; must be the same length as {@code tokens}.
+ * @return The terms.
+ * @throws IllegalArgumentException if {@code tokens} and {@code tags} differ in length.
+ */
+ public List analyze(String[] tokens, String[] tags) {
+ if (tokens.length != tags.length) {
+ throw new IllegalArgumentException(
+ "tokens and tags must be the same length, got " + tokens.length + " and " + tags.length);
+ }
+ final List terms = new ArrayList<>(tokens.length);
+ for (int i = 0; i < tokens.length; i++) {
+ terms.add(new Term(this, tokens[i], null, tags[i]));
+ }
+ return terms;
+ }
+
+ /**
+ * {@return the configured dimensions that are computed eagerly, in canonical order} The list
+ * never includes {@link Dimension#ORIGINAL}, which is always present.
+ */
+ public List dimensions() {
+ return chain;
+ }
+
+ Dimension finalDimension() {
+ return finalDimension;
+ }
+
+ // Applies one dimension's transform to a single token value. Fails loudly when a token-level
+ // dimension was requested without the engine (or tag) it needs.
+ String apply(Dimension dimension, String input, String posTag) {
+ switch (dimension) {
+ case ORIGINAL:
+ return input;
+ case STEM:
+ if (stemmer == null) {
+ throw new IllegalStateException(
+ "Dimension STEM requires a Stemmer; configure it with builder().stem(...)");
+ }
+ return stemmer.stem(input).toString();
+ case LEMMA:
+ if (lemmatizer == null) {
+ throw new IllegalStateException(
+ "Dimension LEMMA requires a Lemmatizer; configure it with builder().lemmatize(...)");
+ }
+ if (posTag == null) {
+ throw new IllegalStateException(
+ "Dimension LEMMA requires a part-of-speech tag; use analyze(tokens, tags)");
+ }
+ return lemmatizer.lemmatize(new String[] {input}, new String[] {posTag})[0];
+ default:
+ // A builder override wins; otherwise the dimension's own default normalizer.
+ final CharSequenceNormalizer normalizer = transforms.containsKey(dimension)
+ ? transforms.get(dimension) : dimension.defaultNormalizer();
+ if (normalizer == null) {
+ throw new IllegalStateException("Dimension " + dimension + " has no default normalizer; "
+ + "configure it with builder().transform(" + dimension + ", ...)");
+ }
+ return normalizer.normalize(input).toString();
+ }
+ }
+
+ /** A builder for {@link TermAnalyzer}. */
+ public static final class Builder {
+
+ private final EnumSet chain = EnumSet.noneOf(Dimension.class);
+ private final EnumMap transforms =
+ new EnumMap<>(Dimension.class);
+ private Stemmer stemmer;
+ private Lemmatizer lemmatizer;
+ private WordTokenizer tokenizer = new WordTokenizer();
+
+ private Builder() {
+ }
+
+ /**
+ * Enables {@link Dimension#NFC}.
+ *
+ * @return this builder
+ */
+ public Builder nfc() {
+ chain.add(Dimension.NFC);
+ return this;
+ }
+
+ /**
+ * Enables {@link Dimension#NFKC}.
+ *
+ * @return this builder
+ */
+ public Builder nfkc() {
+ chain.add(Dimension.NFKC);
+ return this;
+ }
+
+ /**
+ * Enables {@link Dimension#WHITESPACE}.
+ *
+ * @return this builder
+ */
+ public Builder whitespace() {
+ chain.add(Dimension.WHITESPACE);
+ return this;
+ }
+
+ /**
+ * Enables {@link Dimension#WHITESPACE} with a specific normalizer, choosing the fold target and
+ * behavior. For a custom class and target use a {@link CharClass} method reference, for example
+ * {@code whitespace(CharClass.of(members, replacement)::collapse)}.
+ *
+ * @param normalizer The whitespace normalizer to use.
+ * @return this builder
+ */
+ public Builder whitespace(CharSequenceNormalizer normalizer) {
+ return transform(Dimension.WHITESPACE, normalizer);
+ }
+
+ /**
+ * Enables {@link Dimension#DASH}.
+ *
+ * @return this builder
+ */
+ public Builder dashes() {
+ chain.add(Dimension.DASH);
+ return this;
+ }
+
+ /**
+ * Enables {@link Dimension#DASH} with a specific normalizer (a custom dash set or target).
+ *
+ * @param normalizer The dash normalizer to use.
+ * @return this builder
+ */
+ public Builder dashes(CharSequenceNormalizer normalizer) {
+ return transform(Dimension.DASH, normalizer);
+ }
+
+ /**
+ * Enables {@link Dimension#CASE_FOLD}.
+ *
+ * @return this builder
+ */
+ public Builder caseFold() {
+ chain.add(Dimension.CASE_FOLD);
+ return this;
+ }
+
+ /**
+ * Enables {@link Dimension#CASE_FOLD} using the given locale's case rules (for example Turkish
+ * dotted/dotless i), instead of the default {@link Locale#ROOT}.
+ *
+ * @param locale The locale whose case rules to apply.
+ * @return this builder
+ */
+ public Builder caseFold(Locale locale) {
+ Objects.requireNonNull(locale, "locale");
+ return transform(Dimension.CASE_FOLD, CaseFoldCharSequenceNormalizer.getInstance(locale));
+ }
+
+ /**
+ * Enables {@link Dimension#ACCENT_FOLD}.
+ *
+ * @return this builder
+ */
+ public Builder accentFold() {
+ chain.add(Dimension.ACCENT_FOLD);
+ return this;
+ }
+
+ /**
+ * Enables {@link Dimension#ACCENT_FOLD} restricted to a specific set of scripts, instead of the
+ * default Latin/Greek/Cyrillic.
+ *
+ * @param foldScripts The scripts whose diacritics to fold.
+ * @param foldStrokeLetters Whether to also fold stroke letters such as o-slash and l-stroke.
+ * @return this builder
+ */
+ public Builder accentFold(Set foldScripts, boolean foldStrokeLetters) {
+ return transform(Dimension.ACCENT_FOLD,
+ new AccentFoldCharSequenceNormalizer(foldScripts, foldStrokeLetters));
+ }
+
+ /**
+ * Enables {@link Dimension#CONFUSABLE_FOLD}.
+ *
+ * @return this builder
+ */
+ public Builder confusableFold() {
+ chain.add(Dimension.CONFUSABLE_FOLD);
+ return this;
+ }
+
+ /**
+ * Enables a character-level dimension with a specific normalizer, overriding its default (for
+ * example a locale-specific case fold for a language profile).
+ *
+ * @param dimension The character-level dimension to enable.
+ * @param normalizer The normalizer to use for it.
+ * @return this builder
+ * @throws IllegalArgumentException if {@code dimension} is {@link Dimension#ORIGINAL},
+ * {@link Dimension#STEM}, or {@link Dimension#LEMMA}.
+ */
+ public Builder transform(Dimension dimension, CharSequenceNormalizer normalizer) {
+ if (dimension == Dimension.ORIGINAL || dimension == Dimension.STEM
+ || dimension == Dimension.LEMMA) {
+ throw new IllegalArgumentException(
+ "transform(...) only applies to character-level dimensions, not " + dimension);
+ }
+ transforms.put(dimension, Objects.requireNonNull(normalizer, "normalizer"));
+ chain.add(dimension);
+ return this;
+ }
+
+ /**
+ * Enables {@link Dimension#STEM} through the given stemmer.
+ *
+ * @param value The stemmer.
+ * @return this builder
+ */
+ public Builder stem(Stemmer value) {
+ this.stemmer = Objects.requireNonNull(value, "stemmer");
+ chain.add(Dimension.STEM);
+ return this;
+ }
+
+ /**
+ * Enables {@link Dimension#LEMMA} through the given lemmatizer.
+ *
+ * @param value The lemmatizer.
+ * @return this builder
+ */
+ public Builder lemmatize(Lemmatizer value) {
+ this.lemmatizer = Objects.requireNonNull(value, "lemmatizer");
+ chain.add(Dimension.LEMMA);
+ return this;
+ }
+
+ /**
+ * Sets the tokenizer used by {@link TermAnalyzer#analyze(CharSequence)}.
+ *
+ * @param value The tokenizer.
+ * @return this builder
+ */
+ public Builder tokenizer(WordTokenizer value) {
+ this.tokenizer = Objects.requireNonNull(value, "tokenizer");
+ return this;
+ }
+
+ /**
+ * Sets the maximum token length of the tokenizer used by
+ * {@link TermAnalyzer#analyze(CharSequence)}. Convenience for
+ * {@code tokenizer(new WordTokenizer(maxTokenLength))}.
+ *
+ * @param maxTokenLength The maximum number of characters in a token.
+ * @return this builder
+ */
+ public Builder maxTokenLength(int maxTokenLength) {
+ this.tokenizer = new WordTokenizer(maxTokenLength);
+ return this;
+ }
+
+ /**
+ * {@return a new {@link TermAnalyzer} with this configuration}
+ */
+ public TermAnalyzer build() {
+ return new TermAnalyzer(this);
+ }
+ }
+}
diff --git a/opennlp-core/opennlp-runtime/src/main/resources/opennlp/tools/tokenize/uax29/ExtendedPictographic.txt b/opennlp-core/opennlp-runtime/src/main/resources/opennlp/tools/tokenize/uax29/ExtendedPictographic.txt
new file mode 100644
index 000000000..fcb66f495
--- /dev/null
+++ b/opennlp-core/opennlp-runtime/src/main/resources/opennlp/tools/tokenize/uax29/ExtendedPictographic.txt
@@ -0,0 +1,461 @@
+# emoji-data.txt
+# Date: 2025-07-25, 17:54:31 GMT
+# © 2025 Unicode®, Inc.
+# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
+# For terms of use and license, see https://www.unicode.org/terms_of_use.html
+#
+# Emoji Data for UTS #51
+# Version: 17.0
+#
+# For documentation and usage, see https://www.unicode.org/reports/tr51
+00A9 ; Extended_Pictographic# E0.6 [1] (©️) copyright
+00AE ; Extended_Pictographic# E0.6 [1] (®️) registered
+203C ; Extended_Pictographic# E0.6 [1] (‼️) double exclamation mark
+2049 ; Extended_Pictographic# E0.6 [1] (⁉️) exclamation question mark
+2122 ; Extended_Pictographic# E0.6 [1] (™️) trade mark
+2139 ; Extended_Pictographic# E0.6 [1] (ℹ️) information
+2194..2199 ; Extended_Pictographic# E0.6 [6] (↔️..↙️) left-right arrow..down-left arrow
+21A9..21AA ; Extended_Pictographic# E0.6 [2] (↩️..↪️) right arrow curving left..left arrow curving right
+231A..231B ; Extended_Pictographic# E0.6 [2] (⌚..⌛) watch..hourglass done
+2328 ; Extended_Pictographic# E1.0 [1] (⌨️) keyboard
+23CF ; Extended_Pictographic# E1.0 [1] (⏏️) eject button
+23E9..23EC ; Extended_Pictographic# E0.6 [4] (⏩..⏬) fast-forward button..fast down button
+23ED..23EE ; Extended_Pictographic# E0.7 [2] (⏭️..⏮️) next track button..last track button
+23EF ; Extended_Pictographic# E1.0 [1] (⏯️) play or pause button
+23F0 ; Extended_Pictographic# E0.6 [1] (⏰) alarm clock
+23F1..23F2 ; Extended_Pictographic# E1.0 [2] (⏱️..⏲️) stopwatch..timer clock
+23F3 ; Extended_Pictographic# E0.6 [1] (⏳) hourglass not done
+23F8..23FA ; Extended_Pictographic# E0.7 [3] (⏸️..⏺️) pause button..record button
+24C2 ; Extended_Pictographic# E0.6 [1] (Ⓜ️) circled M
+25AA..25AB ; Extended_Pictographic# E0.6 [2] (▪️..▫️) black small square..white small square
+25B6 ; Extended_Pictographic# E0.6 [1] (▶️) play button
+25C0 ; Extended_Pictographic# E0.6 [1] (◀️) reverse button
+25FB..25FE ; Extended_Pictographic# E0.6 [4] (◻️..◾) white medium square..black medium-small square
+2600..2601 ; Extended_Pictographic# E0.6 [2] (☀️..☁️) sun..cloud
+2602..2603 ; Extended_Pictographic# E0.7 [2] (☂️..☃️) umbrella..snowman
+2604 ; Extended_Pictographic# E1.0 [1] (☄️) comet
+260E ; Extended_Pictographic# E0.6 [1] (☎️) telephone
+2611 ; Extended_Pictographic# E0.6 [1] (☑️) check box with check
+2614..2615 ; Extended_Pictographic# E0.6 [2] (☔..☕) umbrella with rain drops..hot beverage
+2618 ; Extended_Pictographic# E1.0 [1] (☘️) shamrock
+261D ; Extended_Pictographic# E0.6 [1] (☝️) index pointing up
+2620 ; Extended_Pictographic# E1.0 [1] (☠️) skull and crossbones
+2622..2623 ; Extended_Pictographic# E1.0 [2] (☢️..☣️) radioactive..biohazard
+2626 ; Extended_Pictographic# E1.0 [1] (☦️) orthodox cross
+262A ; Extended_Pictographic# E0.7 [1] (☪️) star and crescent
+262E ; Extended_Pictographic# E1.0 [1] (☮️) peace symbol
+262F ; Extended_Pictographic# E0.7 [1] (☯️) yin yang
+2638..2639 ; Extended_Pictographic# E0.7 [2] (☸️..☹️) wheel of dharma..frowning face
+263A ; Extended_Pictographic# E0.6 [1] (☺️) smiling face
+2640 ; Extended_Pictographic# E4.0 [1] (♀️) female sign
+2642 ; Extended_Pictographic# E4.0 [1] (♂️) male sign
+2648..2653 ; Extended_Pictographic# E0.6 [12] (♈..♓) Aries..Pisces
+265F ; Extended_Pictographic# E11.0 [1] (♟️) chess pawn
+2660 ; Extended_Pictographic# E0.6 [1] (♠️) spade suit
+2663 ; Extended_Pictographic# E0.6 [1] (♣️) club suit
+2665..2666 ; Extended_Pictographic# E0.6 [2] (♥️..♦️) heart suit..diamond suit
+2668 ; Extended_Pictographic# E0.6 [1] (♨️) hot springs
+267B ; Extended_Pictographic# E0.6 [1] (♻️) recycling symbol
+267E ; Extended_Pictographic# E11.0 [1] (♾️) infinity
+267F ; Extended_Pictographic# E0.6 [1] (♿) wheelchair symbol
+2692 ; Extended_Pictographic# E1.0 [1] (⚒️) hammer and pick
+2693 ; Extended_Pictographic# E0.6 [1] (⚓) anchor
+2694 ; Extended_Pictographic# E1.0 [1] (⚔️) crossed swords
+2695 ; Extended_Pictographic# E4.0 [1] (⚕️) medical symbol
+2696..2697 ; Extended_Pictographic# E1.0 [2] (⚖️..⚗️) balance scale..alembic
+2699 ; Extended_Pictographic# E1.0 [1] (⚙️) gear
+269B..269C ; Extended_Pictographic# E1.0 [2] (⚛️..⚜️) atom symbol..fleur-de-lis
+26A0..26A1 ; Extended_Pictographic# E0.6 [2] (⚠️..⚡) warning..high voltage
+26A7 ; Extended_Pictographic# E13.0 [1] (⚧️) transgender symbol
+26AA..26AB ; Extended_Pictographic# E0.6 [2] (⚪..⚫) white circle..black circle
+26B0..26B1 ; Extended_Pictographic# E1.0 [2] (⚰️..⚱️) coffin..funeral urn
+26BD..26BE ; Extended_Pictographic# E0.6 [2] (⚽..⚾) soccer ball..baseball
+26C4..26C5 ; Extended_Pictographic# E0.6 [2] (⛄..⛅) snowman without snow..sun behind cloud
+26C8 ; Extended_Pictographic# E0.7 [1] (⛈️) cloud with lightning and rain
+26CE ; Extended_Pictographic# E0.6 [1] (⛎) Ophiuchus
+26CF ; Extended_Pictographic# E0.7 [1] (⛏️) pick
+26D1 ; Extended_Pictographic# E0.7 [1] (⛑️) rescue worker’s helmet
+26D3 ; Extended_Pictographic# E0.7 [1] (⛓️) chains
+26D4 ; Extended_Pictographic# E0.6 [1] (⛔) no entry
+26E9 ; Extended_Pictographic# E0.7 [1] (⛩️) shinto shrine
+26EA ; Extended_Pictographic# E0.6 [1] (⛪) church
+26F0..26F1 ; Extended_Pictographic# E0.7 [2] (⛰️..⛱️) mountain..umbrella on ground
+26F2..26F3 ; Extended_Pictographic# E0.6 [2] (⛲..⛳) fountain..flag in hole
+26F4 ; Extended_Pictographic# E0.7 [1] (⛴️) ferry
+26F5 ; Extended_Pictographic# E0.6 [1] (⛵) sailboat
+26F7..26F9 ; Extended_Pictographic# E0.7 [3] (⛷️..⛹️) skier..person bouncing ball
+26FA ; Extended_Pictographic# E0.6 [1] (⛺) tent
+26FD ; Extended_Pictographic# E0.6 [1] (⛽) fuel pump
+2702 ; Extended_Pictographic# E0.6 [1] (✂️) scissors
+2705 ; Extended_Pictographic# E0.6 [1] (✅) check mark button
+2708..270C ; Extended_Pictographic# E0.6 [5] (✈️..✌️) airplane..victory hand
+270D ; Extended_Pictographic# E0.7 [1] (✍️) writing hand
+270F ; Extended_Pictographic# E0.6 [1] (✏️) pencil
+2712 ; Extended_Pictographic# E0.6 [1] (✒️) black nib
+2714 ; Extended_Pictographic# E0.6 [1] (✔️) check mark
+2716 ; Extended_Pictographic# E0.6 [1] (✖️) multiply
+271D ; Extended_Pictographic# E0.7 [1] (✝️) latin cross
+2721 ; Extended_Pictographic# E0.7 [1] (✡️) star of David
+2728 ; Extended_Pictographic# E0.6 [1] (✨) sparkles
+2733..2734 ; Extended_Pictographic# E0.6 [2] (✳️..✴️) eight-spoked asterisk..eight-pointed star
+2744 ; Extended_Pictographic# E0.6 [1] (❄️) snowflake
+2747 ; Extended_Pictographic# E0.6 [1] (❇️) sparkle
+274C ; Extended_Pictographic# E0.6 [1] (❌) cross mark
+274E ; Extended_Pictographic# E0.6 [1] (❎) cross mark button
+2753..2755 ; Extended_Pictographic# E0.6 [3] (❓..❕) red question mark..white exclamation mark
+2757 ; Extended_Pictographic# E0.6 [1] (❗) red exclamation mark
+2763 ; Extended_Pictographic# E1.0 [1] (❣️) heart exclamation
+2764 ; Extended_Pictographic# E0.6 [1] (❤️) red heart
+2795..2797 ; Extended_Pictographic# E0.6 [3] (➕..➗) plus..divide
+27A1 ; Extended_Pictographic# E0.6 [1] (➡️) right arrow
+27B0 ; Extended_Pictographic# E0.6 [1] (➰) curly loop
+27BF ; Extended_Pictographic# E1.0 [1] (➿) double curly loop
+2934..2935 ; Extended_Pictographic# E0.6 [2] (⤴️..⤵️) right arrow curving up..right arrow curving down
+2B05..2B07 ; Extended_Pictographic# E0.6 [3] (⬅️..⬇️) left arrow..down arrow
+2B1B..2B1C ; Extended_Pictographic# E0.6 [2] (⬛..⬜) black large square..white large square
+2B50 ; Extended_Pictographic# E0.6 [1] (⭐) star
+2B55 ; Extended_Pictographic# E0.6 [1] (⭕) hollow red circle
+3030 ; Extended_Pictographic# E0.6 [1] (〰️) wavy dash
+303D ; Extended_Pictographic# E0.6 [1] (〽️) part alternation mark
+3297 ; Extended_Pictographic# E0.6 [1] (㊗️) Japanese “congratulations” button
+3299 ; Extended_Pictographic# E0.6 [1] (㊙️) Japanese “secret” button
+1F004 ; Extended_Pictographic# E0.6 [1] (🀄) mahjong red dragon
+1F02C..1F02F ; Extended_Pictographic# E0.0 [4] (..) ..
+1F094..1F09F ; Extended_Pictographic# E0.0 [12] (..) ..
+1F0AF..1F0B0 ; Extended_Pictographic# E0.0 [2] (..) ..
+1F0C0 ; Extended_Pictographic# E0.0 [1] ()
+1F0CF ; Extended_Pictographic# E0.6 [1] (🃏) joker
+1F0D0 ; Extended_Pictographic# E0.0 [1] ()
+1F0F6..1F0FF ; Extended_Pictographic# E0.0 [10] (..) ..
+1F170..1F171 ; Extended_Pictographic# E0.6 [2] (🅰️..🅱️) A button (blood type)..B button (blood type)
+1F17E..1F17F ; Extended_Pictographic# E0.6 [2] (🅾️..🅿️) O button (blood type)..P button
+1F18E ; Extended_Pictographic# E0.6 [1] (🆎) AB button (blood type)
+1F191..1F19A ; Extended_Pictographic# E0.6 [10] (🆑..🆚) CL button..VS button
+1F1AE..1F1E5 ; Extended_Pictographic# E0.0 [56] (..) ..
+1F201..1F202 ; Extended_Pictographic# E0.6 [2] (🈁..🈂️) Japanese “here” button..Japanese “service charge” button
+1F203..1F20F ; Extended_Pictographic# E0.0 [13] (..) ..
+1F21A ; Extended_Pictographic# E0.6 [1] (🈚) Japanese “free of charge” button
+1F22F ; Extended_Pictographic# E0.6 [1] (🈯) Japanese “reserved” button
+1F232..1F23A ; Extended_Pictographic# E0.6 [9] (🈲..🈺) Japanese “prohibited” button..Japanese “open for business” button
+1F23C..1F23F ; Extended_Pictographic# E0.0 [4] (..) ..
+1F249..1F24F ; Extended_Pictographic# E0.0 [7] (..) ..
+1F250..1F251 ; Extended_Pictographic# E0.6 [2] (🉐..🉑) Japanese “bargain” button..Japanese “acceptable” button
+1F252..1F25F ; Extended_Pictographic# E0.0 [14] (..) ..
+1F266..1F2FF ; Extended_Pictographic# E0.0 [154] (..) ..
+1F300..1F30C ; Extended_Pictographic# E0.6 [13] (🌀..🌌) cyclone..milky way
+1F30D..1F30E ; Extended_Pictographic# E0.7 [2] (🌍..🌎) globe showing Europe-Africa..globe showing Americas
+1F30F ; Extended_Pictographic# E0.6 [1] (🌏) globe showing Asia-Australia
+1F310 ; Extended_Pictographic# E1.0 [1] (🌐) globe with meridians
+1F311 ; Extended_Pictographic# E0.6 [1] (🌑) new moon
+1F312 ; Extended_Pictographic# E1.0 [1] (🌒) waxing crescent moon
+1F313..1F315 ; Extended_Pictographic# E0.6 [3] (🌓..🌕) first quarter moon..full moon
+1F316..1F318 ; Extended_Pictographic# E1.0 [3] (🌖..🌘) waning gibbous moon..waning crescent moon
+1F319 ; Extended_Pictographic# E0.6 [1] (🌙) crescent moon
+1F31A ; Extended_Pictographic# E1.0 [1] (🌚) new moon face
+1F31B ; Extended_Pictographic# E0.6 [1] (🌛) first quarter moon face
+1F31C ; Extended_Pictographic# E0.7 [1] (🌜) last quarter moon face
+1F31D..1F31E ; Extended_Pictographic# E1.0 [2] (🌝..🌞) full moon face..sun with face
+1F31F..1F320 ; Extended_Pictographic# E0.6 [2] (🌟..🌠) glowing star..shooting star
+1F321 ; Extended_Pictographic# E0.7 [1] (🌡️) thermometer
+1F324..1F32C ; Extended_Pictographic# E0.7 [9] (🌤️..🌬️) sun behind small cloud..wind face
+1F32D..1F32F ; Extended_Pictographic# E1.0 [3] (🌭..🌯) hot dog..burrito
+1F330..1F331 ; Extended_Pictographic# E0.6 [2] (🌰..🌱) chestnut..seedling
+1F332..1F333 ; Extended_Pictographic# E1.0 [2] (🌲..🌳) evergreen tree..deciduous tree
+1F334..1F335 ; Extended_Pictographic# E0.6 [2] (🌴..🌵) palm tree..cactus
+1F336 ; Extended_Pictographic# E0.7 [1] (🌶️) hot pepper
+1F337..1F34A ; Extended_Pictographic# E0.6 [20] (🌷..🍊) tulip..tangerine
+1F34B ; Extended_Pictographic# E1.0 [1] (🍋) lemon
+1F34C..1F34F ; Extended_Pictographic# E0.6 [4] (🍌..🍏) banana..green apple
+1F350 ; Extended_Pictographic# E1.0 [1] (🍐) pear
+1F351..1F37B ; Extended_Pictographic# E0.6 [43] (🍑..🍻) peach..clinking beer mugs
+1F37C ; Extended_Pictographic# E1.0 [1] (🍼) baby bottle
+1F37D ; Extended_Pictographic# E0.7 [1] (🍽️) fork and knife with plate
+1F37E..1F37F ; Extended_Pictographic# E1.0 [2] (🍾..🍿) bottle with popping cork..popcorn
+1F380..1F393 ; Extended_Pictographic# E0.6 [20] (🎀..🎓) ribbon..graduation cap
+1F396..1F397 ; Extended_Pictographic# E0.7 [2] (🎖️..🎗️) military medal..reminder ribbon
+1F399..1F39B ; Extended_Pictographic# E0.7 [3] (🎙️..🎛️) studio microphone..control knobs
+1F39E..1F39F ; Extended_Pictographic# E0.7 [2] (🎞️..🎟️) film frames..admission tickets
+1F3A0..1F3C4 ; Extended_Pictographic# E0.6 [37] (🎠..🏄) carousel horse..person surfing
+1F3C5 ; Extended_Pictographic# E1.0 [1] (🏅) sports medal
+1F3C6 ; Extended_Pictographic# E0.6 [1] (🏆) trophy
+1F3C7 ; Extended_Pictographic# E1.0 [1] (🏇) horse racing
+1F3C8 ; Extended_Pictographic# E0.6 [1] (🏈) american football
+1F3C9 ; Extended_Pictographic# E1.0 [1] (🏉) rugby football
+1F3CA ; Extended_Pictographic# E0.6 [1] (🏊) person swimming
+1F3CB..1F3CE ; Extended_Pictographic# E0.7 [4] (🏋️..🏎️) person lifting weights..racing car
+1F3CF..1F3D3 ; Extended_Pictographic# E1.0 [5] (🏏..🏓) cricket game..ping pong
+1F3D4..1F3DF ; Extended_Pictographic# E0.7 [12] (🏔️..🏟️) snow-capped mountain..stadium
+1F3E0..1F3E3 ; Extended_Pictographic# E0.6 [4] (🏠..🏣) house..Japanese post office
+1F3E4 ; Extended_Pictographic# E1.0 [1] (🏤) post office
+1F3E5..1F3F0 ; Extended_Pictographic# E0.6 [12] (🏥..🏰) hospital..castle
+1F3F3 ; Extended_Pictographic# E0.7 [1] (🏳️) white flag
+1F3F4 ; Extended_Pictographic# E1.0 [1] (🏴) black flag
+1F3F5 ; Extended_Pictographic# E0.7 [1] (🏵️) rosette
+1F3F7 ; Extended_Pictographic# E0.7 [1] (🏷️) label
+1F3F8..1F3FA ; Extended_Pictographic# E1.0 [3] (🏸..🏺) badminton..amphora
+1F400..1F407 ; Extended_Pictographic# E1.0 [8] (🐀..🐇) rat..rabbit
+1F408 ; Extended_Pictographic# E0.7 [1] (🐈) cat
+1F409..1F40B ; Extended_Pictographic# E1.0 [3] (🐉..🐋) dragon..whale
+1F40C..1F40E ; Extended_Pictographic# E0.6 [3] (🐌..🐎) snail..horse
+1F40F..1F410 ; Extended_Pictographic# E1.0 [2] (🐏..🐐) ram..goat
+1F411..1F412 ; Extended_Pictographic# E0.6 [2] (🐑..🐒) ewe..monkey
+1F413 ; Extended_Pictographic# E1.0 [1] (🐓) rooster
+1F414 ; Extended_Pictographic# E0.6 [1] (🐔) chicken
+1F415 ; Extended_Pictographic# E0.7 [1] (🐕) dog
+1F416 ; Extended_Pictographic# E1.0 [1] (🐖) pig
+1F417..1F429 ; Extended_Pictographic# E0.6 [19] (🐗..🐩) boar..poodle
+1F42A ; Extended_Pictographic# E1.0 [1] (🐪) camel
+1F42B..1F43E ; Extended_Pictographic# E0.6 [20] (🐫..🐾) two-hump camel..paw prints
+1F43F ; Extended_Pictographic# E0.7 [1] (🐿️) chipmunk
+1F440 ; Extended_Pictographic# E0.6 [1] (👀) eyes
+1F441 ; Extended_Pictographic# E0.7 [1] (👁️) eye
+1F442..1F464 ; Extended_Pictographic# E0.6 [35] (👂..👤) ear..bust in silhouette
+1F465 ; Extended_Pictographic# E1.0 [1] (👥) busts in silhouette
+1F466..1F46B ; Extended_Pictographic# E0.6 [6] (👦..👫) boy..woman and man holding hands
+1F46C..1F46D ; Extended_Pictographic# E1.0 [2] (👬..👭) men holding hands..women holding hands
+1F46E..1F4AC ; Extended_Pictographic# E0.6 [63] (👮..💬) police officer..speech balloon
+1F4AD ; Extended_Pictographic# E1.0 [1] (💭) thought balloon
+1F4AE..1F4B5 ; Extended_Pictographic# E0.6 [8] (💮..💵) white flower..dollar banknote
+1F4B6..1F4B7 ; Extended_Pictographic# E1.0 [2] (💶..💷) euro banknote..pound banknote
+1F4B8..1F4EB ; Extended_Pictographic# E0.6 [52] (💸..📫) money with wings..closed mailbox with raised flag
+1F4EC..1F4ED ; Extended_Pictographic# E0.7 [2] (📬..📭) open mailbox with raised flag..open mailbox with lowered flag
+1F4EE ; Extended_Pictographic# E0.6 [1] (📮) postbox
+1F4EF ; Extended_Pictographic# E1.0 [1] (📯) postal horn
+1F4F0..1F4F4 ; Extended_Pictographic# E0.6 [5] (📰..📴) newspaper..mobile phone off
+1F4F5 ; Extended_Pictographic# E1.0 [1] (📵) no mobile phones
+1F4F6..1F4F7 ; Extended_Pictographic# E0.6 [2] (📶..📷) antenna bars..camera
+1F4F8 ; Extended_Pictographic# E1.0 [1] (📸) camera with flash
+1F4F9..1F4FC ; Extended_Pictographic# E0.6 [4] (📹..📼) video camera..videocassette
+1F4FD ; Extended_Pictographic# E0.7 [1] (📽️) film projector
+1F4FF..1F502 ; Extended_Pictographic# E1.0 [4] (📿..🔂) prayer beads..repeat single button
+1F503 ; Extended_Pictographic# E0.6 [1] (🔃) clockwise vertical arrows
+1F504..1F507 ; Extended_Pictographic# E1.0 [4] (🔄..🔇) counterclockwise arrows button..muted speaker
+1F508 ; Extended_Pictographic# E0.7 [1] (🔈) speaker low volume
+1F509 ; Extended_Pictographic# E1.0 [1] (🔉) speaker medium volume
+1F50A..1F514 ; Extended_Pictographic# E0.6 [11] (🔊..🔔) speaker high volume..bell
+1F515 ; Extended_Pictographic# E1.0 [1] (🔕) bell with slash
+1F516..1F52B ; Extended_Pictographic# E0.6 [22] (🔖..🔫) bookmark..water pistol
+1F52C..1F52D ; Extended_Pictographic# E1.0 [2] (🔬..🔭) microscope..telescope
+1F52E..1F53D ; Extended_Pictographic# E0.6 [16] (🔮..🔽) crystal ball..downwards button
+1F549..1F54A ; Extended_Pictographic# E0.7 [2] (🕉️..🕊️) om..dove
+1F54B..1F54E ; Extended_Pictographic# E1.0 [4] (🕋..🕎) kaaba..menorah
+1F550..1F55B ; Extended_Pictographic# E0.6 [12] (🕐..🕛) one o’clock..twelve o’clock
+1F55C..1F567 ; Extended_Pictographic# E0.7 [12] (🕜..🕧) one-thirty..twelve-thirty
+1F56F..1F570 ; Extended_Pictographic# E0.7 [2] (🕯️..🕰️) candle..mantelpiece clock
+1F573..1F579 ; Extended_Pictographic# E0.7 [7] (🕳️..🕹️) hole..joystick
+1F57A ; Extended_Pictographic# E3.0 [1] (🕺) man dancing
+1F587 ; Extended_Pictographic# E0.7 [1] (🖇️) linked paperclips
+1F58A..1F58D ; Extended_Pictographic# E0.7 [4] (🖊️..🖍️) pen..crayon
+1F590 ; Extended_Pictographic# E0.7 [1] (🖐️) hand with fingers splayed
+1F595..1F596 ; Extended_Pictographic# E1.0 [2] (🖕..🖖) middle finger..vulcan salute
+1F5A4 ; Extended_Pictographic# E3.0 [1] (🖤) black heart
+1F5A5 ; Extended_Pictographic# E0.7 [1] (🖥️) desktop computer
+1F5A8 ; Extended_Pictographic# E0.7 [1] (🖨️) printer
+1F5B1..1F5B2 ; Extended_Pictographic# E0.7 [2] (🖱️..🖲️) computer mouse..trackball
+1F5BC ; Extended_Pictographic# E0.7 [1] (🖼️) framed picture
+1F5C2..1F5C4 ; Extended_Pictographic# E0.7 [3] (🗂️..🗄️) card index dividers..file cabinet
+1F5D1..1F5D3 ; Extended_Pictographic# E0.7 [3] (🗑️..🗓️) wastebasket..spiral calendar
+1F5DC..1F5DE ; Extended_Pictographic# E0.7 [3] (🗜️..🗞️) clamp..rolled-up newspaper
+1F5E1 ; Extended_Pictographic# E0.7 [1] (🗡️) dagger
+1F5E3 ; Extended_Pictographic# E0.7 [1] (🗣️) speaking head
+1F5E8 ; Extended_Pictographic# E2.0 [1] (🗨️) left speech bubble
+1F5EF ; Extended_Pictographic# E0.7 [1] (🗯️) right anger bubble
+1F5F3 ; Extended_Pictographic# E0.7 [1] (🗳️) ballot box with ballot
+1F5FA ; Extended_Pictographic# E0.7 [1] (🗺️) world map
+1F5FB..1F5FF ; Extended_Pictographic# E0.6 [5] (🗻..🗿) mount fuji..moai
+1F600 ; Extended_Pictographic# E1.0 [1] (😀) grinning face
+1F601..1F606 ; Extended_Pictographic# E0.6 [6] (😁..😆) beaming face with smiling eyes..grinning squinting face
+1F607..1F608 ; Extended_Pictographic# E1.0 [2] (😇..😈) smiling face with halo..smiling face with horns
+1F609..1F60D ; Extended_Pictographic# E0.6 [5] (😉..😍) winking face..smiling face with heart-eyes
+1F60E ; Extended_Pictographic# E1.0 [1] (😎) smiling face with sunglasses
+1F60F ; Extended_Pictographic# E0.6 [1] (😏) smirking face
+1F610 ; Extended_Pictographic# E0.7 [1] (😐) neutral face
+1F611 ; Extended_Pictographic# E1.0 [1] (😑) expressionless face
+1F612..1F614 ; Extended_Pictographic# E0.6 [3] (😒..😔) unamused face..pensive face
+1F615 ; Extended_Pictographic# E1.0 [1] (😕) confused face
+1F616 ; Extended_Pictographic# E0.6 [1] (😖) confounded face
+1F617 ; Extended_Pictographic# E1.0 [1] (😗) kissing face
+1F618 ; Extended_Pictographic# E0.6 [1] (😘) face blowing a kiss
+1F619 ; Extended_Pictographic# E1.0 [1] (😙) kissing face with smiling eyes
+1F61A ; Extended_Pictographic# E0.6 [1] (😚) kissing face with closed eyes
+1F61B ; Extended_Pictographic# E1.0 [1] (😛) face with tongue
+1F61C..1F61E ; Extended_Pictographic# E0.6 [3] (😜..😞) winking face with tongue..disappointed face
+1F61F ; Extended_Pictographic# E1.0 [1] (😟) worried face
+1F620..1F625 ; Extended_Pictographic# E0.6 [6] (😠..😥) angry face..sad but relieved face
+1F626..1F627 ; Extended_Pictographic# E1.0 [2] (😦..😧) frowning face with open mouth..anguished face
+1F628..1F62B ; Extended_Pictographic# E0.6 [4] (😨..😫) fearful face..tired face
+1F62C ; Extended_Pictographic# E1.0 [1] (😬) grimacing face
+1F62D ; Extended_Pictographic# E0.6 [1] (😭) loudly crying face
+1F62E..1F62F ; Extended_Pictographic# E1.0 [2] (😮..😯) face with open mouth..hushed face
+1F630..1F633 ; Extended_Pictographic# E0.6 [4] (😰..😳) anxious face with sweat..flushed face
+1F634 ; Extended_Pictographic# E1.0 [1] (😴) sleeping face
+1F635 ; Extended_Pictographic# E0.6 [1] (😵) face with crossed-out eyes
+1F636 ; Extended_Pictographic# E1.0 [1] (😶) face without mouth
+1F637..1F640 ; Extended_Pictographic# E0.6 [10] (😷..🙀) face with medical mask..weary cat
+1F641..1F644 ; Extended_Pictographic# E1.0 [4] (🙁..🙄) slightly frowning face..face with rolling eyes
+1F645..1F64F ; Extended_Pictographic# E0.6 [11] (🙅..🙏) person gesturing NO..folded hands
+1F680 ; Extended_Pictographic# E0.6 [1] (🚀) rocket
+1F681..1F682 ; Extended_Pictographic# E1.0 [2] (🚁..🚂) helicopter..locomotive
+1F683..1F685 ; Extended_Pictographic# E0.6 [3] (🚃..🚅) railway car..bullet train
+1F686 ; Extended_Pictographic# E1.0 [1] (🚆) train
+1F687 ; Extended_Pictographic# E0.6 [1] (🚇) metro
+1F688 ; Extended_Pictographic# E1.0 [1] (🚈) light rail
+1F689 ; Extended_Pictographic# E0.6 [1] (🚉) station
+1F68A..1F68B ; Extended_Pictographic# E1.0 [2] (🚊..🚋) tram..tram car
+1F68C ; Extended_Pictographic# E0.6 [1] (🚌) bus
+1F68D ; Extended_Pictographic# E0.7 [1] (🚍) oncoming bus
+1F68E ; Extended_Pictographic# E1.0 [1] (🚎) trolleybus
+1F68F ; Extended_Pictographic# E0.6 [1] (🚏) bus stop
+1F690 ; Extended_Pictographic# E1.0 [1] (🚐) minibus
+1F691..1F693 ; Extended_Pictographic# E0.6 [3] (🚑..🚓) ambulance..police car
+1F694 ; Extended_Pictographic# E0.7 [1] (🚔) oncoming police car
+1F695 ; Extended_Pictographic# E0.6 [1] (🚕) taxi
+1F696 ; Extended_Pictographic# E1.0 [1] (🚖) oncoming taxi
+1F697 ; Extended_Pictographic# E0.6 [1] (🚗) automobile
+1F698 ; Extended_Pictographic# E0.7 [1] (🚘) oncoming automobile
+1F699..1F69A ; Extended_Pictographic# E0.6 [2] (🚙..🚚) sport utility vehicle..delivery truck
+1F69B..1F6A1 ; Extended_Pictographic# E1.0 [7] (🚛..🚡) articulated lorry..aerial tramway
+1F6A2 ; Extended_Pictographic# E0.6 [1] (🚢) ship
+1F6A3 ; Extended_Pictographic# E1.0 [1] (🚣) person rowing boat
+1F6A4..1F6A5 ; Extended_Pictographic# E0.6 [2] (🚤..🚥) speedboat..horizontal traffic light
+1F6A6 ; Extended_Pictographic# E1.0 [1] (🚦) vertical traffic light
+1F6A7..1F6AD ; Extended_Pictographic# E0.6 [7] (🚧..🚭) construction..no smoking
+1F6AE..1F6B1 ; Extended_Pictographic# E1.0 [4] (🚮..🚱) litter in bin sign..non-potable water
+1F6B2 ; Extended_Pictographic# E0.6 [1] (🚲) bicycle
+1F6B3..1F6B5 ; Extended_Pictographic# E1.0 [3] (🚳..🚵) no bicycles..person mountain biking
+1F6B6 ; Extended_Pictographic# E0.6 [1] (🚶) person walking
+1F6B7..1F6B8 ; Extended_Pictographic# E1.0 [2] (🚷..🚸) no pedestrians..children crossing
+1F6B9..1F6BE ; Extended_Pictographic# E0.6 [6] (🚹..🚾) men’s room..water closet
+1F6BF ; Extended_Pictographic# E1.0 [1] (🚿) shower
+1F6C0 ; Extended_Pictographic# E0.6 [1] (🛀) person taking bath
+1F6C1..1F6C5 ; Extended_Pictographic# E1.0 [5] (🛁..🛅) bathtub..left luggage
+1F6CB ; Extended_Pictographic# E0.7 [1] (🛋️) couch and lamp
+1F6CC ; Extended_Pictographic# E1.0 [1] (🛌) person in bed
+1F6CD..1F6CF ; Extended_Pictographic# E0.7 [3] (🛍️..🛏️) shopping bags..bed
+1F6D0 ; Extended_Pictographic# E1.0 [1] (🛐) place of worship
+1F6D1..1F6D2 ; Extended_Pictographic# E3.0 [2] (🛑..🛒) stop sign..shopping cart
+1F6D5 ; Extended_Pictographic# E12.0 [1] (🛕) hindu temple
+1F6D6..1F6D7 ; Extended_Pictographic# E13.0 [2] (🛖..🛗) hut..elevator
+1F6D8 ; Extended_Pictographic# E17.0 [1] () landslide
+1F6D9..1F6DB ; Extended_Pictographic# E0.0 [3] (..) ..
+1F6DC ; Extended_Pictographic# E15.0 [1] (🛜) wireless
+1F6DD..1F6DF ; Extended_Pictographic# E14.0 [3] (🛝..🛟) playground slide..ring buoy
+1F6E0..1F6E5 ; Extended_Pictographic# E0.7 [6] (🛠️..🛥️) hammer and wrench..motor boat
+1F6E9 ; Extended_Pictographic# E0.7 [1] (🛩️) small airplane
+1F6EB..1F6EC ; Extended_Pictographic# E1.0 [2] (🛫..🛬) airplane departure..airplane arrival
+1F6ED..1F6EF ; Extended_Pictographic# E0.0 [3] (..) ..
+1F6F0 ; Extended_Pictographic# E0.7 [1] (🛰️) satellite
+1F6F3 ; Extended_Pictographic# E0.7 [1] (🛳️) passenger ship
+1F6F4..1F6F6 ; Extended_Pictographic# E3.0 [3] (🛴..🛶) kick scooter..canoe
+1F6F7..1F6F8 ; Extended_Pictographic# E5.0 [2] (🛷..🛸) sled..flying saucer
+1F6F9 ; Extended_Pictographic# E11.0 [1] (🛹) skateboard
+1F6FA ; Extended_Pictographic# E12.0 [1] (🛺) auto rickshaw
+1F6FB..1F6FC ; Extended_Pictographic# E13.0 [2] (🛻..🛼) pickup truck..roller skate
+1F6FD..1F6FF ; Extended_Pictographic# E0.0 [3] (..) ..
+1F7DA..1F7DF ; Extended_Pictographic# E0.0 [6] (..) ..
+1F7E0..1F7EB ; Extended_Pictographic# E12.0 [12] (🟠..🟫) orange circle..brown square
+1F7EC..1F7EF ; Extended_Pictographic# E0.0 [4] (..) ..
+1F7F0 ; Extended_Pictographic# E14.0 [1] (🟰) heavy equals sign
+1F7F1..1F7FF ; Extended_Pictographic# E0.0 [15] (..) ..
+1F80C..1F80F ; Extended_Pictographic# E0.0 [4] (..) ..
+1F848..1F84F ; Extended_Pictographic# E0.0 [8] (..) ..
+1F85A..1F85F ; Extended_Pictographic# E0.0 [6] (..) ..
+1F888..1F88F ; Extended_Pictographic# E0.0 [8] (..) ..
+1F8AE..1F8AF ; Extended_Pictographic# E0.0 [2] (..) ..
+1F8BC..1F8BF ; Extended_Pictographic# E0.0 [4] (..) ..
+1F8C2..1F8CF ; Extended_Pictographic# E0.0 [14] (..) ..
+1F8D9..1F8FF ; Extended_Pictographic# E0.0 [39] (..) ..
+1F90C ; Extended_Pictographic# E13.0 [1] (🤌) pinched fingers
+1F90D..1F90F ; Extended_Pictographic# E12.0 [3] (🤍..🤏) white heart..pinching hand
+1F910..1F918 ; Extended_Pictographic# E1.0 [9] (🤐..🤘) zipper-mouth face..sign of the horns
+1F919..1F91E ; Extended_Pictographic# E3.0 [6] (🤙..🤞) call me hand..crossed fingers
+1F91F ; Extended_Pictographic# E5.0 [1] (🤟) love-you gesture
+1F920..1F927 ; Extended_Pictographic# E3.0 [8] (🤠..🤧) cowboy hat face..sneezing face
+1F928..1F92F ; Extended_Pictographic# E5.0 [8] (🤨..🤯) face with raised eyebrow..exploding head
+1F930 ; Extended_Pictographic# E3.0 [1] (🤰) pregnant woman
+1F931..1F932 ; Extended_Pictographic# E5.0 [2] (🤱..🤲) breast-feeding..palms up together
+1F933..1F93A ; Extended_Pictographic# E3.0 [8] (🤳..🤺) selfie..person fencing
+1F93C..1F93E ; Extended_Pictographic# E3.0 [3] (🤼..🤾) people wrestling..person playing handball
+1F93F ; Extended_Pictographic# E12.0 [1] (🤿) diving mask
+1F940..1F945 ; Extended_Pictographic# E3.0 [6] (🥀..🥅) wilted flower..goal net
+1F947..1F94B ; Extended_Pictographic# E3.0 [5] (🥇..🥋) 1st place medal..martial arts uniform
+1F94C ; Extended_Pictographic# E5.0 [1] (🥌) curling stone
+1F94D..1F94F ; Extended_Pictographic# E11.0 [3] (🥍..🥏) lacrosse..flying disc
+1F950..1F95E ; Extended_Pictographic# E3.0 [15] (🥐..🥞) croissant..pancakes
+1F95F..1F96B ; Extended_Pictographic# E5.0 [13] (🥟..🥫) dumpling..canned food
+1F96C..1F970 ; Extended_Pictographic# E11.0 [5] (🥬..🥰) leafy green..smiling face with hearts
+1F971 ; Extended_Pictographic# E12.0 [1] (🥱) yawning face
+1F972 ; Extended_Pictographic# E13.0 [1] (🥲) smiling face with tear
+1F973..1F976 ; Extended_Pictographic# E11.0 [4] (🥳..🥶) partying face..cold face
+1F977..1F978 ; Extended_Pictographic# E13.0 [2] (🥷..🥸) ninja..disguised face
+1F979 ; Extended_Pictographic# E14.0 [1] (🥹) face holding back tears
+1F97A ; Extended_Pictographic# E11.0 [1] (🥺) pleading face
+1F97B ; Extended_Pictographic# E12.0 [1] (🥻) sari
+1F97C..1F97F ; Extended_Pictographic# E11.0 [4] (🥼..🥿) lab coat..flat shoe
+1F980..1F984 ; Extended_Pictographic# E1.0 [5] (🦀..🦄) crab..unicorn
+1F985..1F991 ; Extended_Pictographic# E3.0 [13] (🦅..🦑) eagle..squid
+1F992..1F997 ; Extended_Pictographic# E5.0 [6] (🦒..🦗) giraffe..cricket
+1F998..1F9A2 ; Extended_Pictographic# E11.0 [11] (🦘..🦢) kangaroo..swan
+1F9A3..1F9A4 ; Extended_Pictographic# E13.0 [2] (🦣..🦤) mammoth..dodo
+1F9A5..1F9AA ; Extended_Pictographic# E12.0 [6] (🦥..🦪) sloth..oyster
+1F9AB..1F9AD ; Extended_Pictographic# E13.0 [3] (🦫..🦭) beaver..seal
+1F9AE..1F9AF ; Extended_Pictographic# E12.0 [2] (🦮..🦯) guide dog..white cane
+1F9B0..1F9B9 ; Extended_Pictographic# E11.0 [10] (🦰..🦹) red hair..supervillain
+1F9BA..1F9BF ; Extended_Pictographic# E12.0 [6] (🦺..🦿) safety vest..mechanical leg
+1F9C0 ; Extended_Pictographic# E1.0 [1] (🧀) cheese wedge
+1F9C1..1F9C2 ; Extended_Pictographic# E11.0 [2] (🧁..🧂) cupcake..salt
+1F9C3..1F9CA ; Extended_Pictographic# E12.0 [8] (🧃..🧊) beverage box..ice
+1F9CB ; Extended_Pictographic# E13.0 [1] (🧋) bubble tea
+1F9CC ; Extended_Pictographic# E14.0 [1] (🧌) troll
+1F9CD..1F9CF ; Extended_Pictographic# E12.0 [3] (🧍..🧏) person standing..deaf person
+1F9D0..1F9E6 ; Extended_Pictographic# E5.0 [23] (🧐..🧦) face with monocle..socks
+1F9E7..1F9FF ; Extended_Pictographic# E11.0 [25] (🧧..🧿) red envelope..nazar amulet
+1FA58..1FA5F ; Extended_Pictographic# E0.0 [8] (..) ..
+1FA6E..1FA6F ; Extended_Pictographic# E0.0 [2] (..) ..
+1FA70..1FA73 ; Extended_Pictographic# E12.0 [4] (🩰..🩳) ballet shoes..shorts
+1FA74 ; Extended_Pictographic# E13.0 [1] (🩴) thong sandal
+1FA75..1FA77 ; Extended_Pictographic# E15.0 [3] (🩵..🩷) light blue heart..pink heart
+1FA78..1FA7A ; Extended_Pictographic# E12.0 [3] (🩸..🩺) drop of blood..stethoscope
+1FA7B..1FA7C ; Extended_Pictographic# E14.0 [2] (🩻..🩼) x-ray..crutch
+1FA7D..1FA7F ; Extended_Pictographic# E0.0 [3] (..) ..
+1FA80..1FA82 ; Extended_Pictographic# E12.0 [3] (🪀..🪂) yo-yo..parachute
+1FA83..1FA86 ; Extended_Pictographic# E13.0 [4] (🪃..🪆) boomerang..nesting dolls
+1FA87..1FA88 ; Extended_Pictographic# E15.0 [2] (🪇..🪈) maracas..flute
+1FA89 ; Extended_Pictographic# E16.0 [1] () harp
+1FA8A ; Extended_Pictographic# E17.0 [1] () trombone
+1FA8B..1FA8D ; Extended_Pictographic# E0.0 [3] (..) ..
+1FA8E ; Extended_Pictographic# E17.0 [1] () treasure chest
+1FA8F ; Extended_Pictographic# E16.0 [1] () shovel
+1FA90..1FA95 ; Extended_Pictographic# E12.0 [6] (🪐..🪕) ringed planet..banjo
+1FA96..1FAA8 ; Extended_Pictographic# E13.0 [19] (🪖..🪨) military helmet..rock
+1FAA9..1FAAC ; Extended_Pictographic# E14.0 [4] (🪩..🪬) mirror ball..hamsa
+1FAAD..1FAAF ; Extended_Pictographic# E15.0 [3] (🪭..🪯) folding hand fan..khanda
+1FAB0..1FAB6 ; Extended_Pictographic# E13.0 [7] (🪰..🪶) fly..feather
+1FAB7..1FABA ; Extended_Pictographic# E14.0 [4] (🪷..🪺) lotus..nest with eggs
+1FABB..1FABD ; Extended_Pictographic# E15.0 [3] (🪻..🪽) hyacinth..wing
+1FABE ; Extended_Pictographic# E16.0 [1] () leafless tree
+1FABF ; Extended_Pictographic# E15.0 [1] (🪿) goose
+1FAC0..1FAC2 ; Extended_Pictographic# E13.0 [3] (🫀..🫂) anatomical heart..people hugging
+1FAC3..1FAC5 ; Extended_Pictographic# E14.0 [3] (🫃..🫅) pregnant man..person with crown
+1FAC6 ; Extended_Pictographic# E16.0 [1] () fingerprint
+1FAC7 ; Extended_Pictographic# E0.0 [1] ()
+1FAC8 ; Extended_Pictographic# E17.0 [1] () hairy creature
+1FAC9..1FACC ; Extended_Pictographic# E0.0 [4] (..) ..
+1FACD ; Extended_Pictographic# E17.0 [1] () orca
+1FACE..1FACF ; Extended_Pictographic# E15.0 [2] (🫎..🫏) moose..donkey
+1FAD0..1FAD6 ; Extended_Pictographic# E13.0 [7] (🫐..🫖) blueberries..teapot
+1FAD7..1FAD9 ; Extended_Pictographic# E14.0 [3] (🫗..🫙) pouring liquid..jar
+1FADA..1FADB ; Extended_Pictographic# E15.0 [2] (🫚..🫛) ginger root..pea pod
+1FADC ; Extended_Pictographic# E16.0 [1] () root vegetable
+1FADD..1FADE ; Extended_Pictographic# E0.0 [2] (..) ..
+1FADF ; Extended_Pictographic# E16.0 [1] () splatter
+1FAE0..1FAE7 ; Extended_Pictographic# E14.0 [8] (🫠..🫧) melting face..bubbles
+1FAE8 ; Extended_Pictographic# E15.0 [1] (🫨) shaking face
+1FAE9 ; Extended_Pictographic# E16.0 [1] () face with bags under eyes
+1FAEA ; Extended_Pictographic# E17.0 [1] () distorted face
+1FAEB..1FAEE ; Extended_Pictographic# E0.0 [4] (..) ..
+1FAEF ; Extended_Pictographic# E17.0 [1] () fight cloud
+1FAF0..1FAF6 ; Extended_Pictographic# E14.0 [7] (🫰..🫶) hand with index finger and thumb crossed..heart hands
+1FAF7..1FAF8 ; Extended_Pictographic# E15.0 [2] (🫷..🫸) leftwards pushing hand..rightwards pushing hand
+1FAF9..1FAFF ; Extended_Pictographic# E0.0 [7] (..) ..
+1FC00..1FFFD ; Extended_Pictographic# E0.0[1022] (..) ..
diff --git a/opennlp-core/opennlp-runtime/src/main/resources/opennlp/tools/tokenize/uax29/WordBreakProperty.txt b/opennlp-core/opennlp-runtime/src/main/resources/opennlp/tools/tokenize/uax29/WordBreakProperty.txt
new file mode 100644
index 000000000..20fa24e37
--- /dev/null
+++ b/opennlp-core/opennlp-runtime/src/main/resources/opennlp/tools/tokenize/uax29/WordBreakProperty.txt
@@ -0,0 +1,1541 @@
+# WordBreakProperty-17.0.0.txt
+# Date: 2025-06-30, 06:20:49 GMT
+# © 2025 Unicode®, Inc.
+# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
+# For terms of use and license, see https://www.unicode.org/terms_of_use.html
+#
+# Unicode Character Database
+# For documentation, see https://www.unicode.org/reports/tr44/
+
+# ================================================
+
+# Property: Word_Break
+
+# All code points not explicitly listed for Word_Break
+# have the value Other (XX).
+
+# @missing: 0000..10FFFF; Other
+
+# ================================================
+
+0022 ; Double_Quote # Po QUOTATION MARK
+
+# Total code points: 1
+
+# ================================================
+
+0027 ; Single_Quote # Po APOSTROPHE
+
+# Total code points: 1
+
+# ================================================
+
+05D0..05EA ; Hebrew_Letter # Lo [27] HEBREW LETTER ALEF..HEBREW LETTER TAV
+05EF..05F2 ; Hebrew_Letter # Lo [4] HEBREW YOD TRIANGLE..HEBREW LIGATURE YIDDISH DOUBLE YOD
+FB1D ; Hebrew_Letter # Lo HEBREW LETTER YOD WITH HIRIQ
+FB1F..FB28 ; Hebrew_Letter # Lo [10] HEBREW LIGATURE YIDDISH YOD YOD PATAH..HEBREW LETTER WIDE TAV
+FB2A..FB36 ; Hebrew_Letter # Lo [13] HEBREW LETTER SHIN WITH SHIN DOT..HEBREW LETTER ZAYIN WITH DAGESH
+FB38..FB3C ; Hebrew_Letter # Lo [5] HEBREW LETTER TET WITH DAGESH..HEBREW LETTER LAMED WITH DAGESH
+FB3E ; Hebrew_Letter # Lo HEBREW LETTER MEM WITH DAGESH
+FB40..FB41 ; Hebrew_Letter # Lo [2] HEBREW LETTER NUN WITH DAGESH..HEBREW LETTER SAMEKH WITH DAGESH
+FB43..FB44 ; Hebrew_Letter # Lo [2] HEBREW LETTER FINAL PE WITH DAGESH..HEBREW LETTER PE WITH DAGESH
+FB46..FB4F ; Hebrew_Letter # Lo [10] HEBREW LETTER TSADI WITH DAGESH..HEBREW LIGATURE ALEF LAMED
+
+# Total code points: 75
+
+# ================================================
+
+000D ; CR # Cc
+
+# Total code points: 1
+
+# ================================================
+
+000A ; LF # Cc
+
+# Total code points: 1
+
+# ================================================
+
+000B..000C ; Newline # Cc [2] ..
+0085 ; Newline # Cc
+2028 ; Newline # Zl LINE SEPARATOR
+2029 ; Newline # Zp PARAGRAPH SEPARATOR
+
+# Total code points: 5
+
+# ================================================
+
+0300..036F ; Extend # Mn [112] COMBINING GRAVE ACCENT..COMBINING LATIN SMALL LETTER X
+0483..0487 ; Extend # Mn [5] COMBINING CYRILLIC TITLO..COMBINING CYRILLIC POKRYTIE
+0488..0489 ; Extend # Me [2] COMBINING CYRILLIC HUNDRED THOUSANDS SIGN..COMBINING CYRILLIC MILLIONS SIGN
+0591..05BD ; Extend # Mn [45] HEBREW ACCENT ETNAHTA..HEBREW POINT METEG
+05BF ; Extend # Mn HEBREW POINT RAFE
+05C1..05C2 ; Extend # Mn [2] HEBREW POINT SHIN DOT..HEBREW POINT SIN DOT
+05C4..05C5 ; Extend # Mn [2] HEBREW MARK UPPER DOT..HEBREW MARK LOWER DOT
+05C7 ; Extend # Mn HEBREW POINT QAMATS QATAN
+0610..061A ; Extend # Mn [11] ARABIC SIGN SALLALLAHOU ALAYHE WASSALLAM..ARABIC SMALL KASRA
+064B..065F ; Extend # Mn [21] ARABIC FATHATAN..ARABIC WAVY HAMZA BELOW
+0670 ; Extend # Mn ARABIC LETTER SUPERSCRIPT ALEF
+06D6..06DC ; Extend # Mn [7] ARABIC SMALL HIGH LIGATURE SAD WITH LAM WITH ALEF MAKSURA..ARABIC SMALL HIGH SEEN
+06DF..06E4 ; Extend # Mn [6] ARABIC SMALL HIGH ROUNDED ZERO..ARABIC SMALL HIGH MADDA
+06E7..06E8 ; Extend # Mn [2] ARABIC SMALL HIGH YEH..ARABIC SMALL HIGH NOON
+06EA..06ED ; Extend # Mn [4] ARABIC EMPTY CENTRE LOW STOP..ARABIC SMALL LOW MEEM
+0711 ; Extend # Mn SYRIAC LETTER SUPERSCRIPT ALAPH
+0730..074A ; Extend # Mn [27] SYRIAC PTHAHA ABOVE..SYRIAC BARREKH
+07A6..07B0 ; Extend # Mn [11] THAANA ABAFILI..THAANA SUKUN
+07EB..07F3 ; Extend # Mn [9] NKO COMBINING SHORT HIGH TONE..NKO COMBINING DOUBLE DOT ABOVE
+07FD ; Extend # Mn NKO DANTAYALAN
+0816..0819 ; Extend # Mn [4] SAMARITAN MARK IN..SAMARITAN MARK DAGESH
+081B..0823 ; Extend # Mn [9] SAMARITAN MARK EPENTHETIC YUT..SAMARITAN VOWEL SIGN A
+0825..0827 ; Extend # Mn [3] SAMARITAN VOWEL SIGN SHORT A..SAMARITAN VOWEL SIGN U
+0829..082D ; Extend # Mn [5] SAMARITAN VOWEL SIGN LONG I..SAMARITAN MARK NEQUDAA
+0859..085B ; Extend # Mn [3] MANDAIC AFFRICATION MARK..MANDAIC GEMINATION MARK
+0897..089F ; Extend # Mn [9] ARABIC PEPET..ARABIC HALF MADDA OVER MADDA
+08CA..08E1 ; Extend # Mn [24] ARABIC SMALL HIGH FARSI YEH..ARABIC SMALL HIGH SIGN SAFHA
+08E3..0902 ; Extend # Mn [32] ARABIC TURNED DAMMA BELOW..DEVANAGARI SIGN ANUSVARA
+0903 ; Extend # Mc DEVANAGARI SIGN VISARGA
+093A ; Extend # Mn DEVANAGARI VOWEL SIGN OE
+093B ; Extend # Mc DEVANAGARI VOWEL SIGN OOE
+093C ; Extend # Mn DEVANAGARI SIGN NUKTA
+093E..0940 ; Extend # Mc [3] DEVANAGARI VOWEL SIGN AA..DEVANAGARI VOWEL SIGN II
+0941..0948 ; Extend # Mn [8] DEVANAGARI VOWEL SIGN U..DEVANAGARI VOWEL SIGN AI
+0949..094C ; Extend # Mc [4] DEVANAGARI VOWEL SIGN CANDRA O..DEVANAGARI VOWEL SIGN AU
+094D ; Extend # Mn DEVANAGARI SIGN VIRAMA
+094E..094F ; Extend # Mc [2] DEVANAGARI VOWEL SIGN PRISHTHAMATRA E..DEVANAGARI VOWEL SIGN AW
+0951..0957 ; Extend # Mn [7] DEVANAGARI STRESS SIGN UDATTA..DEVANAGARI VOWEL SIGN UUE
+0962..0963 ; Extend # Mn [2] DEVANAGARI VOWEL SIGN VOCALIC L..DEVANAGARI VOWEL SIGN VOCALIC LL
+0981 ; Extend # Mn BENGALI SIGN CANDRABINDU
+0982..0983 ; Extend # Mc [2] BENGALI SIGN ANUSVARA..BENGALI SIGN VISARGA
+09BC ; Extend # Mn BENGALI SIGN NUKTA
+09BE..09C0 ; Extend # Mc [3] BENGALI VOWEL SIGN AA..BENGALI VOWEL SIGN II
+09C1..09C4 ; Extend # Mn [4] BENGALI VOWEL SIGN U..BENGALI VOWEL SIGN VOCALIC RR
+09C7..09C8 ; Extend # Mc [2] BENGALI VOWEL SIGN E..BENGALI VOWEL SIGN AI
+09CB..09CC ; Extend # Mc [2] BENGALI VOWEL SIGN O..BENGALI VOWEL SIGN AU
+09CD ; Extend # Mn BENGALI SIGN VIRAMA
+09D7 ; Extend # Mc BENGALI AU LENGTH MARK
+09E2..09E3 ; Extend # Mn [2] BENGALI VOWEL SIGN VOCALIC L..BENGALI VOWEL SIGN VOCALIC LL
+09FE ; Extend # Mn BENGALI SANDHI MARK
+0A01..0A02 ; Extend # Mn [2] GURMUKHI SIGN ADAK BINDI..GURMUKHI SIGN BINDI
+0A03 ; Extend # Mc GURMUKHI SIGN VISARGA
+0A3C ; Extend # Mn GURMUKHI SIGN NUKTA
+0A3E..0A40 ; Extend # Mc [3] GURMUKHI VOWEL SIGN AA..GURMUKHI VOWEL SIGN II
+0A41..0A42 ; Extend # Mn [2] GURMUKHI VOWEL SIGN U..GURMUKHI VOWEL SIGN UU
+0A47..0A48 ; Extend # Mn [2] GURMUKHI VOWEL SIGN EE..GURMUKHI VOWEL SIGN AI
+0A4B..0A4D ; Extend # Mn [3] GURMUKHI VOWEL SIGN OO..GURMUKHI SIGN VIRAMA
+0A51 ; Extend # Mn GURMUKHI SIGN UDAAT
+0A70..0A71 ; Extend # Mn [2] GURMUKHI TIPPI..GURMUKHI ADDAK
+0A75 ; Extend # Mn GURMUKHI SIGN YAKASH
+0A81..0A82 ; Extend # Mn [2] GUJARATI SIGN CANDRABINDU..GUJARATI SIGN ANUSVARA
+0A83 ; Extend # Mc GUJARATI SIGN VISARGA
+0ABC ; Extend # Mn GUJARATI SIGN NUKTA
+0ABE..0AC0 ; Extend # Mc [3] GUJARATI VOWEL SIGN AA..GUJARATI VOWEL SIGN II
+0AC1..0AC5 ; Extend # Mn [5] GUJARATI VOWEL SIGN U..GUJARATI VOWEL SIGN CANDRA E
+0AC7..0AC8 ; Extend # Mn [2] GUJARATI VOWEL SIGN E..GUJARATI VOWEL SIGN AI
+0AC9 ; Extend # Mc GUJARATI VOWEL SIGN CANDRA O
+0ACB..0ACC ; Extend # Mc [2] GUJARATI VOWEL SIGN O..GUJARATI VOWEL SIGN AU
+0ACD ; Extend # Mn GUJARATI SIGN VIRAMA
+0AE2..0AE3 ; Extend # Mn [2] GUJARATI VOWEL SIGN VOCALIC L..GUJARATI VOWEL SIGN VOCALIC LL
+0AFA..0AFF ; Extend # Mn [6] GUJARATI SIGN SUKUN..GUJARATI SIGN TWO-CIRCLE NUKTA ABOVE
+0B01 ; Extend # Mn ORIYA SIGN CANDRABINDU
+0B02..0B03 ; Extend # Mc [2] ORIYA SIGN ANUSVARA..ORIYA SIGN VISARGA
+0B3C ; Extend # Mn ORIYA SIGN NUKTA
+0B3E ; Extend # Mc ORIYA VOWEL SIGN AA
+0B3F ; Extend # Mn ORIYA VOWEL SIGN I
+0B40 ; Extend # Mc ORIYA VOWEL SIGN II
+0B41..0B44 ; Extend # Mn [4] ORIYA VOWEL SIGN U..ORIYA VOWEL SIGN VOCALIC RR
+0B47..0B48 ; Extend # Mc [2] ORIYA VOWEL SIGN E..ORIYA VOWEL SIGN AI
+0B4B..0B4C ; Extend # Mc [2] ORIYA VOWEL SIGN O..ORIYA VOWEL SIGN AU
+0B4D ; Extend # Mn ORIYA SIGN VIRAMA
+0B55..0B56 ; Extend # Mn [2] ORIYA SIGN OVERLINE..ORIYA AI LENGTH MARK
+0B57 ; Extend # Mc ORIYA AU LENGTH MARK
+0B62..0B63 ; Extend # Mn [2] ORIYA VOWEL SIGN VOCALIC L..ORIYA VOWEL SIGN VOCALIC LL
+0B82 ; Extend # Mn TAMIL SIGN ANUSVARA
+0BBE..0BBF ; Extend # Mc [2] TAMIL VOWEL SIGN AA..TAMIL VOWEL SIGN I
+0BC0 ; Extend # Mn TAMIL VOWEL SIGN II
+0BC1..0BC2 ; Extend # Mc [2] TAMIL VOWEL SIGN U..TAMIL VOWEL SIGN UU
+0BC6..0BC8 ; Extend # Mc [3] TAMIL VOWEL SIGN E..TAMIL VOWEL SIGN AI
+0BCA..0BCC ; Extend # Mc [3] TAMIL VOWEL SIGN O..TAMIL VOWEL SIGN AU
+0BCD ; Extend # Mn TAMIL SIGN VIRAMA
+0BD7 ; Extend # Mc TAMIL AU LENGTH MARK
+0C00 ; Extend # Mn TELUGU SIGN COMBINING CANDRABINDU ABOVE
+0C01..0C03 ; Extend # Mc [3] TELUGU SIGN CANDRABINDU..TELUGU SIGN VISARGA
+0C04 ; Extend # Mn TELUGU SIGN COMBINING ANUSVARA ABOVE
+0C3C ; Extend # Mn TELUGU SIGN NUKTA
+0C3E..0C40 ; Extend # Mn [3] TELUGU VOWEL SIGN AA..TELUGU VOWEL SIGN II
+0C41..0C44 ; Extend # Mc [4] TELUGU VOWEL SIGN U..TELUGU VOWEL SIGN VOCALIC RR
+0C46..0C48 ; Extend # Mn [3] TELUGU VOWEL SIGN E..TELUGU VOWEL SIGN AI
+0C4A..0C4D ; Extend # Mn [4] TELUGU VOWEL SIGN O..TELUGU SIGN VIRAMA
+0C55..0C56 ; Extend # Mn [2] TELUGU LENGTH MARK..TELUGU AI LENGTH MARK
+0C62..0C63 ; Extend # Mn [2] TELUGU VOWEL SIGN VOCALIC L..TELUGU VOWEL SIGN VOCALIC LL
+0C81 ; Extend # Mn KANNADA SIGN CANDRABINDU
+0C82..0C83 ; Extend # Mc [2] KANNADA SIGN ANUSVARA..KANNADA SIGN VISARGA
+0CBC ; Extend # Mn KANNADA SIGN NUKTA
+0CBE ; Extend # Mc KANNADA VOWEL SIGN AA
+0CBF ; Extend # Mn KANNADA VOWEL SIGN I
+0CC0..0CC4 ; Extend # Mc [5] KANNADA VOWEL SIGN II..KANNADA VOWEL SIGN VOCALIC RR
+0CC6 ; Extend # Mn KANNADA VOWEL SIGN E
+0CC7..0CC8 ; Extend # Mc [2] KANNADA VOWEL SIGN EE..KANNADA VOWEL SIGN AI
+0CCA..0CCB ; Extend # Mc [2] KANNADA VOWEL SIGN O..KANNADA VOWEL SIGN OO
+0CCC..0CCD ; Extend # Mn [2] KANNADA VOWEL SIGN AU..KANNADA SIGN VIRAMA
+0CD5..0CD6 ; Extend # Mc [2] KANNADA LENGTH MARK..KANNADA AI LENGTH MARK
+0CE2..0CE3 ; Extend # Mn [2] KANNADA VOWEL SIGN VOCALIC L..KANNADA VOWEL SIGN VOCALIC LL
+0CF3 ; Extend # Mc KANNADA SIGN COMBINING ANUSVARA ABOVE RIGHT
+0D00..0D01 ; Extend # Mn [2] MALAYALAM SIGN COMBINING ANUSVARA ABOVE..MALAYALAM SIGN CANDRABINDU
+0D02..0D03 ; Extend # Mc [2] MALAYALAM SIGN ANUSVARA..MALAYALAM SIGN VISARGA
+0D3B..0D3C ; Extend # Mn [2] MALAYALAM SIGN VERTICAL BAR VIRAMA..MALAYALAM SIGN CIRCULAR VIRAMA
+0D3E..0D40 ; Extend # Mc [3] MALAYALAM VOWEL SIGN AA..MALAYALAM VOWEL SIGN II
+0D41..0D44 ; Extend # Mn [4] MALAYALAM VOWEL SIGN U..MALAYALAM VOWEL SIGN VOCALIC RR
+0D46..0D48 ; Extend # Mc [3] MALAYALAM VOWEL SIGN E..MALAYALAM VOWEL SIGN AI
+0D4A..0D4C ; Extend # Mc [3] MALAYALAM VOWEL SIGN O..MALAYALAM VOWEL SIGN AU
+0D4D ; Extend # Mn MALAYALAM SIGN VIRAMA
+0D57 ; Extend # Mc MALAYALAM AU LENGTH MARK
+0D62..0D63 ; Extend # Mn [2] MALAYALAM VOWEL SIGN VOCALIC L..MALAYALAM VOWEL SIGN VOCALIC LL
+0D81 ; Extend # Mn SINHALA SIGN CANDRABINDU
+0D82..0D83 ; Extend # Mc [2] SINHALA SIGN ANUSVARAYA..SINHALA SIGN VISARGAYA
+0DCA ; Extend # Mn SINHALA SIGN AL-LAKUNA
+0DCF..0DD1 ; Extend # Mc [3] SINHALA VOWEL SIGN AELA-PILLA..SINHALA VOWEL SIGN DIGA AEDA-PILLA
+0DD2..0DD4 ; Extend # Mn [3] SINHALA VOWEL SIGN KETTI IS-PILLA..SINHALA VOWEL SIGN KETTI PAA-PILLA
+0DD6 ; Extend # Mn SINHALA VOWEL SIGN DIGA PAA-PILLA
+0DD8..0DDF ; Extend # Mc [8] SINHALA VOWEL SIGN GAETTA-PILLA..SINHALA VOWEL SIGN GAYANUKITTA
+0DF2..0DF3 ; Extend # Mc [2] SINHALA VOWEL SIGN DIGA GAETTA-PILLA..SINHALA VOWEL SIGN DIGA GAYANUKITTA
+0E31 ; Extend # Mn THAI CHARACTER MAI HAN-AKAT
+0E34..0E3A ; Extend # Mn [7] THAI CHARACTER SARA I..THAI CHARACTER PHINTHU
+0E47..0E4E ; Extend # Mn [8] THAI CHARACTER MAITAIKHU..THAI CHARACTER YAMAKKAN
+0EB1 ; Extend # Mn LAO VOWEL SIGN MAI KAN
+0EB4..0EBC ; Extend # Mn [9] LAO VOWEL SIGN I..LAO SEMIVOWEL SIGN LO
+0EC8..0ECE ; Extend # Mn [7] LAO TONE MAI EK..LAO YAMAKKAN
+0F18..0F19 ; Extend # Mn [2] TIBETAN ASTROLOGICAL SIGN -KHYUD PA..TIBETAN ASTROLOGICAL SIGN SDONG TSHUGS
+0F35 ; Extend # Mn TIBETAN MARK NGAS BZUNG NYI ZLA
+0F37 ; Extend # Mn TIBETAN MARK NGAS BZUNG SGOR RTAGS
+0F39 ; Extend # Mn TIBETAN MARK TSA -PHRU
+0F3E..0F3F ; Extend # Mc [2] TIBETAN SIGN YAR TSHES..TIBETAN SIGN MAR TSHES
+0F71..0F7E ; Extend # Mn [14] TIBETAN VOWEL SIGN AA..TIBETAN SIGN RJES SU NGA RO
+0F7F ; Extend # Mc TIBETAN SIGN RNAM BCAD
+0F80..0F84 ; Extend # Mn [5] TIBETAN VOWEL SIGN REVERSED I..TIBETAN MARK HALANTA
+0F86..0F87 ; Extend # Mn [2] TIBETAN SIGN LCI RTAGS..TIBETAN SIGN YANG RTAGS
+0F8D..0F97 ; Extend # Mn [11] TIBETAN SUBJOINED SIGN LCE TSA CAN..TIBETAN SUBJOINED LETTER JA
+0F99..0FBC ; Extend # Mn [36] TIBETAN SUBJOINED LETTER NYA..TIBETAN SUBJOINED LETTER FIXED-FORM RA
+0FC6 ; Extend # Mn TIBETAN SYMBOL PADMA GDAN
+102B..102C ; Extend # Mc [2] MYANMAR VOWEL SIGN TALL AA..MYANMAR VOWEL SIGN AA
+102D..1030 ; Extend # Mn [4] MYANMAR VOWEL SIGN I..MYANMAR VOWEL SIGN UU
+1031 ; Extend # Mc MYANMAR VOWEL SIGN E
+1032..1037 ; Extend # Mn [6] MYANMAR VOWEL SIGN AI..MYANMAR SIGN DOT BELOW
+1038 ; Extend # Mc MYANMAR SIGN VISARGA
+1039..103A ; Extend # Mn [2] MYANMAR SIGN VIRAMA..MYANMAR SIGN ASAT
+103B..103C ; Extend # Mc [2] MYANMAR CONSONANT SIGN MEDIAL YA..MYANMAR CONSONANT SIGN MEDIAL RA
+103D..103E ; Extend # Mn [2] MYANMAR CONSONANT SIGN MEDIAL WA..MYANMAR CONSONANT SIGN MEDIAL HA
+1056..1057 ; Extend # Mc [2] MYANMAR VOWEL SIGN VOCALIC R..MYANMAR VOWEL SIGN VOCALIC RR
+1058..1059 ; Extend # Mn [2] MYANMAR VOWEL SIGN VOCALIC L..MYANMAR VOWEL SIGN VOCALIC LL
+105E..1060 ; Extend # Mn [3] MYANMAR CONSONANT SIGN MON MEDIAL NA..MYANMAR CONSONANT SIGN MON MEDIAL LA
+1062..1064 ; Extend # Mc [3] MYANMAR VOWEL SIGN SGAW KAREN EU..MYANMAR TONE MARK SGAW KAREN KE PHO
+1067..106D ; Extend # Mc [7] MYANMAR VOWEL SIGN WESTERN PWO KAREN EU..MYANMAR SIGN WESTERN PWO KAREN TONE-5
+1071..1074 ; Extend # Mn [4] MYANMAR VOWEL SIGN GEBA KAREN I..MYANMAR VOWEL SIGN KAYAH EE
+1082 ; Extend # Mn MYANMAR CONSONANT SIGN SHAN MEDIAL WA
+1083..1084 ; Extend # Mc [2] MYANMAR VOWEL SIGN SHAN AA..MYANMAR VOWEL SIGN SHAN E
+1085..1086 ; Extend # Mn [2] MYANMAR VOWEL SIGN SHAN E ABOVE..MYANMAR VOWEL SIGN SHAN FINAL Y
+1087..108C ; Extend # Mc [6] MYANMAR SIGN SHAN TONE-2..MYANMAR SIGN SHAN COUNCIL TONE-3
+108D ; Extend # Mn MYANMAR SIGN SHAN COUNCIL EMPHATIC TONE
+108F ; Extend # Mc MYANMAR SIGN RUMAI PALAUNG TONE-5
+109A..109C ; Extend # Mc [3] MYANMAR SIGN KHAMTI TONE-1..MYANMAR VOWEL SIGN AITON A
+109D ; Extend # Mn MYANMAR VOWEL SIGN AITON AI
+135D..135F ; Extend # Mn [3] ETHIOPIC COMBINING GEMINATION AND VOWEL LENGTH MARK..ETHIOPIC COMBINING GEMINATION MARK
+1712..1714 ; Extend # Mn [3] TAGALOG VOWEL SIGN I..TAGALOG SIGN VIRAMA
+1715 ; Extend # Mc TAGALOG SIGN PAMUDPOD
+1732..1733 ; Extend # Mn [2] HANUNOO VOWEL SIGN I..HANUNOO VOWEL SIGN U
+1734 ; Extend # Mc HANUNOO SIGN PAMUDPOD
+1752..1753 ; Extend # Mn [2] BUHID VOWEL SIGN I..BUHID VOWEL SIGN U
+1772..1773 ; Extend # Mn [2] TAGBANWA VOWEL SIGN I..TAGBANWA VOWEL SIGN U
+17B4..17B5 ; Extend # Mn [2] KHMER VOWEL INHERENT AQ..KHMER VOWEL INHERENT AA
+17B6 ; Extend # Mc KHMER VOWEL SIGN AA
+17B7..17BD ; Extend # Mn [7] KHMER VOWEL SIGN I..KHMER VOWEL SIGN UA
+17BE..17C5 ; Extend # Mc [8] KHMER VOWEL SIGN OE..KHMER VOWEL SIGN AU
+17C6 ; Extend # Mn KHMER SIGN NIKAHIT
+17C7..17C8 ; Extend # Mc [2] KHMER SIGN REAHMUK..KHMER SIGN YUUKALEAPINTU
+17C9..17D3 ; Extend # Mn [11] KHMER SIGN MUUSIKATOAN..KHMER SIGN BATHAMASAT
+17DD ; Extend # Mn KHMER SIGN ATTHACAN
+180B..180D ; Extend # Mn [3] MONGOLIAN FREE VARIATION SELECTOR ONE..MONGOLIAN FREE VARIATION SELECTOR THREE
+180F ; Extend # Mn MONGOLIAN FREE VARIATION SELECTOR FOUR
+1885..1886 ; Extend # Mn [2] MONGOLIAN LETTER ALI GALI BALUDA..MONGOLIAN LETTER ALI GALI THREE BALUDA
+18A9 ; Extend # Mn MONGOLIAN LETTER ALI GALI DAGALGA
+1920..1922 ; Extend # Mn [3] LIMBU VOWEL SIGN A..LIMBU VOWEL SIGN U
+1923..1926 ; Extend # Mc [4] LIMBU VOWEL SIGN EE..LIMBU VOWEL SIGN AU
+1927..1928 ; Extend # Mn [2] LIMBU VOWEL SIGN E..LIMBU VOWEL SIGN O
+1929..192B ; Extend # Mc [3] LIMBU SUBJOINED LETTER YA..LIMBU SUBJOINED LETTER WA
+1930..1931 ; Extend # Mc [2] LIMBU SMALL LETTER KA..LIMBU SMALL LETTER NGA
+1932 ; Extend # Mn LIMBU SMALL LETTER ANUSVARA
+1933..1938 ; Extend # Mc [6] LIMBU SMALL LETTER TA..LIMBU SMALL LETTER LA
+1939..193B ; Extend # Mn [3] LIMBU SIGN MUKPHRENG..LIMBU SIGN SA-I
+1A17..1A18 ; Extend # Mn [2] BUGINESE VOWEL SIGN I..BUGINESE VOWEL SIGN U
+1A19..1A1A ; Extend # Mc [2] BUGINESE VOWEL SIGN E..BUGINESE VOWEL SIGN O
+1A1B ; Extend # Mn BUGINESE VOWEL SIGN AE
+1A55 ; Extend # Mc TAI THAM CONSONANT SIGN MEDIAL RA
+1A56 ; Extend # Mn TAI THAM CONSONANT SIGN MEDIAL LA
+1A57 ; Extend # Mc TAI THAM CONSONANT SIGN LA TANG LAI
+1A58..1A5E ; Extend # Mn [7] TAI THAM SIGN MAI KANG LAI..TAI THAM CONSONANT SIGN SA
+1A60 ; Extend # Mn TAI THAM SIGN SAKOT
+1A61 ; Extend # Mc TAI THAM VOWEL SIGN A
+1A62 ; Extend # Mn TAI THAM VOWEL SIGN MAI SAT
+1A63..1A64 ; Extend # Mc [2] TAI THAM VOWEL SIGN AA..TAI THAM VOWEL SIGN TALL AA
+1A65..1A6C ; Extend # Mn [8] TAI THAM VOWEL SIGN I..TAI THAM VOWEL SIGN OA BELOW
+1A6D..1A72 ; Extend # Mc [6] TAI THAM VOWEL SIGN OY..TAI THAM VOWEL SIGN THAM AI
+1A73..1A7C ; Extend # Mn [10] TAI THAM VOWEL SIGN OA ABOVE..TAI THAM SIGN KHUEN-LUE KARAN
+1A7F ; Extend # Mn TAI THAM COMBINING CRYPTOGRAMMIC DOT
+1AB0..1ABD ; Extend # Mn [14] COMBINING DOUBLED CIRCUMFLEX ACCENT..COMBINING PARENTHESES BELOW
+1ABE ; Extend # Me COMBINING PARENTHESES OVERLAY
+1ABF..1ADD ; Extend # Mn [31] COMBINING LATIN SMALL LETTER W BELOW..COMBINING DOT-AND-RING BELOW
+1AE0..1AEB ; Extend # Mn [12] COMBINING LEFT TACK ABOVE..COMBINING DOUBLE RIGHTWARDS ARROW ABOVE
+1B00..1B03 ; Extend # Mn [4] BALINESE SIGN ULU RICEM..BALINESE SIGN SURANG
+1B04 ; Extend # Mc BALINESE SIGN BISAH
+1B34 ; Extend # Mn BALINESE SIGN REREKAN
+1B35 ; Extend # Mc BALINESE VOWEL SIGN TEDUNG
+1B36..1B3A ; Extend # Mn [5] BALINESE VOWEL SIGN ULU..BALINESE VOWEL SIGN RA REPA
+1B3B ; Extend # Mc BALINESE VOWEL SIGN RA REPA TEDUNG
+1B3C ; Extend # Mn BALINESE VOWEL SIGN LA LENGA
+1B3D..1B41 ; Extend # Mc [5] BALINESE VOWEL SIGN LA LENGA TEDUNG..BALINESE VOWEL SIGN TALING REPA TEDUNG
+1B42 ; Extend # Mn BALINESE VOWEL SIGN PEPET
+1B43..1B44 ; Extend # Mc [2] BALINESE VOWEL SIGN PEPET TEDUNG..BALINESE ADEG ADEG
+1B6B..1B73 ; Extend # Mn [9] BALINESE MUSICAL SYMBOL COMBINING TEGEH..BALINESE MUSICAL SYMBOL COMBINING GONG
+1B80..1B81 ; Extend # Mn [2] SUNDANESE SIGN PANYECEK..SUNDANESE SIGN PANGLAYAR
+1B82 ; Extend # Mc SUNDANESE SIGN PANGWISAD
+1BA1 ; Extend # Mc SUNDANESE CONSONANT SIGN PAMINGKAL
+1BA2..1BA5 ; Extend # Mn [4] SUNDANESE CONSONANT SIGN PANYAKRA..SUNDANESE VOWEL SIGN PANYUKU
+1BA6..1BA7 ; Extend # Mc [2] SUNDANESE VOWEL SIGN PANAELAENG..SUNDANESE VOWEL SIGN PANOLONG
+1BA8..1BA9 ; Extend # Mn [2] SUNDANESE VOWEL SIGN PAMEPET..SUNDANESE VOWEL SIGN PANEULEUNG
+1BAA ; Extend # Mc SUNDANESE SIGN PAMAAEH
+1BAB..1BAD ; Extend # Mn [3] SUNDANESE SIGN VIRAMA..SUNDANESE CONSONANT SIGN PASANGAN WA
+1BE6 ; Extend # Mn BATAK SIGN TOMPI
+1BE7 ; Extend # Mc BATAK VOWEL SIGN E
+1BE8..1BE9 ; Extend # Mn [2] BATAK VOWEL SIGN PAKPAK E..BATAK VOWEL SIGN EE
+1BEA..1BEC ; Extend # Mc [3] BATAK VOWEL SIGN I..BATAK VOWEL SIGN O
+1BED ; Extend # Mn BATAK VOWEL SIGN KARO O
+1BEE ; Extend # Mc BATAK VOWEL SIGN U
+1BEF..1BF1 ; Extend # Mn [3] BATAK VOWEL SIGN U FOR SIMALUNGUN SA..BATAK CONSONANT SIGN H
+1BF2..1BF3 ; Extend # Mc [2] BATAK PANGOLAT..BATAK PANONGONAN
+1C24..1C2B ; Extend # Mc [8] LEPCHA SUBJOINED LETTER YA..LEPCHA VOWEL SIGN UU
+1C2C..1C33 ; Extend # Mn [8] LEPCHA VOWEL SIGN E..LEPCHA CONSONANT SIGN T
+1C34..1C35 ; Extend # Mc [2] LEPCHA CONSONANT SIGN NYIN-DO..LEPCHA CONSONANT SIGN KANG
+1C36..1C37 ; Extend # Mn [2] LEPCHA SIGN RAN..LEPCHA SIGN NUKTA
+1CD0..1CD2 ; Extend # Mn [3] VEDIC TONE KARSHANA..VEDIC TONE PRENKHA
+1CD4..1CE0 ; Extend # Mn [13] VEDIC SIGN YAJURVEDIC MIDLINE SVARITA..VEDIC TONE RIGVEDIC KASHMIRI INDEPENDENT SVARITA
+1CE1 ; Extend # Mc VEDIC TONE ATHARVAVEDIC INDEPENDENT SVARITA
+1CE2..1CE8 ; Extend # Mn [7] VEDIC SIGN VISARGA SVARITA..VEDIC SIGN VISARGA ANUDATTA WITH TAIL
+1CED ; Extend # Mn VEDIC SIGN TIRYAK
+1CF4 ; Extend # Mn VEDIC TONE CANDRA ABOVE
+1CF7 ; Extend # Mc VEDIC SIGN ATIKRAMA
+1CF8..1CF9 ; Extend # Mn [2] VEDIC TONE RING ABOVE..VEDIC TONE DOUBLE RING ABOVE
+1DC0..1DFF ; Extend # Mn [64] COMBINING DOTTED GRAVE ACCENT..COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW
+200C ; Extend # Cf ZERO WIDTH NON-JOINER
+20D0..20DC ; Extend # Mn [13] COMBINING LEFT HARPOON ABOVE..COMBINING FOUR DOTS ABOVE
+20DD..20E0 ; Extend # Me [4] COMBINING ENCLOSING CIRCLE..COMBINING ENCLOSING CIRCLE BACKSLASH
+20E1 ; Extend # Mn COMBINING LEFT RIGHT ARROW ABOVE
+20E2..20E4 ; Extend # Me [3] COMBINING ENCLOSING SCREEN..COMBINING ENCLOSING UPWARD POINTING TRIANGLE
+20E5..20F0 ; Extend # Mn [12] COMBINING REVERSE SOLIDUS OVERLAY..COMBINING ASTERISK ABOVE
+2CEF..2CF1 ; Extend # Mn [3] COPTIC COMBINING NI ABOVE..COPTIC COMBINING SPIRITUS LENIS
+2D7F ; Extend # Mn TIFINAGH CONSONANT JOINER
+2DE0..2DFF ; Extend # Mn [32] COMBINING CYRILLIC LETTER BE..COMBINING CYRILLIC LETTER IOTIFIED BIG YUS
+302A..302D ; Extend # Mn [4] IDEOGRAPHIC LEVEL TONE MARK..IDEOGRAPHIC ENTERING TONE MARK
+302E..302F ; Extend # Mc [2] HANGUL SINGLE DOT TONE MARK..HANGUL DOUBLE DOT TONE MARK
+3099..309A ; Extend # Mn [2] COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK..COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
+A66F ; Extend # Mn COMBINING CYRILLIC VZMET
+A670..A672 ; Extend # Me [3] COMBINING CYRILLIC TEN MILLIONS SIGN..COMBINING CYRILLIC THOUSAND MILLIONS SIGN
+A674..A67D ; Extend # Mn [10] COMBINING CYRILLIC LETTER UKRAINIAN IE..COMBINING CYRILLIC PAYEROK
+A69E..A69F ; Extend # Mn [2] COMBINING CYRILLIC LETTER EF..COMBINING CYRILLIC LETTER IOTIFIED E
+A6F0..A6F1 ; Extend # Mn [2] BAMUM COMBINING MARK KOQNDON..BAMUM COMBINING MARK TUKWENTIS
+A802 ; Extend # Mn SYLOTI NAGRI SIGN DVISVARA
+A806 ; Extend # Mn SYLOTI NAGRI SIGN HASANTA
+A80B ; Extend # Mn SYLOTI NAGRI SIGN ANUSVARA
+A823..A824 ; Extend # Mc [2] SYLOTI NAGRI VOWEL SIGN A..SYLOTI NAGRI VOWEL SIGN I
+A825..A826 ; Extend # Mn [2] SYLOTI NAGRI VOWEL SIGN U..SYLOTI NAGRI VOWEL SIGN E
+A827 ; Extend # Mc SYLOTI NAGRI VOWEL SIGN OO
+A82C ; Extend # Mn SYLOTI NAGRI SIGN ALTERNATE HASANTA
+A880..A881 ; Extend # Mc [2] SAURASHTRA SIGN ANUSVARA..SAURASHTRA SIGN VISARGA
+A8B4..A8C3 ; Extend # Mc [16] SAURASHTRA CONSONANT SIGN HAARU..SAURASHTRA VOWEL SIGN AU
+A8C4..A8C5 ; Extend # Mn [2] SAURASHTRA SIGN VIRAMA..SAURASHTRA SIGN CANDRABINDU
+A8E0..A8F1 ; Extend # Mn [18] COMBINING DEVANAGARI DIGIT ZERO..COMBINING DEVANAGARI SIGN AVAGRAHA
+A8FF ; Extend # Mn DEVANAGARI VOWEL SIGN AY
+A926..A92D ; Extend # Mn [8] KAYAH LI VOWEL UE..KAYAH LI TONE CALYA PLOPHU
+A947..A951 ; Extend # Mn [11] REJANG VOWEL SIGN I..REJANG CONSONANT SIGN R
+A952..A953 ; Extend # Mc [2] REJANG CONSONANT SIGN H..REJANG VIRAMA
+A980..A982 ; Extend # Mn [3] JAVANESE SIGN PANYANGGA..JAVANESE SIGN LAYAR
+A983 ; Extend # Mc JAVANESE SIGN WIGNYAN
+A9B3 ; Extend # Mn JAVANESE SIGN CECAK TELU
+A9B4..A9B5 ; Extend # Mc [2] JAVANESE VOWEL SIGN TARUNG..JAVANESE VOWEL SIGN TOLONG
+A9B6..A9B9 ; Extend # Mn [4] JAVANESE VOWEL SIGN WULU..JAVANESE VOWEL SIGN SUKU MENDUT
+A9BA..A9BB ; Extend # Mc [2] JAVANESE VOWEL SIGN TALING..JAVANESE VOWEL SIGN DIRGA MURE
+A9BC..A9BD ; Extend # Mn [2] JAVANESE VOWEL SIGN PEPET..JAVANESE CONSONANT SIGN KERET
+A9BE..A9C0 ; Extend # Mc [3] JAVANESE CONSONANT SIGN PENGKAL..JAVANESE PANGKON
+A9E5 ; Extend # Mn MYANMAR SIGN SHAN SAW
+AA29..AA2E ; Extend # Mn [6] CHAM VOWEL SIGN AA..CHAM VOWEL SIGN OE
+AA2F..AA30 ; Extend # Mc [2] CHAM VOWEL SIGN O..CHAM VOWEL SIGN AI
+AA31..AA32 ; Extend # Mn [2] CHAM VOWEL SIGN AU..CHAM VOWEL SIGN UE
+AA33..AA34 ; Extend # Mc [2] CHAM CONSONANT SIGN YA..CHAM CONSONANT SIGN RA
+AA35..AA36 ; Extend # Mn [2] CHAM CONSONANT SIGN LA..CHAM CONSONANT SIGN WA
+AA43 ; Extend # Mn CHAM CONSONANT SIGN FINAL NG
+AA4C ; Extend # Mn CHAM CONSONANT SIGN FINAL M
+AA4D ; Extend # Mc CHAM CONSONANT SIGN FINAL H
+AA7B ; Extend # Mc MYANMAR SIGN PAO KAREN TONE
+AA7C ; Extend # Mn MYANMAR SIGN TAI LAING TONE-2
+AA7D ; Extend # Mc MYANMAR SIGN TAI LAING TONE-5
+AAB0 ; Extend # Mn TAI VIET MAI KANG
+AAB2..AAB4 ; Extend # Mn [3] TAI VIET VOWEL I..TAI VIET VOWEL U
+AAB7..AAB8 ; Extend # Mn [2] TAI VIET MAI KHIT..TAI VIET VOWEL IA
+AABE..AABF ; Extend # Mn [2] TAI VIET VOWEL AM..TAI VIET TONE MAI EK
+AAC1 ; Extend # Mn TAI VIET TONE MAI THO
+AAEB ; Extend # Mc MEETEI MAYEK VOWEL SIGN II
+AAEC..AAED ; Extend # Mn [2] MEETEI MAYEK VOWEL SIGN UU..MEETEI MAYEK VOWEL SIGN AAI
+AAEE..AAEF ; Extend # Mc [2] MEETEI MAYEK VOWEL SIGN AU..MEETEI MAYEK VOWEL SIGN AAU
+AAF5 ; Extend # Mc MEETEI MAYEK VOWEL SIGN VISARGA
+AAF6 ; Extend # Mn MEETEI MAYEK VIRAMA
+ABE3..ABE4 ; Extend # Mc [2] MEETEI MAYEK VOWEL SIGN ONAP..MEETEI MAYEK VOWEL SIGN INAP
+ABE5 ; Extend # Mn MEETEI MAYEK VOWEL SIGN ANAP
+ABE6..ABE7 ; Extend # Mc [2] MEETEI MAYEK VOWEL SIGN YENAP..MEETEI MAYEK VOWEL SIGN SOUNAP
+ABE8 ; Extend # Mn MEETEI MAYEK VOWEL SIGN UNAP
+ABE9..ABEA ; Extend # Mc [2] MEETEI MAYEK VOWEL SIGN CHEINAP..MEETEI MAYEK VOWEL SIGN NUNG
+ABEC ; Extend # Mc MEETEI MAYEK LUM IYEK
+ABED ; Extend # Mn MEETEI MAYEK APUN IYEK
+FB1E ; Extend # Mn HEBREW POINT JUDEO-SPANISH VARIKA
+FE00..FE0F ; Extend # Mn [16] VARIATION SELECTOR-1..VARIATION SELECTOR-16
+FE20..FE2F ; Extend # Mn [16] COMBINING LIGATURE LEFT HALF..COMBINING CYRILLIC TITLO RIGHT HALF
+FF9E..FF9F ; Extend # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK
+101FD ; Extend # Mn PHAISTOS DISC SIGN COMBINING OBLIQUE STROKE
+102E0 ; Extend # Mn COPTIC EPACT THOUSANDS MARK
+10376..1037A ; Extend # Mn [5] COMBINING OLD PERMIC LETTER AN..COMBINING OLD PERMIC LETTER SII
+10A01..10A03 ; Extend # Mn [3] KHAROSHTHI VOWEL SIGN I..KHAROSHTHI VOWEL SIGN VOCALIC R
+10A05..10A06 ; Extend # Mn [2] KHAROSHTHI VOWEL SIGN E..KHAROSHTHI VOWEL SIGN O
+10A0C..10A0F ; Extend # Mn [4] KHAROSHTHI VOWEL LENGTH MARK..KHAROSHTHI SIGN VISARGA
+10A38..10A3A ; Extend # Mn [3] KHAROSHTHI SIGN BAR ABOVE..KHAROSHTHI SIGN DOT BELOW
+10A3F ; Extend # Mn KHAROSHTHI VIRAMA
+10AE5..10AE6 ; Extend # Mn [2] MANICHAEAN ABBREVIATION MARK ABOVE..MANICHAEAN ABBREVIATION MARK BELOW
+10D24..10D27 ; Extend # Mn [4] HANIFI ROHINGYA SIGN HARBAHAY..HANIFI ROHINGYA SIGN TASSI
+10D69..10D6D ; Extend # Mn [5] GARAY VOWEL SIGN E..GARAY CONSONANT NASALIZATION MARK
+10EAB..10EAC ; Extend # Mn [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK
+10EFA..10EFF ; Extend # Mn [6] ARABIC DOUBLE VERTICAL BAR BELOW..ARABIC SMALL LOW WORD MADDA
+10F46..10F50 ; Extend # Mn [11] SOGDIAN COMBINING DOT BELOW..SOGDIAN COMBINING STROKE BELOW
+10F82..10F85 ; Extend # Mn [4] OLD UYGHUR COMBINING DOT ABOVE..OLD UYGHUR COMBINING TWO DOTS BELOW
+11000 ; Extend # Mc BRAHMI SIGN CANDRABINDU
+11001 ; Extend # Mn BRAHMI SIGN ANUSVARA
+11002 ; Extend # Mc BRAHMI SIGN VISARGA
+11038..11046 ; Extend # Mn [15] BRAHMI VOWEL SIGN AA..BRAHMI VIRAMA
+11070 ; Extend # Mn BRAHMI SIGN OLD TAMIL VIRAMA
+11073..11074 ; Extend # Mn [2] BRAHMI VOWEL SIGN OLD TAMIL SHORT E..BRAHMI VOWEL SIGN OLD TAMIL SHORT O
+1107F..11081 ; Extend # Mn [3] BRAHMI NUMBER JOINER..KAITHI SIGN ANUSVARA
+11082 ; Extend # Mc KAITHI SIGN VISARGA
+110B0..110B2 ; Extend # Mc [3] KAITHI VOWEL SIGN AA..KAITHI VOWEL SIGN II
+110B3..110B6 ; Extend # Mn [4] KAITHI VOWEL SIGN U..KAITHI VOWEL SIGN AI
+110B7..110B8 ; Extend # Mc [2] KAITHI VOWEL SIGN O..KAITHI VOWEL SIGN AU
+110B9..110BA ; Extend # Mn [2] KAITHI SIGN VIRAMA..KAITHI SIGN NUKTA
+110C2 ; Extend # Mn KAITHI VOWEL SIGN VOCALIC R
+11100..11102 ; Extend # Mn [3] CHAKMA SIGN CANDRABINDU..CHAKMA SIGN VISARGA
+11127..1112B ; Extend # Mn [5] CHAKMA VOWEL SIGN A..CHAKMA VOWEL SIGN UU
+1112C ; Extend # Mc CHAKMA VOWEL SIGN E
+1112D..11134 ; Extend # Mn [8] CHAKMA VOWEL SIGN AI..CHAKMA MAAYYAA
+11145..11146 ; Extend # Mc [2] CHAKMA VOWEL SIGN AA..CHAKMA VOWEL SIGN EI
+11173 ; Extend # Mn MAHAJANI SIGN NUKTA
+11180..11181 ; Extend # Mn [2] SHARADA SIGN CANDRABINDU..SHARADA SIGN ANUSVARA
+11182 ; Extend # Mc SHARADA SIGN VISARGA
+111B3..111B5 ; Extend # Mc [3] SHARADA VOWEL SIGN AA..SHARADA VOWEL SIGN II
+111B6..111BE ; Extend # Mn [9] SHARADA VOWEL SIGN U..SHARADA VOWEL SIGN O
+111BF..111C0 ; Extend # Mc [2] SHARADA VOWEL SIGN AU..SHARADA SIGN VIRAMA
+111C9..111CC ; Extend # Mn [4] SHARADA SANDHI MARK..SHARADA EXTRA SHORT VOWEL MARK
+111CE ; Extend # Mc SHARADA VOWEL SIGN PRISHTHAMATRA E
+111CF ; Extend # Mn SHARADA SIGN INVERTED CANDRABINDU
+1122C..1122E ; Extend # Mc [3] KHOJKI VOWEL SIGN AA..KHOJKI VOWEL SIGN II
+1122F..11231 ; Extend # Mn [3] KHOJKI VOWEL SIGN U..KHOJKI VOWEL SIGN AI
+11232..11233 ; Extend # Mc [2] KHOJKI VOWEL SIGN O..KHOJKI VOWEL SIGN AU
+11234 ; Extend # Mn KHOJKI SIGN ANUSVARA
+11235 ; Extend # Mc KHOJKI SIGN VIRAMA
+11236..11237 ; Extend # Mn [2] KHOJKI SIGN NUKTA..KHOJKI SIGN SHADDA
+1123E ; Extend # Mn KHOJKI SIGN SUKUN
+11241 ; Extend # Mn KHOJKI VOWEL SIGN VOCALIC R
+112DF ; Extend # Mn KHUDAWADI SIGN ANUSVARA
+112E0..112E2 ; Extend # Mc [3] KHUDAWADI VOWEL SIGN AA..KHUDAWADI VOWEL SIGN II
+112E3..112EA ; Extend # Mn [8] KHUDAWADI VOWEL SIGN U..KHUDAWADI SIGN VIRAMA
+11300..11301 ; Extend # Mn [2] GRANTHA SIGN COMBINING ANUSVARA ABOVE..GRANTHA SIGN CANDRABINDU
+11302..11303 ; Extend # Mc [2] GRANTHA SIGN ANUSVARA..GRANTHA SIGN VISARGA
+1133B..1133C ; Extend # Mn [2] COMBINING BINDU BELOW..GRANTHA SIGN NUKTA
+1133E..1133F ; Extend # Mc [2] GRANTHA VOWEL SIGN AA..GRANTHA VOWEL SIGN I
+11340 ; Extend # Mn GRANTHA VOWEL SIGN II
+11341..11344 ; Extend # Mc [4] GRANTHA VOWEL SIGN U..GRANTHA VOWEL SIGN VOCALIC RR
+11347..11348 ; Extend # Mc [2] GRANTHA VOWEL SIGN EE..GRANTHA VOWEL SIGN AI
+1134B..1134D ; Extend # Mc [3] GRANTHA VOWEL SIGN OO..GRANTHA SIGN VIRAMA
+11357 ; Extend # Mc GRANTHA AU LENGTH MARK
+11362..11363 ; Extend # Mc [2] GRANTHA VOWEL SIGN VOCALIC L..GRANTHA VOWEL SIGN VOCALIC LL
+11366..1136C ; Extend # Mn [7] COMBINING GRANTHA DIGIT ZERO..COMBINING GRANTHA DIGIT SIX
+11370..11374 ; Extend # Mn [5] COMBINING GRANTHA LETTER A..COMBINING GRANTHA LETTER PA
+113B8..113BA ; Extend # Mc [3] TULU-TIGALARI VOWEL SIGN AA..TULU-TIGALARI VOWEL SIGN II
+113BB..113C0 ; Extend # Mn [6] TULU-TIGALARI VOWEL SIGN U..TULU-TIGALARI VOWEL SIGN VOCALIC LL
+113C2 ; Extend # Mc TULU-TIGALARI VOWEL SIGN EE
+113C5 ; Extend # Mc TULU-TIGALARI VOWEL SIGN AI
+113C7..113CA ; Extend # Mc [4] TULU-TIGALARI VOWEL SIGN OO..TULU-TIGALARI SIGN CANDRA ANUNASIKA
+113CC..113CD ; Extend # Mc [2] TULU-TIGALARI SIGN ANUSVARA..TULU-TIGALARI SIGN VISARGA
+113CE ; Extend # Mn TULU-TIGALARI SIGN VIRAMA
+113CF ; Extend # Mc TULU-TIGALARI SIGN LOOPED VIRAMA
+113D0 ; Extend # Mn TULU-TIGALARI CONJOINER
+113D2 ; Extend # Mn TULU-TIGALARI GEMINATION MARK
+113E1..113E2 ; Extend # Mn [2] TULU-TIGALARI VEDIC TONE SVARITA..TULU-TIGALARI VEDIC TONE ANUDATTA
+11435..11437 ; Extend # Mc [3] NEWA VOWEL SIGN AA..NEWA VOWEL SIGN II
+11438..1143F ; Extend # Mn [8] NEWA VOWEL SIGN U..NEWA VOWEL SIGN AI
+11440..11441 ; Extend # Mc [2] NEWA VOWEL SIGN O..NEWA VOWEL SIGN AU
+11442..11444 ; Extend # Mn [3] NEWA SIGN VIRAMA..NEWA SIGN ANUSVARA
+11445 ; Extend # Mc NEWA SIGN VISARGA
+11446 ; Extend # Mn NEWA SIGN NUKTA
+1145E ; Extend # Mn NEWA SANDHI MARK
+114B0..114B2 ; Extend # Mc [3] TIRHUTA VOWEL SIGN AA..TIRHUTA VOWEL SIGN II
+114B3..114B8 ; Extend # Mn [6] TIRHUTA VOWEL SIGN U..TIRHUTA VOWEL SIGN VOCALIC LL
+114B9 ; Extend # Mc TIRHUTA VOWEL SIGN E
+114BA ; Extend # Mn TIRHUTA VOWEL SIGN SHORT E
+114BB..114BE ; Extend # Mc [4] TIRHUTA VOWEL SIGN AI..TIRHUTA VOWEL SIGN AU
+114BF..114C0 ; Extend # Mn [2] TIRHUTA SIGN CANDRABINDU..TIRHUTA SIGN ANUSVARA
+114C1 ; Extend # Mc TIRHUTA SIGN VISARGA
+114C2..114C3 ; Extend # Mn [2] TIRHUTA SIGN VIRAMA..TIRHUTA SIGN NUKTA
+115AF..115B1 ; Extend # Mc [3] SIDDHAM VOWEL SIGN AA..SIDDHAM VOWEL SIGN II
+115B2..115B5 ; Extend # Mn [4] SIDDHAM VOWEL SIGN U..SIDDHAM VOWEL SIGN VOCALIC RR
+115B8..115BB ; Extend # Mc [4] SIDDHAM VOWEL SIGN E..SIDDHAM VOWEL SIGN AU
+115BC..115BD ; Extend # Mn [2] SIDDHAM SIGN CANDRABINDU..SIDDHAM SIGN ANUSVARA
+115BE ; Extend # Mc SIDDHAM SIGN VISARGA
+115BF..115C0 ; Extend # Mn [2] SIDDHAM SIGN VIRAMA..SIDDHAM SIGN NUKTA
+115DC..115DD ; Extend # Mn [2] SIDDHAM VOWEL SIGN ALTERNATE U..SIDDHAM VOWEL SIGN ALTERNATE UU
+11630..11632 ; Extend # Mc [3] MODI VOWEL SIGN AA..MODI VOWEL SIGN II
+11633..1163A ; Extend # Mn [8] MODI VOWEL SIGN U..MODI VOWEL SIGN AI
+1163B..1163C ; Extend # Mc [2] MODI VOWEL SIGN O..MODI VOWEL SIGN AU
+1163D ; Extend # Mn MODI SIGN ANUSVARA
+1163E ; Extend # Mc MODI SIGN VISARGA
+1163F..11640 ; Extend # Mn [2] MODI SIGN VIRAMA..MODI SIGN ARDHACANDRA
+116AB ; Extend # Mn TAKRI SIGN ANUSVARA
+116AC ; Extend # Mc TAKRI SIGN VISARGA
+116AD ; Extend # Mn TAKRI VOWEL SIGN AA
+116AE..116AF ; Extend # Mc [2] TAKRI VOWEL SIGN I..TAKRI VOWEL SIGN II
+116B0..116B5 ; Extend # Mn [6] TAKRI VOWEL SIGN U..TAKRI VOWEL SIGN AU
+116B6 ; Extend # Mc TAKRI SIGN VIRAMA
+116B7 ; Extend # Mn TAKRI SIGN NUKTA
+1171D ; Extend # Mn AHOM CONSONANT SIGN MEDIAL LA
+1171E ; Extend # Mc AHOM CONSONANT SIGN MEDIAL RA
+1171F ; Extend # Mn AHOM CONSONANT SIGN MEDIAL LIGATING RA
+11720..11721 ; Extend # Mc [2] AHOM VOWEL SIGN A..AHOM VOWEL SIGN AA
+11722..11725 ; Extend # Mn [4] AHOM VOWEL SIGN I..AHOM VOWEL SIGN UU
+11726 ; Extend # Mc AHOM VOWEL SIGN E
+11727..1172B ; Extend # Mn [5] AHOM VOWEL SIGN AW..AHOM SIGN KILLER
+1182C..1182E ; Extend # Mc [3] DOGRA VOWEL SIGN AA..DOGRA VOWEL SIGN II
+1182F..11837 ; Extend # Mn [9] DOGRA VOWEL SIGN U..DOGRA SIGN ANUSVARA
+11838 ; Extend # Mc DOGRA SIGN VISARGA
+11839..1183A ; Extend # Mn [2] DOGRA SIGN VIRAMA..DOGRA SIGN NUKTA
+11930..11935 ; Extend # Mc [6] DIVES AKURU VOWEL SIGN AA..DIVES AKURU VOWEL SIGN E
+11937..11938 ; Extend # Mc [2] DIVES AKURU VOWEL SIGN AI..DIVES AKURU VOWEL SIGN O
+1193B..1193C ; Extend # Mn [2] DIVES AKURU SIGN ANUSVARA..DIVES AKURU SIGN CANDRABINDU
+1193D ; Extend # Mc DIVES AKURU SIGN HALANTA
+1193E ; Extend # Mn DIVES AKURU VIRAMA
+11940 ; Extend # Mc DIVES AKURU MEDIAL YA
+11942 ; Extend # Mc DIVES AKURU MEDIAL RA
+11943 ; Extend # Mn DIVES AKURU SIGN NUKTA
+119D1..119D3 ; Extend # Mc [3] NANDINAGARI VOWEL SIGN AA..NANDINAGARI VOWEL SIGN II
+119D4..119D7 ; Extend # Mn [4] NANDINAGARI VOWEL SIGN U..NANDINAGARI VOWEL SIGN VOCALIC RR
+119DA..119DB ; Extend # Mn [2] NANDINAGARI VOWEL SIGN E..NANDINAGARI VOWEL SIGN AI
+119DC..119DF ; Extend # Mc [4] NANDINAGARI VOWEL SIGN O..NANDINAGARI SIGN VISARGA
+119E0 ; Extend # Mn NANDINAGARI SIGN VIRAMA
+119E4 ; Extend # Mc NANDINAGARI VOWEL SIGN PRISHTHAMATRA E
+11A01..11A0A ; Extend # Mn [10] ZANABAZAR SQUARE VOWEL SIGN I..ZANABAZAR SQUARE VOWEL LENGTH MARK
+11A33..11A38 ; Extend # Mn [6] ZANABAZAR SQUARE FINAL CONSONANT MARK..ZANABAZAR SQUARE SIGN ANUSVARA
+11A39 ; Extend # Mc ZANABAZAR SQUARE SIGN VISARGA
+11A3B..11A3E ; Extend # Mn [4] ZANABAZAR SQUARE CLUSTER-FINAL LETTER YA..ZANABAZAR SQUARE CLUSTER-FINAL LETTER VA
+11A47 ; Extend # Mn ZANABAZAR SQUARE SUBJOINER
+11A51..11A56 ; Extend # Mn [6] SOYOMBO VOWEL SIGN I..SOYOMBO VOWEL SIGN OE
+11A57..11A58 ; Extend # Mc [2] SOYOMBO VOWEL SIGN AI..SOYOMBO VOWEL SIGN AU
+11A59..11A5B ; Extend # Mn [3] SOYOMBO VOWEL SIGN VOCALIC R..SOYOMBO VOWEL LENGTH MARK
+11A8A..11A96 ; Extend # Mn [13] SOYOMBO FINAL CONSONANT SIGN G..SOYOMBO SIGN ANUSVARA
+11A97 ; Extend # Mc SOYOMBO SIGN VISARGA
+11A98..11A99 ; Extend # Mn [2] SOYOMBO GEMINATION MARK..SOYOMBO SUBJOINER
+11B60 ; Extend # Mn SHARADA VOWEL SIGN OE
+11B61 ; Extend # Mc SHARADA VOWEL SIGN OOE
+11B62..11B64 ; Extend # Mn [3] SHARADA VOWEL SIGN UE..SHARADA VOWEL SIGN SHORT E
+11B65 ; Extend # Mc SHARADA VOWEL SIGN SHORT O
+11B66 ; Extend # Mn SHARADA VOWEL SIGN CANDRA E
+11B67 ; Extend # Mc SHARADA VOWEL SIGN CANDRA O
+11C2F ; Extend # Mc BHAIKSUKI VOWEL SIGN AA
+11C30..11C36 ; Extend # Mn [7] BHAIKSUKI VOWEL SIGN I..BHAIKSUKI VOWEL SIGN VOCALIC L
+11C38..11C3D ; Extend # Mn [6] BHAIKSUKI VOWEL SIGN E..BHAIKSUKI SIGN ANUSVARA
+11C3E ; Extend # Mc BHAIKSUKI SIGN VISARGA
+11C3F ; Extend # Mn BHAIKSUKI SIGN VIRAMA
+11C92..11CA7 ; Extend # Mn [22] MARCHEN SUBJOINED LETTER KA..MARCHEN SUBJOINED LETTER ZA
+11CA9 ; Extend # Mc MARCHEN SUBJOINED LETTER YA
+11CAA..11CB0 ; Extend # Mn [7] MARCHEN SUBJOINED LETTER RA..MARCHEN VOWEL SIGN AA
+11CB1 ; Extend # Mc MARCHEN VOWEL SIGN I
+11CB2..11CB3 ; Extend # Mn [2] MARCHEN VOWEL SIGN U..MARCHEN VOWEL SIGN E
+11CB4 ; Extend # Mc MARCHEN VOWEL SIGN O
+11CB5..11CB6 ; Extend # Mn [2] MARCHEN SIGN ANUSVARA..MARCHEN SIGN CANDRABINDU
+11D31..11D36 ; Extend # Mn [6] MASARAM GONDI VOWEL SIGN AA..MASARAM GONDI VOWEL SIGN VOCALIC R
+11D3A ; Extend # Mn MASARAM GONDI VOWEL SIGN E
+11D3C..11D3D ; Extend # Mn [2] MASARAM GONDI VOWEL SIGN AI..MASARAM GONDI VOWEL SIGN O
+11D3F..11D45 ; Extend # Mn [7] MASARAM GONDI VOWEL SIGN AU..MASARAM GONDI VIRAMA
+11D47 ; Extend # Mn MASARAM GONDI RA-KARA
+11D8A..11D8E ; Extend # Mc [5] GUNJALA GONDI VOWEL SIGN AA..GUNJALA GONDI VOWEL SIGN UU
+11D90..11D91 ; Extend # Mn [2] GUNJALA GONDI VOWEL SIGN EE..GUNJALA GONDI VOWEL SIGN AI
+11D93..11D94 ; Extend # Mc [2] GUNJALA GONDI VOWEL SIGN OO..GUNJALA GONDI VOWEL SIGN AU
+11D95 ; Extend # Mn GUNJALA GONDI SIGN ANUSVARA
+11D96 ; Extend # Mc GUNJALA GONDI SIGN VISARGA
+11D97 ; Extend # Mn GUNJALA GONDI VIRAMA
+11EF3..11EF4 ; Extend # Mn [2] MAKASAR VOWEL SIGN I..MAKASAR VOWEL SIGN U
+11EF5..11EF6 ; Extend # Mc [2] MAKASAR VOWEL SIGN E..MAKASAR VOWEL SIGN O
+11F00..11F01 ; Extend # Mn [2] KAWI SIGN CANDRABINDU..KAWI SIGN ANUSVARA
+11F03 ; Extend # Mc KAWI SIGN VISARGA
+11F34..11F35 ; Extend # Mc [2] KAWI VOWEL SIGN AA..KAWI VOWEL SIGN ALTERNATE AA
+11F36..11F3A ; Extend # Mn [5] KAWI VOWEL SIGN I..KAWI VOWEL SIGN VOCALIC R
+11F3E..11F3F ; Extend # Mc [2] KAWI VOWEL SIGN E..KAWI VOWEL SIGN AI
+11F40 ; Extend # Mn KAWI VOWEL SIGN EU
+11F41 ; Extend # Mc KAWI SIGN KILLER
+11F42 ; Extend # Mn KAWI CONJOINER
+11F5A ; Extend # Mn KAWI SIGN NUKTA
+13440 ; Extend # Mn EGYPTIAN HIEROGLYPH MIRROR HORIZONTALLY
+13447..13455 ; Extend # Mn [15] EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT TOP START..EGYPTIAN HIEROGLYPH MODIFIER DAMAGED
+1611E..16129 ; Extend # Mn [12] GURUNG KHEMA VOWEL SIGN AA..GURUNG KHEMA VOWEL LENGTH MARK
+1612A..1612C ; Extend # Mc [3] GURUNG KHEMA CONSONANT SIGN MEDIAL YA..GURUNG KHEMA CONSONANT SIGN MEDIAL HA
+1612D..1612F ; Extend # Mn [3] GURUNG KHEMA SIGN ANUSVARA..GURUNG KHEMA SIGN THOLHOMA
+16AF0..16AF4 ; Extend # Mn [5] BASSA VAH COMBINING HIGH TONE..BASSA VAH COMBINING HIGH-LOW TONE
+16B30..16B36 ; Extend # Mn [7] PAHAWH HMONG MARK CIM TUB..PAHAWH HMONG MARK CIM TAUM
+16F4F ; Extend # Mn MIAO SIGN CONSONANT MODIFIER BAR
+16F51..16F87 ; Extend # Mc [55] MIAO SIGN ASPIRATION..MIAO VOWEL SIGN UI
+16F8F..16F92 ; Extend # Mn [4] MIAO TONE RIGHT..MIAO TONE BELOW
+16FE4 ; Extend # Mn KHITAN SMALL SCRIPT FILLER
+16FF0..16FF1 ; Extend # Mc [2] VIETNAMESE ALTERNATE READING MARK CA..VIETNAMESE ALTERNATE READING MARK NHAY
+1BC9D..1BC9E ; Extend # Mn [2] DUPLOYAN THICK LETTER SELECTOR..DUPLOYAN DOUBLE MARK
+1CF00..1CF2D ; Extend # Mn [46] ZNAMENNY COMBINING MARK GORAZDO NIZKO S KRYZHEM ON LEFT..ZNAMENNY COMBINING MARK KRYZH ON LEFT
+1CF30..1CF46 ; Extend # Mn [23] ZNAMENNY COMBINING TONAL RANGE MARK MRACHNO..ZNAMENNY PRIZNAK MODIFIER ROG
+1D165..1D166 ; Extend # Mc [2] MUSICAL SYMBOL COMBINING STEM..MUSICAL SYMBOL COMBINING SPRECHGESANG STEM
+1D167..1D169 ; Extend # Mn [3] MUSICAL SYMBOL COMBINING TREMOLO-1..MUSICAL SYMBOL COMBINING TREMOLO-3
+1D16D..1D172 ; Extend # Mc [6] MUSICAL SYMBOL COMBINING AUGMENTATION DOT..MUSICAL SYMBOL COMBINING FLAG-5
+1D17B..1D182 ; Extend # Mn [8] MUSICAL SYMBOL COMBINING ACCENT..MUSICAL SYMBOL COMBINING LOURE
+1D185..1D18B ; Extend # Mn [7] MUSICAL SYMBOL COMBINING DOIT..MUSICAL SYMBOL COMBINING TRIPLE TONGUE
+1D1AA..1D1AD ; Extend # Mn [4] MUSICAL SYMBOL COMBINING DOWN BOW..MUSICAL SYMBOL COMBINING SNAP PIZZICATO
+1D242..1D244 ; Extend # Mn [3] COMBINING GREEK MUSICAL TRISEME..COMBINING GREEK MUSICAL PENTASEME
+1DA00..1DA36 ; Extend # Mn [55] SIGNWRITING HEAD RIM..SIGNWRITING AIR SUCKING IN
+1DA3B..1DA6C ; Extend # Mn [50] SIGNWRITING MOUTH CLOSED NEUTRAL..SIGNWRITING EXCITEMENT
+1DA75 ; Extend # Mn SIGNWRITING UPPER BODY TILTING FROM HIP JOINTS
+1DA84 ; Extend # Mn SIGNWRITING LOCATION HEAD NECK
+1DA9B..1DA9F ; Extend # Mn [5] SIGNWRITING FILL MODIFIER-2..SIGNWRITING FILL MODIFIER-6
+1DAA1..1DAAF ; Extend # Mn [15] SIGNWRITING ROTATION MODIFIER-2..SIGNWRITING ROTATION MODIFIER-16
+1E000..1E006 ; Extend # Mn [7] COMBINING GLAGOLITIC LETTER AZU..COMBINING GLAGOLITIC LETTER ZHIVETE
+1E008..1E018 ; Extend # Mn [17] COMBINING GLAGOLITIC LETTER ZEMLJA..COMBINING GLAGOLITIC LETTER HERU
+1E01B..1E021 ; Extend # Mn [7] COMBINING GLAGOLITIC LETTER SHTA..COMBINING GLAGOLITIC LETTER YATI
+1E023..1E024 ; Extend # Mn [2] COMBINING GLAGOLITIC LETTER YU..COMBINING GLAGOLITIC LETTER SMALL YUS
+1E026..1E02A ; Extend # Mn [5] COMBINING GLAGOLITIC LETTER YO..COMBINING GLAGOLITIC LETTER FITA
+1E08F ; Extend # Mn COMBINING CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I
+1E130..1E136 ; Extend # Mn [7] NYIAKENG PUACHUE HMONG TONE-B..NYIAKENG PUACHUE HMONG TONE-D
+1E2AE ; Extend # Mn TOTO SIGN RISING TONE
+1E2EC..1E2EF ; Extend # Mn [4] WANCHO TONE TUP..WANCHO TONE KOINI
+1E4EC..1E4EF ; Extend # Mn [4] NAG MUNDARI SIGN MUHOR..NAG MUNDARI SIGN SUTUH
+1E5EE..1E5EF ; Extend # Mn [2] OL ONAL SIGN MU..OL ONAL SIGN IKIR
+1E6E3 ; Extend # Mn TAI YO SIGN UE
+1E6E6 ; Extend # Mn TAI YO SIGN AU
+1E6EE..1E6EF ; Extend # Mn [2] TAI YO SIGN AY..TAI YO SIGN ANG
+1E6F5 ; Extend # Mn TAI YO SIGN OM
+1E8D0..1E8D6 ; Extend # Mn [7] MENDE KIKAKUI COMBINING NUMBER TEENS..MENDE KIKAKUI COMBINING NUMBER MILLIONS
+1E944..1E94A ; Extend # Mn [7] ADLAM ALIF LENGTHENER..ADLAM NUKTA
+1F3FB..1F3FF ; Extend # Sk [5] EMOJI MODIFIER FITZPATRICK TYPE-1-2..EMOJI MODIFIER FITZPATRICK TYPE-6
+E0020..E007F ; Extend # Cf [96] TAG SPACE..CANCEL TAG
+E0100..E01EF ; Extend # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256
+
+# Total code points: 2647
+
+# ================================================
+
+1F1E6..1F1FF ; Regional_Indicator # So [26] REGIONAL INDICATOR SYMBOL LETTER A..REGIONAL INDICATOR SYMBOL LETTER Z
+
+# Total code points: 26
+
+# ================================================
+
+00AD ; Format # Cf SOFT HYPHEN
+061C ; Format # Cf ARABIC LETTER MARK
+180E ; Format # Cf MONGOLIAN VOWEL SEPARATOR
+200E..200F ; Format # Cf [2] LEFT-TO-RIGHT MARK..RIGHT-TO-LEFT MARK
+202A..202E ; Format # Cf [5] LEFT-TO-RIGHT EMBEDDING..RIGHT-TO-LEFT OVERRIDE
+2060..2064 ; Format # Cf [5] WORD JOINER..INVISIBLE PLUS
+2066..206F ; Format # Cf [10] LEFT-TO-RIGHT ISOLATE..NOMINAL DIGIT SHAPES
+FEFF ; Format # Cf ZERO WIDTH NO-BREAK SPACE
+FFF9..FFFB ; Format # Cf [3] INTERLINEAR ANNOTATION ANCHOR..INTERLINEAR ANNOTATION TERMINATOR
+13430..1343F ; Format # Cf [16] EGYPTIAN HIEROGLYPH VERTICAL JOINER..EGYPTIAN HIEROGLYPH END WALLED ENCLOSURE
+1BCA0..1BCA3 ; Format # Cf [4] SHORTHAND FORMAT LETTER OVERLAP..SHORTHAND FORMAT UP STEP
+1D173..1D17A ; Format # Cf [8] MUSICAL SYMBOL BEGIN BEAM..MUSICAL SYMBOL END PHRASE
+E0001 ; Format # Cf LANGUAGE TAG
+
+# Total code points: 58
+
+# ================================================
+
+3031..3035 ; Katakana # Lm [5] VERTICAL KANA REPEAT MARK..VERTICAL KANA REPEAT MARK LOWER HALF
+309B..309C ; Katakana # Sk [2] KATAKANA-HIRAGANA VOICED SOUND MARK..KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
+30A0 ; Katakana # Pd KATAKANA-HIRAGANA DOUBLE HYPHEN
+30A1..30FA ; Katakana # Lo [90] KATAKANA LETTER SMALL A..KATAKANA LETTER VO
+30FC..30FE ; Katakana # Lm [3] KATAKANA-HIRAGANA PROLONGED SOUND MARK..KATAKANA VOICED ITERATION MARK
+30FF ; Katakana # Lo KATAKANA DIGRAPH KOTO
+31F0..31FF ; Katakana # Lo [16] KATAKANA LETTER SMALL KU..KATAKANA LETTER SMALL RO
+32D0..32FE ; Katakana # So [47] CIRCLED KATAKANA A..CIRCLED KATAKANA WO
+3300..3357 ; Katakana # So [88] SQUARE APAATO..SQUARE WATTO
+FF66..FF6F ; Katakana # Lo [10] HALFWIDTH KATAKANA LETTER WO..HALFWIDTH KATAKANA LETTER SMALL TU
+FF70 ; Katakana # Lm HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
+FF71..FF9D ; Katakana # Lo [45] HALFWIDTH KATAKANA LETTER A..HALFWIDTH KATAKANA LETTER N
+1AFF0..1AFF3 ; Katakana # Lm [4] KATAKANA LETTER MINNAN TONE-2..KATAKANA LETTER MINNAN TONE-5
+1AFF5..1AFFB ; Katakana # Lm [7] KATAKANA LETTER MINNAN TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-5
+1AFFD..1AFFE ; Katakana # Lm [2] KATAKANA LETTER MINNAN NASALIZED TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-8
+1B000 ; Katakana # Lo KATAKANA LETTER ARCHAIC E
+1B120..1B122 ; Katakana # Lo [3] KATAKANA LETTER ARCHAIC YI..KATAKANA LETTER ARCHAIC WU
+1B155 ; Katakana # Lo KATAKANA LETTER SMALL KO
+1B164..1B167 ; Katakana # Lo [4] KATAKANA LETTER SMALL WI..KATAKANA LETTER SMALL N
+
+# Total code points: 331
+
+# ================================================
+
+0041..005A ; ALetter # L& [26] LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z
+0061..007A ; ALetter # L& [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z
+00AA ; ALetter # Lo FEMININE ORDINAL INDICATOR
+00B5 ; ALetter # L& MICRO SIGN
+00B8 ; ALetter # Sk CEDILLA
+00BA ; ALetter # Lo MASCULINE ORDINAL INDICATOR
+00C0..00D6 ; ALetter # L& [23] LATIN CAPITAL LETTER A WITH GRAVE..LATIN CAPITAL LETTER O WITH DIAERESIS
+00D8..00F6 ; ALetter # L& [31] LATIN CAPITAL LETTER O WITH STROKE..LATIN SMALL LETTER O WITH DIAERESIS
+00F8..01BA ; ALetter # L& [195] LATIN SMALL LETTER O WITH STROKE..LATIN SMALL LETTER EZH WITH TAIL
+01BB ; ALetter # Lo LATIN LETTER TWO WITH STROKE
+01BC..01BF ; ALetter # L& [4] LATIN CAPITAL LETTER TONE FIVE..LATIN LETTER WYNN
+01C0..01C3 ; ALetter # Lo [4] LATIN LETTER DENTAL CLICK..LATIN LETTER RETROFLEX CLICK
+01C4..0293 ; ALetter # L& [208] LATIN CAPITAL LETTER DZ WITH CARON..LATIN SMALL LETTER EZH WITH CURL
+0294..0295 ; ALetter # Lo [2] LATIN LETTER GLOTTAL STOP..LATIN LETTER PHARYNGEAL VOICED FRICATIVE
+0296..02AF ; ALetter # L& [26] LATIN LETTER INVERTED GLOTTAL STOP..LATIN SMALL LETTER TURNED H WITH FISHHOOK AND TAIL
+02B0..02C1 ; ALetter # Lm [18] MODIFIER LETTER SMALL H..MODIFIER LETTER REVERSED GLOTTAL STOP
+02C2..02C5 ; ALetter # Sk [4] MODIFIER LETTER LEFT ARROWHEAD..MODIFIER LETTER DOWN ARROWHEAD
+02C6..02D1 ; ALetter # Lm [12] MODIFIER LETTER CIRCUMFLEX ACCENT..MODIFIER LETTER HALF TRIANGULAR COLON
+02D2..02D7 ; ALetter # Sk [6] MODIFIER LETTER CENTRED RIGHT HALF RING..MODIFIER LETTER MINUS SIGN
+02DE..02DF ; ALetter # Sk [2] MODIFIER LETTER RHOTIC HOOK..MODIFIER LETTER CROSS ACCENT
+02E0..02E4 ; ALetter # Lm [5] MODIFIER LETTER SMALL GAMMA..MODIFIER LETTER SMALL REVERSED GLOTTAL STOP
+02E5..02EB ; ALetter # Sk [7] MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER YANG DEPARTING TONE MARK
+02EC ; ALetter # Lm MODIFIER LETTER VOICING
+02ED ; ALetter # Sk MODIFIER LETTER UNASPIRATED
+02EE ; ALetter # Lm MODIFIER LETTER DOUBLE APOSTROPHE
+02EF..02FF ; ALetter # Sk [17] MODIFIER LETTER LOW DOWN ARROWHEAD..MODIFIER LETTER LOW LEFT ARROW
+0370..0373 ; ALetter # L& [4] GREEK CAPITAL LETTER HETA..GREEK SMALL LETTER ARCHAIC SAMPI
+0374 ; ALetter # Lm GREEK NUMERAL SIGN
+0376..0377 ; ALetter # L& [2] GREEK CAPITAL LETTER PAMPHYLIAN DIGAMMA..GREEK SMALL LETTER PAMPHYLIAN DIGAMMA
+037A ; ALetter # Lm GREEK YPOGEGRAMMENI
+037B..037D ; ALetter # L& [3] GREEK SMALL REVERSED LUNATE SIGMA SYMBOL..GREEK SMALL REVERSED DOTTED LUNATE SIGMA SYMBOL
+037F ; ALetter # L& GREEK CAPITAL LETTER YOT
+0386 ; ALetter # L& GREEK CAPITAL LETTER ALPHA WITH TONOS
+0388..038A ; ALetter # L& [3] GREEK CAPITAL LETTER EPSILON WITH TONOS..GREEK CAPITAL LETTER IOTA WITH TONOS
+038C ; ALetter # L& GREEK CAPITAL LETTER OMICRON WITH TONOS
+038E..03A1 ; ALetter # L& [20] GREEK CAPITAL LETTER UPSILON WITH TONOS..GREEK CAPITAL LETTER RHO
+03A3..03F5 ; ALetter # L& [83] GREEK CAPITAL LETTER SIGMA..GREEK LUNATE EPSILON SYMBOL
+03F7..0481 ; ALetter # L& [139] GREEK CAPITAL LETTER SHO..CYRILLIC SMALL LETTER KOPPA
+048A..052F ; ALetter # L& [166] CYRILLIC CAPITAL LETTER SHORT I WITH TAIL..CYRILLIC SMALL LETTER EL WITH DESCENDER
+0531..0556 ; ALetter # L& [38] ARMENIAN CAPITAL LETTER AYB..ARMENIAN CAPITAL LETTER FEH
+0559 ; ALetter # Lm ARMENIAN MODIFIER LETTER LEFT HALF RING
+055A..055C ; ALetter # Po [3] ARMENIAN APOSTROPHE..ARMENIAN EXCLAMATION MARK
+055E ; ALetter # Po ARMENIAN QUESTION MARK
+0560..0588 ; ALetter # L& [41] ARMENIAN SMALL LETTER TURNED AYB..ARMENIAN SMALL LETTER YI WITH STROKE
+058A ; ALetter # Pd ARMENIAN HYPHEN
+05F3 ; ALetter # Po HEBREW PUNCTUATION GERESH
+0620..063F ; ALetter # Lo [32] ARABIC LETTER KASHMIRI YEH..ARABIC LETTER FARSI YEH WITH THREE DOTS ABOVE
+0640 ; ALetter # Lm ARABIC TATWEEL
+0641..064A ; ALetter # Lo [10] ARABIC LETTER FEH..ARABIC LETTER YEH
+066E..066F ; ALetter # Lo [2] ARABIC LETTER DOTLESS BEH..ARABIC LETTER DOTLESS QAF
+0671..06D3 ; ALetter # Lo [99] ARABIC LETTER ALEF WASLA..ARABIC LETTER YEH BARREE WITH HAMZA ABOVE
+06D5 ; ALetter # Lo ARABIC LETTER AE
+06E5..06E6 ; ALetter # Lm [2] ARABIC SMALL WAW..ARABIC SMALL YEH
+06EE..06EF ; ALetter # Lo [2] ARABIC LETTER DAL WITH INVERTED V..ARABIC LETTER REH WITH INVERTED V
+06FA..06FC ; ALetter # Lo [3] ARABIC LETTER SHEEN WITH DOT BELOW..ARABIC LETTER GHAIN WITH DOT BELOW
+06FF ; ALetter # Lo ARABIC LETTER HEH WITH INVERTED V
+070F ; ALetter # Cf SYRIAC ABBREVIATION MARK
+0710 ; ALetter # Lo SYRIAC LETTER ALAPH
+0712..072F ; ALetter # Lo [30] SYRIAC LETTER BETH..SYRIAC LETTER PERSIAN DHALATH
+074D..07A5 ; ALetter # Lo [89] SYRIAC LETTER SOGDIAN ZHAIN..THAANA LETTER WAAVU
+07B1 ; ALetter # Lo THAANA LETTER NAA
+07CA..07EA ; ALetter # Lo [33] NKO LETTER A..NKO LETTER JONA RA
+07F4..07F5 ; ALetter # Lm [2] NKO HIGH TONE APOSTROPHE..NKO LOW TONE APOSTROPHE
+07FA ; ALetter # Lm NKO LAJANYALAN
+0800..0815 ; ALetter # Lo [22] SAMARITAN LETTER ALAF..SAMARITAN LETTER TAAF
+081A ; ALetter # Lm SAMARITAN MODIFIER LETTER EPENTHETIC YUT
+0824 ; ALetter # Lm SAMARITAN MODIFIER LETTER SHORT A
+0828 ; ALetter # Lm SAMARITAN MODIFIER LETTER I
+0840..0858 ; ALetter # Lo [25] MANDAIC LETTER HALQA..MANDAIC LETTER AIN
+0860..086A ; ALetter # Lo [11] SYRIAC LETTER MALAYALAM NGA..SYRIAC LETTER MALAYALAM SSA
+0870..0887 ; ALetter # Lo [24] ARABIC LETTER ALEF WITH ATTACHED FATHA..ARABIC BASELINE ROUND DOT
+0889..088F ; ALetter # Lo [7] ARABIC LETTER NOON WITH INVERTED SMALL V..ARABIC LETTER NOON WITH RING ABOVE
+08A0..08C8 ; ALetter # Lo [41] ARABIC LETTER BEH WITH SMALL V BELOW..ARABIC LETTER GRAF
+08C9 ; ALetter # Lm ARABIC SMALL FARSI YEH
+0904..0939 ; ALetter # Lo [54] DEVANAGARI LETTER SHORT A..DEVANAGARI LETTER HA
+093D ; ALetter # Lo DEVANAGARI SIGN AVAGRAHA
+0950 ; ALetter # Lo DEVANAGARI OM
+0958..0961 ; ALetter # Lo [10] DEVANAGARI LETTER QA..DEVANAGARI LETTER VOCALIC LL
+0971 ; ALetter # Lm DEVANAGARI SIGN HIGH SPACING DOT
+0972..0980 ; ALetter # Lo [15] DEVANAGARI LETTER CANDRA A..BENGALI ANJI
+0985..098C ; ALetter # Lo [8] BENGALI LETTER A..BENGALI LETTER VOCALIC L
+098F..0990 ; ALetter # Lo [2] BENGALI LETTER E..BENGALI LETTER AI
+0993..09A8 ; ALetter # Lo [22] BENGALI LETTER O..BENGALI LETTER NA
+09AA..09B0 ; ALetter # Lo [7] BENGALI LETTER PA..BENGALI LETTER RA
+09B2 ; ALetter # Lo BENGALI LETTER LA
+09B6..09B9 ; ALetter # Lo [4] BENGALI LETTER SHA..BENGALI LETTER HA
+09BD ; ALetter # Lo BENGALI SIGN AVAGRAHA
+09CE ; ALetter # Lo BENGALI LETTER KHANDA TA
+09DC..09DD ; ALetter # Lo [2] BENGALI LETTER RRA..BENGALI LETTER RHA
+09DF..09E1 ; ALetter # Lo [3] BENGALI LETTER YYA..BENGALI LETTER VOCALIC LL
+09F0..09F1 ; ALetter # Lo [2] BENGALI LETTER RA WITH MIDDLE DIAGONAL..BENGALI LETTER RA WITH LOWER DIAGONAL
+09FC ; ALetter # Lo BENGALI LETTER VEDIC ANUSVARA
+0A05..0A0A ; ALetter # Lo [6] GURMUKHI LETTER A..GURMUKHI LETTER UU
+0A0F..0A10 ; ALetter # Lo [2] GURMUKHI LETTER EE..GURMUKHI LETTER AI
+0A13..0A28 ; ALetter # Lo [22] GURMUKHI LETTER OO..GURMUKHI LETTER NA
+0A2A..0A30 ; ALetter # Lo [7] GURMUKHI LETTER PA..GURMUKHI LETTER RA
+0A32..0A33 ; ALetter # Lo [2] GURMUKHI LETTER LA..GURMUKHI LETTER LLA
+0A35..0A36 ; ALetter # Lo [2] GURMUKHI LETTER VA..GURMUKHI LETTER SHA
+0A38..0A39 ; ALetter # Lo [2] GURMUKHI LETTER SA..GURMUKHI LETTER HA
+0A59..0A5C ; ALetter # Lo [4] GURMUKHI LETTER KHHA..GURMUKHI LETTER RRA
+0A5E ; ALetter # Lo GURMUKHI LETTER FA
+0A72..0A74 ; ALetter # Lo [3] GURMUKHI IRI..GURMUKHI EK ONKAR
+0A85..0A8D ; ALetter # Lo [9] GUJARATI LETTER A..GUJARATI VOWEL CANDRA E
+0A8F..0A91 ; ALetter # Lo [3] GUJARATI LETTER E..GUJARATI VOWEL CANDRA O
+0A93..0AA8 ; ALetter # Lo [22] GUJARATI LETTER O..GUJARATI LETTER NA
+0AAA..0AB0 ; ALetter # Lo [7] GUJARATI LETTER PA..GUJARATI LETTER RA
+0AB2..0AB3 ; ALetter # Lo [2] GUJARATI LETTER LA..GUJARATI LETTER LLA
+0AB5..0AB9 ; ALetter # Lo [5] GUJARATI LETTER VA..GUJARATI LETTER HA
+0ABD ; ALetter # Lo GUJARATI SIGN AVAGRAHA
+0AD0 ; ALetter # Lo GUJARATI OM
+0AE0..0AE1 ; ALetter # Lo [2] GUJARATI LETTER VOCALIC RR..GUJARATI LETTER VOCALIC LL
+0AF9 ; ALetter # Lo GUJARATI LETTER ZHA
+0B05..0B0C ; ALetter # Lo [8] ORIYA LETTER A..ORIYA LETTER VOCALIC L
+0B0F..0B10 ; ALetter # Lo [2] ORIYA LETTER E..ORIYA LETTER AI
+0B13..0B28 ; ALetter # Lo [22] ORIYA LETTER O..ORIYA LETTER NA
+0B2A..0B30 ; ALetter # Lo [7] ORIYA LETTER PA..ORIYA LETTER RA
+0B32..0B33 ; ALetter # Lo [2] ORIYA LETTER LA..ORIYA LETTER LLA
+0B35..0B39 ; ALetter # Lo [5] ORIYA LETTER VA..ORIYA LETTER HA
+0B3D ; ALetter # Lo ORIYA SIGN AVAGRAHA
+0B5C..0B5D ; ALetter # Lo [2] ORIYA LETTER RRA..ORIYA LETTER RHA
+0B5F..0B61 ; ALetter # Lo [3] ORIYA LETTER YYA..ORIYA LETTER VOCALIC LL
+0B71 ; ALetter # Lo ORIYA LETTER WA
+0B83 ; ALetter # Lo TAMIL SIGN VISARGA
+0B85..0B8A ; ALetter # Lo [6] TAMIL LETTER A..TAMIL LETTER UU
+0B8E..0B90 ; ALetter # Lo [3] TAMIL LETTER E..TAMIL LETTER AI
+0B92..0B95 ; ALetter # Lo [4] TAMIL LETTER O..TAMIL LETTER KA
+0B99..0B9A ; ALetter # Lo [2] TAMIL LETTER NGA..TAMIL LETTER CA
+0B9C ; ALetter # Lo TAMIL LETTER JA
+0B9E..0B9F ; ALetter # Lo [2] TAMIL LETTER NYA..TAMIL LETTER TTA
+0BA3..0BA4 ; ALetter # Lo [2] TAMIL LETTER NNA..TAMIL LETTER TA
+0BA8..0BAA ; ALetter # Lo [3] TAMIL LETTER NA..TAMIL LETTER PA
+0BAE..0BB9 ; ALetter # Lo [12] TAMIL LETTER MA..TAMIL LETTER HA
+0BD0 ; ALetter # Lo TAMIL OM
+0C05..0C0C ; ALetter # Lo [8] TELUGU LETTER A..TELUGU LETTER VOCALIC L
+0C0E..0C10 ; ALetter # Lo [3] TELUGU LETTER E..TELUGU LETTER AI
+0C12..0C28 ; ALetter # Lo [23] TELUGU LETTER O..TELUGU LETTER NA
+0C2A..0C39 ; ALetter # Lo [16] TELUGU LETTER PA..TELUGU LETTER HA
+0C3D ; ALetter # Lo TELUGU SIGN AVAGRAHA
+0C58..0C5A ; ALetter # Lo [3] TELUGU LETTER TSA..TELUGU LETTER RRRA
+0C5C..0C5D ; ALetter # Lo [2] TELUGU ARCHAIC SHRII..TELUGU LETTER NAKAARA POLLU
+0C60..0C61 ; ALetter # Lo [2] TELUGU LETTER VOCALIC RR..TELUGU LETTER VOCALIC LL
+0C80 ; ALetter # Lo KANNADA SIGN SPACING CANDRABINDU
+0C85..0C8C ; ALetter # Lo [8] KANNADA LETTER A..KANNADA LETTER VOCALIC L
+0C8E..0C90 ; ALetter # Lo [3] KANNADA LETTER E..KANNADA LETTER AI
+0C92..0CA8 ; ALetter # Lo [23] KANNADA LETTER O..KANNADA LETTER NA
+0CAA..0CB3 ; ALetter # Lo [10] KANNADA LETTER PA..KANNADA LETTER LLA
+0CB5..0CB9 ; ALetter # Lo [5] KANNADA LETTER VA..KANNADA LETTER HA
+0CBD ; ALetter # Lo KANNADA SIGN AVAGRAHA
+0CDC..0CDE ; ALetter # Lo [3] KANNADA ARCHAIC SHRII..KANNADA LETTER FA
+0CE0..0CE1 ; ALetter # Lo [2] KANNADA LETTER VOCALIC RR..KANNADA LETTER VOCALIC LL
+0CF1..0CF2 ; ALetter # Lo [2] KANNADA SIGN JIHVAMULIYA..KANNADA SIGN UPADHMANIYA
+0D04..0D0C ; ALetter # Lo [9] MALAYALAM LETTER VEDIC ANUSVARA..MALAYALAM LETTER VOCALIC L
+0D0E..0D10 ; ALetter # Lo [3] MALAYALAM LETTER E..MALAYALAM LETTER AI
+0D12..0D3A ; ALetter # Lo [41] MALAYALAM LETTER O..MALAYALAM LETTER TTTA
+0D3D ; ALetter # Lo MALAYALAM SIGN AVAGRAHA
+0D4E ; ALetter # Lo MALAYALAM LETTER DOT REPH
+0D54..0D56 ; ALetter # Lo [3] MALAYALAM LETTER CHILLU M..MALAYALAM LETTER CHILLU LLL
+0D5F..0D61 ; ALetter # Lo [3] MALAYALAM LETTER ARCHAIC II..MALAYALAM LETTER VOCALIC LL
+0D7A..0D7F ; ALetter # Lo [6] MALAYALAM LETTER CHILLU NN..MALAYALAM LETTER CHILLU K
+0D85..0D96 ; ALetter # Lo [18] SINHALA LETTER AYANNA..SINHALA LETTER AUYANNA
+0D9A..0DB1 ; ALetter # Lo [24] SINHALA LETTER ALPAPRAANA KAYANNA..SINHALA LETTER DANTAJA NAYANNA
+0DB3..0DBB ; ALetter # Lo [9] SINHALA LETTER SANYAKA DAYANNA..SINHALA LETTER RAYANNA
+0DBD ; ALetter # Lo SINHALA LETTER DANTAJA LAYANNA
+0DC0..0DC6 ; ALetter # Lo [7] SINHALA LETTER VAYANNA..SINHALA LETTER FAYANNA
+0F00 ; ALetter # Lo TIBETAN SYLLABLE OM
+0F40..0F47 ; ALetter # Lo [8] TIBETAN LETTER KA..TIBETAN LETTER JA
+0F49..0F6C ; ALetter # Lo [36] TIBETAN LETTER NYA..TIBETAN LETTER RRA
+0F88..0F8C ; ALetter # Lo [5] TIBETAN SIGN LCE TSA CAN..TIBETAN SIGN INVERTED MCHU CAN
+10A0..10C5 ; ALetter # L& [38] GEORGIAN CAPITAL LETTER AN..GEORGIAN CAPITAL LETTER HOE
+10C7 ; ALetter # L& GEORGIAN CAPITAL LETTER YN
+10CD ; ALetter # L& GEORGIAN CAPITAL LETTER AEN
+10D0..10FA ; ALetter # L& [43] GEORGIAN LETTER AN..GEORGIAN LETTER AIN
+10FC ; ALetter # Lm MODIFIER LETTER GEORGIAN NAR
+10FD..10FF ; ALetter # L& [3] GEORGIAN LETTER AEN..GEORGIAN LETTER LABIAL SIGN
+1100..1248 ; ALetter # Lo [329] HANGUL CHOSEONG KIYEOK..ETHIOPIC SYLLABLE QWA
+124A..124D ; ALetter # Lo [4] ETHIOPIC SYLLABLE QWI..ETHIOPIC SYLLABLE QWE
+1250..1256 ; ALetter # Lo [7] ETHIOPIC SYLLABLE QHA..ETHIOPIC SYLLABLE QHO
+1258 ; ALetter # Lo ETHIOPIC SYLLABLE QHWA
+125A..125D ; ALetter # Lo [4] ETHIOPIC SYLLABLE QHWI..ETHIOPIC SYLLABLE QHWE
+1260..1288 ; ALetter # Lo [41] ETHIOPIC SYLLABLE BA..ETHIOPIC SYLLABLE XWA
+128A..128D ; ALetter # Lo [4] ETHIOPIC SYLLABLE XWI..ETHIOPIC SYLLABLE XWE
+1290..12B0 ; ALetter # Lo [33] ETHIOPIC SYLLABLE NA..ETHIOPIC SYLLABLE KWA
+12B2..12B5 ; ALetter # Lo [4] ETHIOPIC SYLLABLE KWI..ETHIOPIC SYLLABLE KWE
+12B8..12BE ; ALetter # Lo [7] ETHIOPIC SYLLABLE KXA..ETHIOPIC SYLLABLE KXO
+12C0 ; ALetter # Lo ETHIOPIC SYLLABLE KXWA
+12C2..12C5 ; ALetter # Lo [4] ETHIOPIC SYLLABLE KXWI..ETHIOPIC SYLLABLE KXWE
+12C8..12D6 ; ALetter # Lo [15] ETHIOPIC SYLLABLE WA..ETHIOPIC SYLLABLE PHARYNGEAL O
+12D8..1310 ; ALetter # Lo [57] ETHIOPIC SYLLABLE ZA..ETHIOPIC SYLLABLE GWA
+1312..1315 ; ALetter # Lo [4] ETHIOPIC SYLLABLE GWI..ETHIOPIC SYLLABLE GWE
+1318..135A ; ALetter # Lo [67] ETHIOPIC SYLLABLE GGA..ETHIOPIC SYLLABLE FYA
+1380..138F ; ALetter # Lo [16] ETHIOPIC SYLLABLE SEBATBEIT MWA..ETHIOPIC SYLLABLE PWE
+13A0..13F5 ; ALetter # L& [86] CHEROKEE LETTER A..CHEROKEE LETTER MV
+13F8..13FD ; ALetter # L& [6] CHEROKEE SMALL LETTER YE..CHEROKEE SMALL LETTER MV
+1401..166C ; ALetter # Lo [620] CANADIAN SYLLABICS E..CANADIAN SYLLABICS CARRIER TTSA
+166F..167F ; ALetter # Lo [17] CANADIAN SYLLABICS QAI..CANADIAN SYLLABICS BLACKFOOT W
+1681..169A ; ALetter # Lo [26] OGHAM LETTER BEITH..OGHAM LETTER PEITH
+16A0..16EA ; ALetter # Lo [75] RUNIC LETTER FEHU FEOH FE F..RUNIC LETTER X
+16EE..16F0 ; ALetter # Nl [3] RUNIC ARLAUG SYMBOL..RUNIC BELGTHOR SYMBOL
+16F1..16F8 ; ALetter # Lo [8] RUNIC LETTER K..RUNIC LETTER FRANKS CASKET AESC
+1700..1711 ; ALetter # Lo [18] TAGALOG LETTER A..TAGALOG LETTER HA
+171F..1731 ; ALetter # Lo [19] TAGALOG LETTER ARCHAIC RA..HANUNOO LETTER HA
+1740..1751 ; ALetter # Lo [18] BUHID LETTER A..BUHID LETTER HA
+1760..176C ; ALetter # Lo [13] TAGBANWA LETTER A..TAGBANWA LETTER YA
+176E..1770 ; ALetter # Lo [3] TAGBANWA LETTER LA..TAGBANWA LETTER SA
+1820..1842 ; ALetter # Lo [35] MONGOLIAN LETTER A..MONGOLIAN LETTER CHI
+1843 ; ALetter # Lm MONGOLIAN LETTER TODO LONG VOWEL SIGN
+1844..1878 ; ALetter # Lo [53] MONGOLIAN LETTER TODO E..MONGOLIAN LETTER CHA WITH TWO DOTS
+1880..1884 ; ALetter # Lo [5] MONGOLIAN LETTER ALI GALI ANUSVARA ONE..MONGOLIAN LETTER ALI GALI INVERTED UBADAMA
+1887..18A8 ; ALetter # Lo [34] MONGOLIAN LETTER ALI GALI A..MONGOLIAN LETTER MANCHU ALI GALI BHA
+18AA ; ALetter # Lo MONGOLIAN LETTER MANCHU ALI GALI LHA
+18B0..18F5 ; ALetter # Lo [70] CANADIAN SYLLABICS OY..CANADIAN SYLLABICS CARRIER DENTAL S
+1900..191E ; ALetter # Lo [31] LIMBU VOWEL-CARRIER LETTER..LIMBU LETTER TRA
+1A00..1A16 ; ALetter # Lo [23] BUGINESE LETTER KA..BUGINESE LETTER HA
+1B05..1B33 ; ALetter # Lo [47] BALINESE LETTER AKARA..BALINESE LETTER HA
+1B45..1B4C ; ALetter # Lo [8] BALINESE LETTER KAF SASAK..BALINESE LETTER ARCHAIC JNYA
+1B83..1BA0 ; ALetter # Lo [30] SUNDANESE LETTER A..SUNDANESE LETTER HA
+1BAE..1BAF ; ALetter # Lo [2] SUNDANESE LETTER KHA..SUNDANESE LETTER SYA
+1BBA..1BE5 ; ALetter # Lo [44] SUNDANESE AVAGRAHA..BATAK LETTER U
+1C00..1C23 ; ALetter # Lo [36] LEPCHA LETTER KA..LEPCHA LETTER A
+1C4D..1C4F ; ALetter # Lo [3] LEPCHA LETTER TTA..LEPCHA LETTER DDA
+1C5A..1C77 ; ALetter # Lo [30] OL CHIKI LETTER LA..OL CHIKI LETTER OH
+1C78..1C7D ; ALetter # Lm [6] OL CHIKI MU TTUDDAG..OL CHIKI AHAD
+1C80..1C8A ; ALetter # L& [11] CYRILLIC SMALL LETTER ROUNDED VE..CYRILLIC SMALL LETTER TJE
+1C90..1CBA ; ALetter # L& [43] GEORGIAN MTAVRULI CAPITAL LETTER AN..GEORGIAN MTAVRULI CAPITAL LETTER AIN
+1CBD..1CBF ; ALetter # L& [3] GEORGIAN MTAVRULI CAPITAL LETTER AEN..GEORGIAN MTAVRULI CAPITAL LETTER LABIAL SIGN
+1CE9..1CEC ; ALetter # Lo [4] VEDIC SIGN ANUSVARA ANTARGOMUKHA..VEDIC SIGN ANUSVARA VAMAGOMUKHA WITH TAIL
+1CEE..1CF3 ; ALetter # Lo [6] VEDIC SIGN HEXIFORM LONG ANUSVARA..VEDIC SIGN ROTATED ARDHAVISARGA
+1CF5..1CF6 ; ALetter # Lo [2] VEDIC SIGN JIHVAMULIYA..VEDIC SIGN UPADHMANIYA
+1CFA ; ALetter # Lo VEDIC SIGN DOUBLE ANUSVARA ANTARGOMUKHA
+1D00..1D2B ; ALetter # L& [44] LATIN LETTER SMALL CAPITAL A..CYRILLIC LETTER SMALL CAPITAL EL
+1D2C..1D6A ; ALetter # Lm [63] MODIFIER LETTER CAPITAL A..GREEK SUBSCRIPT SMALL LETTER CHI
+1D6B..1D77 ; ALetter # L& [13] LATIN SMALL LETTER UE..LATIN SMALL LETTER TURNED G
+1D78 ; ALetter # Lm MODIFIER LETTER CYRILLIC EN
+1D79..1D9A ; ALetter # L& [34] LATIN SMALL LETTER INSULAR G..LATIN SMALL LETTER EZH WITH RETROFLEX HOOK
+1D9B..1DBF ; ALetter # Lm [37] MODIFIER LETTER SMALL TURNED ALPHA..MODIFIER LETTER SMALL THETA
+1E00..1F15 ; ALetter # L& [278] LATIN CAPITAL LETTER A WITH RING BELOW..GREEK SMALL LETTER EPSILON WITH DASIA AND OXIA
+1F18..1F1D ; ALetter # L& [6] GREEK CAPITAL LETTER EPSILON WITH PSILI..GREEK CAPITAL LETTER EPSILON WITH DASIA AND OXIA
+1F20..1F45 ; ALetter # L& [38] GREEK SMALL LETTER ETA WITH PSILI..GREEK SMALL LETTER OMICRON WITH DASIA AND OXIA
+1F48..1F4D ; ALetter # L& [6] GREEK CAPITAL LETTER OMICRON WITH PSILI..GREEK CAPITAL LETTER OMICRON WITH DASIA AND OXIA
+1F50..1F57 ; ALetter # L& [8] GREEK SMALL LETTER UPSILON WITH PSILI..GREEK SMALL LETTER UPSILON WITH DASIA AND PERISPOMENI
+1F59 ; ALetter # L& GREEK CAPITAL LETTER UPSILON WITH DASIA
+1F5B ; ALetter # L& GREEK CAPITAL LETTER UPSILON WITH DASIA AND VARIA
+1F5D ; ALetter # L& GREEK CAPITAL LETTER UPSILON WITH DASIA AND OXIA
+1F5F..1F7D ; ALetter # L& [31] GREEK CAPITAL LETTER UPSILON WITH DASIA AND PERISPOMENI..GREEK SMALL LETTER OMEGA WITH OXIA
+1F80..1FB4 ; ALetter # L& [53] GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI..GREEK SMALL LETTER ALPHA WITH OXIA AND YPOGEGRAMMENI
+1FB6..1FBC ; ALetter # L& [7] GREEK SMALL LETTER ALPHA WITH PERISPOMENI..GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI
+1FBE ; ALetter # L& GREEK PROSGEGRAMMENI
+1FC2..1FC4 ; ALetter # L& [3] GREEK SMALL LETTER ETA WITH VARIA AND YPOGEGRAMMENI..GREEK SMALL LETTER ETA WITH OXIA AND YPOGEGRAMMENI
+1FC6..1FCC ; ALetter # L& [7] GREEK SMALL LETTER ETA WITH PERISPOMENI..GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI
+1FD0..1FD3 ; ALetter # L& [4] GREEK SMALL LETTER IOTA WITH VRACHY..GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
+1FD6..1FDB ; ALetter # L& [6] GREEK SMALL LETTER IOTA WITH PERISPOMENI..GREEK CAPITAL LETTER IOTA WITH OXIA
+1FE0..1FEC ; ALetter # L& [13] GREEK SMALL LETTER UPSILON WITH VRACHY..GREEK CAPITAL LETTER RHO WITH DASIA
+1FF2..1FF4 ; ALetter # L& [3] GREEK SMALL LETTER OMEGA WITH VARIA AND YPOGEGRAMMENI..GREEK SMALL LETTER OMEGA WITH OXIA AND YPOGEGRAMMENI
+1FF6..1FFC ; ALetter # L& [7] GREEK SMALL LETTER OMEGA WITH PERISPOMENI..GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI
+2071 ; ALetter # Lm SUPERSCRIPT LATIN SMALL LETTER I
+207F ; ALetter # Lm SUPERSCRIPT LATIN SMALL LETTER N
+2090..209C ; ALetter # Lm [13] LATIN SUBSCRIPT SMALL LETTER A..LATIN SUBSCRIPT SMALL LETTER T
+2102 ; ALetter # L& DOUBLE-STRUCK CAPITAL C
+2107 ; ALetter # L& EULER CONSTANT
+210A..2113 ; ALetter # L& [10] SCRIPT SMALL G..SCRIPT SMALL L
+2115 ; ALetter # L& DOUBLE-STRUCK CAPITAL N
+2119..211D ; ALetter # L& [5] DOUBLE-STRUCK CAPITAL P..DOUBLE-STRUCK CAPITAL R
+2124 ; ALetter # L& DOUBLE-STRUCK CAPITAL Z
+2126 ; ALetter # L& OHM SIGN
+2128 ; ALetter # L& BLACK-LETTER CAPITAL Z
+212A..212D ; ALetter # L& [4] KELVIN SIGN..BLACK-LETTER CAPITAL C
+212F..2134 ; ALetter # L& [6] SCRIPT SMALL E..SCRIPT SMALL O
+2135..2138 ; ALetter # Lo [4] ALEF SYMBOL..DALET SYMBOL
+2139 ; ALetter # L& INFORMATION SOURCE
+213C..213F ; ALetter # L& [4] DOUBLE-STRUCK SMALL PI..DOUBLE-STRUCK CAPITAL PI
+2145..2149 ; ALetter # L& [5] DOUBLE-STRUCK ITALIC CAPITAL D..DOUBLE-STRUCK ITALIC SMALL J
+214E ; ALetter # L& TURNED SMALL F
+2160..2182 ; ALetter # Nl [35] ROMAN NUMERAL ONE..ROMAN NUMERAL TEN THOUSAND
+2183..2184 ; ALetter # L& [2] ROMAN NUMERAL REVERSED ONE HUNDRED..LATIN SMALL LETTER REVERSED C
+2185..2188 ; ALetter # Nl [4] ROMAN NUMERAL SIX LATE FORM..ROMAN NUMERAL ONE HUNDRED THOUSAND
+24B6..24E9 ; ALetter # So [52] CIRCLED LATIN CAPITAL LETTER A..CIRCLED LATIN SMALL LETTER Z
+2C00..2C7B ; ALetter # L& [124] GLAGOLITIC CAPITAL LETTER AZU..LATIN LETTER SMALL CAPITAL TURNED E
+2C7C..2C7D ; ALetter # Lm [2] LATIN SUBSCRIPT SMALL LETTER J..MODIFIER LETTER CAPITAL V
+2C7E..2CE4 ; ALetter # L& [103] LATIN CAPITAL LETTER S WITH SWASH TAIL..COPTIC SYMBOL KAI
+2CEB..2CEE ; ALetter # L& [4] COPTIC CAPITAL LETTER CRYPTOGRAMMIC SHEI..COPTIC SMALL LETTER CRYPTOGRAMMIC GANGIA
+2CF2..2CF3 ; ALetter # L& [2] COPTIC CAPITAL LETTER BOHAIRIC KHEI..COPTIC SMALL LETTER BOHAIRIC KHEI
+2D00..2D25 ; ALetter # L& [38] GEORGIAN SMALL LETTER AN..GEORGIAN SMALL LETTER HOE
+2D27 ; ALetter # L& GEORGIAN SMALL LETTER YN
+2D2D ; ALetter # L& GEORGIAN SMALL LETTER AEN
+2D30..2D67 ; ALetter # Lo [56] TIFINAGH LETTER YA..TIFINAGH LETTER YO
+2D6F ; ALetter # Lm TIFINAGH MODIFIER LETTER LABIALIZATION MARK
+2D80..2D96 ; ALetter # Lo [23] ETHIOPIC SYLLABLE LOA..ETHIOPIC SYLLABLE GGWE
+2DA0..2DA6 ; ALetter # Lo [7] ETHIOPIC SYLLABLE SSA..ETHIOPIC SYLLABLE SSO
+2DA8..2DAE ; ALetter # Lo [7] ETHIOPIC SYLLABLE CCA..ETHIOPIC SYLLABLE CCO
+2DB0..2DB6 ; ALetter # Lo [7] ETHIOPIC SYLLABLE ZZA..ETHIOPIC SYLLABLE ZZO
+2DB8..2DBE ; ALetter # Lo [7] ETHIOPIC SYLLABLE CCHA..ETHIOPIC SYLLABLE CCHO
+2DC0..2DC6 ; ALetter # Lo [7] ETHIOPIC SYLLABLE QYA..ETHIOPIC SYLLABLE QYO
+2DC8..2DCE ; ALetter # Lo [7] ETHIOPIC SYLLABLE KYA..ETHIOPIC SYLLABLE KYO
+2DD0..2DD6 ; ALetter # Lo [7] ETHIOPIC SYLLABLE XYA..ETHIOPIC SYLLABLE XYO
+2DD8..2DDE ; ALetter # Lo [7] ETHIOPIC SYLLABLE GYA..ETHIOPIC SYLLABLE GYO
+2E2F ; ALetter # Lm VERTICAL TILDE
+3005 ; ALetter # Lm IDEOGRAPHIC ITERATION MARK
+303B ; ALetter # Lm VERTICAL IDEOGRAPHIC ITERATION MARK
+303C ; ALetter # Lo MASU MARK
+3105..312F ; ALetter # Lo [43] BOPOMOFO LETTER B..BOPOMOFO LETTER NN
+3131..318E ; ALetter # Lo [94] HANGUL LETTER KIYEOK..HANGUL LETTER ARAEAE
+31A0..31BF ; ALetter # Lo [32] BOPOMOFO LETTER BU..BOPOMOFO LETTER AH
+A000..A014 ; ALetter # Lo [21] YI SYLLABLE IT..YI SYLLABLE E
+A015 ; ALetter # Lm YI SYLLABLE WU
+A016..A48C ; ALetter # Lo [1143] YI SYLLABLE BIT..YI SYLLABLE YYR
+A4D0..A4F7 ; ALetter # Lo [40] LISU LETTER BA..LISU LETTER OE
+A4F8..A4FD ; ALetter # Lm [6] LISU LETTER TONE MYA TI..LISU LETTER TONE MYA JEU
+A500..A60B ; ALetter # Lo [268] VAI SYLLABLE EE..VAI SYLLABLE NG
+A60C ; ALetter # Lm VAI SYLLABLE LENGTHENER
+A610..A61F ; ALetter # Lo [16] VAI SYLLABLE NDOLE FA..VAI SYMBOL JONG
+A62A..A62B ; ALetter # Lo [2] VAI SYLLABLE NDOLE MA..VAI SYLLABLE NDOLE DO
+A640..A66D ; ALetter # L& [46] CYRILLIC CAPITAL LETTER ZEMLYA..CYRILLIC SMALL LETTER DOUBLE MONOCULAR O
+A66E ; ALetter # Lo CYRILLIC LETTER MULTIOCULAR O
+A67F ; ALetter # Lm CYRILLIC PAYEROK
+A680..A69B ; ALetter # L& [28] CYRILLIC CAPITAL LETTER DWE..CYRILLIC SMALL LETTER CROSSED O
+A69C..A69D ; ALetter # Lm [2] MODIFIER LETTER CYRILLIC HARD SIGN..MODIFIER LETTER CYRILLIC SOFT SIGN
+A6A0..A6E5 ; ALetter # Lo [70] BAMUM LETTER A..BAMUM LETTER KI
+A6E6..A6EF ; ALetter # Nl [10] BAMUM LETTER MO..BAMUM LETTER KOGHOM
+A708..A716 ; ALetter # Sk [15] MODIFIER LETTER EXTRA-HIGH DOTTED TONE BAR..MODIFIER LETTER EXTRA-LOW LEFT-STEM TONE BAR
+A717..A71F ; ALetter # Lm [9] MODIFIER LETTER DOT VERTICAL BAR..MODIFIER LETTER LOW INVERTED EXCLAMATION MARK
+A720..A721 ; ALetter # Sk [2] MODIFIER LETTER STRESS AND HIGH TONE..MODIFIER LETTER STRESS AND LOW TONE
+A722..A76F ; ALetter # L& [78] LATIN CAPITAL LETTER EGYPTOLOGICAL ALEF..LATIN SMALL LETTER CON
+A770 ; ALetter # Lm MODIFIER LETTER US
+A771..A787 ; ALetter # L& [23] LATIN SMALL LETTER DUM..LATIN SMALL LETTER INSULAR T
+A788 ; ALetter # Lm MODIFIER LETTER LOW CIRCUMFLEX ACCENT
+A789..A78A ; ALetter # Sk [2] MODIFIER LETTER COLON..MODIFIER LETTER SHORT EQUALS SIGN
+A78B..A78E ; ALetter # L& [4] LATIN CAPITAL LETTER SALTILLO..LATIN SMALL LETTER L WITH RETROFLEX HOOK AND BELT
+A78F ; ALetter # Lo LATIN LETTER SINOLOGICAL DOT
+A790..A7DC ; ALetter # L& [77] LATIN CAPITAL LETTER N WITH DESCENDER..LATIN CAPITAL LETTER LAMBDA WITH STROKE
+A7F1..A7F4 ; ALetter # Lm [4] MODIFIER LETTER CAPITAL S..MODIFIER LETTER CAPITAL Q
+A7F5..A7F6 ; ALetter # L& [2] LATIN CAPITAL LETTER REVERSED HALF H..LATIN SMALL LETTER REVERSED HALF H
+A7F7 ; ALetter # Lo LATIN EPIGRAPHIC LETTER SIDEWAYS I
+A7F8..A7F9 ; ALetter # Lm [2] MODIFIER LETTER CAPITAL H WITH STROKE..MODIFIER LETTER SMALL LIGATURE OE
+A7FA ; ALetter # L& LATIN LETTER SMALL CAPITAL TURNED M
+A7FB..A801 ; ALetter # Lo [7] LATIN EPIGRAPHIC LETTER REVERSED F..SYLOTI NAGRI LETTER I
+A803..A805 ; ALetter # Lo [3] SYLOTI NAGRI LETTER U..SYLOTI NAGRI LETTER O
+A807..A80A ; ALetter # Lo [4] SYLOTI NAGRI LETTER KO..SYLOTI NAGRI LETTER GHO
+A80C..A822 ; ALetter # Lo [23] SYLOTI NAGRI LETTER CO..SYLOTI NAGRI LETTER HO
+A840..A873 ; ALetter # Lo [52] PHAGS-PA LETTER KA..PHAGS-PA LETTER CANDRABINDU
+A882..A8B3 ; ALetter # Lo [50] SAURASHTRA LETTER A..SAURASHTRA LETTER LLA
+A8F2..A8F7 ; ALetter # Lo [6] DEVANAGARI SIGN SPACING CANDRABINDU..DEVANAGARI SIGN CANDRABINDU AVAGRAHA
+A8FB ; ALetter # Lo DEVANAGARI HEADSTROKE
+A8FD..A8FE ; ALetter # Lo [2] DEVANAGARI JAIN OM..DEVANAGARI LETTER AY
+A90A..A925 ; ALetter # Lo [28] KAYAH LI LETTER KA..KAYAH LI LETTER OO
+A930..A946 ; ALetter # Lo [23] REJANG LETTER KA..REJANG LETTER A
+A960..A97C ; ALetter # Lo [29] HANGUL CHOSEONG TIKEUT-MIEUM..HANGUL CHOSEONG SSANGYEORINHIEUH
+A984..A9B2 ; ALetter # Lo [47] JAVANESE LETTER A..JAVANESE LETTER HA
+A9CF ; ALetter # Lm JAVANESE PANGRANGKEP
+AA00..AA28 ; ALetter # Lo [41] CHAM LETTER A..CHAM LETTER HA
+AA40..AA42 ; ALetter # Lo [3] CHAM LETTER FINAL K..CHAM LETTER FINAL NG
+AA44..AA4B ; ALetter # Lo [8] CHAM LETTER FINAL CH..CHAM LETTER FINAL SS
+AAE0..AAEA ; ALetter # Lo [11] MEETEI MAYEK LETTER E..MEETEI MAYEK LETTER SSA
+AAF2 ; ALetter # Lo MEETEI MAYEK ANJI
+AAF3..AAF4 ; ALetter # Lm [2] MEETEI MAYEK SYLLABLE REPETITION MARK..MEETEI MAYEK WORD REPETITION MARK
+AB01..AB06 ; ALetter # Lo [6] ETHIOPIC SYLLABLE TTHU..ETHIOPIC SYLLABLE TTHO
+AB09..AB0E ; ALetter # Lo [6] ETHIOPIC SYLLABLE DDHU..ETHIOPIC SYLLABLE DDHO
+AB11..AB16 ; ALetter # Lo [6] ETHIOPIC SYLLABLE DZU..ETHIOPIC SYLLABLE DZO
+AB20..AB26 ; ALetter # Lo [7] ETHIOPIC SYLLABLE CCHHA..ETHIOPIC SYLLABLE CCHHO
+AB28..AB2E ; ALetter # Lo [7] ETHIOPIC SYLLABLE BBA..ETHIOPIC SYLLABLE BBO
+AB30..AB5A ; ALetter # L& [43] LATIN SMALL LETTER BARRED ALPHA..LATIN SMALL LETTER Y WITH SHORT RIGHT LEG
+AB5B ; ALetter # Sk MODIFIER BREVE WITH INVERTED BREVE
+AB5C..AB5F ; ALetter # Lm [4] MODIFIER LETTER SMALL HENG..MODIFIER LETTER SMALL U WITH LEFT HOOK
+AB60..AB68 ; ALetter # L& [9] LATIN SMALL LETTER SAKHA YAT..LATIN SMALL LETTER TURNED R WITH MIDDLE TILDE
+AB69 ; ALetter # Lm MODIFIER LETTER SMALL TURNED W
+AB70..ABBF ; ALetter # L& [80] CHEROKEE SMALL LETTER A..CHEROKEE SMALL LETTER YA
+ABC0..ABE2 ; ALetter # Lo [35] MEETEI MAYEK LETTER KOK..MEETEI MAYEK LETTER I LONSUM
+AC00..D7A3 ; ALetter # Lo [11172] HANGUL SYLLABLE GA..HANGUL SYLLABLE HIH
+D7B0..D7C6 ; ALetter # Lo [23] HANGUL JUNGSEONG O-YEO..HANGUL JUNGSEONG ARAEA-E
+D7CB..D7FB ; ALetter # Lo [49] HANGUL JONGSEONG NIEUN-RIEUL..HANGUL JONGSEONG PHIEUPH-THIEUTH
+FB00..FB06 ; ALetter # L& [7] LATIN SMALL LIGATURE FF..LATIN SMALL LIGATURE ST
+FB13..FB17 ; ALetter # L& [5] ARMENIAN SMALL LIGATURE MEN NOW..ARMENIAN SMALL LIGATURE MEN XEH
+FB50..FBB1 ; ALetter # Lo [98] ARABIC LETTER ALEF WASLA ISOLATED FORM..ARABIC LETTER YEH BARREE WITH HAMZA ABOVE FINAL FORM
+FBD3..FD3D ; ALetter # Lo [363] ARABIC LETTER NG ISOLATED FORM..ARABIC LIGATURE ALEF WITH FATHATAN ISOLATED FORM
+FD50..FD8F ; ALetter # Lo [64] ARABIC LIGATURE TEH WITH JEEM WITH MEEM INITIAL FORM..ARABIC LIGATURE MEEM WITH KHAH WITH MEEM INITIAL FORM
+FD92..FDC7 ; ALetter # Lo [54] ARABIC LIGATURE MEEM WITH JEEM WITH KHAH INITIAL FORM..ARABIC LIGATURE NOON WITH JEEM WITH YEH FINAL FORM
+FDF0..FDFB ; ALetter # Lo [12] ARABIC LIGATURE SALLA USED AS KORANIC STOP SIGN ISOLATED FORM..ARABIC LIGATURE JALLAJALALOUHOU
+FE70..FE74 ; ALetter # Lo [5] ARABIC FATHATAN ISOLATED FORM..ARABIC KASRATAN ISOLATED FORM
+FE76..FEFC ; ALetter # Lo [135] ARABIC FATHA ISOLATED FORM..ARABIC LIGATURE LAM WITH ALEF FINAL FORM
+FF21..FF3A ; ALetter # L& [26] FULLWIDTH LATIN CAPITAL LETTER A..FULLWIDTH LATIN CAPITAL LETTER Z
+FF41..FF5A ; ALetter # L& [26] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH LATIN SMALL LETTER Z
+FFA0..FFBE ; ALetter # Lo [31] HALFWIDTH HANGUL FILLER..HALFWIDTH HANGUL LETTER HIEUH
+FFC2..FFC7 ; ALetter # Lo [6] HALFWIDTH HANGUL LETTER A..HALFWIDTH HANGUL LETTER E
+FFCA..FFCF ; ALetter # Lo [6] HALFWIDTH HANGUL LETTER YEO..HALFWIDTH HANGUL LETTER OE
+FFD2..FFD7 ; ALetter # Lo [6] HALFWIDTH HANGUL LETTER YO..HALFWIDTH HANGUL LETTER YU
+FFDA..FFDC ; ALetter # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER I
+10000..1000B ; ALetter # Lo [12] LINEAR B SYLLABLE B008 A..LINEAR B SYLLABLE B046 JE
+1000D..10026 ; ALetter # Lo [26] LINEAR B SYLLABLE B036 JO..LINEAR B SYLLABLE B032 QO
+10028..1003A ; ALetter # Lo [19] LINEAR B SYLLABLE B060 RA..LINEAR B SYLLABLE B042 WO
+1003C..1003D ; ALetter # Lo [2] LINEAR B SYLLABLE B017 ZA..LINEAR B SYLLABLE B074 ZE
+1003F..1004D ; ALetter # Lo [15] LINEAR B SYLLABLE B020 ZO..LINEAR B SYLLABLE B091 TWO
+10050..1005D ; ALetter # Lo [14] LINEAR B SYMBOL B018..LINEAR B SYMBOL B089
+10080..100FA ; ALetter # Lo [123] LINEAR B IDEOGRAM B100 MAN..LINEAR B IDEOGRAM VESSEL B305
+10140..10174 ; ALetter # Nl [53] GREEK ACROPHONIC ATTIC ONE QUARTER..GREEK ACROPHONIC STRATIAN FIFTY MNAS
+10280..1029C ; ALetter # Lo [29] LYCIAN LETTER A..LYCIAN LETTER X
+102A0..102D0 ; ALetter # Lo [49] CARIAN LETTER A..CARIAN LETTER UUU3
+10300..1031F ; ALetter # Lo [32] OLD ITALIC LETTER A..OLD ITALIC LETTER ESS
+1032D..10340 ; ALetter # Lo [20] OLD ITALIC LETTER YE..GOTHIC LETTER PAIRTHRA
+10341 ; ALetter # Nl GOTHIC LETTER NINETY
+10342..10349 ; ALetter # Lo [8] GOTHIC LETTER RAIDA..GOTHIC LETTER OTHAL
+1034A ; ALetter # Nl GOTHIC LETTER NINE HUNDRED
+10350..10375 ; ALetter # Lo [38] OLD PERMIC LETTER AN..OLD PERMIC LETTER IA
+10380..1039D ; ALetter # Lo [30] UGARITIC LETTER ALPA..UGARITIC LETTER SSU
+103A0..103C3 ; ALetter # Lo [36] OLD PERSIAN SIGN A..OLD PERSIAN SIGN HA
+103C8..103CF ; ALetter # Lo [8] OLD PERSIAN SIGN AURAMAZDAA..OLD PERSIAN SIGN BUUMISH
+103D1..103D5 ; ALetter # Nl [5] OLD PERSIAN NUMBER ONE..OLD PERSIAN NUMBER HUNDRED
+10400..1044F ; ALetter # L& [80] DESERET CAPITAL LETTER LONG I..DESERET SMALL LETTER EW
+10450..1049D ; ALetter # Lo [78] SHAVIAN LETTER PEEP..OSMANYA LETTER OO
+104B0..104D3 ; ALetter # L& [36] OSAGE CAPITAL LETTER A..OSAGE CAPITAL LETTER ZHA
+104D8..104FB ; ALetter # L& [36] OSAGE SMALL LETTER A..OSAGE SMALL LETTER ZHA
+10500..10527 ; ALetter # Lo [40] ELBASAN LETTER A..ELBASAN LETTER KHE
+10530..10563 ; ALetter # Lo [52] CAUCASIAN ALBANIAN LETTER ALT..CAUCASIAN ALBANIAN LETTER KIW
+10570..1057A ; ALetter # L& [11] VITHKUQI CAPITAL LETTER A..VITHKUQI CAPITAL LETTER GA
+1057C..1058A ; ALetter # L& [15] VITHKUQI CAPITAL LETTER HA..VITHKUQI CAPITAL LETTER RE
+1058C..10592 ; ALetter # L& [7] VITHKUQI CAPITAL LETTER SE..VITHKUQI CAPITAL LETTER XE
+10594..10595 ; ALetter # L& [2] VITHKUQI CAPITAL LETTER Y..VITHKUQI CAPITAL LETTER ZE
+10597..105A1 ; ALetter # L& [11] VITHKUQI SMALL LETTER A..VITHKUQI SMALL LETTER GA
+105A3..105B1 ; ALetter # L& [15] VITHKUQI SMALL LETTER HA..VITHKUQI SMALL LETTER RE
+105B3..105B9 ; ALetter # L& [7] VITHKUQI SMALL LETTER SE..VITHKUQI SMALL LETTER XE
+105BB..105BC ; ALetter # L& [2] VITHKUQI SMALL LETTER Y..VITHKUQI SMALL LETTER ZE
+105C0..105F3 ; ALetter # Lo [52] TODHRI LETTER A..TODHRI LETTER OO
+10600..10736 ; ALetter # Lo [311] LINEAR A SIGN AB001..LINEAR A SIGN A664
+10740..10755 ; ALetter # Lo [22] LINEAR A SIGN A701 A..LINEAR A SIGN A732 JE
+10760..10767 ; ALetter # Lo [8] LINEAR A SIGN A800..LINEAR A SIGN A807
+10780..10785 ; ALetter # Lm [6] MODIFIER LETTER SMALL CAPITAL AA..MODIFIER LETTER SMALL B WITH HOOK
+10787..107B0 ; ALetter # Lm [42] MODIFIER LETTER SMALL DZ DIGRAPH..MODIFIER LETTER SMALL V WITH RIGHT HOOK
+107B2..107BA ; ALetter # Lm [9] MODIFIER LETTER SMALL CAPITAL Y..MODIFIER LETTER SMALL S WITH CURL
+10800..10805 ; ALetter # Lo [6] CYPRIOT SYLLABLE A..CYPRIOT SYLLABLE JA
+10808 ; ALetter # Lo CYPRIOT SYLLABLE JO
+1080A..10835 ; ALetter # Lo [44] CYPRIOT SYLLABLE KA..CYPRIOT SYLLABLE WO
+10837..10838 ; ALetter # Lo [2] CYPRIOT SYLLABLE XA..CYPRIOT SYLLABLE XE
+1083C ; ALetter # Lo CYPRIOT SYLLABLE ZA
+1083F..10855 ; ALetter # Lo [23] CYPRIOT SYLLABLE ZO..IMPERIAL ARAMAIC LETTER TAW
+10860..10876 ; ALetter # Lo [23] PALMYRENE LETTER ALEPH..PALMYRENE LETTER TAW
+10880..1089E ; ALetter # Lo [31] NABATAEAN LETTER FINAL ALEPH..NABATAEAN LETTER TAW
+108E0..108F2 ; ALetter # Lo [19] HATRAN LETTER ALEPH..HATRAN LETTER QOPH
+108F4..108F5 ; ALetter # Lo [2] HATRAN LETTER SHIN..HATRAN LETTER TAW
+10900..10915 ; ALetter # Lo [22] PHOENICIAN LETTER ALF..PHOENICIAN LETTER TAU
+10920..10939 ; ALetter # Lo [26] LYDIAN LETTER A..LYDIAN LETTER C
+10940..10959 ; ALetter # Lo [26] SIDETIC LETTER N01..SIDETIC LETTER N26
+10980..109B7 ; ALetter # Lo [56] MEROITIC HIEROGLYPHIC LETTER A..MEROITIC CURSIVE LETTER DA
+109BE..109BF ; ALetter # Lo [2] MEROITIC CURSIVE LOGOGRAM RMT..MEROITIC CURSIVE LOGOGRAM IMN
+10A00 ; ALetter # Lo KHAROSHTHI LETTER A
+10A10..10A13 ; ALetter # Lo [4] KHAROSHTHI LETTER KA..KHAROSHTHI LETTER GHA
+10A15..10A17 ; ALetter # Lo [3] KHAROSHTHI LETTER CA..KHAROSHTHI LETTER JA
+10A19..10A35 ; ALetter # Lo [29] KHAROSHTHI LETTER NYA..KHAROSHTHI LETTER VHA
+10A60..10A7C ; ALetter # Lo [29] OLD SOUTH ARABIAN LETTER HE..OLD SOUTH ARABIAN LETTER THETH
+10A80..10A9C ; ALetter # Lo [29] OLD NORTH ARABIAN LETTER HEH..OLD NORTH ARABIAN LETTER ZAH
+10AC0..10AC7 ; ALetter # Lo [8] MANICHAEAN LETTER ALEPH..MANICHAEAN LETTER WAW
+10AC9..10AE4 ; ALetter # Lo [28] MANICHAEAN LETTER ZAYIN..MANICHAEAN LETTER TAW
+10B00..10B35 ; ALetter # Lo [54] AVESTAN LETTER A..AVESTAN LETTER HE
+10B40..10B55 ; ALetter # Lo [22] INSCRIPTIONAL PARTHIAN LETTER ALEPH..INSCRIPTIONAL PARTHIAN LETTER TAW
+10B60..10B72 ; ALetter # Lo [19] INSCRIPTIONAL PAHLAVI LETTER ALEPH..INSCRIPTIONAL PAHLAVI LETTER TAW
+10B80..10B91 ; ALetter # Lo [18] PSALTER PAHLAVI LETTER ALEPH..PSALTER PAHLAVI LETTER TAW
+10C00..10C48 ; ALetter # Lo [73] OLD TURKIC LETTER ORKHON A..OLD TURKIC LETTER ORKHON BASH
+10C80..10CB2 ; ALetter # L& [51] OLD HUNGARIAN CAPITAL LETTER A..OLD HUNGARIAN CAPITAL LETTER US
+10CC0..10CF2 ; ALetter # L& [51] OLD HUNGARIAN SMALL LETTER A..OLD HUNGARIAN SMALL LETTER US
+10D00..10D23 ; ALetter # Lo [36] HANIFI ROHINGYA LETTER A..HANIFI ROHINGYA MARK NA KHONNA
+10D4A..10D4D ; ALetter # Lo [4] GARAY VOWEL SIGN A..GARAY VOWEL SIGN EE
+10D4E ; ALetter # Lm GARAY VOWEL LENGTH MARK
+10D4F ; ALetter # Lo GARAY SUKUN
+10D50..10D65 ; ALetter # L& [22] GARAY CAPITAL LETTER A..GARAY CAPITAL LETTER OLD NA
+10D6F ; ALetter # Lm GARAY REDUPLICATION MARK
+10D70..10D85 ; ALetter # L& [22] GARAY SMALL LETTER A..GARAY SMALL LETTER OLD NA
+10E80..10EA9 ; ALetter # Lo [42] YEZIDI LETTER ELIF..YEZIDI LETTER ET
+10EB0..10EB1 ; ALetter # Lo [2] YEZIDI LETTER LAM WITH DOT ABOVE..YEZIDI LETTER YOT WITH CIRCUMFLEX ABOVE
+10EC2..10EC4 ; ALetter # Lo [3] ARABIC LETTER DAL WITH TWO DOTS VERTICALLY BELOW..ARABIC LETTER KAF WITH TWO DOTS VERTICALLY BELOW
+10EC5 ; ALetter # Lm ARABIC SMALL YEH BARREE WITH TWO DOTS BELOW
+10EC6..10EC7 ; ALetter # Lo [2] ARABIC LETTER THIN NOON..ARABIC LETTER YEH WITH FOUR DOTS BELOW
+10F00..10F1C ; ALetter # Lo [29] OLD SOGDIAN LETTER ALEPH..OLD SOGDIAN LETTER FINAL TAW WITH VERTICAL TAIL
+10F27 ; ALetter # Lo OLD SOGDIAN LIGATURE AYIN-DALETH
+10F30..10F45 ; ALetter # Lo [22] SOGDIAN LETTER ALEPH..SOGDIAN INDEPENDENT SHIN
+10F70..10F81 ; ALetter # Lo [18] OLD UYGHUR LETTER ALEPH..OLD UYGHUR LETTER LESH
+10FB0..10FC4 ; ALetter # Lo [21] CHORASMIAN LETTER ALEPH..CHORASMIAN LETTER TAW
+10FE0..10FF6 ; ALetter # Lo [23] ELYMAIC LETTER ALEPH..ELYMAIC LIGATURE ZAYIN-YODH
+11003..11037 ; ALetter # Lo [53] BRAHMI SIGN JIHVAMULIYA..BRAHMI LETTER OLD TAMIL NNNA
+11071..11072 ; ALetter # Lo [2] BRAHMI LETTER OLD TAMIL SHORT E..BRAHMI LETTER OLD TAMIL SHORT O
+11075 ; ALetter # Lo BRAHMI LETTER OLD TAMIL LLA
+11083..110AF ; ALetter # Lo [45] KAITHI LETTER A..KAITHI LETTER HA
+110D0..110E8 ; ALetter # Lo [25] SORA SOMPENG LETTER SAH..SORA SOMPENG LETTER MAE
+11103..11126 ; ALetter # Lo [36] CHAKMA LETTER AA..CHAKMA LETTER HAA
+11144 ; ALetter # Lo CHAKMA LETTER LHAA
+11147 ; ALetter # Lo CHAKMA LETTER VAA
+11150..11172 ; ALetter # Lo [35] MAHAJANI LETTER A..MAHAJANI LETTER RRA
+11176 ; ALetter # Lo MAHAJANI LIGATURE SHRI
+11183..111B2 ; ALetter # Lo [48] SHARADA LETTER A..SHARADA LETTER HA
+111C1..111C4 ; ALetter # Lo [4] SHARADA SIGN AVAGRAHA..SHARADA OM
+111DA ; ALetter # Lo SHARADA EKAM
+111DC ; ALetter # Lo SHARADA HEADSTROKE
+11200..11211 ; ALetter # Lo [18] KHOJKI LETTER A..KHOJKI LETTER JJA
+11213..1122B ; ALetter # Lo [25] KHOJKI LETTER NYA..KHOJKI LETTER LLA
+1123F..11240 ; ALetter # Lo [2] KHOJKI LETTER QA..KHOJKI LETTER SHORT I
+11280..11286 ; ALetter # Lo [7] MULTANI LETTER A..MULTANI LETTER GA
+11288 ; ALetter # Lo MULTANI LETTER GHA
+1128A..1128D ; ALetter # Lo [4] MULTANI LETTER CA..MULTANI LETTER JJA
+1128F..1129D ; ALetter # Lo [15] MULTANI LETTER NYA..MULTANI LETTER BA
+1129F..112A8 ; ALetter # Lo [10] MULTANI LETTER BHA..MULTANI LETTER RHA
+112B0..112DE ; ALetter # Lo [47] KHUDAWADI LETTER A..KHUDAWADI LETTER HA
+11305..1130C ; ALetter # Lo [8] GRANTHA LETTER A..GRANTHA LETTER VOCALIC L
+1130F..11310 ; ALetter # Lo [2] GRANTHA LETTER EE..GRANTHA LETTER AI
+11313..11328 ; ALetter # Lo [22] GRANTHA LETTER OO..GRANTHA LETTER NA
+1132A..11330 ; ALetter # Lo [7] GRANTHA LETTER PA..GRANTHA LETTER RA
+11332..11333 ; ALetter # Lo [2] GRANTHA LETTER LA..GRANTHA LETTER LLA
+11335..11339 ; ALetter # Lo [5] GRANTHA LETTER VA..GRANTHA LETTER HA
+1133D ; ALetter # Lo GRANTHA SIGN AVAGRAHA
+11350 ; ALetter # Lo GRANTHA OM
+1135D..11361 ; ALetter # Lo [5] GRANTHA SIGN PLUTA..GRANTHA LETTER VOCALIC LL
+11380..11389 ; ALetter # Lo [10] TULU-TIGALARI LETTER A..TULU-TIGALARI LETTER VOCALIC LL
+1138B ; ALetter # Lo TULU-TIGALARI LETTER EE
+1138E ; ALetter # Lo TULU-TIGALARI LETTER AI
+11390..113B5 ; ALetter # Lo [38] TULU-TIGALARI LETTER OO..TULU-TIGALARI LETTER LLLA
+113B7 ; ALetter # Lo TULU-TIGALARI SIGN AVAGRAHA
+113D1 ; ALetter # Lo TULU-TIGALARI REPHA
+113D3 ; ALetter # Lo TULU-TIGALARI SIGN PLUTA
+11400..11434 ; ALetter # Lo [53] NEWA LETTER A..NEWA LETTER HA
+11447..1144A ; ALetter # Lo [4] NEWA SIGN AVAGRAHA..NEWA SIDDHI
+1145F..11461 ; ALetter # Lo [3] NEWA LETTER VEDIC ANUSVARA..NEWA SIGN UPADHMANIYA
+11480..114AF ; ALetter # Lo [48] TIRHUTA ANJI..TIRHUTA LETTER HA
+114C4..114C5 ; ALetter # Lo [2] TIRHUTA SIGN AVAGRAHA..TIRHUTA GVANG
+114C7 ; ALetter # Lo TIRHUTA OM
+11580..115AE ; ALetter # Lo [47] SIDDHAM LETTER A..SIDDHAM LETTER HA
+115D8..115DB ; ALetter # Lo [4] SIDDHAM LETTER THREE-CIRCLE ALTERNATE I..SIDDHAM LETTER ALTERNATE U
+11600..1162F ; ALetter # Lo [48] MODI LETTER A..MODI LETTER LLA
+11644 ; ALetter # Lo MODI SIGN HUVA
+11680..116AA ; ALetter # Lo [43] TAKRI LETTER A..TAKRI LETTER RRA
+116B8 ; ALetter # Lo TAKRI LETTER ARCHAIC KHA
+11800..1182B ; ALetter # Lo [44] DOGRA LETTER A..DOGRA LETTER RRA
+118A0..118DF ; ALetter # L& [64] WARANG CITI CAPITAL LETTER NGAA..WARANG CITI SMALL LETTER VIYO
+118FF..11906 ; ALetter # Lo [8] WARANG CITI OM..DIVES AKURU LETTER E
+11909 ; ALetter # Lo DIVES AKURU LETTER O
+1190C..11913 ; ALetter # Lo [8] DIVES AKURU LETTER KA..DIVES AKURU LETTER JA
+11915..11916 ; ALetter # Lo [2] DIVES AKURU LETTER NYA..DIVES AKURU LETTER TTA
+11918..1192F ; ALetter # Lo [24] DIVES AKURU LETTER DDA..DIVES AKURU LETTER ZA
+1193F ; ALetter # Lo DIVES AKURU PREFIXED NASAL SIGN
+11941 ; ALetter # Lo DIVES AKURU INITIAL RA
+119A0..119A7 ; ALetter # Lo [8] NANDINAGARI LETTER A..NANDINAGARI LETTER VOCALIC RR
+119AA..119D0 ; ALetter # Lo [39] NANDINAGARI LETTER E..NANDINAGARI LETTER RRA
+119E1 ; ALetter # Lo NANDINAGARI SIGN AVAGRAHA
+119E3 ; ALetter # Lo NANDINAGARI HEADSTROKE
+11A00 ; ALetter # Lo ZANABAZAR SQUARE LETTER A
+11A0B..11A32 ; ALetter # Lo [40] ZANABAZAR SQUARE LETTER KA..ZANABAZAR SQUARE LETTER KSSA
+11A3A ; ALetter # Lo ZANABAZAR SQUARE CLUSTER-INITIAL LETTER RA
+11A50 ; ALetter # Lo SOYOMBO LETTER A
+11A5C..11A89 ; ALetter # Lo [46] SOYOMBO LETTER KA..SOYOMBO CLUSTER-INITIAL LETTER SA
+11A9D ; ALetter # Lo SOYOMBO MARK PLUTA
+11AB0..11AF8 ; ALetter # Lo [73] CANADIAN SYLLABICS NATTILIK HI..PAU CIN HAU GLOTTAL STOP FINAL
+11BC0..11BE0 ; ALetter # Lo [33] SUNUWAR LETTER DEVI..SUNUWAR LETTER KLOKO
+11C00..11C08 ; ALetter # Lo [9] BHAIKSUKI LETTER A..BHAIKSUKI LETTER VOCALIC L
+11C0A..11C2E ; ALetter # Lo [37] BHAIKSUKI LETTER E..BHAIKSUKI LETTER HA
+11C40 ; ALetter # Lo BHAIKSUKI SIGN AVAGRAHA
+11C72..11C8F ; ALetter # Lo [30] MARCHEN LETTER KA..MARCHEN LETTER A
+11D00..11D06 ; ALetter # Lo [7] MASARAM GONDI LETTER A..MASARAM GONDI LETTER E
+11D08..11D09 ; ALetter # Lo [2] MASARAM GONDI LETTER AI..MASARAM GONDI LETTER O
+11D0B..11D30 ; ALetter # Lo [38] MASARAM GONDI LETTER AU..MASARAM GONDI LETTER TRA
+11D46 ; ALetter # Lo MASARAM GONDI REPHA
+11D60..11D65 ; ALetter # Lo [6] GUNJALA GONDI LETTER A..GUNJALA GONDI LETTER UU
+11D67..11D68 ; ALetter # Lo [2] GUNJALA GONDI LETTER EE..GUNJALA GONDI LETTER AI
+11D6A..11D89 ; ALetter # Lo [32] GUNJALA GONDI LETTER OO..GUNJALA GONDI LETTER SA
+11D98 ; ALetter # Lo GUNJALA GONDI OM
+11DB0..11DD8 ; ALetter # Lo [41] TOLONG SIKI LETTER I..TOLONG SIKI LETTER RRH
+11DD9 ; ALetter # Lm TOLONG SIKI SIGN SELA
+11DDA..11DDB ; ALetter # Lo [2] TOLONG SIKI SIGN HECAKA..TOLONG SIKI UNGGA
+11EE0..11EF2 ; ALetter # Lo [19] MAKASAR LETTER KA..MAKASAR ANGKA
+11F02 ; ALetter # Lo KAWI SIGN REPHA
+11F04..11F10 ; ALetter # Lo [13] KAWI LETTER A..KAWI LETTER O
+11F12..11F33 ; ALetter # Lo [34] KAWI LETTER KA..KAWI LETTER JNYA
+11FB0 ; ALetter # Lo LISU LETTER YHA
+12000..12399 ; ALetter # Lo [922] CUNEIFORM SIGN A..CUNEIFORM SIGN U U
+12400..1246E ; ALetter # Nl [111] CUNEIFORM NUMERIC SIGN TWO ASH..CUNEIFORM NUMERIC SIGN NINE U VARIANT FORM
+12480..12543 ; ALetter # Lo [196] CUNEIFORM SIGN AB TIMES NUN TENU..CUNEIFORM SIGN ZU5 TIMES THREE DISH TENU
+12F90..12FF0 ; ALetter # Lo [97] CYPRO-MINOAN SIGN CM001..CYPRO-MINOAN SIGN CM114
+13000..1342F ; ALetter # Lo [1072] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH V011D
+13441..13446 ; ALetter # Lo [6] EGYPTIAN HIEROGLYPH FULL BLANK..EGYPTIAN HIEROGLYPH WIDE LOST SIGN
+13460..143FA ; ALetter # Lo [3995] EGYPTIAN HIEROGLYPH-13460..EGYPTIAN HIEROGLYPH-143FA
+14400..14646 ; ALetter # Lo [583] ANATOLIAN HIEROGLYPH A001..ANATOLIAN HIEROGLYPH A530
+16100..1611D ; ALetter # Lo [30] GURUNG KHEMA LETTER A..GURUNG KHEMA LETTER SA
+16800..16A38 ; ALetter # Lo [569] BAMUM LETTER PHASE-A NGKUE MFON..BAMUM LETTER PHASE-F VUEQ
+16A40..16A5E ; ALetter # Lo [31] MRO LETTER TA..MRO LETTER TEK
+16A70..16ABE ; ALetter # Lo [79] TANGSA LETTER OZ..TANGSA LETTER ZA
+16AD0..16AED ; ALetter # Lo [30] BASSA VAH LETTER ENNI..BASSA VAH LETTER I
+16B00..16B2F ; ALetter # Lo [48] PAHAWH HMONG VOWEL KEEB..PAHAWH HMONG CONSONANT CAU
+16B40..16B43 ; ALetter # Lm [4] PAHAWH HMONG SIGN VOS SEEV..PAHAWH HMONG SIGN IB YAM
+16B63..16B77 ; ALetter # Lo [21] PAHAWH HMONG SIGN VOS LUB..PAHAWH HMONG SIGN CIM NRES TOS
+16B7D..16B8F ; ALetter # Lo [19] PAHAWH HMONG CLAN SIGN TSHEEJ..PAHAWH HMONG CLAN SIGN VWJ
+16D40..16D42 ; ALetter # Lm [3] KIRAT RAI SIGN ANUSVARA..KIRAT RAI SIGN VISARGA
+16D43..16D6A ; ALetter # Lo [40] KIRAT RAI LETTER A..KIRAT RAI VOWEL SIGN AU
+16D6B..16D6C ; ALetter # Lm [2] KIRAT RAI SIGN VIRAMA..KIRAT RAI SIGN SAAT
+16E40..16E7F ; ALetter # L& [64] MEDEFAIDRIN CAPITAL LETTER M..MEDEFAIDRIN SMALL LETTER Y
+16EA0..16EB8 ; ALetter # L& [25] BERIA ERFE CAPITAL LETTER ARKAB..BERIA ERFE CAPITAL LETTER AY
+16EBB..16ED3 ; ALetter # L& [25] BERIA ERFE SMALL LETTER ARKAB..BERIA ERFE SMALL LETTER AY
+16F00..16F4A ; ALetter # Lo [75] MIAO LETTER PA..MIAO LETTER RTE
+16F50 ; ALetter # Lo MIAO LETTER NASALIZATION
+16F93..16F9F ; ALetter # Lm [13] MIAO LETTER TONE-2..MIAO LETTER REFORMED TONE-8
+16FE0..16FE1 ; ALetter # Lm [2] TANGUT ITERATION MARK..NUSHU ITERATION MARK
+16FE3 ; ALetter # Lm OLD CHINESE ITERATION MARK
+1BC00..1BC6A ; ALetter # Lo [107] DUPLOYAN LETTER H..DUPLOYAN LETTER VOCALIC M
+1BC70..1BC7C ; ALetter # Lo [13] DUPLOYAN AFFIX LEFT HORIZONTAL SECANT..DUPLOYAN AFFIX ATTACHED TANGENT HOOK
+1BC80..1BC88 ; ALetter # Lo [9] DUPLOYAN AFFIX HIGH ACUTE..DUPLOYAN AFFIX HIGH VERTICAL
+1BC90..1BC99 ; ALetter # Lo [10] DUPLOYAN AFFIX LOW ACUTE..DUPLOYAN AFFIX LOW ARROW
+1D400..1D454 ; ALetter # L& [85] MATHEMATICAL BOLD CAPITAL A..MATHEMATICAL ITALIC SMALL G
+1D456..1D49C ; ALetter # L& [71] MATHEMATICAL ITALIC SMALL I..MATHEMATICAL SCRIPT CAPITAL A
+1D49E..1D49F ; ALetter # L& [2] MATHEMATICAL SCRIPT CAPITAL C..MATHEMATICAL SCRIPT CAPITAL D
+1D4A2 ; ALetter # L& MATHEMATICAL SCRIPT CAPITAL G
+1D4A5..1D4A6 ; ALetter # L& [2] MATHEMATICAL SCRIPT CAPITAL J..MATHEMATICAL SCRIPT CAPITAL K
+1D4A9..1D4AC ; ALetter # L& [4] MATHEMATICAL SCRIPT CAPITAL N..MATHEMATICAL SCRIPT CAPITAL Q
+1D4AE..1D4B9 ; ALetter # L& [12] MATHEMATICAL SCRIPT CAPITAL S..MATHEMATICAL SCRIPT SMALL D
+1D4BB ; ALetter # L& MATHEMATICAL SCRIPT SMALL F
+1D4BD..1D4C3 ; ALetter # L& [7] MATHEMATICAL SCRIPT SMALL H..MATHEMATICAL SCRIPT SMALL N
+1D4C5..1D505 ; ALetter # L& [65] MATHEMATICAL SCRIPT SMALL P..MATHEMATICAL FRAKTUR CAPITAL B
+1D507..1D50A ; ALetter # L& [4] MATHEMATICAL FRAKTUR CAPITAL D..MATHEMATICAL FRAKTUR CAPITAL G
+1D50D..1D514 ; ALetter # L& [8] MATHEMATICAL FRAKTUR CAPITAL J..MATHEMATICAL FRAKTUR CAPITAL Q
+1D516..1D51C ; ALetter # L& [7] MATHEMATICAL FRAKTUR CAPITAL S..MATHEMATICAL FRAKTUR CAPITAL Y
+1D51E..1D539 ; ALetter # L& [28] MATHEMATICAL FRAKTUR SMALL A..MATHEMATICAL DOUBLE-STRUCK CAPITAL B
+1D53B..1D53E ; ALetter # L& [4] MATHEMATICAL DOUBLE-STRUCK CAPITAL D..MATHEMATICAL DOUBLE-STRUCK CAPITAL G
+1D540..1D544 ; ALetter # L& [5] MATHEMATICAL DOUBLE-STRUCK CAPITAL I..MATHEMATICAL DOUBLE-STRUCK CAPITAL M
+1D546 ; ALetter # L& MATHEMATICAL DOUBLE-STRUCK CAPITAL O
+1D54A..1D550 ; ALetter # L& [7] MATHEMATICAL DOUBLE-STRUCK CAPITAL S..MATHEMATICAL DOUBLE-STRUCK CAPITAL Y
+1D552..1D6A5 ; ALetter # L& [340] MATHEMATICAL DOUBLE-STRUCK SMALL A..MATHEMATICAL ITALIC SMALL DOTLESS J
+1D6A8..1D6C0 ; ALetter # L& [25] MATHEMATICAL BOLD CAPITAL ALPHA..MATHEMATICAL BOLD CAPITAL OMEGA
+1D6C2..1D6DA ; ALetter # L& [25] MATHEMATICAL BOLD SMALL ALPHA..MATHEMATICAL BOLD SMALL OMEGA
+1D6DC..1D6FA ; ALetter # L& [31] MATHEMATICAL BOLD EPSILON SYMBOL..MATHEMATICAL ITALIC CAPITAL OMEGA
+1D6FC..1D714 ; ALetter # L& [25] MATHEMATICAL ITALIC SMALL ALPHA..MATHEMATICAL ITALIC SMALL OMEGA
+1D716..1D734 ; ALetter # L& [31] MATHEMATICAL ITALIC EPSILON SYMBOL..MATHEMATICAL BOLD ITALIC CAPITAL OMEGA
+1D736..1D74E ; ALetter # L& [25] MATHEMATICAL BOLD ITALIC SMALL ALPHA..MATHEMATICAL BOLD ITALIC SMALL OMEGA
+1D750..1D76E ; ALetter # L& [31] MATHEMATICAL BOLD ITALIC EPSILON SYMBOL..MATHEMATICAL SANS-SERIF BOLD CAPITAL OMEGA
+1D770..1D788 ; ALetter # L& [25] MATHEMATICAL SANS-SERIF BOLD SMALL ALPHA..MATHEMATICAL SANS-SERIF BOLD SMALL OMEGA
+1D78A..1D7A8 ; ALetter # L& [31] MATHEMATICAL SANS-SERIF BOLD EPSILON SYMBOL..MATHEMATICAL SANS-SERIF BOLD ITALIC CAPITAL OMEGA
+1D7AA..1D7C2 ; ALetter # L& [25] MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL ALPHA..MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL OMEGA
+1D7C4..1D7CB ; ALetter # L& [8] MATHEMATICAL SANS-SERIF BOLD ITALIC EPSILON SYMBOL..MATHEMATICAL BOLD SMALL DIGAMMA
+1DF00..1DF09 ; ALetter # L& [10] LATIN SMALL LETTER FENG DIGRAPH WITH TRILL..LATIN SMALL LETTER T WITH HOOK AND RETROFLEX HOOK
+1DF0A ; ALetter # Lo LATIN LETTER RETROFLEX CLICK WITH RETROFLEX HOOK
+1DF0B..1DF1E ; ALetter # L& [20] LATIN SMALL LETTER ESH WITH DOUBLE BAR..LATIN SMALL LETTER S WITH CURL
+1DF25..1DF2A ; ALetter # L& [6] LATIN SMALL LETTER D WITH MID-HEIGHT LEFT HOOK..LATIN SMALL LETTER T WITH MID-HEIGHT LEFT HOOK
+1E030..1E06D ; ALetter # Lm [62] MODIFIER LETTER CYRILLIC SMALL A..MODIFIER LETTER CYRILLIC SMALL STRAIGHT U WITH STROKE
+1E100..1E12C ; ALetter # Lo [45] NYIAKENG PUACHUE HMONG LETTER MA..NYIAKENG PUACHUE HMONG LETTER W
+1E137..1E13D ; ALetter # Lm [7] NYIAKENG PUACHUE HMONG SIGN FOR PERSON..NYIAKENG PUACHUE HMONG SYLLABLE LENGTHENER
+1E14E ; ALetter # Lo NYIAKENG PUACHUE HMONG LOGOGRAM NYAJ
+1E290..1E2AD ; ALetter # Lo [30] TOTO LETTER PA..TOTO LETTER A
+1E2C0..1E2EB ; ALetter # Lo [44] WANCHO LETTER AA..WANCHO LETTER YIH
+1E4D0..1E4EA ; ALetter # Lo [27] NAG MUNDARI LETTER O..NAG MUNDARI LETTER ELL
+1E4EB ; ALetter # Lm NAG MUNDARI SIGN OJOD
+1E5D0..1E5ED ; ALetter # Lo [30] OL ONAL LETTER O..OL ONAL LETTER EG
+1E5F0 ; ALetter # Lo OL ONAL SIGN HODDOND
+1E6C0..1E6DE ; ALetter # Lo [31] TAI YO LETTER LOW KO..TAI YO LETTER HIGH KVO
+1E6E0..1E6E2 ; ALetter # Lo [3] TAI YO LETTER AA..TAI YO LETTER UE
+1E6E4..1E6E5 ; ALetter # Lo [2] TAI YO LETTER U..TAI YO LETTER AE
+1E6E7..1E6ED ; ALetter # Lo [7] TAI YO LETTER O..TAI YO LETTER AUE
+1E6F0..1E6F4 ; ALetter # Lo [5] TAI YO LETTER AN..TAI YO LETTER AP
+1E6FE ; ALetter # Lo TAI YO SYMBOL MUEANG
+1E6FF ; ALetter # Lm TAI YO XAM LAI
+1E7E0..1E7E6 ; ALetter # Lo [7] ETHIOPIC SYLLABLE HHYA..ETHIOPIC SYLLABLE HHYO
+1E7E8..1E7EB ; ALetter # Lo [4] ETHIOPIC SYLLABLE GURAGE HHWA..ETHIOPIC SYLLABLE HHWE
+1E7ED..1E7EE ; ALetter # Lo [2] ETHIOPIC SYLLABLE GURAGE MWI..ETHIOPIC SYLLABLE GURAGE MWEE
+1E7F0..1E7FE ; ALetter # Lo [15] ETHIOPIC SYLLABLE GURAGE QWI..ETHIOPIC SYLLABLE GURAGE PWEE
+1E800..1E8C4 ; ALetter # Lo [197] MENDE KIKAKUI SYLLABLE M001 KI..MENDE KIKAKUI SYLLABLE M060 NYON
+1E900..1E943 ; ALetter # L& [68] ADLAM CAPITAL LETTER ALIF..ADLAM SMALL LETTER SHA
+1E94B ; ALetter # Lm ADLAM NASALIZATION MARK
+1EE00..1EE03 ; ALetter # Lo [4] ARABIC MATHEMATICAL ALEF..ARABIC MATHEMATICAL DAL
+1EE05..1EE1F ; ALetter # Lo [27] ARABIC MATHEMATICAL WAW..ARABIC MATHEMATICAL DOTLESS QAF
+1EE21..1EE22 ; ALetter # Lo [2] ARABIC MATHEMATICAL INITIAL BEH..ARABIC MATHEMATICAL INITIAL JEEM
+1EE24 ; ALetter # Lo ARABIC MATHEMATICAL INITIAL HEH
+1EE27 ; ALetter # Lo ARABIC MATHEMATICAL INITIAL HAH
+1EE29..1EE32 ; ALetter # Lo [10] ARABIC MATHEMATICAL INITIAL YEH..ARABIC MATHEMATICAL INITIAL QAF
+1EE34..1EE37 ; ALetter # Lo [4] ARABIC MATHEMATICAL INITIAL SHEEN..ARABIC MATHEMATICAL INITIAL KHAH
+1EE39 ; ALetter # Lo ARABIC MATHEMATICAL INITIAL DAD
+1EE3B ; ALetter # Lo ARABIC MATHEMATICAL INITIAL GHAIN
+1EE42 ; ALetter # Lo ARABIC MATHEMATICAL TAILED JEEM
+1EE47 ; ALetter # Lo ARABIC MATHEMATICAL TAILED HAH
+1EE49 ; ALetter # Lo ARABIC MATHEMATICAL TAILED YEH
+1EE4B ; ALetter # Lo ARABIC MATHEMATICAL TAILED LAM
+1EE4D..1EE4F ; ALetter # Lo [3] ARABIC MATHEMATICAL TAILED NOON..ARABIC MATHEMATICAL TAILED AIN
+1EE51..1EE52 ; ALetter # Lo [2] ARABIC MATHEMATICAL TAILED SAD..ARABIC MATHEMATICAL TAILED QAF
+1EE54 ; ALetter # Lo ARABIC MATHEMATICAL TAILED SHEEN
+1EE57 ; ALetter # Lo ARABIC MATHEMATICAL TAILED KHAH
+1EE59 ; ALetter # Lo ARABIC MATHEMATICAL TAILED DAD
+1EE5B ; ALetter # Lo ARABIC MATHEMATICAL TAILED GHAIN
+1EE5D ; ALetter # Lo ARABIC MATHEMATICAL TAILED DOTLESS NOON
+1EE5F ; ALetter # Lo ARABIC MATHEMATICAL TAILED DOTLESS QAF
+1EE61..1EE62 ; ALetter # Lo [2] ARABIC MATHEMATICAL STRETCHED BEH..ARABIC MATHEMATICAL STRETCHED JEEM
+1EE64 ; ALetter # Lo ARABIC MATHEMATICAL STRETCHED HEH
+1EE67..1EE6A ; ALetter # Lo [4] ARABIC MATHEMATICAL STRETCHED HAH..ARABIC MATHEMATICAL STRETCHED KAF
+1EE6C..1EE72 ; ALetter # Lo [7] ARABIC MATHEMATICAL STRETCHED MEEM..ARABIC MATHEMATICAL STRETCHED QAF
+1EE74..1EE77 ; ALetter # Lo [4] ARABIC MATHEMATICAL STRETCHED SHEEN..ARABIC MATHEMATICAL STRETCHED KHAH
+1EE79..1EE7C ; ALetter # Lo [4] ARABIC MATHEMATICAL STRETCHED DAD..ARABIC MATHEMATICAL STRETCHED DOTLESS BEH
+1EE7E ; ALetter # Lo ARABIC MATHEMATICAL STRETCHED DOTLESS FEH
+1EE80..1EE89 ; ALetter # Lo [10] ARABIC MATHEMATICAL LOOPED ALEF..ARABIC MATHEMATICAL LOOPED YEH
+1EE8B..1EE9B ; ALetter # Lo [17] ARABIC MATHEMATICAL LOOPED LAM..ARABIC MATHEMATICAL LOOPED GHAIN
+1EEA1..1EEA3 ; ALetter # Lo [3] ARABIC MATHEMATICAL DOUBLE-STRUCK BEH..ARABIC MATHEMATICAL DOUBLE-STRUCK DAL
+1EEA5..1EEA9 ; ALetter # Lo [5] ARABIC MATHEMATICAL DOUBLE-STRUCK WAW..ARABIC MATHEMATICAL DOUBLE-STRUCK YEH
+1EEAB..1EEBB ; ALetter # Lo [17] ARABIC MATHEMATICAL DOUBLE-STRUCK LAM..ARABIC MATHEMATICAL DOUBLE-STRUCK GHAIN
+1F130..1F149 ; ALetter # So [26] SQUARED LATIN CAPITAL LETTER A..SQUARED LATIN CAPITAL LETTER Z
+1F150..1F169 ; ALetter # So [26] NEGATIVE CIRCLED LATIN CAPITAL LETTER A..NEGATIVE CIRCLED LATIN CAPITAL LETTER Z
+1F170..1F189 ; ALetter # So [26] NEGATIVE SQUARED LATIN CAPITAL LETTER A..NEGATIVE SQUARED LATIN CAPITAL LETTER Z
+
+# Total code points: 33973
+
+# ================================================
+
+003A ; MidLetter # Po COLON
+00B7 ; MidLetter # Po MIDDLE DOT
+0387 ; MidLetter # Po GREEK ANO TELEIA
+055F ; MidLetter # Po ARMENIAN ABBREVIATION MARK
+05F4 ; MidLetter # Po HEBREW PUNCTUATION GERSHAYIM
+2027 ; MidLetter # Po HYPHENATION POINT
+FE13 ; MidLetter # Po PRESENTATION FORM FOR VERTICAL COLON
+FE55 ; MidLetter # Po SMALL COLON
+FF1A ; MidLetter # Po FULLWIDTH COLON
+
+# Total code points: 9
+
+# ================================================
+
+002C ; MidNum # Po COMMA
+003B ; MidNum # Po SEMICOLON
+037E ; MidNum # Po GREEK QUESTION MARK
+0589 ; MidNum # Po ARMENIAN FULL STOP
+060C..060D ; MidNum # Po [2] ARABIC COMMA..ARABIC DATE SEPARATOR
+066C ; MidNum # Po ARABIC THOUSANDS SEPARATOR
+07F8 ; MidNum # Po NKO COMMA
+2044 ; MidNum # Sm FRACTION SLASH
+FE50 ; MidNum # Po SMALL COMMA
+FE54 ; MidNum # Po SMALL SEMICOLON
+FF0C ; MidNum # Po FULLWIDTH COMMA
+FF1B ; MidNum # Po FULLWIDTH SEMICOLON
+
+# Total code points: 13
+
+# ================================================
+
+002E ; MidNumLet # Po FULL STOP
+2018 ; MidNumLet # Pi LEFT SINGLE QUOTATION MARK
+2019 ; MidNumLet # Pf RIGHT SINGLE QUOTATION MARK
+2024 ; MidNumLet # Po ONE DOT LEADER
+FE52 ; MidNumLet # Po SMALL FULL STOP
+FF07 ; MidNumLet # Po FULLWIDTH APOSTROPHE
+FF0E ; MidNumLet # Po FULLWIDTH FULL STOP
+
+# Total code points: 7
+
+# ================================================
+
+0030..0039 ; Numeric # Nd [10] DIGIT ZERO..DIGIT NINE
+0600..0605 ; Numeric # Cf [6] ARABIC NUMBER SIGN..ARABIC NUMBER MARK ABOVE
+0660..0669 ; Numeric # Nd [10] ARABIC-INDIC DIGIT ZERO..ARABIC-INDIC DIGIT NINE
+066B ; Numeric # Po ARABIC DECIMAL SEPARATOR
+06DD ; Numeric # Cf ARABIC END OF AYAH
+06F0..06F9 ; Numeric # Nd [10] EXTENDED ARABIC-INDIC DIGIT ZERO..EXTENDED ARABIC-INDIC DIGIT NINE
+07C0..07C9 ; Numeric # Nd [10] NKO DIGIT ZERO..NKO DIGIT NINE
+0890..0891 ; Numeric # Cf [2] ARABIC POUND MARK ABOVE..ARABIC PIASTRE MARK ABOVE
+08E2 ; Numeric # Cf ARABIC DISPUTED END OF AYAH
+0966..096F ; Numeric # Nd [10] DEVANAGARI DIGIT ZERO..DEVANAGARI DIGIT NINE
+09E6..09EF ; Numeric # Nd [10] BENGALI DIGIT ZERO..BENGALI DIGIT NINE
+0A66..0A6F ; Numeric # Nd [10] GURMUKHI DIGIT ZERO..GURMUKHI DIGIT NINE
+0AE6..0AEF ; Numeric # Nd [10] GUJARATI DIGIT ZERO..GUJARATI DIGIT NINE
+0B66..0B6F ; Numeric # Nd [10] ORIYA DIGIT ZERO..ORIYA DIGIT NINE
+0BE6..0BEF ; Numeric # Nd [10] TAMIL DIGIT ZERO..TAMIL DIGIT NINE
+0C66..0C6F ; Numeric # Nd [10] TELUGU DIGIT ZERO..TELUGU DIGIT NINE
+0CE6..0CEF ; Numeric # Nd [10] KANNADA DIGIT ZERO..KANNADA DIGIT NINE
+0D66..0D6F ; Numeric # Nd [10] MALAYALAM DIGIT ZERO..MALAYALAM DIGIT NINE
+0DE6..0DEF ; Numeric # Nd [10] SINHALA LITH DIGIT ZERO..SINHALA LITH DIGIT NINE
+0E50..0E59 ; Numeric # Nd [10] THAI DIGIT ZERO..THAI DIGIT NINE
+0ED0..0ED9 ; Numeric # Nd [10] LAO DIGIT ZERO..LAO DIGIT NINE
+0F20..0F29 ; Numeric # Nd [10] TIBETAN DIGIT ZERO..TIBETAN DIGIT NINE
+1040..1049 ; Numeric # Nd [10] MYANMAR DIGIT ZERO..MYANMAR DIGIT NINE
+1090..1099 ; Numeric # Nd [10] MYANMAR SHAN DIGIT ZERO..MYANMAR SHAN DIGIT NINE
+17E0..17E9 ; Numeric # Nd [10] KHMER DIGIT ZERO..KHMER DIGIT NINE
+1810..1819 ; Numeric # Nd [10] MONGOLIAN DIGIT ZERO..MONGOLIAN DIGIT NINE
+1946..194F ; Numeric # Nd [10] LIMBU DIGIT ZERO..LIMBU DIGIT NINE
+19D0..19D9 ; Numeric # Nd [10] NEW TAI LUE DIGIT ZERO..NEW TAI LUE DIGIT NINE
+19DA ; Numeric # No NEW TAI LUE THAM DIGIT ONE
+1A80..1A89 ; Numeric # Nd [10] TAI THAM HORA DIGIT ZERO..TAI THAM HORA DIGIT NINE
+1A90..1A99 ; Numeric # Nd [10] TAI THAM THAM DIGIT ZERO..TAI THAM THAM DIGIT NINE
+1B50..1B59 ; Numeric # Nd [10] BALINESE DIGIT ZERO..BALINESE DIGIT NINE
+1BB0..1BB9 ; Numeric # Nd [10] SUNDANESE DIGIT ZERO..SUNDANESE DIGIT NINE
+1C40..1C49 ; Numeric # Nd [10] LEPCHA DIGIT ZERO..LEPCHA DIGIT NINE
+1C50..1C59 ; Numeric # Nd [10] OL CHIKI DIGIT ZERO..OL CHIKI DIGIT NINE
+A620..A629 ; Numeric # Nd [10] VAI DIGIT ZERO..VAI DIGIT NINE
+A8D0..A8D9 ; Numeric # Nd [10] SAURASHTRA DIGIT ZERO..SAURASHTRA DIGIT NINE
+A900..A909 ; Numeric # Nd [10] KAYAH LI DIGIT ZERO..KAYAH LI DIGIT NINE
+A9D0..A9D9 ; Numeric # Nd [10] JAVANESE DIGIT ZERO..JAVANESE DIGIT NINE
+A9F0..A9F9 ; Numeric # Nd [10] MYANMAR TAI LAING DIGIT ZERO..MYANMAR TAI LAING DIGIT NINE
+AA50..AA59 ; Numeric # Nd [10] CHAM DIGIT ZERO..CHAM DIGIT NINE
+ABF0..ABF9 ; Numeric # Nd [10] MEETEI MAYEK DIGIT ZERO..MEETEI MAYEK DIGIT NINE
+FF10..FF19 ; Numeric # Nd [10] FULLWIDTH DIGIT ZERO..FULLWIDTH DIGIT NINE
+104A0..104A9 ; Numeric # Nd [10] OSMANYA DIGIT ZERO..OSMANYA DIGIT NINE
+10D30..10D39 ; Numeric # Nd [10] HANIFI ROHINGYA DIGIT ZERO..HANIFI ROHINGYA DIGIT NINE
+10D40..10D49 ; Numeric # Nd [10] GARAY DIGIT ZERO..GARAY DIGIT NINE
+11066..1106F ; Numeric # Nd [10] BRAHMI DIGIT ZERO..BRAHMI DIGIT NINE
+110BD ; Numeric # Cf KAITHI NUMBER SIGN
+110CD ; Numeric # Cf KAITHI NUMBER SIGN ABOVE
+110F0..110F9 ; Numeric # Nd [10] SORA SOMPENG DIGIT ZERO..SORA SOMPENG DIGIT NINE
+11136..1113F ; Numeric # Nd [10] CHAKMA DIGIT ZERO..CHAKMA DIGIT NINE
+111D0..111D9 ; Numeric # Nd [10] SHARADA DIGIT ZERO..SHARADA DIGIT NINE
+112F0..112F9 ; Numeric # Nd [10] KHUDAWADI DIGIT ZERO..KHUDAWADI DIGIT NINE
+11450..11459 ; Numeric # Nd [10] NEWA DIGIT ZERO..NEWA DIGIT NINE
+114D0..114D9 ; Numeric # Nd [10] TIRHUTA DIGIT ZERO..TIRHUTA DIGIT NINE
+11650..11659 ; Numeric # Nd [10] MODI DIGIT ZERO..MODI DIGIT NINE
+116C0..116C9 ; Numeric # Nd [10] TAKRI DIGIT ZERO..TAKRI DIGIT NINE
+116D0..116E3 ; Numeric # Nd [20] MYANMAR PAO DIGIT ZERO..MYANMAR EASTERN PWO KAREN DIGIT NINE
+11730..11739 ; Numeric # Nd [10] AHOM DIGIT ZERO..AHOM DIGIT NINE
+118E0..118E9 ; Numeric # Nd [10] WARANG CITI DIGIT ZERO..WARANG CITI DIGIT NINE
+11950..11959 ; Numeric # Nd [10] DIVES AKURU DIGIT ZERO..DIVES AKURU DIGIT NINE
+11BF0..11BF9 ; Numeric # Nd [10] SUNUWAR DIGIT ZERO..SUNUWAR DIGIT NINE
+11C50..11C59 ; Numeric # Nd [10] BHAIKSUKI DIGIT ZERO..BHAIKSUKI DIGIT NINE
+11D50..11D59 ; Numeric # Nd [10] MASARAM GONDI DIGIT ZERO..MASARAM GONDI DIGIT NINE
+11DA0..11DA9 ; Numeric # Nd [10] GUNJALA GONDI DIGIT ZERO..GUNJALA GONDI DIGIT NINE
+11DE0..11DE9 ; Numeric # Nd [10] TOLONG SIKI DIGIT ZERO..TOLONG SIKI DIGIT NINE
+11F50..11F59 ; Numeric # Nd [10] KAWI DIGIT ZERO..KAWI DIGIT NINE
+16130..16139 ; Numeric # Nd [10] GURUNG KHEMA DIGIT ZERO..GURUNG KHEMA DIGIT NINE
+16A60..16A69 ; Numeric # Nd [10] MRO DIGIT ZERO..MRO DIGIT NINE
+16AC0..16AC9 ; Numeric # Nd [10] TANGSA DIGIT ZERO..TANGSA DIGIT NINE
+16B50..16B59 ; Numeric # Nd [10] PAHAWH HMONG DIGIT ZERO..PAHAWH HMONG DIGIT NINE
+16D70..16D79 ; Numeric # Nd [10] KIRAT RAI DIGIT ZERO..KIRAT RAI DIGIT NINE
+1CCF0..1CCF9 ; Numeric # Nd [10] OUTLINED DIGIT ZERO..OUTLINED DIGIT NINE
+1D7CE..1D7FF ; Numeric # Nd [50] MATHEMATICAL BOLD DIGIT ZERO..MATHEMATICAL MONOSPACE DIGIT NINE
+1E140..1E149 ; Numeric # Nd [10] NYIAKENG PUACHUE HMONG DIGIT ZERO..NYIAKENG PUACHUE HMONG DIGIT NINE
+1E2F0..1E2F9 ; Numeric # Nd [10] WANCHO DIGIT ZERO..WANCHO DIGIT NINE
+1E4F0..1E4F9 ; Numeric # Nd [10] NAG MUNDARI DIGIT ZERO..NAG MUNDARI DIGIT NINE
+1E5F1..1E5FA ; Numeric # Nd [10] OL ONAL DIGIT ZERO..OL ONAL DIGIT NINE
+1E950..1E959 ; Numeric # Nd [10] ADLAM DIGIT ZERO..ADLAM DIGIT NINE
+1FBF0..1FBF9 ; Numeric # Nd [10] SEGMENTED DIGIT ZERO..SEGMENTED DIGIT NINE
+
+# Total code points: 784
+
+# ================================================
+
+005F ; ExtendNumLet # Pc LOW LINE
+202F ; ExtendNumLet # Zs NARROW NO-BREAK SPACE
+203F..2040 ; ExtendNumLet # Pc [2] UNDERTIE..CHARACTER TIE
+2054 ; ExtendNumLet # Pc INVERTED UNDERTIE
+FE33..FE34 ; ExtendNumLet # Pc [2] PRESENTATION FORM FOR VERTICAL LOW LINE..PRESENTATION FORM FOR VERTICAL WAVY LOW LINE
+FE4D..FE4F ; ExtendNumLet # Pc [3] DASHED LOW LINE..WAVY LOW LINE
+FF3F ; ExtendNumLet # Pc FULLWIDTH LOW LINE
+
+# Total code points: 11
+
+# ================================================
+
+200D ; ZWJ # Cf ZERO WIDTH JOINER
+
+# Total code points: 1
+
+# ================================================
+
+0020 ; WSegSpace # Zs SPACE
+1680 ; WSegSpace # Zs OGHAM SPACE MARK
+2000..2006 ; WSegSpace # Zs [7] EN QUAD..SIX-PER-EM SPACE
+2008..200A ; WSegSpace # Zs [3] PUNCTUATION SPACE..HAIR SPACE
+205F ; WSegSpace # Zs MEDIUM MATHEMATICAL SPACE
+3000 ; WSegSpace # Zs IDEOGRAPHIC SPACE
+
+# Total code points: 14
+
+# EOF
diff --git a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/uax29/ExtendedPictographicTest.java b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/uax29/ExtendedPictographicTest.java
new file mode 100644
index 000000000..9325fc584
--- /dev/null
+++ b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/uax29/ExtendedPictographicTest.java
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.tokenize.uax29;
+
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.ValueSource;
+
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+public class ExtendedPictographicTest {
+
+ @ParameterizedTest
+ @ValueSource(ints = {0x00A9, 0x00AE, 0x203C, 0x2764, 0x1F600, 0x1F468})
+ void testPictographicCodePoints(int codePoint) {
+ assertTrue(ExtendedPictographic.is(codePoint),
+ () -> String.format("U+%04X should be Extended_Pictographic", codePoint));
+ }
+
+ @ParameterizedTest
+ @ValueSource(ints = {'a', '5', ' ', 0x0301, 0x05D0, 0x1F1E6})
+ void testNonPictographicCodePoints(int codePoint) {
+ // 0x1F1E6 (regional indicator A) is a supplementary code point that is NOT pictographic.
+ assertFalse(ExtendedPictographic.is(codePoint));
+ }
+
+ @ParameterizedTest
+ @ValueSource(ints = {-1, Integer.MIN_VALUE, Character.MAX_CODE_POINT + 1, Integer.MAX_VALUE})
+ void testOutOfRangeIsFalseAndSafe(int codePoint) {
+ assertFalse(ExtendedPictographic.is(codePoint));
+ }
+}
diff --git a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/uax29/WordBoundaryConformanceTest.java b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/uax29/WordBoundaryConformanceTest.java
new file mode 100644
index 000000000..a763f6786
--- /dev/null
+++ b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/uax29/WordBoundaryConformanceTest.java
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.tokenize.uax29;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Objects;
+
+import org.junit.jupiter.api.Test;
+
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+/**
+ * Runs the official Unicode {@code WordBreakTest.txt} conformance suite against
+ * {@link WordSegmenter}. Each line marks boundaries with U+00F7 (division sign) and non-boundaries
+ * with U+00D7 (multiplication sign) between code points.
+ */
+public class WordBoundaryConformanceTest {
+
+ private static final int BOUNDARY = 0x00F7; // division sign
+
+ @Test
+ void testOfficialUnicodeWordBreakConformance() throws IOException {
+ int total = 0;
+ int passed = 0;
+ final List failures = new ArrayList<>();
+
+ try (InputStream in = Objects.requireNonNull(
+ WordBoundaryConformanceTest.class.getResourceAsStream("WordBreakTest.txt"),
+ "Missing test resource: WordBreakTest.txt");
+ BufferedReader reader =
+ new BufferedReader(new InputStreamReader(in, StandardCharsets.UTF_8))) {
+ String raw;
+ int lineNumber = 0;
+ while ((raw = reader.readLine()) != null) {
+ lineNumber++;
+ final int hash = raw.indexOf('#');
+ final String content = (hash < 0 ? raw : raw.substring(0, hash)).strip();
+ if (content.isEmpty()) {
+ continue;
+ }
+ final String[] tokens = content.split("\\s+");
+
+ final StringBuilder text = new StringBuilder();
+ final List expected = new ArrayList<>();
+ expected.add(0); // tokens[0] is always a leading boundary marker.
+ int offset = 0;
+ for (int k = 1; k < tokens.length; k += 2) {
+ final int codePoint = Integer.parseInt(tokens[k], 16);
+ text.appendCodePoint(codePoint);
+ offset += Character.charCount(codePoint);
+ if (tokens[k + 1].codePointAt(0) == BOUNDARY) {
+ expected.add(offset);
+ }
+ }
+
+ final int[] actual = WordSegmenter.boundaries(text);
+ final int[] expectedArray = expected.stream().mapToInt(Integer::intValue).toArray();
+ total++;
+ if (Arrays.equals(actual, expectedArray)) {
+ passed++;
+ } else if (failures.size() < 25) {
+ failures.add("line " + lineNumber + ": " + content
+ + "\n expected=" + Arrays.toString(expectedArray)
+ + "\n actual =" + Arrays.toString(actual));
+ }
+ }
+ }
+
+ final int passRate = total == 0 ? 0 : passed * 100 / total;
+ assertTrue(total > 1900, "expected the full conformance suite to load, ran only " + total);
+ assertTrue(failures.isEmpty(),
+ "UAX#29 word-break conformance: " + passed + "/" + total + " (" + passRate
+ + "%). First failures:\n" + String.join("\n", failures));
+ }
+}
diff --git a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/uax29/WordBreakPropertyTest.java b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/uax29/WordBreakPropertyTest.java
new file mode 100644
index 000000000..86383013b
--- /dev/null
+++ b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/uax29/WordBreakPropertyTest.java
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.tokenize.uax29;
+
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.ValueSource;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertSame;
+
+public class WordBreakPropertyTest {
+
+ @Test
+ void testAsciiLettersAndDigits() {
+ assertSame(WordBreak.ALETTER, WordBreakProperty.of('a'));
+ assertSame(WordBreak.ALETTER, WordBreakProperty.of('Z'));
+ assertSame(WordBreak.NUMERIC, WordBreakProperty.of('0'));
+ assertSame(WordBreak.NUMERIC, WordBreakProperty.of('9'));
+ }
+
+ @Test
+ void testWhitespaceAndLineBreaks() {
+ assertSame(WordBreak.WSEG_SPACE, WordBreakProperty.of(0x0020)); // space
+ assertSame(WordBreak.CR, WordBreakProperty.of(0x000D));
+ assertSame(WordBreak.LF, WordBreakProperty.of(0x000A));
+ assertSame(WordBreak.NEWLINE, WordBreakProperty.of(0x000B)); // vertical tab
+ }
+
+ @Test
+ void testMidAndExtendClasses() {
+ assertSame(WordBreak.MID_NUM, WordBreakProperty.of(0x002C)); // comma
+ assertSame(WordBreak.MID_NUM_LET, WordBreakProperty.of(0x002E)); // full stop
+ assertSame(WordBreak.MID_LETTER, WordBreakProperty.of(0x003A)); // colon
+ assertSame(WordBreak.EXTEND_NUM_LET, WordBreakProperty.of(0x005F)); // low line
+ assertSame(WordBreak.EXTEND, WordBreakProperty.of(0x0301)); // combining acute
+ }
+
+ @Test
+ void testQuotesJoinerAndScriptLetters() {
+ assertSame(WordBreak.SINGLE_QUOTE, WordBreakProperty.of(0x0027));
+ assertSame(WordBreak.DOUBLE_QUOTE, WordBreakProperty.of(0x0022));
+ assertSame(WordBreak.ZWJ, WordBreakProperty.of(0x200D));
+ assertSame(WordBreak.HEBREW_LETTER, WordBreakProperty.of(0x05D0));
+ assertSame(WordBreak.KATAKANA, WordBreakProperty.of(0x30A1));
+ }
+
+ @Test
+ void testSupplementaryCodePointsUseTheRangeTable() {
+ assertSame(WordBreak.REGIONAL_INDICATOR, WordBreakProperty.of(0x1F1E6)); // regional indicator A
+ assertSame(WordBreak.ALETTER, WordBreakProperty.of(0x1D400)); // math bold A
+ assertSame(WordBreak.OTHER, WordBreakProperty.of(0x1F600)); // grinning face
+ }
+
+ // Assigned punctuation/symbols ('!', '@', em dash) carry no Word_Break property and map to OTHER.
+ @ParameterizedTest
+ @ValueSource(ints = {0x0021, 0x0040, 0x2014})
+ void testPunctuationAndSymbolsAreOther(int codePoint) {
+ assertSame(WordBreak.OTHER, WordBreakProperty.of(codePoint));
+ }
+
+ @ParameterizedTest
+ @ValueSource(ints = {-1, Integer.MIN_VALUE, Character.MAX_CODE_POINT + 1, Integer.MAX_VALUE})
+ void testOutOfRangeIsOtherAndSafe(int codePoint) {
+ assertSame(WordBreak.OTHER, WordBreakProperty.of(codePoint));
+ }
+
+ @Test
+ void testFromPropertyNameRejectsUnknown() {
+ assertEquals(WordBreak.ALETTER, WordBreak.fromPropertyName("ALetter"));
+ org.junit.jupiter.api.Assertions.assertThrows(IllegalArgumentException.class,
+ () -> WordBreak.fromPropertyName("NotAValue"));
+ }
+}
diff --git a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/uax29/WordSegmenterTest.java b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/uax29/WordSegmenterTest.java
new file mode 100644
index 000000000..96dbc1491
--- /dev/null
+++ b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/uax29/WordSegmenterTest.java
@@ -0,0 +1,110 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.tokenize.uax29;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.junit.jupiter.api.Test;
+
+import opennlp.tools.util.Span;
+
+import static org.junit.jupiter.api.Assertions.assertArrayEquals;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+public class WordSegmenterTest {
+
+ private static String cp(int codePoint) {
+ return new String(Character.toChars(codePoint));
+ }
+
+ private static List words(String text) {
+ final List out = new ArrayList<>();
+ for (final Span span : WordSegmenter.segments(text)) {
+ out.add(span.getCoveredText(text).toString());
+ }
+ return out;
+ }
+
+ @Test
+ void testEnglishSentenceKeepsWordsAndSeparators() {
+ assertEquals(List.of("The", " ", "quick", " ", "fox"), words("The quick fox"));
+ }
+
+ @Test
+ void testContractionStaysOneWord() {
+ assertEquals(List.of("don't"), words("don't")); // WB6/WB7 over the apostrophe
+ }
+
+ @Test
+ void testDecimalNumberStaysOneToken() {
+ assertEquals(List.of("3.14"), words("3.14")); // WB11/WB12
+ }
+
+ @Test
+ void testAcronymWithInternalDotsStaysOneToken() {
+ assertEquals(List.of("U.S.A"), words("U.S.A")); // WB6/WB7
+ }
+
+ @Test
+ void testLettersAndDigitsJoin() {
+ assertEquals(List.of("a1b2"), words("a1b2")); // WB9/WB10
+ }
+
+ @Test
+ void testWhitespaceRunIsASingleSegment() {
+ assertEquals(List.of("a", " ", "b"), words("a b")); // WB3d
+ }
+
+ @Test
+ void testNewlineBreaksOnBothSides() {
+ assertEquals(List.of("a", "\n", "b"), words("a\nb")); // WB3a/WB3b
+ }
+
+ @Test
+ void testCarriageReturnLineFeedStayTogether() {
+ assertEquals(List.of("a", "\r\n", "b"), words("a\r\nb")); // WB3
+ }
+
+ @Test
+ void testIdeographsSplitPerCharacter() {
+ assertEquals(List.of(cp(0x4E2D), cp(0x6587)), words(cp(0x4E2D) + cp(0x6587)));
+ }
+
+ @Test
+ void testEmojiZwjSequenceStaysTogether() {
+ final String family = cp(0x1F468) + cp(0x200D) + cp(0x1F469); // man + ZWJ + woman
+ assertEquals(List.of(family), words(family)); // WB3c
+ }
+
+ @Test
+ void testRegionalIndicatorFlagIsOneToken() {
+ final String flag = cp(0x1F1FA) + cp(0x1F1F8); // regional indicators U + S
+ assertEquals(List.of(flag), words(flag)); // WB15/WB16
+ }
+
+ @Test
+ void testEmptyText() {
+ assertEquals(List.of(), words(""));
+ assertArrayEquals(new int[] {0}, WordSegmenter.boundaries(""));
+ }
+
+ @Test
+ void testBoundariesIncludeStartAndEnd() {
+ assertArrayEquals(new int[] {0, 2, 3, 5}, WordSegmenter.boundaries("ab cd"));
+ }
+}
diff --git a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/uax29/WordTokenizerTest.java b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/uax29/WordTokenizerTest.java
new file mode 100644
index 000000000..3d46ce870
--- /dev/null
+++ b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/uax29/WordTokenizerTest.java
@@ -0,0 +1,176 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.tokenize.uax29;
+
+import java.util.List;
+
+import org.junit.jupiter.api.Test;
+
+import opennlp.tools.tokenize.Tokenizer;
+import opennlp.tools.util.Span;
+
+import static org.junit.jupiter.api.Assertions.assertArrayEquals;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+
+public class WordTokenizerTest {
+
+ private static final WordTokenizer TOKENIZER = new WordTokenizer();
+
+ private static String cp(int codePoint) {
+ return new String(Character.toChars(codePoint));
+ }
+
+ private static List words(String text) {
+ return List.of(TOKENIZER.tokenize(text));
+ }
+
+ @Test
+ void testDropsWhitespaceAndPunctuation() {
+ assertEquals(List.of("Hello", "world"), words("Hello, world!"));
+ }
+
+ @Test
+ void testAlphanumericAndNumericTypes() {
+ final List tokens = TOKENIZER.tokenizeTyped("abc 123");
+ assertEquals(2, tokens.size());
+ assertEquals(WordType.ALPHANUMERIC, tokens.get(0).type());
+ assertEquals("abc", tokens.get(0).text("abc 123"));
+ assertEquals(WordType.NUMERIC, tokens.get(1).type());
+ assertEquals("123", tokens.get(1).text("abc 123"));
+ }
+
+ @Test
+ void testDecimalIsSingleNumericToken() {
+ final List tokens = TOKENIZER.tokenizeTyped("3.14");
+ assertEquals(1, tokens.size());
+ assertEquals(WordType.NUMERIC, tokens.get(0).type());
+ assertEquals("3.14", tokens.get(0).text("3.14"));
+ }
+
+ @Test
+ void testIdeographsOnePerToken() {
+ final String text = cp(0x4E2D) + cp(0x6587);
+ final List tokens = TOKENIZER.tokenizeTyped(text);
+ assertEquals(2, tokens.size());
+ assertEquals(WordType.IDEOGRAPHIC, tokens.get(0).type());
+ assertEquals(WordType.IDEOGRAPHIC, tokens.get(1).type());
+ }
+
+ @Test
+ void testHiraganaSplitsPerCharacter() {
+ final String text = cp(0x3042) + cp(0x3044); // a + i
+ final List tokens = TOKENIZER.tokenizeTyped(text);
+ assertEquals(2, tokens.size());
+ assertEquals(WordType.HIRAGANA, tokens.get(0).type());
+ assertEquals(WordType.HIRAGANA, tokens.get(1).type());
+ }
+
+ @Test
+ void testKatakanaRunStaysTogether() {
+ final String text = cp(0x30A2) + cp(0x30A4); // a + i
+ final List tokens = TOKENIZER.tokenizeTyped(text);
+ assertEquals(1, tokens.size());
+ assertEquals(WordType.KATAKANA, tokens.get(0).type());
+ assertEquals(text, tokens.get(0).text(text));
+ }
+
+ @Test
+ void testExtendedPictographicSymbolsAreKeptAsEmoji() {
+ // Extended_Pictographic includes symbol-like characters (copyright U+00A9, trademark U+2122,
+ // double exclamation U+203C), which WordType classifies as EMOJI, so the tokenizer keeps them
+ // rather than dropping them as punctuation.
+ final String text = "a " + cp(0x00A9) + " " + cp(0x2122) + " " + cp(0x203C) + " b";
+ final List tokens = TOKENIZER.tokenizeTyped(text);
+ assertEquals(List.of(WordType.ALPHANUMERIC, WordType.EMOJI, WordType.EMOJI,
+ WordType.EMOJI, WordType.ALPHANUMERIC),
+ tokens.stream().map(WordToken::type).toList());
+ }
+
+ @Test
+ void testHangulSyllablesStayTogether() {
+ final String text = cp(0xAC00) + cp(0xB098); // ga + na
+ final List tokens = TOKENIZER.tokenizeTyped(text);
+ assertEquals(1, tokens.size());
+ assertEquals(WordType.HANGUL, tokens.get(0).type());
+ assertEquals(text, tokens.get(0).text(text));
+ }
+
+ @Test
+ void testSoutheastAsianType() {
+ final String text = cp(0x0E01); // Thai letter ko kai
+ final List tokens = TOKENIZER.tokenizeTyped(text);
+ assertEquals(1, tokens.size());
+ assertEquals(WordType.SOUTHEAST_ASIAN, tokens.get(0).type());
+ }
+
+ @Test
+ void testEmojiType() {
+ final String text = cp(0x1F600); // grinning face
+ final List tokens = TOKENIZER.tokenizeTyped(text);
+ assertEquals(1, tokens.size());
+ assertEquals(WordType.EMOJI, tokens.get(0).type());
+ }
+
+ @Test
+ void testRegionalIndicatorFlagIsOneEmoji() {
+ final String flag = cp(0x1F1FA) + cp(0x1F1F8); // U + S
+ final List tokens = TOKENIZER.tokenizeTyped(flag);
+ assertEquals(1, tokens.size());
+ assertEquals(WordType.EMOJI, tokens.get(0).type());
+ assertEquals(flag, tokens.get(0).text(flag));
+ }
+
+ @Test
+ void testMaxTokenLengthChopsLongWords() {
+ final WordTokenizer tokenizer = new WordTokenizer(3);
+ assertEquals(List.of("abc", "def", "g"), List.of(tokenizer.tokenize("abcdefg")));
+ }
+
+ @Test
+ void testMaxTokenLengthNeverSplitsASurrogatePair() {
+ // A two-char emoji must be emitted whole even when the limit is one char.
+ final WordTokenizer tokenizer = new WordTokenizer(1);
+ final String emoji = cp(0x1F600);
+ final List tokens = tokenizer.tokenizeTyped(emoji);
+ assertEquals(1, tokens.size());
+ assertEquals(emoji, tokens.get(0).text(emoji));
+ }
+
+ @Test
+ void testConstructorRejectsNonPositiveLength() {
+ assertThrows(IllegalArgumentException.class, () -> new WordTokenizer(0));
+ assertThrows(IllegalArgumentException.class, () -> new WordTokenizer(-5));
+ }
+
+ @Test
+ void testEmptyText() {
+ assertEquals(List.of(), words(""));
+ assertEquals(List.of(), TOKENIZER.tokenizeTyped(""));
+ }
+
+ @Test
+ void testUsableThroughTokenizerInterface() {
+ final Tokenizer tokenizer = new WordTokenizer();
+ final String text = "Hello, world!";
+ assertArrayEquals(new String[] {"Hello", "world"}, tokenizer.tokenize(text));
+ final Span[] spans = tokenizer.tokenizePos(text);
+ assertEquals(2, spans.length);
+ assertEquals("Hello", spans[0].getCoveredText(text).toString());
+ assertEquals("world", spans[1].getCoveredText(text).toString());
+ }
+}
diff --git a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/ConfusablesTest.java b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/ConfusablesTest.java
new file mode 100644
index 000000000..262fe5aa9
--- /dev/null
+++ b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/ConfusablesTest.java
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.util.normalizer;
+
+import org.junit.jupiter.api.Test;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+public class ConfusablesTest {
+
+ private static String cp(int codePoint) {
+ return new String(Character.toChars(codePoint));
+ }
+
+ @Test
+ void testCyrillicLetterIsConfusableWithLatin() {
+ final String cyrillicA = cp(0x0430); // CYRILLIC SMALL LETTER A, looks like Latin 'a'
+ assertTrue(Confusables.confusable(cyrillicA, "a"));
+ assertFalse(Confusables.confusable(cyrillicA, "b"));
+ }
+
+ @Test
+ void testHomoglyphSpoofWordReducesToLatinSpelling() {
+ final String spoof = "p" + cp(0x0430) + "yp" + cp(0x0430) + "l"; // paypal with Cyrillic a's
+ assertTrue(Confusables.confusable(spoof, "paypal"));
+ assertEquals(Confusables.skeleton("paypal"), Confusables.skeleton(spoof));
+ }
+
+ @Test
+ void testHorizontalEllipsisFoldsToThreeFullStops() {
+ assertEquals(Confusables.skeleton("..."), Confusables.skeleton(cp(0x2026)));
+ assertTrue(Confusables.confusable(cp(0x2026), "..."));
+ }
+
+ @Test
+ void testDistinctWordsAreNotConfusable() {
+ assertFalse(Confusables.confusable("cat", "dog"));
+ }
+
+ @Test
+ void testSkeletonIsIdempotent() {
+ final String skeleton = Confusables.skeleton(cp(0x0430) + "bc");
+ assertEquals(skeleton, Confusables.skeleton(skeleton));
+ }
+
+ @Test
+ void testNormalizerProducesTheSkeleton() {
+ final String spoof = "p" + cp(0x0430) + "yp" + cp(0x0430) + "l";
+ assertEquals(Confusables.skeleton(spoof),
+ ConfusableSkeletonCharSequenceNormalizer.getInstance().normalize(spoof).toString());
+ }
+
+ @Test
+ void testMultipleCyrillicLookalikesFold() {
+ final String spoof = "d" + cp(0x0430) + "t" + cp(0x0430); // "data" with Cyrillic a's
+ assertEquals(Confusables.skeleton("data"), Confusables.skeleton(spoof));
+ }
+
+ @Test
+ void testTermConfusableFoldDimension() {
+ final String spoof = "p" + cp(0x0430) + "yp" + cp(0x0430) + "l";
+ final TermAnalyzer analyzer = TermAnalyzer.builder().confusableFold().build();
+ assertEquals(Confusables.skeleton("paypal"), analyzer.analyze(spoof).get(0).normalized());
+ }
+}
diff --git a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/NormalizationProfilesTest.java b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/NormalizationProfilesTest.java
new file mode 100644
index 000000000..6dbe95260
--- /dev/null
+++ b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/NormalizationProfilesTest.java
@@ -0,0 +1,140 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.util.normalizer;
+
+import java.util.List;
+
+import org.junit.jupiter.api.Test;
+
+import opennlp.tools.langdetect.Language;
+import opennlp.tools.langdetect.LanguageDetector;
+import opennlp.tools.stemmer.snowball.SnowballStemmer;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNull;
+import static org.junit.jupiter.api.Assertions.assertSame;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+public class NormalizationProfilesTest {
+
+ @Test
+ void testEnglishUsesTheGenericAccentFold() {
+ final NormalizationProfile profile = NormalizationProfiles.forLanguage("eng").orElseThrow();
+ assertEquals(SnowballStemmer.ALGORITHM.ENGLISH, profile.stemmerAlgorithm());
+ assertSame(AccentFoldCharSequenceNormalizer.getInstance(), profile.accentFold());
+ assertEquals(List.of(Dimension.NFC, Dimension.CASE_FOLD, Dimension.ACCENT_FOLD, Dimension.STEM),
+ profile.searchAnalyzer().dimensions());
+ }
+
+ @Test
+ void testTwoLetterCodeResolvesToProfile() {
+ assertEquals(SnowballStemmer.ALGORITHM.GERMAN,
+ NormalizationProfiles.forLanguage("de").orElseThrow().stemmerAlgorithm());
+ }
+
+ @Test
+ void testGermanUsesTheGermanSpecificFold() {
+ final NormalizationProfile profile = NormalizationProfiles.forLanguage("deu").orElseThrow();
+ assertSame(GermanUmlautCharSequenceNormalizer.getInstance(), profile.accentFold());
+ assertEquals(List.of(Dimension.NFC, Dimension.CASE_FOLD, Dimension.ACCENT_FOLD, Dimension.STEM),
+ profile.searchAnalyzer().dimensions());
+ }
+
+ @Test
+ void testRomanceLanguagesUseTheGenericFold() {
+ for (final String language : List.of("fra", "spa", "por", "ita", "cat")) {
+ assertSame(AccentFoldCharSequenceNormalizer.getInstance(),
+ NormalizationProfiles.forLanguage(language).orElseThrow().accentFold());
+ }
+ }
+
+ @Test
+ void testNordicLanguageHasNoFold() {
+ final NormalizationProfile swedish = NormalizationProfiles.forLanguage("swe").orElseThrow();
+ assertNull(swedish.accentFold());
+ assertEquals(List.of(Dimension.NFC, Dimension.CASE_FOLD, Dimension.STEM),
+ swedish.searchAnalyzer().dimensions());
+ }
+
+ @Test
+ void testUnsupportedLanguageIsEmpty() {
+ assertTrue(NormalizationProfiles.forLanguage("jpn").isEmpty());
+ assertTrue(NormalizationProfiles.forLanguage("zzz").isEmpty());
+ }
+
+ @Test
+ void testSearchAnalyzerStemsThroughTheChain() {
+ final NormalizationProfile english = NormalizationProfiles.forLanguage("eng").orElseThrow();
+ assertEquals("cat", english.searchAnalyzer().analyze("Cats").get(0).normalized());
+ }
+
+ @Test
+ void testDetectDispatchesThroughTheDetector() {
+ final LanguageDetector detector = new LanguageDetector() {
+ @Override
+ public Language[] predictLanguages(CharSequence content) {
+ return new Language[] {new Language("deu")};
+ }
+
+ @Override
+ public Language predictLanguage(CharSequence content) {
+ return new Language("deu");
+ }
+
+ @Override
+ public String[] getSupportedLanguages() {
+ return new String[] {"deu"};
+ }
+ };
+ final NormalizationProfile profile =
+ NormalizationProfiles.detect("Guten Tag", detector).orElseThrow();
+ assertEquals(SnowballStemmer.ALGORITHM.GERMAN, profile.stemmerAlgorithm());
+ }
+
+ @Test
+ void testDetectUnsupportedLanguageIsEmpty() {
+ final LanguageDetector detector = new LanguageDetector() {
+ @Override
+ public Language[] predictLanguages(CharSequence content) {
+ return new Language[] {new Language("jpn")};
+ }
+
+ @Override
+ public Language predictLanguage(CharSequence content) {
+ return new Language("jpn");
+ }
+
+ @Override
+ public String[] getSupportedLanguages() {
+ return new String[] {"jpn"};
+ }
+ };
+ assertTrue(NormalizationProfiles.detect("text", detector).isEmpty());
+ }
+
+ @Test
+ void testSupportedLanguagesCoverTheSnowballSet() {
+ assertEquals(19, NormalizationProfiles.supportedLanguages().size());
+ assertTrue(NormalizationProfiles.supportedLanguages().containsAll(List.of("eng", "deu", "fra")));
+ }
+
+ @Test
+ void testForLanguageRejectsNull() {
+ assertThrows(NullPointerException.class, () -> NormalizationProfiles.forLanguage(null));
+ }
+}
diff --git a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/TermAnalyzerTest.java b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/TermAnalyzerTest.java
new file mode 100644
index 000000000..56f16899d
--- /dev/null
+++ b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/TermAnalyzerTest.java
@@ -0,0 +1,211 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.util.normalizer;
+
+import java.util.List;
+import java.util.Locale;
+import java.util.Set;
+
+import org.junit.jupiter.api.Test;
+
+import opennlp.tools.lemmatizer.Lemmatizer;
+import opennlp.tools.stemmer.PorterStemmer;
+import opennlp.tools.util.Span;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNull;
+import static org.junit.jupiter.api.Assertions.assertSame;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+
+public class TermAnalyzerTest {
+
+ private static String cp(int codePoint) {
+ return new String(Character.toChars(codePoint));
+ }
+
+ @Test
+ void testNoDimensionsLeavesTokenUnchanged() {
+ final TermAnalyzer analyzer = TermAnalyzer.builder().build();
+ final Term term = analyzer.analyze("Hello").get(0);
+ assertEquals("Hello", term.original());
+ assertEquals("Hello", term.normalized());
+ assertEquals("Hello", term.peel());
+ assertEquals(List.of(), analyzer.dimensions());
+ }
+
+ @Test
+ void testChainAppliesInCanonicalOrderRegardlessOfBuilderOrder() {
+ // accentFold added before caseFold, but the canonical order is caseFold then accentFold.
+ final TermAnalyzer analyzer = TermAnalyzer.builder().accentFold().caseFold().build();
+ assertEquals(List.of(Dimension.CASE_FOLD, Dimension.ACCENT_FOLD), analyzer.dimensions());
+ final String input = "CAF" + cp(0x00C9); // CAFE with capital acute E
+ final Term term = analyzer.analyze(input).get(0);
+ assertEquals(input, term.original());
+ assertEquals("cafe", term.normalized());
+ assertEquals("caf" + cp(0x00E9), term.peel()); // before accent folding: lower-case, acute kept
+ }
+
+ @Test
+ void testStemIsTheTopLayer() {
+ final TermAnalyzer analyzer =
+ TermAnalyzer.builder().caseFold().stem(new PorterStemmer()).build();
+ final Term term = analyzer.analyze("Running").get(0);
+ assertEquals("running", term.peel()); // case-folded form, before stemming
+ assertEquals("run", term.normalized());
+ assertEquals("run", term.at(Dimension.STEM));
+ }
+
+ @Test
+ void testUnconfiguredCharDimensionComputedLazily() {
+ final TermAnalyzer analyzer = TermAnalyzer.builder().build();
+ final Term term = analyzer.analyze("HELLO").get(0);
+ assertEquals("HELLO", term.normalized());
+ assertEquals("hello", term.at(Dimension.CASE_FOLD)); // lazily added on top of the final form
+ }
+
+ @Test
+ void testStemDimensionWithoutStemmerFailsLoudly() {
+ final TermAnalyzer analyzer = TermAnalyzer.builder().caseFold().build();
+ final Term term = analyzer.analyze("running").get(0);
+ assertThrows(IllegalStateException.class, () -> term.at(Dimension.STEM));
+ }
+
+ @Test
+ void testLemmaWithoutLemmatizerFailsLoudly() {
+ final TermAnalyzer analyzer = TermAnalyzer.builder().build();
+ final Term term = analyzer.analyze("running").get(0);
+ assertThrows(IllegalStateException.class, () -> term.at(Dimension.LEMMA));
+ }
+
+ @Test
+ void testAnalyzeTextProducesSpans() {
+ final TermAnalyzer analyzer = TermAnalyzer.builder().caseFold().build();
+ final List terms = analyzer.analyze("The Cats");
+ assertEquals(2, terms.size());
+ assertEquals("The", terms.get(0).original());
+ assertEquals("the", terms.get(0).normalized());
+ assertEquals(new Span(0, 3), terms.get(0).span());
+ assertEquals("Cats", terms.get(1).original());
+ assertEquals(new Span(4, 8), terms.get(1).span());
+ }
+
+ @Test
+ void testAnalyzeTokensHasNoSpan() {
+ final TermAnalyzer analyzer = TermAnalyzer.builder().caseFold().build();
+ final List terms = analyzer.analyze(new String[] {"Cats"}, new String[] {"NNS"});
+ assertNull(terms.get(0).span());
+ assertEquals("cats", terms.get(0).normalized());
+ }
+
+ @Test
+ void testAnalyzeTokensRejectsLengthMismatch() {
+ final TermAnalyzer analyzer = TermAnalyzer.builder().build();
+ assertThrows(IllegalArgumentException.class,
+ () -> analyzer.analyze(new String[] {"a", "b"}, new String[] {"X"}));
+ }
+
+ @Test
+ void testTransformRejectsNonCharacterDimension() {
+ assertThrows(IllegalArgumentException.class, () -> TermAnalyzer.builder()
+ .transform(Dimension.STEM, CaseFoldCharSequenceNormalizer.getInstance()));
+ }
+
+ @Test
+ void testLemmaWithLemmatizerAndTag() {
+ final Lemmatizer lemmatizer = new Lemmatizer() {
+ @Override
+ public String[] lemmatize(String[] tokens, String[] tags) {
+ return new String[] {"be"};
+ }
+
+ @Override
+ public List> lemmatize(List tokens, List tags) {
+ return List.of(List.of("be"));
+ }
+ };
+ final TermAnalyzer analyzer =
+ TermAnalyzer.builder().caseFold().lemmatize(lemmatizer).build();
+ final Term term = analyzer.analyze(new String[] {"was"}, new String[] {"VBD"}).get(0);
+ assertEquals("be", term.normalized());
+ }
+
+ @Test
+ void testConfusableFoldComposesWithCaseFold() {
+ final TermAnalyzer analyzer = TermAnalyzer.builder().caseFold().confusableFold().build();
+ final String spoof = "P" + cp(0x0430) + "yp" + cp(0x0430) + "l"; // Paypal with Cyrillic a's
+ assertEquals(Confusables.skeleton("paypal"), analyzer.analyze(spoof).get(0).normalized());
+ }
+
+ @Test
+ void testAtIsMemoized() {
+ final TermAnalyzer analyzer = TermAnalyzer.builder().build();
+ final Term term = analyzer.analyze("HELLO").get(0);
+ final String first = term.at(Dimension.CASE_FOLD);
+ assertSame(first, term.at(Dimension.CASE_FOLD));
+ }
+
+ @Test
+ void testWhitespaceTargetIsConfigurable() {
+ final CharClass lineFold = CharClass.of(CodePointSet.of('\n', '\t'), '\n');
+ final TermAnalyzer analyzer = TermAnalyzer.builder().whitespace(lineFold::collapse).build();
+ final Term term = analyzer.analyze(new String[] {"a\n\n\tb"}, new String[] {"X"}).get(0);
+ assertEquals("a\nb", term.normalized());
+ }
+
+ @Test
+ void testCaseFoldLocaleAppliesTurkishRules() {
+ final TermAnalyzer analyzer =
+ TermAnalyzer.builder().caseFold(Locale.forLanguageTag("tr")).build();
+ assertEquals(cp(0x0131), analyzer.analyze("I").get(0).normalized()); // dotless lowercase i
+ }
+
+ @Test
+ void testAccentFoldScopeFoldsLatin() {
+ final TermAnalyzer analyzer = TermAnalyzer.builder()
+ .accentFold(Set.of(Character.UnicodeScript.LATIN), false).build();
+ assertEquals("cafe", analyzer.analyze("caf" + cp(0x00E9)).get(0).normalized()); // cafe + acute
+ }
+
+ @Test
+ void testMaxTokenLengthChopsTokens() {
+ final List terms = TermAnalyzer.builder().maxTokenLength(3).build().analyze("abcdefg");
+ assertEquals(3, terms.size());
+ assertEquals("abc", terms.get(0).original());
+ assertEquals("def", terms.get(1).original());
+ assertEquals("g", terms.get(2).original());
+ }
+
+ @Test
+ void testAnalyzeEmptyTextProducesNoTerms() {
+ assertEquals(List.of(), TermAnalyzer.builder().caseFold().build().analyze(""));
+ }
+
+ @Test
+ void testWhitespaceOnlyInputHasNoWordTerms() {
+ assertEquals(List.of(), TermAnalyzer.builder().build().analyze(" \t "));
+ }
+
+ @Test
+ void testAtDimensionBelowFinalIsAppliedOnTop() {
+ // Final dimension is STEM; asking for NFC applies it on top of the stem (documented behavior).
+ final TermAnalyzer analyzer =
+ TermAnalyzer.builder().caseFold().stem(new PorterStemmer()).build();
+ final Term term = analyzer.analyze("Running").get(0);
+ assertEquals("run", term.normalized());
+ assertEquals("run", term.at(Dimension.NFC));
+ }
+}
diff --git a/opennlp-core/opennlp-runtime/src/test/resources/opennlp/tools/tokenize/uax29/WordBreakTest.txt b/opennlp-core/opennlp-runtime/src/test/resources/opennlp/tools/tokenize/uax29/WordBreakTest.txt
new file mode 100644
index 000000000..042b02e77
--- /dev/null
+++ b/opennlp-core/opennlp-runtime/src/test/resources/opennlp/tools/tokenize/uax29/WordBreakTest.txt
@@ -0,0 +1,1974 @@
+# WordBreakTest-17.0.0.txt
+# Date: 2025-03-24, 14:46:35 GMT
+# © 2025 Unicode®, Inc.
+# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
+# For terms of use and license, see https://www.unicode.org/terms_of_use.html
+#
+# Unicode Character Database
+# For documentation, see https://www.unicode.org/reports/tr44/
+#
+# Default Word_Break Test
+#
+# Format:
+# (# )?
+# contains hex Unicode code points, with
+# ÷ wherever there is a break opportunity, and
+# × wherever there is not.
+# the format can change, but currently it shows:
+# - the sample character name
+# - (x) the Word_Break property value for the sample character and
+# any other properties relevant to the algorithm, as described in
+# WordBreakTest.html
+# - [x] the rule that determines whether there is a break or not,
+# as listed in the Rules section of WordBreakTest.html
+#
+# These samples may be extended or changed in the future.
+#
+÷ 000D ÷ 000D ÷ # ÷ [0.2] (CR) ÷ [3.1] (CR) ÷ [0.3]
+÷ 000D ÷ 0308 ÷ 000D ÷ # ÷ [0.2] (CR) ÷ [3.1] COMBINING DIAERESIS (Extend) ÷ [3.2] (CR) ÷ [0.3]
+÷ 000D × 000A ÷ # ÷ [0.2] (CR) × [3.0] (LF) ÷ [0.3]
+÷ 000D ÷ 0308 ÷ 000A ÷ # ÷ [0.2] (CR) ÷ [3.1] COMBINING DIAERESIS (Extend) ÷ [3.2] (LF) ÷ [0.3]
+÷ 000D ÷ 000B ÷ # ÷ [0.2] (CR) ÷ [3.1] (Newline) ÷ [0.3]
+÷ 000D ÷ 0308 ÷ 000B ÷ # ÷ [0.2] (CR) ÷ [3.1] COMBINING DIAERESIS (Extend) ÷ [3.2] (Newline) ÷ [0.3]
+÷ 000D ÷ 0300 ÷ # ÷ [0.2] (CR) ÷ [3.1] COMBINING GRAVE ACCENT (Extend) ÷ [0.3]
+÷ 000D ÷ 0308 × 0300 ÷ # ÷ [0.2] (CR) ÷ [3.1] COMBINING DIAERESIS (Extend) × [4.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3]
+÷ 000D ÷ 00AD ÷ # ÷ [0.2] (CR) ÷ [3.1] SOFT HYPHEN (Format) ÷ [0.3]
+÷ 000D ÷ 0308 × 00AD ÷ # ÷ [0.2] (CR) ÷ [3.1] COMBINING DIAERESIS (Extend) × [4.0] SOFT HYPHEN (Format) ÷ [0.3]
+÷ 000D ÷ 3031 ÷ # ÷ [0.2] (CR) ÷ [3.1] VERTICAL KANA REPEAT MARK (Katakana) ÷ [0.3]
+÷ 000D ÷ 0308 ÷ 3031 ÷ # ÷ [0.2] (CR) ÷ [3.1] COMBINING DIAERESIS (Extend) ÷ [999.0] VERTICAL KANA REPEAT MARK (Katakana) ÷ [0.3]
+÷ 000D ÷ 24C2 ÷ # ÷ [0.2] (CR) ÷ [3.1] CIRCLED LATIN CAPITAL LETTER M (ALetter_ExtPict) ÷ [0.3]
+÷ 000D ÷ 0308 ÷ 24C2 ÷ # ÷ [0.2] (CR) ÷ [3.1] COMBINING DIAERESIS (Extend) ÷ [999.0] CIRCLED LATIN CAPITAL LETTER M (ALetter_ExtPict) ÷ [0.3]
+÷ 000D ÷ 0041 ÷ # ÷ [0.2] (CR) ÷ [3.1] LATIN CAPITAL LETTER A (ALettermExtPict) ÷ [0.3]
+÷ 000D ÷ 0308 ÷ 0041 ÷ # ÷ [0.2] (CR) ÷ [3.1] COMBINING DIAERESIS (Extend) ÷ [999.0] LATIN CAPITAL LETTER A (ALettermExtPict) ÷ [0.3]
+÷ 000D ÷ 003A ÷ # ÷ [0.2] (CR) ÷ [3.1] COLON (MidLetter) ÷ [0.3]
+÷ 000D ÷ 0308 ÷ 003A ÷ # ÷ [0.2] (CR) ÷ [3.1] COMBINING DIAERESIS (Extend) ÷ [999.0] COLON (MidLetter) ÷ [0.3]
+÷ 000D ÷ 002C ÷ # ÷ [0.2] (CR) ÷ [3.1] COMMA (MidNum) ÷ [0.3]
+÷ 000D ÷ 0308 ÷ 002C ÷ # ÷ [0.2] (CR) ÷ [3.1] COMBINING DIAERESIS (Extend) ÷ [999.0] COMMA (MidNum) ÷ [0.3]
+÷ 000D ÷ 002E ÷ # ÷ [0.2] (CR) ÷ [3.1] FULL STOP (MidNumLet) ÷ [0.3]
+÷ 000D ÷ 0308 ÷ 002E ÷ # ÷ [0.2] (CR) ÷ [3.1] COMBINING DIAERESIS (Extend) ÷ [999.0] FULL STOP (MidNumLet) ÷ [0.3]
+÷ 000D ÷ 0030 ÷ # ÷ [0.2] (CR) ÷ [3.1] DIGIT ZERO (Numeric) ÷ [0.3]
+÷ 000D ÷ 0308 ÷ 0030 ÷ # ÷ [0.2] (CR) ÷ [3.1] COMBINING DIAERESIS (Extend) ÷ [999.0] DIGIT ZERO (Numeric) ÷ [0.3]
+÷ 000D ÷ 005F ÷ # ÷ [0.2] (CR) ÷ [3.1] LOW LINE (ExtendNumLet) ÷ [0.3]
+÷ 000D ÷ 0308 ÷ 005F ÷ # ÷ [0.2] (CR) ÷ [3.1] COMBINING DIAERESIS (Extend) ÷ [999.0] LOW LINE (ExtendNumLet) ÷ [0.3]
+÷ 000D ÷ 1F1E6 ÷ # ÷ [0.2] (CR) ÷ [3.1] REGIONAL INDICATOR SYMBOL LETTER A (RI) ÷ [0.3]
+÷ 000D ÷ 0308 ÷ 1F1E6 ÷ # ÷ [0.2] (CR) ÷ [3.1] COMBINING DIAERESIS (Extend) ÷ [999.0] REGIONAL INDICATOR SYMBOL LETTER A (RI) ÷ [0.3]
+÷ 000D ÷ 05D0 ÷ # ÷ [0.2] (CR) ÷ [3.1] HEBREW LETTER ALEF (Hebrew_Letter) ÷ [0.3]
+÷ 000D ÷ 0308 ÷ 05D0 ÷ # ÷ [0.2] (CR) ÷ [3.1] COMBINING DIAERESIS (Extend) ÷ [999.0] HEBREW LETTER ALEF (Hebrew_Letter) ÷ [0.3]
+÷ 000D ÷ 0022 ÷ # ÷ [0.2] (CR) ÷ [3.1] QUOTATION MARK (Double_Quote) ÷ [0.3]
+÷ 000D ÷ 0308 ÷ 0022 ÷ # ÷ [0.2] (CR) ÷ [3.1] COMBINING DIAERESIS (Extend) ÷ [999.0] QUOTATION MARK (Double_Quote) ÷ [0.3]
+÷ 000D ÷ 0027 ÷ # ÷ [0.2] (CR) ÷ [3.1] APOSTROPHE (Single_Quote) ÷ [0.3]
+÷ 000D ÷ 0308 ÷ 0027 ÷ # ÷ [0.2] (CR) ÷ [3.1] COMBINING DIAERESIS (Extend) ÷ [999.0] APOSTROPHE (Single_Quote) ÷ [0.3]
+÷ 000D ÷ 200D ÷ # ÷ [0.2] (CR) ÷ [3.1] ZERO WIDTH JOINER (ZWJ) ÷ [0.3]
+÷ 000D ÷ 0308 × 200D ÷ # ÷ [0.2] (CR) ÷ [3.1] COMBINING DIAERESIS (Extend) × [4.0] ZERO WIDTH JOINER (ZWJ) ÷ [0.3]
+÷ 000D ÷ 00A9 ÷ # ÷ [0.2] (CR) ÷ [3.1] COPYRIGHT SIGN (ExtPictmALetter) ÷ [0.3]
+÷ 000D ÷ 0308 ÷ 00A9 ÷ # ÷ [0.2] (CR) ÷ [3.1] COMBINING DIAERESIS (Extend) ÷ [999.0] COPYRIGHT SIGN (ExtPictmALetter) ÷ [0.3]
+÷ 000D ÷ 0020 ÷ # ÷ [0.2] (CR) ÷ [3.1] SPACE (WSegSpace) ÷ [0.3]
+÷ 000D ÷ 0308 ÷ 0020 ÷ # ÷ [0.2] (CR) ÷ [3.1] COMBINING DIAERESIS (Extend) ÷ [999.0] SPACE (WSegSpace) ÷ [0.3]
+÷ 000D ÷ 0000 ÷ # ÷ [0.2] (CR) ÷ [3.1] (XXmExtPict) ÷ [0.3]
+÷ 000D ÷ 0308 ÷ 0000 ÷ # ÷ [0.2] (CR) ÷ [3.1] COMBINING DIAERESIS (Extend) ÷ [999.0] (XXmExtPict) ÷ [0.3]
+÷ 000D ÷ 0061 × 2060 ÷ # ÷ [0.2] (CR) ÷ [3.1] LATIN SMALL LETTER A (ALettermExtPict) × [4.0] WORD JOINER (Format) ÷ [0.3]
+÷ 000D ÷ 0308 ÷ 0061 × 2060 ÷ # ÷ [0.2] (CR) ÷ [3.1] COMBINING DIAERESIS (Extend) ÷ [999.0] LATIN SMALL LETTER A (ALettermExtPict) × [4.0] WORD JOINER (Format) ÷ [0.3]
+÷ 000D ÷ 0061 ÷ 003A ÷ # ÷ [0.2] (CR) ÷ [3.1] LATIN SMALL LETTER A (ALettermExtPict) ÷ [999.0] COLON (MidLetter) ÷ [0.3]
+÷ 000D ÷ 0308 ÷ 0061 ÷ 003A ÷ # ÷ [0.2] (CR) ÷ [3.1] COMBINING DIAERESIS (Extend) ÷ [999.0] LATIN SMALL LETTER A (ALettermExtPict) ÷ [999.0] COLON (MidLetter) ÷ [0.3]
+÷ 000D ÷ 0061 ÷ 0027 ÷ # ÷ [0.2] (CR) ÷ [3.1] LATIN SMALL LETTER A (ALettermExtPict) ÷ [999.0] APOSTROPHE (Single_Quote) ÷ [0.3]
+÷ 000D ÷ 0308 ÷ 0061 ÷ 0027 ÷ # ÷ [0.2] (CR) ÷ [3.1] COMBINING DIAERESIS (Extend) ÷ [999.0] LATIN SMALL LETTER A (ALettermExtPict) ÷ [999.0] APOSTROPHE (Single_Quote) ÷ [0.3]
+÷ 000D ÷ 0061 ÷ 0027 × 2060 ÷ # ÷ [0.2] (CR) ÷ [3.1] LATIN SMALL LETTER A (ALettermExtPict) ÷ [999.0] APOSTROPHE (Single_Quote) × [4.0] WORD JOINER (Format) ÷ [0.3]
+÷ 000D ÷ 0308 ÷ 0061 ÷ 0027 × 2060 ÷ # ÷ [0.2] (CR) ÷ [3.1] COMBINING DIAERESIS (Extend) ÷ [999.0] LATIN SMALL LETTER A (ALettermExtPict) ÷ [999.0] APOSTROPHE (Single_Quote) × [4.0] WORD JOINER (Format) ÷ [0.3]
+÷ 000D ÷ 0061 ÷ 002C ÷ # ÷ [0.2] (CR) ÷ [3.1] LATIN SMALL LETTER A (ALettermExtPict) ÷ [999.0] COMMA (MidNum) ÷ [0.3]
+÷ 000D ÷ 0308 ÷ 0061 ÷ 002C ÷ # ÷ [0.2] (CR) ÷ [3.1] COMBINING DIAERESIS (Extend) ÷ [999.0] LATIN SMALL LETTER A (ALettermExtPict) ÷ [999.0] COMMA (MidNum) ÷ [0.3]
+÷ 000D ÷ 0031 ÷ 003A ÷ # ÷ [0.2] (CR) ÷ [3.1] DIGIT ONE (Numeric) ÷ [999.0] COLON (MidLetter) ÷ [0.3]
+÷ 000D ÷ 0308 ÷ 0031 ÷ 003A ÷ # ÷ [0.2] (CR) ÷ [3.1] COMBINING DIAERESIS (Extend) ÷ [999.0] DIGIT ONE (Numeric) ÷ [999.0] COLON (MidLetter) ÷ [0.3]
+÷ 000D ÷ 0031 ÷ 0027 ÷ # ÷ [0.2] (CR) ÷ [3.1] DIGIT ONE (Numeric) ÷ [999.0] APOSTROPHE (Single_Quote) ÷ [0.3]
+÷ 000D ÷ 0308 ÷ 0031 ÷ 0027 ÷ # ÷ [0.2] (CR) ÷ [3.1] COMBINING DIAERESIS (Extend) ÷ [999.0] DIGIT ONE (Numeric) ÷ [999.0] APOSTROPHE (Single_Quote) ÷ [0.3]
+÷ 000D ÷ 0031 ÷ 002C ÷ # ÷ [0.2] (CR) ÷ [3.1] DIGIT ONE (Numeric) ÷ [999.0] COMMA (MidNum) ÷ [0.3]
+÷ 000D ÷ 0308 ÷ 0031 ÷ 002C ÷ # ÷ [0.2] (CR) ÷ [3.1] COMBINING DIAERESIS (Extend) ÷ [999.0] DIGIT ONE (Numeric) ÷ [999.0] COMMA (MidNum) ÷ [0.3]
+÷ 000D ÷ 0031 ÷ 002E × 2060 ÷ # ÷ [0.2] (CR) ÷ [3.1] DIGIT ONE (Numeric) ÷ [999.0] FULL STOP (MidNumLet) × [4.0] WORD JOINER (Format) ÷ [0.3]
+÷ 000D ÷ 0308 ÷ 0031 ÷ 002E × 2060 ÷ # ÷ [0.2] (CR) ÷ [3.1] COMBINING DIAERESIS (Extend) ÷ [999.0] DIGIT ONE (Numeric) ÷ [999.0] FULL STOP (MidNumLet) × [4.0] WORD JOINER (Format) ÷ [0.3]
+÷ 000A ÷ 000D ÷ # ÷ [0.2] (LF) ÷ [3.1] (CR) ÷ [0.3]
+÷ 000A ÷ 0308 ÷ 000D ÷ # ÷ [0.2] (LF) ÷ [3.1] COMBINING DIAERESIS (Extend) ÷ [3.2] (CR) ÷ [0.3]
+÷ 000A ÷ 000A ÷ # ÷ [0.2] (LF) ÷ [3.1] (LF) ÷ [0.3]
+÷ 000A ÷ 0308 ÷ 000A ÷ # ÷ [0.2] (LF) ÷ [3.1] COMBINING DIAERESIS (Extend) ÷ [3.2] (LF) ÷ [0.3]
+÷ 000A ÷ 000B ÷ # ÷ [0.2] (LF) ÷ [3.1] (Newline) ÷ [0.3]
+÷ 000A ÷ 0308 ÷ 000B ÷ # ÷ [0.2] (LF) ÷ [3.1] COMBINING DIAERESIS (Extend) ÷ [3.2] (Newline) ÷ [0.3]
+÷ 000A ÷ 0300 ÷ # ÷ [0.2] (LF) ÷ [3.1] COMBINING GRAVE ACCENT (Extend) ÷ [0.3]
+÷ 000A ÷ 0308 × 0300 ÷ # ÷ [0.2] (LF) ÷ [3.1] COMBINING DIAERESIS (Extend) × [4.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3]
+÷ 000A ÷ 00AD ÷ # ÷ [0.2] (LF) ÷ [3.1] SOFT HYPHEN (Format) ÷ [0.3]
+÷ 000A ÷ 0308 × 00AD ÷ # ÷ [0.2] (LF) ÷ [3.1] COMBINING DIAERESIS (Extend) × [4.0] SOFT HYPHEN (Format) ÷ [0.3]
+÷ 000A ÷ 3031 ÷ # ÷ [0.2] (LF) ÷ [3.1] VERTICAL KANA REPEAT MARK (Katakana) ÷ [0.3]
+÷ 000A ÷ 0308 ÷ 3031 ÷ # ÷ [0.2] (LF) ÷ [3.1] COMBINING DIAERESIS (Extend) ÷ [999.0] VERTICAL KANA REPEAT MARK (Katakana) ÷ [0.3]
+÷ 000A ÷ 24C2 ÷ # ÷ [0.2] (LF) ÷ [3.1] CIRCLED LATIN CAPITAL LETTER M (ALetter_ExtPict) ÷ [0.3]
+÷ 000A ÷ 0308 ÷ 24C2 ÷ # ÷ [0.2] (LF) ÷ [3.1] COMBINING DIAERESIS (Extend) ÷ [999.0] CIRCLED LATIN CAPITAL LETTER M (ALetter_ExtPict) ÷ [0.3]
+÷ 000A ÷ 0041 ÷ # ÷ [0.2] (LF) ÷ [3.1] LATIN CAPITAL LETTER A (ALettermExtPict) ÷ [0.3]
+÷ 000A ÷ 0308 ÷ 0041 ÷ # ÷ [0.2] (LF) ÷ [3.1] COMBINING DIAERESIS (Extend) ÷ [999.0] LATIN CAPITAL LETTER A (ALettermExtPict) ÷ [0.3]
+÷ 000A ÷ 003A ÷ # ÷ [0.2] (LF) ÷ [3.1] COLON (MidLetter) ÷ [0.3]
+÷ 000A ÷ 0308 ÷ 003A ÷ # ÷ [0.2] (LF) ÷ [3.1] COMBINING DIAERESIS (Extend) ÷ [999.0] COLON (MidLetter) ÷ [0.3]
+÷ 000A ÷ 002C ÷ # ÷ [0.2] (LF) ÷ [3.1] COMMA (MidNum) ÷ [0.3]
+÷ 000A ÷ 0308 ÷ 002C ÷ # ÷ [0.2] (LF) ÷ [3.1] COMBINING DIAERESIS (Extend) ÷ [999.0] COMMA (MidNum) ÷ [0.3]
+÷ 000A ÷ 002E ÷ # ÷ [0.2] (LF) ÷ [3.1] FULL STOP (MidNumLet) ÷ [0.3]
+÷ 000A ÷ 0308 ÷ 002E ÷ # ÷ [0.2] (LF) ÷ [3.1] COMBINING DIAERESIS (Extend) ÷ [999.0] FULL STOP (MidNumLet) ÷ [0.3]
+÷ 000A ÷ 0030 ÷ # ÷ [0.2] (LF) ÷ [3.1] DIGIT ZERO (Numeric) ÷ [0.3]
+÷ 000A ÷ 0308 ÷ 0030 ÷ # ÷ [0.2] (LF) ÷ [3.1] COMBINING DIAERESIS (Extend) ÷ [999.0] DIGIT ZERO (Numeric) ÷ [0.3]
+÷ 000A ÷ 005F ÷ # ÷ [0.2] (LF) ÷ [3.1] LOW LINE (ExtendNumLet) ÷ [0.3]
+÷ 000A ÷ 0308 ÷ 005F ÷ # ÷ [0.2] (LF) ÷ [3.1] COMBINING DIAERESIS (Extend) ÷ [999.0] LOW LINE (ExtendNumLet) ÷ [0.3]
+÷ 000A ÷ 1F1E6 ÷ # ÷ [0.2] (LF) ÷ [3.1] REGIONAL INDICATOR SYMBOL LETTER A (RI) ÷ [0.3]
+÷ 000A ÷ 0308 ÷ 1F1E6 ÷ # ÷ [0.2] (LF) ÷ [3.1] COMBINING DIAERESIS (Extend) ÷ [999.0] REGIONAL INDICATOR SYMBOL LETTER A (RI) ÷ [0.3]
+÷ 000A ÷ 05D0 ÷ # ÷ [0.2] (LF) ÷ [3.1] HEBREW LETTER ALEF (Hebrew_Letter) ÷ [0.3]
+÷ 000A ÷ 0308 ÷ 05D0 ÷ # ÷ [0.2] (LF) ÷ [3.1] COMBINING DIAERESIS (Extend) ÷ [999.0] HEBREW LETTER ALEF (Hebrew_Letter) ÷ [0.3]
+÷ 000A ÷ 0022 ÷ # ÷ [0.2] (LF) ÷ [3.1] QUOTATION MARK (Double_Quote) ÷ [0.3]
+÷ 000A ÷ 0308 ÷ 0022 ÷ # ÷ [0.2] (LF) ÷ [3.1] COMBINING DIAERESIS (Extend) ÷ [999.0] QUOTATION MARK (Double_Quote) ÷ [0.3]
+÷ 000A ÷ 0027 ÷ # ÷ [0.2] (LF) ÷ [3.1] APOSTROPHE (Single_Quote) ÷ [0.3]
+÷ 000A ÷ 0308 ÷ 0027 ÷ # ÷ [0.2] (LF) ÷ [3.1] COMBINING DIAERESIS (Extend) ÷ [999.0] APOSTROPHE (Single_Quote) ÷ [0.3]
+÷ 000A ÷ 200D ÷ # ÷ [0.2] (LF) ÷ [3.1] ZERO WIDTH JOINER (ZWJ) ÷ [0.3]
+÷ 000A ÷ 0308 × 200D ÷ # ÷ [0.2] (LF) ÷ [3.1] COMBINING DIAERESIS (Extend) × [4.0] ZERO WIDTH JOINER (ZWJ) ÷ [0.3]
+÷ 000A ÷ 00A9 ÷ # ÷ [0.2] (LF) ÷ [3.1] COPYRIGHT SIGN (ExtPictmALetter) ÷ [0.3]
+÷ 000A ÷ 0308 ÷ 00A9 ÷ # ÷ [0.2] (LF) ÷ [3.1] COMBINING DIAERESIS (Extend) ÷ [999.0] COPYRIGHT SIGN (ExtPictmALetter) ÷ [0.3]
+÷ 000A ÷ 0020 ÷ # ÷ [0.2] (LF) ÷ [3.1] SPACE (WSegSpace) ÷ [0.3]
+÷ 000A ÷ 0308 ÷ 0020 ÷ # ÷ [0.2] (LF) ÷ [3.1] COMBINING DIAERESIS (Extend) ÷ [999.0] SPACE (WSegSpace) ÷ [0.3]
+÷ 000A ÷ 0000 ÷ # ÷ [0.2] (LF) ÷ [3.1] (XXmExtPict) ÷ [0.3]
+÷ 000A ÷ 0308 ÷ 0000 ÷ # ÷ [0.2] (LF) ÷ [3.1] COMBINING DIAERESIS (Extend) ÷ [999.0] (XXmExtPict) ÷ [0.3]
+÷ 000A ÷ 0061 × 2060 ÷ # ÷ [0.2] (LF) ÷ [3.1] LATIN SMALL LETTER A (ALettermExtPict) × [4.0] WORD JOINER (Format) ÷ [0.3]
+÷ 000A ÷ 0308 ÷ 0061 × 2060 ÷ # ÷ [0.2] (LF) ÷ [3.1] COMBINING DIAERESIS (Extend) ÷ [999.0] LATIN SMALL LETTER A (ALettermExtPict) × [4.0] WORD JOINER (Format) ÷ [0.3]
+÷ 000A ÷ 0061 ÷ 003A ÷ # ÷ [0.2]