apache · krickert · Jun 20, 2026 · Jun 20, 2026 · Jun 21, 2026 · Jun 22, 2026
diff --git a/LICENSE b/LICENSE
@@ -371,9 +371,13 @@ The following license applies to the SLF4J API:
     OF CONTRACT, TORT OR OTHERWISE,  ARISING FROM, OUT OF OR IN CONNECTION
     WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
-The following license applies to the bundled Unicode data file in
+The following license applies to the bundled Unicode data files in
+opennlp-core/opennlp-runtime/src/main/resources/opennlp/tools/tokenize/uax29
+(WordBreakProperty.txt, ExtendedPictographic.txt),
 opennlp-core/opennlp-runtime/src/main/resources/opennlp/tools/util/normalizer
-(confusables.txt):
+(confusables.txt), and
+opennlp-core/opennlp-runtime/src/test/resources/opennlp/tools/tokenize/uax29
+(WordBreakTest.txt):
 
     UNICODE LICENSE V3
 

diff --git a/NOTICE b/NOTICE
@@ -94,16 +94,30 @@ SOFTWARE.
 
 ============================================================================
 
-This product bundles a data file from the Unicode Security Mechanisms
-(UTS #39), version 17.0.0, published by Unicode, Inc.
-(https://www.unicode.org/Public/).
-
+This product bundles data files from the Unicode Character Database (UCD)
+and the Unicode Security Mechanisms, version 17.0.0, published by Unicode,
+Inc. (https://www.unicode.org/Public/).
+
+  * opennlp-core/opennlp-runtime/src/main/resources/opennlp/tools/tokenize/uax29/WordBreakProperty.txt
+    is the upstream WordBreakProperty-17.0.0.txt, unmodified except for the
+    file name.
+  * opennlp-core/opennlp-runtime/src/test/resources/opennlp/tools/tokenize/uax29/WordBreakTest.txt
+    is the upstream WordBreakTest-17.0.0.txt, unmodified except for the file
+    name.
   * opennlp-core/opennlp-runtime/src/main/resources/opennlp/tools/util/normalizer/confusables.txt
-    is the upstream confusables.txt, unmodified.
+    is the upstream confusables.txt from the Unicode Security Mechanisms
+    (UTS #39), unmodified.
+  * opennlp-core/opennlp-runtime/src/main/resources/opennlp/tools/tokenize/uax29/ExtendedPictographic.txt
+    is derived from the upstream emoji-data.txt (Emoji Data for UTS #51,
+    version 17.0): it keeps only the lines that assign the
+    Extended_Pictographic property and is renamed accordingly. It is a
+    filtered subset; the upstream file additionally carries the Emoji,
+    Emoji_Presentation, Emoji_Modifier, Emoji_Modifier_Base, and
+    Emoji_Component properties, which are not retained.
 
 The original Unicode copyright and license header is preserved verbatim at the
-top of the bundled file. It is distributed under the Unicode License V3, the
-full text of which is reproduced in the LICENSE file accompanying this
+top of each bundled file. These files are distributed under the Unicode License
+V3, the full text of which is reproduced in the LICENSE file accompanying this
 distribution.
 
 Copyright (c) 1991-2025 Unicode, Inc. All rights reserved.

diff --git a/...core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/uax29/ExtendedPictographic.java b/...core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/uax29/ExtendedPictographic.java
@@ -0,0 +1,109 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.tokenize.uax29;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.UncheckedIOException;
+import java.nio.charset.StandardCharsets;
+import java.util.BitSet;
+
+/**
+ * Tests the Unicode {@code Extended_Pictographic} property of a code point.
+ *
+ * <p>This is the one extra property the word boundary algorithm needs (rule WB3c), to keep emoji
+ * zero-width-joiner sequences together. The data is loaded once from the {@code emoji-data.txt}
+ * derived resource of the Unicode Character Database and stored in a {@link BitSet}, so membership
+ * is an O(1) bit test.</p>
+ */
+public final class ExtendedPictographic {
+
+  private static final String RESOURCE = "ExtendedPictographic.txt";
+
+  // Loaded lazily on first use (see members()) so a missing or unreadable resource surfaces as a
+  // catchable exception at call time rather than an ExceptionInInitializerError that permanently
+  // poisons the class -- a real risk in container, OSGi, shaded, or modular setups.
+  private static volatile BitSet members;
+
+  private ExtendedPictographic() {
+  }
+
+  // Double-checked lazy initialization: load() runs once on first use, and a failure leaves the
+  // field null so a later call retries instead of the class being permanently unusable.
+  private static BitSet members() {
+    BitSet set = members;
+    if (set == null) {
+      synchronized (ExtendedPictographic.class) {
+        set = members;
+        if (set == null) {
+          set = load();
+          members = set;
+        }
+      }
+    }
+    return set;
+  }
+
+  private static BitSet load() {
+    final BitSet set = new BitSet();
+    try (InputStream in = ExtendedPictographic.class.getResourceAsStream(RESOURCE)) {
+      if (in == null) {
+        throw new IllegalStateException("Missing Extended_Pictographic data resource: " + RESOURCE);
+      }
+      parse(in, set);
+    } catch (IOException e) {
+      throw new UncheckedIOException(
+          "Unable to read Extended_Pictographic data resource " + RESOURCE, e);
+    }
+    return set;
+  }
+
+  private static void parse(InputStream in, BitSet set) throws IOException {
+    try (BufferedReader reader =
+             new BufferedReader(new InputStreamReader(in, StandardCharsets.UTF_8))) {
+      String line;
+      while ((line = reader.readLine()) != null) {
+        final int hash = line.indexOf('#');
+        final String content = (hash < 0 ? line : line.substring(0, hash)).strip();
+        if (content.isEmpty()) {
+          continue;
+        }
+        final int semicolon = content.indexOf(';');
+        final String codePoints = (semicolon < 0 ? content : content.substring(0, semicolon)).strip();
+        final int dots = codePoints.indexOf("..");
+        if (dots < 0) {
+          set.set(Integer.parseInt(codePoints, 16));
+        } else {
+          final int start = Integer.parseInt(codePoints.substring(0, dots), 16);
+          final int end = Integer.parseInt(codePoints.substring(dots + 2), 16);
+          set.set(start, end + 1);
+        }
+      }
+    }
+  }
+
+  /**
+   * {@return whether a code point has the {@code Extended_Pictographic} property}
+   *
+   * @param codePoint The code point. Values outside {@code [0, U+10FFFF]} return {@code false}.
+   */
+  public static boolean is(int codePoint) {
+    return codePoint >= 0 && codePoint <= Character.MAX_CODE_POINT && members().get(codePoint);
+  }
+}
diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/uax29/WordBreak.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/uax29/WordBreak.java
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.tokenize.uax29;
+
+/**
+ * The Unicode {@code Word_Break} property values, used by the UAX #29 word boundary algorithm.
+ *
+ * <p>{@link #OTHER} is the default for code points that carry no {@code Word_Break} value in the
+ * Unicode Character Database. The remaining constants correspond one-to-one to the values in
+ * {@code WordBreakProperty.txt} (see
+ * <a href="https://www.unicode.org/reports/tr29/">UAX #29</a>).</p>
+ */
+public enum WordBreak {
+
+  /** No assigned {@code Word_Break} value (the default). */
+  OTHER,
+  /** Carriage return ({@code U+000D}). */
+  CR,
+  /** Line feed ({@code U+000A}). */
+  LF,
+  /** Other mandatory line breaks (vertical tab, form feed, NEL, line/paragraph separators). */
+  NEWLINE,
+  /** Combining marks and other characters that extend the preceding one. */
+  EXTEND,
+  /** Zero width joiner ({@code U+200D}). */
+  ZWJ,
+  /** Regional indicator symbols (used in pairs for flag emoji). */
+  REGIONAL_INDICATOR,
+  /** Format characters. */
+  FORMAT,
+  /** Katakana letters. */
+  KATAKANA,
+  /** Hebrew letters (distinguished so a single quote may join them). */
+  HEBREW_LETTER,
+  /** Alphabetic letters. */
+  ALETTER,
+  /** The apostrophe ({@code U+0027}). */
+  SINGLE_QUOTE,
+  /** The quotation mark ({@code U+0022}). */
+  DOUBLE_QUOTE,
+  /** Characters that join letters or numbers (for example the full stop). */
+  MID_NUM_LET,
+  /** Characters that join letters (for example the middle dot). */
+  MID_LETTER,
+  /** Characters that join numbers (for example the comma). */
+  MID_NUM,
+  /** Decimal digits. */
+  NUMERIC,
+  /** Characters that extend a number or letter sequence (for example the low line). */
+  EXTEND_NUM_LET,
+  /** Whitespace that segments words ({@code Word_Break=WSegSpace}). */
+  WSEG_SPACE;
+
+  /**
+   * Maps a {@code Word_Break} value name, as written in {@code WordBreakProperty.txt}, to its
+   * constant.
+   *
+   * @param name The property value name (for example {@code ALetter}).
+   * @return The matching constant.
+   * @throws IllegalArgumentException Thrown if the name is not a known {@code Word_Break} value.
+   */
+  static WordBreak fromPropertyName(String name) {
+    return switch (name) {
+      case "CR" -> CR;
+      case "LF" -> LF;
+      case "Newline" -> NEWLINE;
+      case "Extend" -> EXTEND;
+      case "ZWJ" -> ZWJ;
+      case "Regional_Indicator" -> REGIONAL_INDICATOR;
+      case "Format" -> FORMAT;
+      case "Katakana" -> KATAKANA;
+      case "Hebrew_Letter" -> HEBREW_LETTER;
+      case "ALetter" -> ALETTER;
+      case "Single_Quote" -> SINGLE_QUOTE;
+      case "Double_Quote" -> DOUBLE_QUOTE;
+      case "MidNumLet" -> MID_NUM_LET;
+      case "MidLetter" -> MID_LETTER;
+      case "MidNum" -> MID_NUM;
+      case "Numeric" -> NUMERIC;
+      case "ExtendNumLet" -> EXTEND_NUM_LET;
+      case "WSegSpace" -> WSEG_SPACE;
+      default -> throw new IllegalArgumentException("Unknown Word_Break value: " + name);
+    };
+  }
+}