Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions LICENSE
Original file line number Diff line number Diff line change
Expand Up @@ -371,9 +371,13 @@ The following license applies to the SLF4J API:
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

The following license applies to the bundled Unicode data file in
The following license applies to the bundled Unicode data files in
opennlp-core/opennlp-runtime/src/main/resources/opennlp/tools/tokenize/uax29
(WordBreakProperty.txt, ExtendedPictographic.txt),
opennlp-core/opennlp-runtime/src/main/resources/opennlp/tools/util/normalizer
(confusables.txt):
(confusables.txt), and
opennlp-core/opennlp-runtime/src/test/resources/opennlp/tools/tokenize/uax29
(WordBreakTest.txt):

UNICODE LICENSE V3

Expand Down
28 changes: 21 additions & 7 deletions NOTICE
Original file line number Diff line number Diff line change
Expand Up @@ -94,16 +94,30 @@ SOFTWARE.

============================================================================

This product bundles a data file from the Unicode Security Mechanisms
(UTS #39), version 17.0.0, published by Unicode, Inc.
(https://www.unicode.org/Public/).

This product bundles data files from the Unicode Character Database (UCD)
and the Unicode Security Mechanisms, version 17.0.0, published by Unicode,
Inc. (https://www.unicode.org/Public/).

* opennlp-core/opennlp-runtime/src/main/resources/opennlp/tools/tokenize/uax29/WordBreakProperty.txt
is the upstream WordBreakProperty-17.0.0.txt, unmodified except for the
file name.
* opennlp-core/opennlp-runtime/src/test/resources/opennlp/tools/tokenize/uax29/WordBreakTest.txt
is the upstream WordBreakTest-17.0.0.txt, unmodified except for the file
name.
* opennlp-core/opennlp-runtime/src/main/resources/opennlp/tools/util/normalizer/confusables.txt
is the upstream confusables.txt, unmodified.
is the upstream confusables.txt from the Unicode Security Mechanisms
(UTS #39), unmodified.
* opennlp-core/opennlp-runtime/src/main/resources/opennlp/tools/tokenize/uax29/ExtendedPictographic.txt
is derived from the upstream emoji-data.txt (Emoji Data for UTS #51,
version 17.0): it keeps only the lines that assign the
Extended_Pictographic property and is renamed accordingly. It is a
filtered subset; the upstream file additionally carries the Emoji,
Emoji_Presentation, Emoji_Modifier, Emoji_Modifier_Base, and
Emoji_Component properties, which are not retained.

The original Unicode copyright and license header is preserved verbatim at the
top of the bundled file. It is distributed under the Unicode License V3, the
full text of which is reproduced in the LICENSE file accompanying this
top of each bundled file. These files are distributed under the Unicode License
V3, the full text of which is reproduced in the LICENSE file accompanying this
distribution.

Copyright (c) 1991-2025 Unicode, Inc. All rights reserved.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.tools.tokenize.uax29;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UncheckedIOException;
import java.nio.charset.StandardCharsets;
import java.util.BitSet;

/**
* Tests the Unicode {@code Extended_Pictographic} property of a code point.
*
* <p>This is the one extra property the word boundary algorithm needs (rule WB3c), to keep emoji
* zero-width-joiner sequences together. The data is loaded once from the {@code emoji-data.txt}
* derived resource of the Unicode Character Database and stored in a {@link BitSet}, so membership
* is an O(1) bit test.</p>
*/
public final class ExtendedPictographic {

private static final String RESOURCE = "ExtendedPictographic.txt";

// Loaded lazily on first use (see members()) so a missing or unreadable resource surfaces as a
// catchable exception at call time rather than an ExceptionInInitializerError that permanently
// poisons the class -- a real risk in container, OSGi, shaded, or modular setups.
private static volatile BitSet members;

private ExtendedPictographic() {
}

// Double-checked lazy initialization: load() runs once on first use, and a failure leaves the
// field null so a later call retries instead of the class being permanently unusable.
private static BitSet members() {
BitSet set = members;
if (set == null) {
synchronized (ExtendedPictographic.class) {
set = members;
if (set == null) {
set = load();
members = set;
}
}
}
return set;
}

private static BitSet load() {
final BitSet set = new BitSet();
try (InputStream in = ExtendedPictographic.class.getResourceAsStream(RESOURCE)) {
if (in == null) {
throw new IllegalStateException("Missing Extended_Pictographic data resource: " + RESOURCE);
}
parse(in, set);
} catch (IOException e) {
throw new UncheckedIOException(
"Unable to read Extended_Pictographic data resource " + RESOURCE, e);
}
return set;
}

private static void parse(InputStream in, BitSet set) throws IOException {
try (BufferedReader reader =
new BufferedReader(new InputStreamReader(in, StandardCharsets.UTF_8))) {
String line;
while ((line = reader.readLine()) != null) {
final int hash = line.indexOf('#');
final String content = (hash < 0 ? line : line.substring(0, hash)).strip();
if (content.isEmpty()) {
continue;
}
final int semicolon = content.indexOf(';');
final String codePoints = (semicolon < 0 ? content : content.substring(0, semicolon)).strip();
final int dots = codePoints.indexOf("..");
if (dots < 0) {
set.set(Integer.parseInt(codePoints, 16));
} else {
final int start = Integer.parseInt(codePoints.substring(0, dots), 16);
final int end = Integer.parseInt(codePoints.substring(dots + 2), 16);
set.set(start, end + 1);
}
}
}
}

/**
* {@return whether a code point has the {@code Extended_Pictographic} property}
*
* @param codePoint The code point. Values outside {@code [0, U+10FFFF]} return {@code false}.
*/
public static boolean is(int codePoint) {
return codePoint >= 0 && codePoint <= Character.MAX_CODE_POINT && members().get(codePoint);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.tools.tokenize.uax29;

/**
* The Unicode {@code Word_Break} property values, used by the UAX #29 word boundary algorithm.
*
* <p>{@link #OTHER} is the default for code points that carry no {@code Word_Break} value in the
* Unicode Character Database. The remaining constants correspond one-to-one to the values in
* {@code WordBreakProperty.txt} (see
* <a href="https://www.unicode.org/reports/tr29/">UAX #29</a>).</p>
*/
public enum WordBreak {

/** No assigned {@code Word_Break} value (the default). */
OTHER,
/** Carriage return ({@code U+000D}). */
CR,
/** Line feed ({@code U+000A}). */
LF,
/** Other mandatory line breaks (vertical tab, form feed, NEL, line/paragraph separators). */
NEWLINE,
/** Combining marks and other characters that extend the preceding one. */
EXTEND,
/** Zero width joiner ({@code U+200D}). */
ZWJ,
/** Regional indicator symbols (used in pairs for flag emoji). */
REGIONAL_INDICATOR,
/** Format characters. */
FORMAT,
/** Katakana letters. */
KATAKANA,
/** Hebrew letters (distinguished so a single quote may join them). */
HEBREW_LETTER,
/** Alphabetic letters. */
ALETTER,
/** The apostrophe ({@code U+0027}). */
SINGLE_QUOTE,
/** The quotation mark ({@code U+0022}). */
DOUBLE_QUOTE,
/** Characters that join letters or numbers (for example the full stop). */
MID_NUM_LET,
/** Characters that join letters (for example the middle dot). */
MID_LETTER,
/** Characters that join numbers (for example the comma). */
MID_NUM,
/** Decimal digits. */
NUMERIC,
/** Characters that extend a number or letter sequence (for example the low line). */
EXTEND_NUM_LET,
/** Whitespace that segments words ({@code Word_Break=WSegSpace}). */
WSEG_SPACE;

/**
* Maps a {@code Word_Break} value name, as written in {@code WordBreakProperty.txt}, to its
* constant.
*
* @param name The property value name (for example {@code ALetter}).
* @return The matching constant.
* @throws IllegalArgumentException Thrown if the name is not a known {@code Word_Break} value.
*/
static WordBreak fromPropertyName(String name) {
return switch (name) {
case "CR" -> CR;
case "LF" -> LF;
case "Newline" -> NEWLINE;
case "Extend" -> EXTEND;
case "ZWJ" -> ZWJ;
case "Regional_Indicator" -> REGIONAL_INDICATOR;
case "Format" -> FORMAT;
case "Katakana" -> KATAKANA;
case "Hebrew_Letter" -> HEBREW_LETTER;
case "ALetter" -> ALETTER;
case "Single_Quote" -> SINGLE_QUOTE;
case "Double_Quote" -> DOUBLE_QUOTE;
case "MidNumLet" -> MID_NUM_LET;
case "MidLetter" -> MID_LETTER;
case "MidNum" -> MID_NUM;
case "Numeric" -> NUMERIC;
case "ExtendNumLet" -> EXTEND_NUM_LET;
case "WSegSpace" -> WSEG_SPACE;
default -> throw new IllegalArgumentException("Unknown Word_Break value: " + name);
};
}
}
Loading