From 58cff012054a00fcd451f6dcd59297258a2003d7 Mon Sep 17 00:00:00 2001 From: Kristian Rickert Date: Tue, 23 Jun 2026 10:57:26 -0400 Subject: [PATCH 1/2] OPENNLP-1850 Layered Term model: Term, TermAnalyzer (2b) The token analysis layer split out of the former tokenizer PR (#1104) on review request. A Term is one token projected through the ordered Dimension stack (original, NFC, NFKC, whitespace, dash, case fold, accent fold, confusable fold, stem, lemma), keeping its source Span and every intermediate form; TermAnalyzer segments with the UAX #29 WordTokenizer (from 2a) and applies the configured dimension prefix. Restores Dimension's {@link Term}/{@link TermAnalyzer} javadoc now that they exist. Builds on the tokenizer in 2a. --- .../tools/util/normalizer/Dimension.java | 6 +- .../opennlp/tools/util/normalizer/Term.java | 120 ++++++ .../tools/util/normalizer/TermAnalyzer.java | 368 ++++++++++++++++++ .../util/normalizer/ConfusablesTest.java | 81 ++++ .../util/normalizer/TermAnalyzerTest.java | 211 ++++++++++ 5 files changed, 783 insertions(+), 3 deletions(-) create mode 100644 opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/Term.java create mode 100644 opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/TermAnalyzer.java create mode 100644 opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/ConfusablesTest.java create mode 100644 opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/TermAnalyzerTest.java diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/Dimension.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/Dimension.java index 6ad068471..7caece7a0 100644 --- a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/Dimension.java +++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/Dimension.java @@ -19,13 +19,13 @@ import java.util.function.Supplier; /** - * A layer of the {@code Term} normalization stack, in increasing order of aggressiveness. A - * {@code TermAnalyzer} applies a configured prefix of these to each token; the declaration order is + * A layer of the {@link Term} normalization stack, in increasing order of aggressiveness. A + * {@link TermAnalyzer} applies a configured prefix of these to each token; the declaration order is * the canonical pipeline order, because the transforms do not commute (case folding then accent * folding differs from the reverse for Turkish dotted/dotless i and the German eszett). * *

This enum is the single definition of the character-level steps: each one carries its default - * {@link CharSequenceNormalizer}, which both {@code TermAnalyzer} and {@link TextNormalizer} read + * {@link CharSequenceNormalizer}, which both {@link TermAnalyzer} and {@link TextNormalizer} read * from rather than re-listing. The default is resolved lazily, so loading this enum does not eagerly * initialize heavy data such as the confusables table.

* diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/Term.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/Term.java new file mode 100644 index 000000000..eda3c4107 --- /dev/null +++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/Term.java @@ -0,0 +1,120 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.util.normalizer; + +import java.util.EnumMap; +import java.util.List; + +import opennlp.tools.util.Span; + +/** + * One token as a stack of normalization layers. The {@link #original()} form is the canonical + * source of truth; the other layers are derived, increasingly aggressive {@link Dimension}s tuned + * for matching and search. The dimensions configured on the producing {@link TermAnalyzer} are + * computed eagerly and cached; any other dimension is computed on first request, applied on top of + * the {@link #normalized() configured form}, and then cached. + * + *

Because the original is always retained, aggressive folding is safe: a match on a derived layer + * can always be reported in original coordinates through {@link #span()}. Querying a configured + * layer, or {@link #peel() peeling} the last-applied one, is O(1); adding an unconfigured dimension + * costs one transform on first touch and is O(1) thereafter.

+ * + *

Instances are created by {@link TermAnalyzer} and are not thread-safe (the lazy cache is + * mutated on first access of an unconfigured dimension).

+ */ +public final class Term { + + private final TermAnalyzer analyzer; + private final Span span; + private final String posTag; + private final EnumMap layers = new EnumMap<>(Dimension.class); + + Term(TermAnalyzer analyzer, String original, Span span, String posTag) { + this.analyzer = analyzer; + this.span = span; + this.posTag = posTag; + String value = original; + layers.put(Dimension.ORIGINAL, value); + for (final Dimension dimension : analyzer.dimensions()) { + value = analyzer.apply(dimension, value, posTag); + layers.put(dimension, value); + } + } + + /** + * {@return the source span of this token, or {@code null} if it was supplied as a pre-tokenized + * string} The span indexes into the text passed to {@link TermAnalyzer#analyze(CharSequence)}. + */ + public Span span() { + return span; + } + + /** + * {@return the original token text} + */ + public String original() { + return layers.get(Dimension.ORIGINAL); + } + + /** + * {@return the token at the analyzer's final configured dimension} Equal to {@link #original()} + * when no dimensions were configured. + */ + public String normalized() { + return at(analyzer.finalDimension()); + } + + /** + * Returns the token at {@code dimension}. Configured dimensions are cached; an unconfigured + * dimension is computed by applying its transform to {@link #normalized()} and then cached. + * + *

Note: an unconfigured dimension is applied on top of {@link #normalized()} (the most + * aggressive configured layer), not spliced into canonical pipeline order. Because the transforms + * do not commute (see {@link Dimension}), requesting a dimension that ranks earlier than + * the configured ones can differ from having configured it. For example, asking for + * {@link Dimension#CASE_FOLD} on an analyzer configured only through {@link Dimension#ACCENT_FOLD} + * case-folds the already accent-folded text, which is not the same as case-folding first. + * Configure the dimension on the analyzer when canonical order matters.

+ * + * @param dimension The dimension to project to. + * @return The token at that dimension. + * @throws IllegalStateException if the dimension needs an engine or tag that was not configured + * (see {@link Dimension#STEM} and {@link Dimension#LEMMA}). + */ + public String at(Dimension dimension) { + final String cached = layers.get(dimension); + if (cached != null) { + return cached; + } + final String value = analyzer.apply(dimension, normalized(), posTag); + layers.put(dimension, value); + return value; + } + + /** + * {@return the token at the dimension just below the final configured one} This is the + * last-applied layer removed (for example the form before stemming when {@link Dimension#STEM} + * is the final dimension); equal to {@link #original()} when at most one dimension is configured. + */ + public String peel() { + final List dimensions = analyzer.dimensions(); + if (dimensions.size() < 2) { + return original(); + } + return at(dimensions.get(dimensions.size() - 2)); + } +} diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/TermAnalyzer.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/TermAnalyzer.java new file mode 100644 index 000000000..0d9956e8e --- /dev/null +++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/TermAnalyzer.java @@ -0,0 +1,368 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.util.normalizer; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.EnumMap; +import java.util.EnumSet; +import java.util.List; +import java.util.Locale; +import java.util.Objects; +import java.util.Set; + +import opennlp.tools.lemmatizer.Lemmatizer; +import opennlp.tools.stemmer.Stemmer; +import opennlp.tools.tokenize.uax29.WordTokenizer; +import opennlp.tools.util.Span; + +/** + * Builds {@link Term}s by segmenting text and applying a configured stack of normalization + * {@link Dimension}s to each token. The analyzer is the configuration; each {@link Term} is the + * layered result for one token, with the configured dimensions computed eagerly and any other + * dimension computed lazily on first request. + * + *

Segmentation uses the Unicode {@linkplain WordTokenizer UAX #29 word tokenizer}, so the + * input does not need to be pre-tokenized. The character-level dimensions ({@link Dimension#NFC} + * through {@link Dimension#ACCENT_FOLD}) have built-in defaults; {@link Dimension#STEM} and + * {@link Dimension#LEMMA} are enabled by supplying a {@link Stemmer} or {@link Lemmatizer}.

+ * + *

An instance is immutable and is thread-safe when its configured transforms are. The built-in + * character normalizers are stateless, but the Snowball stemmers are not, so an analyzer configured + * with a {@link Stemmer} (for example through {@link NormalizationProfile#searchAnalyzer()}) should + * not be shared across threads when {@link Dimension#STEM} is used. Build one with + * {@link #builder()}.

+ */ +public final class TermAnalyzer { + + private final List chain; + private final Dimension finalDimension; + private final EnumMap transforms; + private final Stemmer stemmer; + private final Lemmatizer lemmatizer; + private final WordTokenizer tokenizer; + + private TermAnalyzer(Builder builder) { + final List ordered = new ArrayList<>(builder.chain); + Collections.sort(ordered); // canonical pipeline order (enum declaration order) + this.chain = List.copyOf(ordered); + this.finalDimension = ordered.isEmpty() ? Dimension.ORIGINAL : ordered.get(ordered.size() - 1); + // Only the per-analyzer overrides from the builder; the defaults live on Dimension itself. + this.transforms = new EnumMap<>(builder.transforms); + this.stemmer = builder.stemmer; + this.lemmatizer = builder.lemmatizer; + this.tokenizer = builder.tokenizer; + } + + /** + * {@return a new builder} + */ + public static Builder builder() { + return new Builder(); + } + + /** + * Segments {@code text} with the UAX #29 word tokenizer and returns one {@link Term} per + * word token, in order. The terms carry no part-of-speech tag, so {@link Dimension#LEMMA} is not + * available from them. + * + * @param text The text to analyze. + * @return The terms. + */ + public List analyze(CharSequence text) { + final List spans = tokenizer.tokenizeSpans(text); + final List terms = new ArrayList<>(spans.size()); + for (final Span span : spans) { + terms.add(new Term(this, span.getCoveredText(text).toString(), span, null)); + } + return terms; + } + + /** + * Returns one {@link Term} per supplied token, attaching the matching part-of-speech tag so that + * {@link Dimension#LEMMA} can be computed. The terms have no source span. + * + * @param tokens The tokens. + * @param tags The part-of-speech tag for each token; must be the same length as {@code tokens}. + * @return The terms. + * @throws IllegalArgumentException if {@code tokens} and {@code tags} differ in length. + */ + public List analyze(String[] tokens, String[] tags) { + if (tokens.length != tags.length) { + throw new IllegalArgumentException( + "tokens and tags must be the same length, got " + tokens.length + " and " + tags.length); + } + final List terms = new ArrayList<>(tokens.length); + for (int i = 0; i < tokens.length; i++) { + terms.add(new Term(this, tokens[i], null, tags[i])); + } + return terms; + } + + /** + * {@return the configured dimensions that are computed eagerly, in canonical order} The list + * never includes {@link Dimension#ORIGINAL}, which is always present. + */ + public List dimensions() { + return chain; + } + + Dimension finalDimension() { + return finalDimension; + } + + // Applies one dimension's transform to a single token value. Fails loudly when a token-level + // dimension was requested without the engine (or tag) it needs. + String apply(Dimension dimension, String input, String posTag) { + switch (dimension) { + case ORIGINAL: + return input; + case STEM: + if (stemmer == null) { + throw new IllegalStateException( + "Dimension STEM requires a Stemmer; configure it with builder().stem(...)"); + } + return stemmer.stem(input).toString(); + case LEMMA: + if (lemmatizer == null) { + throw new IllegalStateException( + "Dimension LEMMA requires a Lemmatizer; configure it with builder().lemmatize(...)"); + } + if (posTag == null) { + throw new IllegalStateException( + "Dimension LEMMA requires a part-of-speech tag; use analyze(tokens, tags)"); + } + return lemmatizer.lemmatize(new String[] {input}, new String[] {posTag})[0]; + default: + // A builder override wins; otherwise the dimension's own default normalizer. + final CharSequenceNormalizer normalizer = transforms.containsKey(dimension) + ? transforms.get(dimension) : dimension.defaultNormalizer(); + if (normalizer == null) { + throw new IllegalStateException("Dimension " + dimension + " has no default normalizer; " + + "configure it with builder().transform(" + dimension + ", ...)"); + } + return normalizer.normalize(input).toString(); + } + } + + /** A builder for {@link TermAnalyzer}. */ + public static final class Builder { + + private final EnumSet chain = EnumSet.noneOf(Dimension.class); + private final EnumMap transforms = + new EnumMap<>(Dimension.class); + private Stemmer stemmer; + private Lemmatizer lemmatizer; + private WordTokenizer tokenizer = new WordTokenizer(); + + private Builder() { + } + + /** + * Enables {@link Dimension#NFC}. + * + * @return this builder + */ + public Builder nfc() { + chain.add(Dimension.NFC); + return this; + } + + /** + * Enables {@link Dimension#NFKC}. + * + * @return this builder + */ + public Builder nfkc() { + chain.add(Dimension.NFKC); + return this; + } + + /** + * Enables {@link Dimension#WHITESPACE}. + * + * @return this builder + */ + public Builder whitespace() { + chain.add(Dimension.WHITESPACE); + return this; + } + + /** + * Enables {@link Dimension#WHITESPACE} with a specific normalizer, choosing the fold target and + * behavior. For a custom class and target use a {@link CharClass} method reference, for example + * {@code whitespace(CharClass.of(members, replacement)::collapse)}. + * + * @param normalizer The whitespace normalizer to use. + * @return this builder + */ + public Builder whitespace(CharSequenceNormalizer normalizer) { + return transform(Dimension.WHITESPACE, normalizer); + } + + /** + * Enables {@link Dimension#DASH}. + * + * @return this builder + */ + public Builder dashes() { + chain.add(Dimension.DASH); + return this; + } + + /** + * Enables {@link Dimension#DASH} with a specific normalizer (a custom dash set or target). + * + * @param normalizer The dash normalizer to use. + * @return this builder + */ + public Builder dashes(CharSequenceNormalizer normalizer) { + return transform(Dimension.DASH, normalizer); + } + + /** + * Enables {@link Dimension#CASE_FOLD}. + * + * @return this builder + */ + public Builder caseFold() { + chain.add(Dimension.CASE_FOLD); + return this; + } + + /** + * Enables {@link Dimension#CASE_FOLD} using the given locale's case rules (for example Turkish + * dotted/dotless i), instead of the default {@link Locale#ROOT}. + * + * @param locale The locale whose case rules to apply. + * @return this builder + */ + public Builder caseFold(Locale locale) { + Objects.requireNonNull(locale, "locale"); + return transform(Dimension.CASE_FOLD, CaseFoldCharSequenceNormalizer.getInstance(locale)); + } + + /** + * Enables {@link Dimension#ACCENT_FOLD}. + * + * @return this builder + */ + public Builder accentFold() { + chain.add(Dimension.ACCENT_FOLD); + return this; + } + + /** + * Enables {@link Dimension#ACCENT_FOLD} restricted to a specific set of scripts, instead of the + * default Latin/Greek/Cyrillic. + * + * @param foldScripts The scripts whose diacritics to fold. + * @param foldStrokeLetters Whether to also fold stroke letters such as o-slash and l-stroke. + * @return this builder + */ + public Builder accentFold(Set foldScripts, boolean foldStrokeLetters) { + return transform(Dimension.ACCENT_FOLD, + new AccentFoldCharSequenceNormalizer(foldScripts, foldStrokeLetters)); + } + + /** + * Enables {@link Dimension#CONFUSABLE_FOLD}. + * + * @return this builder + */ + public Builder confusableFold() { + chain.add(Dimension.CONFUSABLE_FOLD); + return this; + } + + /** + * Enables a character-level dimension with a specific normalizer, overriding its default (for + * example a locale-specific case fold for a language profile). + * + * @param dimension The character-level dimension to enable. + * @param normalizer The normalizer to use for it. + * @return this builder + * @throws IllegalArgumentException if {@code dimension} is {@link Dimension#ORIGINAL}, + * {@link Dimension#STEM}, or {@link Dimension#LEMMA}. + */ + public Builder transform(Dimension dimension, CharSequenceNormalizer normalizer) { + if (dimension == Dimension.ORIGINAL || dimension == Dimension.STEM + || dimension == Dimension.LEMMA) { + throw new IllegalArgumentException( + "transform(...) only applies to character-level dimensions, not " + dimension); + } + transforms.put(dimension, Objects.requireNonNull(normalizer, "normalizer")); + chain.add(dimension); + return this; + } + + /** + * Enables {@link Dimension#STEM} through the given stemmer. + * + * @param value The stemmer. + * @return this builder + */ + public Builder stem(Stemmer value) { + this.stemmer = Objects.requireNonNull(value, "stemmer"); + chain.add(Dimension.STEM); + return this; + } + + /** + * Enables {@link Dimension#LEMMA} through the given lemmatizer. + * + * @param value The lemmatizer. + * @return this builder + */ + public Builder lemmatize(Lemmatizer value) { + this.lemmatizer = Objects.requireNonNull(value, "lemmatizer"); + chain.add(Dimension.LEMMA); + return this; + } + + /** + * Sets the tokenizer used by {@link TermAnalyzer#analyze(CharSequence)}. + * + * @param value The tokenizer. + * @return this builder + */ + public Builder tokenizer(WordTokenizer value) { + this.tokenizer = Objects.requireNonNull(value, "tokenizer"); + return this; + } + + /** + * Sets the maximum token length of the tokenizer used by + * {@link TermAnalyzer#analyze(CharSequence)}. Convenience for + * {@code tokenizer(new WordTokenizer(maxTokenLength))}. + * + * @param maxTokenLength The maximum number of characters in a token. + * @return this builder + */ + public Builder maxTokenLength(int maxTokenLength) { + this.tokenizer = new WordTokenizer(maxTokenLength); + return this; + } + + /** + * {@return a new {@link TermAnalyzer} with this configuration} + */ + public TermAnalyzer build() { + return new TermAnalyzer(this); + } + } +} diff --git a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/ConfusablesTest.java b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/ConfusablesTest.java new file mode 100644 index 000000000..262fe5aa9 --- /dev/null +++ b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/ConfusablesTest.java @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.util.normalizer; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class ConfusablesTest { + + private static String cp(int codePoint) { + return new String(Character.toChars(codePoint)); + } + + @Test + void testCyrillicLetterIsConfusableWithLatin() { + final String cyrillicA = cp(0x0430); // CYRILLIC SMALL LETTER A, looks like Latin 'a' + assertTrue(Confusables.confusable(cyrillicA, "a")); + assertFalse(Confusables.confusable(cyrillicA, "b")); + } + + @Test + void testHomoglyphSpoofWordReducesToLatinSpelling() { + final String spoof = "p" + cp(0x0430) + "yp" + cp(0x0430) + "l"; // paypal with Cyrillic a's + assertTrue(Confusables.confusable(spoof, "paypal")); + assertEquals(Confusables.skeleton("paypal"), Confusables.skeleton(spoof)); + } + + @Test + void testHorizontalEllipsisFoldsToThreeFullStops() { + assertEquals(Confusables.skeleton("..."), Confusables.skeleton(cp(0x2026))); + assertTrue(Confusables.confusable(cp(0x2026), "...")); + } + + @Test + void testDistinctWordsAreNotConfusable() { + assertFalse(Confusables.confusable("cat", "dog")); + } + + @Test + void testSkeletonIsIdempotent() { + final String skeleton = Confusables.skeleton(cp(0x0430) + "bc"); + assertEquals(skeleton, Confusables.skeleton(skeleton)); + } + + @Test + void testNormalizerProducesTheSkeleton() { + final String spoof = "p" + cp(0x0430) + "yp" + cp(0x0430) + "l"; + assertEquals(Confusables.skeleton(spoof), + ConfusableSkeletonCharSequenceNormalizer.getInstance().normalize(spoof).toString()); + } + + @Test + void testMultipleCyrillicLookalikesFold() { + final String spoof = "d" + cp(0x0430) + "t" + cp(0x0430); // "data" with Cyrillic a's + assertEquals(Confusables.skeleton("data"), Confusables.skeleton(spoof)); + } + + @Test + void testTermConfusableFoldDimension() { + final String spoof = "p" + cp(0x0430) + "yp" + cp(0x0430) + "l"; + final TermAnalyzer analyzer = TermAnalyzer.builder().confusableFold().build(); + assertEquals(Confusables.skeleton("paypal"), analyzer.analyze(spoof).get(0).normalized()); + } +} diff --git a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/TermAnalyzerTest.java b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/TermAnalyzerTest.java new file mode 100644 index 000000000..56f16899d --- /dev/null +++ b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/TermAnalyzerTest.java @@ -0,0 +1,211 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.util.normalizer; + +import java.util.List; +import java.util.Locale; +import java.util.Set; + +import org.junit.jupiter.api.Test; + +import opennlp.tools.lemmatizer.Lemmatizer; +import opennlp.tools.stemmer.PorterStemmer; +import opennlp.tools.util.Span; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertSame; +import static org.junit.jupiter.api.Assertions.assertThrows; + +public class TermAnalyzerTest { + + private static String cp(int codePoint) { + return new String(Character.toChars(codePoint)); + } + + @Test + void testNoDimensionsLeavesTokenUnchanged() { + final TermAnalyzer analyzer = TermAnalyzer.builder().build(); + final Term term = analyzer.analyze("Hello").get(0); + assertEquals("Hello", term.original()); + assertEquals("Hello", term.normalized()); + assertEquals("Hello", term.peel()); + assertEquals(List.of(), analyzer.dimensions()); + } + + @Test + void testChainAppliesInCanonicalOrderRegardlessOfBuilderOrder() { + // accentFold added before caseFold, but the canonical order is caseFold then accentFold. + final TermAnalyzer analyzer = TermAnalyzer.builder().accentFold().caseFold().build(); + assertEquals(List.of(Dimension.CASE_FOLD, Dimension.ACCENT_FOLD), analyzer.dimensions()); + final String input = "CAF" + cp(0x00C9); // CAFE with capital acute E + final Term term = analyzer.analyze(input).get(0); + assertEquals(input, term.original()); + assertEquals("cafe", term.normalized()); + assertEquals("caf" + cp(0x00E9), term.peel()); // before accent folding: lower-case, acute kept + } + + @Test + void testStemIsTheTopLayer() { + final TermAnalyzer analyzer = + TermAnalyzer.builder().caseFold().stem(new PorterStemmer()).build(); + final Term term = analyzer.analyze("Running").get(0); + assertEquals("running", term.peel()); // case-folded form, before stemming + assertEquals("run", term.normalized()); + assertEquals("run", term.at(Dimension.STEM)); + } + + @Test + void testUnconfiguredCharDimensionComputedLazily() { + final TermAnalyzer analyzer = TermAnalyzer.builder().build(); + final Term term = analyzer.analyze("HELLO").get(0); + assertEquals("HELLO", term.normalized()); + assertEquals("hello", term.at(Dimension.CASE_FOLD)); // lazily added on top of the final form + } + + @Test + void testStemDimensionWithoutStemmerFailsLoudly() { + final TermAnalyzer analyzer = TermAnalyzer.builder().caseFold().build(); + final Term term = analyzer.analyze("running").get(0); + assertThrows(IllegalStateException.class, () -> term.at(Dimension.STEM)); + } + + @Test + void testLemmaWithoutLemmatizerFailsLoudly() { + final TermAnalyzer analyzer = TermAnalyzer.builder().build(); + final Term term = analyzer.analyze("running").get(0); + assertThrows(IllegalStateException.class, () -> term.at(Dimension.LEMMA)); + } + + @Test + void testAnalyzeTextProducesSpans() { + final TermAnalyzer analyzer = TermAnalyzer.builder().caseFold().build(); + final List terms = analyzer.analyze("The Cats"); + assertEquals(2, terms.size()); + assertEquals("The", terms.get(0).original()); + assertEquals("the", terms.get(0).normalized()); + assertEquals(new Span(0, 3), terms.get(0).span()); + assertEquals("Cats", terms.get(1).original()); + assertEquals(new Span(4, 8), terms.get(1).span()); + } + + @Test + void testAnalyzeTokensHasNoSpan() { + final TermAnalyzer analyzer = TermAnalyzer.builder().caseFold().build(); + final List terms = analyzer.analyze(new String[] {"Cats"}, new String[] {"NNS"}); + assertNull(terms.get(0).span()); + assertEquals("cats", terms.get(0).normalized()); + } + + @Test + void testAnalyzeTokensRejectsLengthMismatch() { + final TermAnalyzer analyzer = TermAnalyzer.builder().build(); + assertThrows(IllegalArgumentException.class, + () -> analyzer.analyze(new String[] {"a", "b"}, new String[] {"X"})); + } + + @Test + void testTransformRejectsNonCharacterDimension() { + assertThrows(IllegalArgumentException.class, () -> TermAnalyzer.builder() + .transform(Dimension.STEM, CaseFoldCharSequenceNormalizer.getInstance())); + } + + @Test + void testLemmaWithLemmatizerAndTag() { + final Lemmatizer lemmatizer = new Lemmatizer() { + @Override + public String[] lemmatize(String[] tokens, String[] tags) { + return new String[] {"be"}; + } + + @Override + public List> lemmatize(List tokens, List tags) { + return List.of(List.of("be")); + } + }; + final TermAnalyzer analyzer = + TermAnalyzer.builder().caseFold().lemmatize(lemmatizer).build(); + final Term term = analyzer.analyze(new String[] {"was"}, new String[] {"VBD"}).get(0); + assertEquals("be", term.normalized()); + } + + @Test + void testConfusableFoldComposesWithCaseFold() { + final TermAnalyzer analyzer = TermAnalyzer.builder().caseFold().confusableFold().build(); + final String spoof = "P" + cp(0x0430) + "yp" + cp(0x0430) + "l"; // Paypal with Cyrillic a's + assertEquals(Confusables.skeleton("paypal"), analyzer.analyze(spoof).get(0).normalized()); + } + + @Test + void testAtIsMemoized() { + final TermAnalyzer analyzer = TermAnalyzer.builder().build(); + final Term term = analyzer.analyze("HELLO").get(0); + final String first = term.at(Dimension.CASE_FOLD); + assertSame(first, term.at(Dimension.CASE_FOLD)); + } + + @Test + void testWhitespaceTargetIsConfigurable() { + final CharClass lineFold = CharClass.of(CodePointSet.of('\n', '\t'), '\n'); + final TermAnalyzer analyzer = TermAnalyzer.builder().whitespace(lineFold::collapse).build(); + final Term term = analyzer.analyze(new String[] {"a\n\n\tb"}, new String[] {"X"}).get(0); + assertEquals("a\nb", term.normalized()); + } + + @Test + void testCaseFoldLocaleAppliesTurkishRules() { + final TermAnalyzer analyzer = + TermAnalyzer.builder().caseFold(Locale.forLanguageTag("tr")).build(); + assertEquals(cp(0x0131), analyzer.analyze("I").get(0).normalized()); // dotless lowercase i + } + + @Test + void testAccentFoldScopeFoldsLatin() { + final TermAnalyzer analyzer = TermAnalyzer.builder() + .accentFold(Set.of(Character.UnicodeScript.LATIN), false).build(); + assertEquals("cafe", analyzer.analyze("caf" + cp(0x00E9)).get(0).normalized()); // cafe + acute + } + + @Test + void testMaxTokenLengthChopsTokens() { + final List terms = TermAnalyzer.builder().maxTokenLength(3).build().analyze("abcdefg"); + assertEquals(3, terms.size()); + assertEquals("abc", terms.get(0).original()); + assertEquals("def", terms.get(1).original()); + assertEquals("g", terms.get(2).original()); + } + + @Test + void testAnalyzeEmptyTextProducesNoTerms() { + assertEquals(List.of(), TermAnalyzer.builder().caseFold().build().analyze("")); + } + + @Test + void testWhitespaceOnlyInputHasNoWordTerms() { + assertEquals(List.of(), TermAnalyzer.builder().build().analyze(" \t ")); + } + + @Test + void testAtDimensionBelowFinalIsAppliedOnTop() { + // Final dimension is STEM; asking for NFC applies it on top of the stem (documented behavior). + final TermAnalyzer analyzer = + TermAnalyzer.builder().caseFold().stem(new PorterStemmer()).build(); + final Term term = analyzer.analyze("Running").get(0); + assertEquals("run", term.normalized()); + assertEquals("run", term.at(Dimension.NFC)); + } +} From a23a51358a8884a7e7eb9fdb1565418cbf836478 Mon Sep 17 00:00:00 2001 From: Kristian Rickert Date: Thu, 25 Jun 2026 13:00:26 -0400 Subject: [PATCH 2/2] OPENNLP-1850 Review nits: rename dashes()->dash(); LEMMA doc+test; soften forward-link (Term) Rename TermAnalyzer.Builder.dashes() -> dash() for consistency with the singular layer-enable methods (nfc/whitespace/caseFold/accentFold) and the DASH enum. Clarify that analyze(CharSequence) fails loud when a lemmatizer is configured (no POS tags) and add a test for it. Soften the NormalizationProfile forward-link to {@code}. --- .../tools/util/normalizer/TermAnalyzer.java | 11 +++++---- .../util/normalizer/TermAnalyzerTest.java | 23 +++++++++++++++++++ 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/TermAnalyzer.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/TermAnalyzer.java index 0d9956e8e..749796742 100644 --- a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/TermAnalyzer.java +++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/TermAnalyzer.java @@ -43,7 +43,7 @@ * *

An instance is immutable and is thread-safe when its configured transforms are. The built-in * character normalizers are stateless, but the Snowball stemmers are not, so an analyzer configured - * with a {@link Stemmer} (for example through {@link NormalizationProfile#searchAnalyzer()}) should + * with a {@link Stemmer} (for example through {@code NormalizationProfile.searchAnalyzer()}) should * not be shared across threads when {@link Dimension#STEM} is used. Build one with * {@link #builder()}.

*/ @@ -77,8 +77,9 @@ public static Builder builder() { /** * Segments {@code text} with the UAX #29 word tokenizer and returns one {@link Term} per - * word token, in order. The terms carry no part-of-speech tag, so {@link Dimension#LEMMA} is not - * available from them. + * word token, in order. The terms carry no part-of-speech tag, so {@link Dimension#LEMMA} cannot be + * computed from this entry point: if a lemmatizer is configured, this method throws -- use + * {@link #analyze(String[], String[])} when lemmas are needed. * * @param text The text to analyze. * @return The terms. @@ -219,7 +220,7 @@ public Builder whitespace(CharSequenceNormalizer normalizer) { * * @return this builder */ - public Builder dashes() { + public Builder dash() { chain.add(Dimension.DASH); return this; } @@ -230,7 +231,7 @@ public Builder dashes() { * @param normalizer The dash normalizer to use. * @return this builder */ - public Builder dashes(CharSequenceNormalizer normalizer) { + public Builder dash(CharSequenceNormalizer normalizer) { return transform(Dimension.DASH, normalizer); } diff --git a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/TermAnalyzerTest.java b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/TermAnalyzerTest.java index 56f16899d..c381f2141 100644 --- a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/TermAnalyzerTest.java +++ b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/TermAnalyzerTest.java @@ -30,6 +30,7 @@ import static org.junit.jupiter.api.Assertions.assertNull; import static org.junit.jupiter.api.Assertions.assertSame; import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; public class TermAnalyzerTest { @@ -143,6 +144,28 @@ public List> lemmatize(List tokens, List tags) { assertEquals("be", term.normalized()); } + @Test + void testAnalyzeCharSequenceFailsLoudlyWhenLemmaConfigured() { + // analyze(CharSequence) has no POS tags, so a configured LEMMA layer cannot be satisfied; it + // fails loud rather than silently dropping the layer. Callers needing lemmas use analyze(tokens, + // tags). + final Lemmatizer lemmatizer = new Lemmatizer() { + @Override + public String[] lemmatize(String[] tokens, String[] tags) { + return tokens.clone(); + } + + @Override + public List> lemmatize(List tokens, List tags) { + return List.of(tokens); + } + }; + final TermAnalyzer analyzer = TermAnalyzer.builder().lemmatize(lemmatizer).build(); + final IllegalStateException e = assertThrows(IllegalStateException.class, + () -> analyzer.analyze("running")); + assertTrue(e.getMessage().contains("part-of-speech"), e.getMessage()); + } + @Test void testConfusableFoldComposesWithCaseFold() { final TermAnalyzer analyzer = TermAnalyzer.builder().caseFold().confusableFold().build();