diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/Dimension.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/Dimension.java
index 6ad068471..7caece7a0 100644
--- a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/Dimension.java
+++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/Dimension.java
@@ -19,13 +19,13 @@
import java.util.function.Supplier;
/**
- * A layer of the {@code Term} normalization stack, in increasing order of aggressiveness. A
- * {@code TermAnalyzer} applies a configured prefix of these to each token; the declaration order is
+ * A layer of the {@link Term} normalization stack, in increasing order of aggressiveness. A
+ * {@link TermAnalyzer} applies a configured prefix of these to each token; the declaration order is
* the canonical pipeline order, because the transforms do not commute (case folding then accent
* folding differs from the reverse for Turkish dotted/dotless i and the German eszett).
*
*
This enum is the single definition of the character-level steps: each one carries its default
- * {@link CharSequenceNormalizer}, which both {@code TermAnalyzer} and {@link TextNormalizer} read
+ * {@link CharSequenceNormalizer}, which both {@link TermAnalyzer} and {@link TextNormalizer} read
* from rather than re-listing. The default is resolved lazily, so loading this enum does not eagerly
* initialize heavy data such as the confusables table.
*
diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/Term.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/Term.java
new file mode 100644
index 000000000..eda3c4107
--- /dev/null
+++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/Term.java
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.util.normalizer;
+
+import java.util.EnumMap;
+import java.util.List;
+
+import opennlp.tools.util.Span;
+
+/**
+ * One token as a stack of normalization layers. The {@link #original()} form is the canonical
+ * source of truth; the other layers are derived, increasingly aggressive {@link Dimension}s tuned
+ * for matching and search. The dimensions configured on the producing {@link TermAnalyzer} are
+ * computed eagerly and cached; any other dimension is computed on first request, applied on top of
+ * the {@link #normalized() configured form}, and then cached.
+ *
+ * Because the original is always retained, aggressive folding is safe: a match on a derived layer
+ * can always be reported in original coordinates through {@link #span()}. Querying a configured
+ * layer, or {@link #peel() peeling} the last-applied one, is O(1); adding an unconfigured dimension
+ * costs one transform on first touch and is O(1) thereafter.
+ *
+ * Instances are created by {@link TermAnalyzer} and are not thread-safe (the lazy cache is
+ * mutated on first access of an unconfigured dimension).
+ */
+public final class Term {
+
+ private final TermAnalyzer analyzer;
+ private final Span span;
+ private final String posTag;
+ private final EnumMap layers = new EnumMap<>(Dimension.class);
+
+ Term(TermAnalyzer analyzer, String original, Span span, String posTag) {
+ this.analyzer = analyzer;
+ this.span = span;
+ this.posTag = posTag;
+ String value = original;
+ layers.put(Dimension.ORIGINAL, value);
+ for (final Dimension dimension : analyzer.dimensions()) {
+ value = analyzer.apply(dimension, value, posTag);
+ layers.put(dimension, value);
+ }
+ }
+
+ /**
+ * {@return the source span of this token, or {@code null} if it was supplied as a pre-tokenized
+ * string} The span indexes into the text passed to {@link TermAnalyzer#analyze(CharSequence)}.
+ */
+ public Span span() {
+ return span;
+ }
+
+ /**
+ * {@return the original token text}
+ */
+ public String original() {
+ return layers.get(Dimension.ORIGINAL);
+ }
+
+ /**
+ * {@return the token at the analyzer's final configured dimension} Equal to {@link #original()}
+ * when no dimensions were configured.
+ */
+ public String normalized() {
+ return at(analyzer.finalDimension());
+ }
+
+ /**
+ * Returns the token at {@code dimension}. Configured dimensions are cached; an unconfigured
+ * dimension is computed by applying its transform to {@link #normalized()} and then cached.
+ *
+ * Note: an unconfigured dimension is applied on top of {@link #normalized()} (the most
+ * aggressive configured layer), not spliced into canonical pipeline order. Because the transforms
+ * do not commute (see {@link Dimension}), requesting a dimension that ranks earlier than
+ * the configured ones can differ from having configured it. For example, asking for
+ * {@link Dimension#CASE_FOLD} on an analyzer configured only through {@link Dimension#ACCENT_FOLD}
+ * case-folds the already accent-folded text, which is not the same as case-folding first.
+ * Configure the dimension on the analyzer when canonical order matters.
+ *
+ * @param dimension The dimension to project to.
+ * @return The token at that dimension.
+ * @throws IllegalStateException if the dimension needs an engine or tag that was not configured
+ * (see {@link Dimension#STEM} and {@link Dimension#LEMMA}).
+ */
+ public String at(Dimension dimension) {
+ final String cached = layers.get(dimension);
+ if (cached != null) {
+ return cached;
+ }
+ final String value = analyzer.apply(dimension, normalized(), posTag);
+ layers.put(dimension, value);
+ return value;
+ }
+
+ /**
+ * {@return the token at the dimension just below the final configured one} This is the
+ * last-applied layer removed (for example the form before stemming when {@link Dimension#STEM}
+ * is the final dimension); equal to {@link #original()} when at most one dimension is configured.
+ */
+ public String peel() {
+ final List dimensions = analyzer.dimensions();
+ if (dimensions.size() < 2) {
+ return original();
+ }
+ return at(dimensions.get(dimensions.size() - 2));
+ }
+}
diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/TermAnalyzer.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/TermAnalyzer.java
new file mode 100644
index 000000000..0d9956e8e
--- /dev/null
+++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/TermAnalyzer.java
@@ -0,0 +1,368 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.util.normalizer;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.EnumMap;
+import java.util.EnumSet;
+import java.util.List;
+import java.util.Locale;
+import java.util.Objects;
+import java.util.Set;
+
+import opennlp.tools.lemmatizer.Lemmatizer;
+import opennlp.tools.stemmer.Stemmer;
+import opennlp.tools.tokenize.uax29.WordTokenizer;
+import opennlp.tools.util.Span;
+
+/**
+ * Builds {@link Term}s by segmenting text and applying a configured stack of normalization
+ * {@link Dimension}s to each token. The analyzer is the configuration; each {@link Term} is the
+ * layered result for one token, with the configured dimensions computed eagerly and any other
+ * dimension computed lazily on first request.
+ *
+ * Segmentation uses the Unicode {@linkplain WordTokenizer UAX #29 word tokenizer}, so the
+ * input does not need to be pre-tokenized. The character-level dimensions ({@link Dimension#NFC}
+ * through {@link Dimension#ACCENT_FOLD}) have built-in defaults; {@link Dimension#STEM} and
+ * {@link Dimension#LEMMA} are enabled by supplying a {@link Stemmer} or {@link Lemmatizer}.
+ *
+ * An instance is immutable and is thread-safe when its configured transforms are. The built-in
+ * character normalizers are stateless, but the Snowball stemmers are not, so an analyzer configured
+ * with a {@link Stemmer} (for example through {@link NormalizationProfile#searchAnalyzer()}) should
+ * not be shared across threads when {@link Dimension#STEM} is used. Build one with
+ * {@link #builder()}.
+ */
+public final class TermAnalyzer {
+
+ private final List chain;
+ private final Dimension finalDimension;
+ private final EnumMap transforms;
+ private final Stemmer stemmer;
+ private final Lemmatizer lemmatizer;
+ private final WordTokenizer tokenizer;
+
+ private TermAnalyzer(Builder builder) {
+ final List ordered = new ArrayList<>(builder.chain);
+ Collections.sort(ordered); // canonical pipeline order (enum declaration order)
+ this.chain = List.copyOf(ordered);
+ this.finalDimension = ordered.isEmpty() ? Dimension.ORIGINAL : ordered.get(ordered.size() - 1);
+ // Only the per-analyzer overrides from the builder; the defaults live on Dimension itself.
+ this.transforms = new EnumMap<>(builder.transforms);
+ this.stemmer = builder.stemmer;
+ this.lemmatizer = builder.lemmatizer;
+ this.tokenizer = builder.tokenizer;
+ }
+
+ /**
+ * {@return a new builder}
+ */
+ public static Builder builder() {
+ return new Builder();
+ }
+
+ /**
+ * Segments {@code text} with the UAX #29 word tokenizer and returns one {@link Term} per
+ * word token, in order. The terms carry no part-of-speech tag, so {@link Dimension#LEMMA} is not
+ * available from them.
+ *
+ * @param text The text to analyze.
+ * @return The terms.
+ */
+ public List analyze(CharSequence text) {
+ final List spans = tokenizer.tokenizeSpans(text);
+ final List terms = new ArrayList<>(spans.size());
+ for (final Span span : spans) {
+ terms.add(new Term(this, span.getCoveredText(text).toString(), span, null));
+ }
+ return terms;
+ }
+
+ /**
+ * Returns one {@link Term} per supplied token, attaching the matching part-of-speech tag so that
+ * {@link Dimension#LEMMA} can be computed. The terms have no source span.
+ *
+ * @param tokens The tokens.
+ * @param tags The part-of-speech tag for each token; must be the same length as {@code tokens}.
+ * @return The terms.
+ * @throws IllegalArgumentException if {@code tokens} and {@code tags} differ in length.
+ */
+ public List analyze(String[] tokens, String[] tags) {
+ if (tokens.length != tags.length) {
+ throw new IllegalArgumentException(
+ "tokens and tags must be the same length, got " + tokens.length + " and " + tags.length);
+ }
+ final List terms = new ArrayList<>(tokens.length);
+ for (int i = 0; i < tokens.length; i++) {
+ terms.add(new Term(this, tokens[i], null, tags[i]));
+ }
+ return terms;
+ }
+
+ /**
+ * {@return the configured dimensions that are computed eagerly, in canonical order} The list
+ * never includes {@link Dimension#ORIGINAL}, which is always present.
+ */
+ public List dimensions() {
+ return chain;
+ }
+
+ Dimension finalDimension() {
+ return finalDimension;
+ }
+
+ // Applies one dimension's transform to a single token value. Fails loudly when a token-level
+ // dimension was requested without the engine (or tag) it needs.
+ String apply(Dimension dimension, String input, String posTag) {
+ switch (dimension) {
+ case ORIGINAL:
+ return input;
+ case STEM:
+ if (stemmer == null) {
+ throw new IllegalStateException(
+ "Dimension STEM requires a Stemmer; configure it with builder().stem(...)");
+ }
+ return stemmer.stem(input).toString();
+ case LEMMA:
+ if (lemmatizer == null) {
+ throw new IllegalStateException(
+ "Dimension LEMMA requires a Lemmatizer; configure it with builder().lemmatize(...)");
+ }
+ if (posTag == null) {
+ throw new IllegalStateException(
+ "Dimension LEMMA requires a part-of-speech tag; use analyze(tokens, tags)");
+ }
+ return lemmatizer.lemmatize(new String[] {input}, new String[] {posTag})[0];
+ default:
+ // A builder override wins; otherwise the dimension's own default normalizer.
+ final CharSequenceNormalizer normalizer = transforms.containsKey(dimension)
+ ? transforms.get(dimension) : dimension.defaultNormalizer();
+ if (normalizer == null) {
+ throw new IllegalStateException("Dimension " + dimension + " has no default normalizer; "
+ + "configure it with builder().transform(" + dimension + ", ...)");
+ }
+ return normalizer.normalize(input).toString();
+ }
+ }
+
+ /** A builder for {@link TermAnalyzer}. */
+ public static final class Builder {
+
+ private final EnumSet chain = EnumSet.noneOf(Dimension.class);
+ private final EnumMap transforms =
+ new EnumMap<>(Dimension.class);
+ private Stemmer stemmer;
+ private Lemmatizer lemmatizer;
+ private WordTokenizer tokenizer = new WordTokenizer();
+
+ private Builder() {
+ }
+
+ /**
+ * Enables {@link Dimension#NFC}.
+ *
+ * @return this builder
+ */
+ public Builder nfc() {
+ chain.add(Dimension.NFC);
+ return this;
+ }
+
+ /**
+ * Enables {@link Dimension#NFKC}.
+ *
+ * @return this builder
+ */
+ public Builder nfkc() {
+ chain.add(Dimension.NFKC);
+ return this;
+ }
+
+ /**
+ * Enables {@link Dimension#WHITESPACE}.
+ *
+ * @return this builder
+ */
+ public Builder whitespace() {
+ chain.add(Dimension.WHITESPACE);
+ return this;
+ }
+
+ /**
+ * Enables {@link Dimension#WHITESPACE} with a specific normalizer, choosing the fold target and
+ * behavior. For a custom class and target use a {@link CharClass} method reference, for example
+ * {@code whitespace(CharClass.of(members, replacement)::collapse)}.
+ *
+ * @param normalizer The whitespace normalizer to use.
+ * @return this builder
+ */
+ public Builder whitespace(CharSequenceNormalizer normalizer) {
+ return transform(Dimension.WHITESPACE, normalizer);
+ }
+
+ /**
+ * Enables {@link Dimension#DASH}.
+ *
+ * @return this builder
+ */
+ public Builder dashes() {
+ chain.add(Dimension.DASH);
+ return this;
+ }
+
+ /**
+ * Enables {@link Dimension#DASH} with a specific normalizer (a custom dash set or target).
+ *
+ * @param normalizer The dash normalizer to use.
+ * @return this builder
+ */
+ public Builder dashes(CharSequenceNormalizer normalizer) {
+ return transform(Dimension.DASH, normalizer);
+ }
+
+ /**
+ * Enables {@link Dimension#CASE_FOLD}.
+ *
+ * @return this builder
+ */
+ public Builder caseFold() {
+ chain.add(Dimension.CASE_FOLD);
+ return this;
+ }
+
+ /**
+ * Enables {@link Dimension#CASE_FOLD} using the given locale's case rules (for example Turkish
+ * dotted/dotless i), instead of the default {@link Locale#ROOT}.
+ *
+ * @param locale The locale whose case rules to apply.
+ * @return this builder
+ */
+ public Builder caseFold(Locale locale) {
+ Objects.requireNonNull(locale, "locale");
+ return transform(Dimension.CASE_FOLD, CaseFoldCharSequenceNormalizer.getInstance(locale));
+ }
+
+ /**
+ * Enables {@link Dimension#ACCENT_FOLD}.
+ *
+ * @return this builder
+ */
+ public Builder accentFold() {
+ chain.add(Dimension.ACCENT_FOLD);
+ return this;
+ }
+
+ /**
+ * Enables {@link Dimension#ACCENT_FOLD} restricted to a specific set of scripts, instead of the
+ * default Latin/Greek/Cyrillic.
+ *
+ * @param foldScripts The scripts whose diacritics to fold.
+ * @param foldStrokeLetters Whether to also fold stroke letters such as o-slash and l-stroke.
+ * @return this builder
+ */
+ public Builder accentFold(Set foldScripts, boolean foldStrokeLetters) {
+ return transform(Dimension.ACCENT_FOLD,
+ new AccentFoldCharSequenceNormalizer(foldScripts, foldStrokeLetters));
+ }
+
+ /**
+ * Enables {@link Dimension#CONFUSABLE_FOLD}.
+ *
+ * @return this builder
+ */
+ public Builder confusableFold() {
+ chain.add(Dimension.CONFUSABLE_FOLD);
+ return this;
+ }
+
+ /**
+ * Enables a character-level dimension with a specific normalizer, overriding its default (for
+ * example a locale-specific case fold for a language profile).
+ *
+ * @param dimension The character-level dimension to enable.
+ * @param normalizer The normalizer to use for it.
+ * @return this builder
+ * @throws IllegalArgumentException if {@code dimension} is {@link Dimension#ORIGINAL},
+ * {@link Dimension#STEM}, or {@link Dimension#LEMMA}.
+ */
+ public Builder transform(Dimension dimension, CharSequenceNormalizer normalizer) {
+ if (dimension == Dimension.ORIGINAL || dimension == Dimension.STEM
+ || dimension == Dimension.LEMMA) {
+ throw new IllegalArgumentException(
+ "transform(...) only applies to character-level dimensions, not " + dimension);
+ }
+ transforms.put(dimension, Objects.requireNonNull(normalizer, "normalizer"));
+ chain.add(dimension);
+ return this;
+ }
+
+ /**
+ * Enables {@link Dimension#STEM} through the given stemmer.
+ *
+ * @param value The stemmer.
+ * @return this builder
+ */
+ public Builder stem(Stemmer value) {
+ this.stemmer = Objects.requireNonNull(value, "stemmer");
+ chain.add(Dimension.STEM);
+ return this;
+ }
+
+ /**
+ * Enables {@link Dimension#LEMMA} through the given lemmatizer.
+ *
+ * @param value The lemmatizer.
+ * @return this builder
+ */
+ public Builder lemmatize(Lemmatizer value) {
+ this.lemmatizer = Objects.requireNonNull(value, "lemmatizer");
+ chain.add(Dimension.LEMMA);
+ return this;
+ }
+
+ /**
+ * Sets the tokenizer used by {@link TermAnalyzer#analyze(CharSequence)}.
+ *
+ * @param value The tokenizer.
+ * @return this builder
+ */
+ public Builder tokenizer(WordTokenizer value) {
+ this.tokenizer = Objects.requireNonNull(value, "tokenizer");
+ return this;
+ }
+
+ /**
+ * Sets the maximum token length of the tokenizer used by
+ * {@link TermAnalyzer#analyze(CharSequence)}. Convenience for
+ * {@code tokenizer(new WordTokenizer(maxTokenLength))}.
+ *
+ * @param maxTokenLength The maximum number of characters in a token.
+ * @return this builder
+ */
+ public Builder maxTokenLength(int maxTokenLength) {
+ this.tokenizer = new WordTokenizer(maxTokenLength);
+ return this;
+ }
+
+ /**
+ * {@return a new {@link TermAnalyzer} with this configuration}
+ */
+ public TermAnalyzer build() {
+ return new TermAnalyzer(this);
+ }
+ }
+}
diff --git a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/ConfusablesTest.java b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/ConfusablesTest.java
new file mode 100644
index 000000000..262fe5aa9
--- /dev/null
+++ b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/ConfusablesTest.java
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.util.normalizer;
+
+import org.junit.jupiter.api.Test;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+public class ConfusablesTest {
+
+ private static String cp(int codePoint) {
+ return new String(Character.toChars(codePoint));
+ }
+
+ @Test
+ void testCyrillicLetterIsConfusableWithLatin() {
+ final String cyrillicA = cp(0x0430); // CYRILLIC SMALL LETTER A, looks like Latin 'a'
+ assertTrue(Confusables.confusable(cyrillicA, "a"));
+ assertFalse(Confusables.confusable(cyrillicA, "b"));
+ }
+
+ @Test
+ void testHomoglyphSpoofWordReducesToLatinSpelling() {
+ final String spoof = "p" + cp(0x0430) + "yp" + cp(0x0430) + "l"; // paypal with Cyrillic a's
+ assertTrue(Confusables.confusable(spoof, "paypal"));
+ assertEquals(Confusables.skeleton("paypal"), Confusables.skeleton(spoof));
+ }
+
+ @Test
+ void testHorizontalEllipsisFoldsToThreeFullStops() {
+ assertEquals(Confusables.skeleton("..."), Confusables.skeleton(cp(0x2026)));
+ assertTrue(Confusables.confusable(cp(0x2026), "..."));
+ }
+
+ @Test
+ void testDistinctWordsAreNotConfusable() {
+ assertFalse(Confusables.confusable("cat", "dog"));
+ }
+
+ @Test
+ void testSkeletonIsIdempotent() {
+ final String skeleton = Confusables.skeleton(cp(0x0430) + "bc");
+ assertEquals(skeleton, Confusables.skeleton(skeleton));
+ }
+
+ @Test
+ void testNormalizerProducesTheSkeleton() {
+ final String spoof = "p" + cp(0x0430) + "yp" + cp(0x0430) + "l";
+ assertEquals(Confusables.skeleton(spoof),
+ ConfusableSkeletonCharSequenceNormalizer.getInstance().normalize(spoof).toString());
+ }
+
+ @Test
+ void testMultipleCyrillicLookalikesFold() {
+ final String spoof = "d" + cp(0x0430) + "t" + cp(0x0430); // "data" with Cyrillic a's
+ assertEquals(Confusables.skeleton("data"), Confusables.skeleton(spoof));
+ }
+
+ @Test
+ void testTermConfusableFoldDimension() {
+ final String spoof = "p" + cp(0x0430) + "yp" + cp(0x0430) + "l";
+ final TermAnalyzer analyzer = TermAnalyzer.builder().confusableFold().build();
+ assertEquals(Confusables.skeleton("paypal"), analyzer.analyze(spoof).get(0).normalized());
+ }
+}
diff --git a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/TermAnalyzerTest.java b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/TermAnalyzerTest.java
new file mode 100644
index 000000000..56f16899d
--- /dev/null
+++ b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/TermAnalyzerTest.java
@@ -0,0 +1,211 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.util.normalizer;
+
+import java.util.List;
+import java.util.Locale;
+import java.util.Set;
+
+import org.junit.jupiter.api.Test;
+
+import opennlp.tools.lemmatizer.Lemmatizer;
+import opennlp.tools.stemmer.PorterStemmer;
+import opennlp.tools.util.Span;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNull;
+import static org.junit.jupiter.api.Assertions.assertSame;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+
+public class TermAnalyzerTest {
+
+ private static String cp(int codePoint) {
+ return new String(Character.toChars(codePoint));
+ }
+
+ @Test
+ void testNoDimensionsLeavesTokenUnchanged() {
+ final TermAnalyzer analyzer = TermAnalyzer.builder().build();
+ final Term term = analyzer.analyze("Hello").get(0);
+ assertEquals("Hello", term.original());
+ assertEquals("Hello", term.normalized());
+ assertEquals("Hello", term.peel());
+ assertEquals(List.of(), analyzer.dimensions());
+ }
+
+ @Test
+ void testChainAppliesInCanonicalOrderRegardlessOfBuilderOrder() {
+ // accentFold added before caseFold, but the canonical order is caseFold then accentFold.
+ final TermAnalyzer analyzer = TermAnalyzer.builder().accentFold().caseFold().build();
+ assertEquals(List.of(Dimension.CASE_FOLD, Dimension.ACCENT_FOLD), analyzer.dimensions());
+ final String input = "CAF" + cp(0x00C9); // CAFE with capital acute E
+ final Term term = analyzer.analyze(input).get(0);
+ assertEquals(input, term.original());
+ assertEquals("cafe", term.normalized());
+ assertEquals("caf" + cp(0x00E9), term.peel()); // before accent folding: lower-case, acute kept
+ }
+
+ @Test
+ void testStemIsTheTopLayer() {
+ final TermAnalyzer analyzer =
+ TermAnalyzer.builder().caseFold().stem(new PorterStemmer()).build();
+ final Term term = analyzer.analyze("Running").get(0);
+ assertEquals("running", term.peel()); // case-folded form, before stemming
+ assertEquals("run", term.normalized());
+ assertEquals("run", term.at(Dimension.STEM));
+ }
+
+ @Test
+ void testUnconfiguredCharDimensionComputedLazily() {
+ final TermAnalyzer analyzer = TermAnalyzer.builder().build();
+ final Term term = analyzer.analyze("HELLO").get(0);
+ assertEquals("HELLO", term.normalized());
+ assertEquals("hello", term.at(Dimension.CASE_FOLD)); // lazily added on top of the final form
+ }
+
+ @Test
+ void testStemDimensionWithoutStemmerFailsLoudly() {
+ final TermAnalyzer analyzer = TermAnalyzer.builder().caseFold().build();
+ final Term term = analyzer.analyze("running").get(0);
+ assertThrows(IllegalStateException.class, () -> term.at(Dimension.STEM));
+ }
+
+ @Test
+ void testLemmaWithoutLemmatizerFailsLoudly() {
+ final TermAnalyzer analyzer = TermAnalyzer.builder().build();
+ final Term term = analyzer.analyze("running").get(0);
+ assertThrows(IllegalStateException.class, () -> term.at(Dimension.LEMMA));
+ }
+
+ @Test
+ void testAnalyzeTextProducesSpans() {
+ final TermAnalyzer analyzer = TermAnalyzer.builder().caseFold().build();
+ final List terms = analyzer.analyze("The Cats");
+ assertEquals(2, terms.size());
+ assertEquals("The", terms.get(0).original());
+ assertEquals("the", terms.get(0).normalized());
+ assertEquals(new Span(0, 3), terms.get(0).span());
+ assertEquals("Cats", terms.get(1).original());
+ assertEquals(new Span(4, 8), terms.get(1).span());
+ }
+
+ @Test
+ void testAnalyzeTokensHasNoSpan() {
+ final TermAnalyzer analyzer = TermAnalyzer.builder().caseFold().build();
+ final List terms = analyzer.analyze(new String[] {"Cats"}, new String[] {"NNS"});
+ assertNull(terms.get(0).span());
+ assertEquals("cats", terms.get(0).normalized());
+ }
+
+ @Test
+ void testAnalyzeTokensRejectsLengthMismatch() {
+ final TermAnalyzer analyzer = TermAnalyzer.builder().build();
+ assertThrows(IllegalArgumentException.class,
+ () -> analyzer.analyze(new String[] {"a", "b"}, new String[] {"X"}));
+ }
+
+ @Test
+ void testTransformRejectsNonCharacterDimension() {
+ assertThrows(IllegalArgumentException.class, () -> TermAnalyzer.builder()
+ .transform(Dimension.STEM, CaseFoldCharSequenceNormalizer.getInstance()));
+ }
+
+ @Test
+ void testLemmaWithLemmatizerAndTag() {
+ final Lemmatizer lemmatizer = new Lemmatizer() {
+ @Override
+ public String[] lemmatize(String[] tokens, String[] tags) {
+ return new String[] {"be"};
+ }
+
+ @Override
+ public List> lemmatize(List tokens, List tags) {
+ return List.of(List.of("be"));
+ }
+ };
+ final TermAnalyzer analyzer =
+ TermAnalyzer.builder().caseFold().lemmatize(lemmatizer).build();
+ final Term term = analyzer.analyze(new String[] {"was"}, new String[] {"VBD"}).get(0);
+ assertEquals("be", term.normalized());
+ }
+
+ @Test
+ void testConfusableFoldComposesWithCaseFold() {
+ final TermAnalyzer analyzer = TermAnalyzer.builder().caseFold().confusableFold().build();
+ final String spoof = "P" + cp(0x0430) + "yp" + cp(0x0430) + "l"; // Paypal with Cyrillic a's
+ assertEquals(Confusables.skeleton("paypal"), analyzer.analyze(spoof).get(0).normalized());
+ }
+
+ @Test
+ void testAtIsMemoized() {
+ final TermAnalyzer analyzer = TermAnalyzer.builder().build();
+ final Term term = analyzer.analyze("HELLO").get(0);
+ final String first = term.at(Dimension.CASE_FOLD);
+ assertSame(first, term.at(Dimension.CASE_FOLD));
+ }
+
+ @Test
+ void testWhitespaceTargetIsConfigurable() {
+ final CharClass lineFold = CharClass.of(CodePointSet.of('\n', '\t'), '\n');
+ final TermAnalyzer analyzer = TermAnalyzer.builder().whitespace(lineFold::collapse).build();
+ final Term term = analyzer.analyze(new String[] {"a\n\n\tb"}, new String[] {"X"}).get(0);
+ assertEquals("a\nb", term.normalized());
+ }
+
+ @Test
+ void testCaseFoldLocaleAppliesTurkishRules() {
+ final TermAnalyzer analyzer =
+ TermAnalyzer.builder().caseFold(Locale.forLanguageTag("tr")).build();
+ assertEquals(cp(0x0131), analyzer.analyze("I").get(0).normalized()); // dotless lowercase i
+ }
+
+ @Test
+ void testAccentFoldScopeFoldsLatin() {
+ final TermAnalyzer analyzer = TermAnalyzer.builder()
+ .accentFold(Set.of(Character.UnicodeScript.LATIN), false).build();
+ assertEquals("cafe", analyzer.analyze("caf" + cp(0x00E9)).get(0).normalized()); // cafe + acute
+ }
+
+ @Test
+ void testMaxTokenLengthChopsTokens() {
+ final List terms = TermAnalyzer.builder().maxTokenLength(3).build().analyze("abcdefg");
+ assertEquals(3, terms.size());
+ assertEquals("abc", terms.get(0).original());
+ assertEquals("def", terms.get(1).original());
+ assertEquals("g", terms.get(2).original());
+ }
+
+ @Test
+ void testAnalyzeEmptyTextProducesNoTerms() {
+ assertEquals(List.of(), TermAnalyzer.builder().caseFold().build().analyze(""));
+ }
+
+ @Test
+ void testWhitespaceOnlyInputHasNoWordTerms() {
+ assertEquals(List.of(), TermAnalyzer.builder().build().analyze(" \t "));
+ }
+
+ @Test
+ void testAtDimensionBelowFinalIsAppliedOnTop() {
+ // Final dimension is STEM; asking for NFC applies it on top of the stem (documented behavior).
+ final TermAnalyzer analyzer =
+ TermAnalyzer.builder().caseFold().stem(new PorterStemmer()).build();
+ final Term term = analyzer.analyze("Running").get(0);
+ assertEquals("run", term.normalized());
+ assertEquals("run", term.at(Dimension.NFC));
+ }
+}