diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/Dimension.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/Dimension.java index 6ad068471..7caece7a0 100644 --- a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/Dimension.java +++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/Dimension.java @@ -19,13 +19,13 @@ import java.util.function.Supplier; /** - * A layer of the {@code Term} normalization stack, in increasing order of aggressiveness. A - * {@code TermAnalyzer} applies a configured prefix of these to each token; the declaration order is + * A layer of the {@link Term} normalization stack, in increasing order of aggressiveness. A + * {@link TermAnalyzer} applies a configured prefix of these to each token; the declaration order is * the canonical pipeline order, because the transforms do not commute (case folding then accent * folding differs from the reverse for Turkish dotted/dotless i and the German eszett). * *

This enum is the single definition of the character-level steps: each one carries its default - * {@link CharSequenceNormalizer}, which both {@code TermAnalyzer} and {@link TextNormalizer} read + * {@link CharSequenceNormalizer}, which both {@link TermAnalyzer} and {@link TextNormalizer} read * from rather than re-listing. The default is resolved lazily, so loading this enum does not eagerly * initialize heavy data such as the confusables table.

* diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/Term.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/Term.java new file mode 100644 index 000000000..eda3c4107 --- /dev/null +++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/Term.java @@ -0,0 +1,120 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.util.normalizer; + +import java.util.EnumMap; +import java.util.List; + +import opennlp.tools.util.Span; + +/** + * One token as a stack of normalization layers. The {@link #original()} form is the canonical + * source of truth; the other layers are derived, increasingly aggressive {@link Dimension}s tuned + * for matching and search. The dimensions configured on the producing {@link TermAnalyzer} are + * computed eagerly and cached; any other dimension is computed on first request, applied on top of + * the {@link #normalized() configured form}, and then cached. + * + *

Because the original is always retained, aggressive folding is safe: a match on a derived layer + * can always be reported in original coordinates through {@link #span()}. Querying a configured + * layer, or {@link #peel() peeling} the last-applied one, is O(1); adding an unconfigured dimension + * costs one transform on first touch and is O(1) thereafter.

+ * + *

Instances are created by {@link TermAnalyzer} and are not thread-safe (the lazy cache is + * mutated on first access of an unconfigured dimension).

+ */ +public final class Term { + + private final TermAnalyzer analyzer; + private final Span span; + private final String posTag; + private final EnumMap layers = new EnumMap<>(Dimension.class); + + Term(TermAnalyzer analyzer, String original, Span span, String posTag) { + this.analyzer = analyzer; + this.span = span; + this.posTag = posTag; + String value = original; + layers.put(Dimension.ORIGINAL, value); + for (final Dimension dimension : analyzer.dimensions()) { + value = analyzer.apply(dimension, value, posTag); + layers.put(dimension, value); + } + } + + /** + * {@return the source span of this token, or {@code null} if it was supplied as a pre-tokenized + * string} The span indexes into the text passed to {@link TermAnalyzer#analyze(CharSequence)}. + */ + public Span span() { + return span; + } + + /** + * {@return the original token text} + */ + public String original() { + return layers.get(Dimension.ORIGINAL); + } + + /** + * {@return the token at the analyzer's final configured dimension} Equal to {@link #original()} + * when no dimensions were configured. + */ + public String normalized() { + return at(analyzer.finalDimension()); + } + + /** + * Returns the token at {@code dimension}. Configured dimensions are cached; an unconfigured + * dimension is computed by applying its transform to {@link #normalized()} and then cached. + * + *

Note: an unconfigured dimension is applied on top of {@link #normalized()} (the most + * aggressive configured layer), not spliced into canonical pipeline order. Because the transforms + * do not commute (see {@link Dimension}), requesting a dimension that ranks earlier than + * the configured ones can differ from having configured it. For example, asking for + * {@link Dimension#CASE_FOLD} on an analyzer configured only through {@link Dimension#ACCENT_FOLD} + * case-folds the already accent-folded text, which is not the same as case-folding first. + * Configure the dimension on the analyzer when canonical order matters.

+ * + * @param dimension The dimension to project to. + * @return The token at that dimension. + * @throws IllegalStateException if the dimension needs an engine or tag that was not configured + * (see {@link Dimension#STEM} and {@link Dimension#LEMMA}). + */ + public String at(Dimension dimension) { + final String cached = layers.get(dimension); + if (cached != null) { + return cached; + } + final String value = analyzer.apply(dimension, normalized(), posTag); + layers.put(dimension, value); + return value; + } + + /** + * {@return the token at the dimension just below the final configured one} This is the + * last-applied layer removed (for example the form before stemming when {@link Dimension#STEM} + * is the final dimension); equal to {@link #original()} when at most one dimension is configured. + */ + public String peel() { + final List dimensions = analyzer.dimensions(); + if (dimensions.size() < 2) { + return original(); + } + return at(dimensions.get(dimensions.size() - 2)); + } +} diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/TermAnalyzer.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/TermAnalyzer.java new file mode 100644 index 000000000..0d9956e8e --- /dev/null +++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/TermAnalyzer.java @@ -0,0 +1,368 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.util.normalizer; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.EnumMap; +import java.util.EnumSet; +import java.util.List; +import java.util.Locale; +import java.util.Objects; +import java.util.Set; + +import opennlp.tools.lemmatizer.Lemmatizer; +import opennlp.tools.stemmer.Stemmer; +import opennlp.tools.tokenize.uax29.WordTokenizer; +import opennlp.tools.util.Span; + +/** + * Builds {@link Term}s by segmenting text and applying a configured stack of normalization + * {@link Dimension}s to each token. The analyzer is the configuration; each {@link Term} is the + * layered result for one token, with the configured dimensions computed eagerly and any other + * dimension computed lazily on first request. + * + *

Segmentation uses the Unicode {@linkplain WordTokenizer UAX #29 word tokenizer}, so the + * input does not need to be pre-tokenized. The character-level dimensions ({@link Dimension#NFC} + * through {@link Dimension#ACCENT_FOLD}) have built-in defaults; {@link Dimension#STEM} and + * {@link Dimension#LEMMA} are enabled by supplying a {@link Stemmer} or {@link Lemmatizer}.

+ * + *

An instance is immutable and is thread-safe when its configured transforms are. The built-in + * character normalizers are stateless, but the Snowball stemmers are not, so an analyzer configured + * with a {@link Stemmer} (for example through {@link NormalizationProfile#searchAnalyzer()}) should + * not be shared across threads when {@link Dimension#STEM} is used. Build one with + * {@link #builder()}.

+ */ +public final class TermAnalyzer { + + private final List chain; + private final Dimension finalDimension; + private final EnumMap transforms; + private final Stemmer stemmer; + private final Lemmatizer lemmatizer; + private final WordTokenizer tokenizer; + + private TermAnalyzer(Builder builder) { + final List ordered = new ArrayList<>(builder.chain); + Collections.sort(ordered); // canonical pipeline order (enum declaration order) + this.chain = List.copyOf(ordered); + this.finalDimension = ordered.isEmpty() ? Dimension.ORIGINAL : ordered.get(ordered.size() - 1); + // Only the per-analyzer overrides from the builder; the defaults live on Dimension itself. + this.transforms = new EnumMap<>(builder.transforms); + this.stemmer = builder.stemmer; + this.lemmatizer = builder.lemmatizer; + this.tokenizer = builder.tokenizer; + } + + /** + * {@return a new builder} + */ + public static Builder builder() { + return new Builder(); + } + + /** + * Segments {@code text} with the UAX #29 word tokenizer and returns one {@link Term} per + * word token, in order. The terms carry no part-of-speech tag, so {@link Dimension#LEMMA} is not + * available from them. + * + * @param text The text to analyze. + * @return The terms. + */ + public List analyze(CharSequence text) { + final List spans = tokenizer.tokenizeSpans(text); + final List terms = new ArrayList<>(spans.size()); + for (final Span span : spans) { + terms.add(new Term(this, span.getCoveredText(text).toString(), span, null)); + } + return terms; + } + + /** + * Returns one {@link Term} per supplied token, attaching the matching part-of-speech tag so that + * {@link Dimension#LEMMA} can be computed. The terms have no source span. + * + * @param tokens The tokens. + * @param tags The part-of-speech tag for each token; must be the same length as {@code tokens}. + * @return The terms. + * @throws IllegalArgumentException if {@code tokens} and {@code tags} differ in length. + */ + public List analyze(String[] tokens, String[] tags) { + if (tokens.length != tags.length) { + throw new IllegalArgumentException( + "tokens and tags must be the same length, got " + tokens.length + " and " + tags.length); + } + final List terms = new ArrayList<>(tokens.length); + for (int i = 0; i < tokens.length; i++) { + terms.add(new Term(this, tokens[i], null, tags[i])); + } + return terms; + } + + /** + * {@return the configured dimensions that are computed eagerly, in canonical order} The list + * never includes {@link Dimension#ORIGINAL}, which is always present. + */ + public List dimensions() { + return chain; + } + + Dimension finalDimension() { + return finalDimension; + } + + // Applies one dimension's transform to a single token value. Fails loudly when a token-level + // dimension was requested without the engine (or tag) it needs. + String apply(Dimension dimension, String input, String posTag) { + switch (dimension) { + case ORIGINAL: + return input; + case STEM: + if (stemmer == null) { + throw new IllegalStateException( + "Dimension STEM requires a Stemmer; configure it with builder().stem(...)"); + } + return stemmer.stem(input).toString(); + case LEMMA: + if (lemmatizer == null) { + throw new IllegalStateException( + "Dimension LEMMA requires a Lemmatizer; configure it with builder().lemmatize(...)"); + } + if (posTag == null) { + throw new IllegalStateException( + "Dimension LEMMA requires a part-of-speech tag; use analyze(tokens, tags)"); + } + return lemmatizer.lemmatize(new String[] {input}, new String[] {posTag})[0]; + default: + // A builder override wins; otherwise the dimension's own default normalizer. + final CharSequenceNormalizer normalizer = transforms.containsKey(dimension) + ? transforms.get(dimension) : dimension.defaultNormalizer(); + if (normalizer == null) { + throw new IllegalStateException("Dimension " + dimension + " has no default normalizer; " + + "configure it with builder().transform(" + dimension + ", ...)"); + } + return normalizer.normalize(input).toString(); + } + } + + /** A builder for {@link TermAnalyzer}. */ + public static final class Builder { + + private final EnumSet chain = EnumSet.noneOf(Dimension.class); + private final EnumMap transforms = + new EnumMap<>(Dimension.class); + private Stemmer stemmer; + private Lemmatizer lemmatizer; + private WordTokenizer tokenizer = new WordTokenizer(); + + private Builder() { + } + + /** + * Enables {@link Dimension#NFC}. + * + * @return this builder + */ + public Builder nfc() { + chain.add(Dimension.NFC); + return this; + } + + /** + * Enables {@link Dimension#NFKC}. + * + * @return this builder + */ + public Builder nfkc() { + chain.add(Dimension.NFKC); + return this; + } + + /** + * Enables {@link Dimension#WHITESPACE}. + * + * @return this builder + */ + public Builder whitespace() { + chain.add(Dimension.WHITESPACE); + return this; + } + + /** + * Enables {@link Dimension#WHITESPACE} with a specific normalizer, choosing the fold target and + * behavior. For a custom class and target use a {@link CharClass} method reference, for example + * {@code whitespace(CharClass.of(members, replacement)::collapse)}. + * + * @param normalizer The whitespace normalizer to use. + * @return this builder + */ + public Builder whitespace(CharSequenceNormalizer normalizer) { + return transform(Dimension.WHITESPACE, normalizer); + } + + /** + * Enables {@link Dimension#DASH}. + * + * @return this builder + */ + public Builder dashes() { + chain.add(Dimension.DASH); + return this; + } + + /** + * Enables {@link Dimension#DASH} with a specific normalizer (a custom dash set or target). + * + * @param normalizer The dash normalizer to use. + * @return this builder + */ + public Builder dashes(CharSequenceNormalizer normalizer) { + return transform(Dimension.DASH, normalizer); + } + + /** + * Enables {@link Dimension#CASE_FOLD}. + * + * @return this builder + */ + public Builder caseFold() { + chain.add(Dimension.CASE_FOLD); + return this; + } + + /** + * Enables {@link Dimension#CASE_FOLD} using the given locale's case rules (for example Turkish + * dotted/dotless i), instead of the default {@link Locale#ROOT}. + * + * @param locale The locale whose case rules to apply. + * @return this builder + */ + public Builder caseFold(Locale locale) { + Objects.requireNonNull(locale, "locale"); + return transform(Dimension.CASE_FOLD, CaseFoldCharSequenceNormalizer.getInstance(locale)); + } + + /** + * Enables {@link Dimension#ACCENT_FOLD}. + * + * @return this builder + */ + public Builder accentFold() { + chain.add(Dimension.ACCENT_FOLD); + return this; + } + + /** + * Enables {@link Dimension#ACCENT_FOLD} restricted to a specific set of scripts, instead of the + * default Latin/Greek/Cyrillic. + * + * @param foldScripts The scripts whose diacritics to fold. + * @param foldStrokeLetters Whether to also fold stroke letters such as o-slash and l-stroke. + * @return this builder + */ + public Builder accentFold(Set foldScripts, boolean foldStrokeLetters) { + return transform(Dimension.ACCENT_FOLD, + new AccentFoldCharSequenceNormalizer(foldScripts, foldStrokeLetters)); + } + + /** + * Enables {@link Dimension#CONFUSABLE_FOLD}. + * + * @return this builder + */ + public Builder confusableFold() { + chain.add(Dimension.CONFUSABLE_FOLD); + return this; + } + + /** + * Enables a character-level dimension with a specific normalizer, overriding its default (for + * example a locale-specific case fold for a language profile). + * + * @param dimension The character-level dimension to enable. + * @param normalizer The normalizer to use for it. + * @return this builder + * @throws IllegalArgumentException if {@code dimension} is {@link Dimension#ORIGINAL}, + * {@link Dimension#STEM}, or {@link Dimension#LEMMA}. + */ + public Builder transform(Dimension dimension, CharSequenceNormalizer normalizer) { + if (dimension == Dimension.ORIGINAL || dimension == Dimension.STEM + || dimension == Dimension.LEMMA) { + throw new IllegalArgumentException( + "transform(...) only applies to character-level dimensions, not " + dimension); + } + transforms.put(dimension, Objects.requireNonNull(normalizer, "normalizer")); + chain.add(dimension); + return this; + } + + /** + * Enables {@link Dimension#STEM} through the given stemmer. + * + * @param value The stemmer. + * @return this builder + */ + public Builder stem(Stemmer value) { + this.stemmer = Objects.requireNonNull(value, "stemmer"); + chain.add(Dimension.STEM); + return this; + } + + /** + * Enables {@link Dimension#LEMMA} through the given lemmatizer. + * + * @param value The lemmatizer. + * @return this builder + */ + public Builder lemmatize(Lemmatizer value) { + this.lemmatizer = Objects.requireNonNull(value, "lemmatizer"); + chain.add(Dimension.LEMMA); + return this; + } + + /** + * Sets the tokenizer used by {@link TermAnalyzer#analyze(CharSequence)}. + * + * @param value The tokenizer. + * @return this builder + */ + public Builder tokenizer(WordTokenizer value) { + this.tokenizer = Objects.requireNonNull(value, "tokenizer"); + return this; + } + + /** + * Sets the maximum token length of the tokenizer used by + * {@link TermAnalyzer#analyze(CharSequence)}. Convenience for + * {@code tokenizer(new WordTokenizer(maxTokenLength))}. + * + * @param maxTokenLength The maximum number of characters in a token. + * @return this builder + */ + public Builder maxTokenLength(int maxTokenLength) { + this.tokenizer = new WordTokenizer(maxTokenLength); + return this; + } + + /** + * {@return a new {@link TermAnalyzer} with this configuration} + */ + public TermAnalyzer build() { + return new TermAnalyzer(this); + } + } +} diff --git a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/ConfusablesTest.java b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/ConfusablesTest.java new file mode 100644 index 000000000..262fe5aa9 --- /dev/null +++ b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/ConfusablesTest.java @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.util.normalizer; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class ConfusablesTest { + + private static String cp(int codePoint) { + return new String(Character.toChars(codePoint)); + } + + @Test + void testCyrillicLetterIsConfusableWithLatin() { + final String cyrillicA = cp(0x0430); // CYRILLIC SMALL LETTER A, looks like Latin 'a' + assertTrue(Confusables.confusable(cyrillicA, "a")); + assertFalse(Confusables.confusable(cyrillicA, "b")); + } + + @Test + void testHomoglyphSpoofWordReducesToLatinSpelling() { + final String spoof = "p" + cp(0x0430) + "yp" + cp(0x0430) + "l"; // paypal with Cyrillic a's + assertTrue(Confusables.confusable(spoof, "paypal")); + assertEquals(Confusables.skeleton("paypal"), Confusables.skeleton(spoof)); + } + + @Test + void testHorizontalEllipsisFoldsToThreeFullStops() { + assertEquals(Confusables.skeleton("..."), Confusables.skeleton(cp(0x2026))); + assertTrue(Confusables.confusable(cp(0x2026), "...")); + } + + @Test + void testDistinctWordsAreNotConfusable() { + assertFalse(Confusables.confusable("cat", "dog")); + } + + @Test + void testSkeletonIsIdempotent() { + final String skeleton = Confusables.skeleton(cp(0x0430) + "bc"); + assertEquals(skeleton, Confusables.skeleton(skeleton)); + } + + @Test + void testNormalizerProducesTheSkeleton() { + final String spoof = "p" + cp(0x0430) + "yp" + cp(0x0430) + "l"; + assertEquals(Confusables.skeleton(spoof), + ConfusableSkeletonCharSequenceNormalizer.getInstance().normalize(spoof).toString()); + } + + @Test + void testMultipleCyrillicLookalikesFold() { + final String spoof = "d" + cp(0x0430) + "t" + cp(0x0430); // "data" with Cyrillic a's + assertEquals(Confusables.skeleton("data"), Confusables.skeleton(spoof)); + } + + @Test + void testTermConfusableFoldDimension() { + final String spoof = "p" + cp(0x0430) + "yp" + cp(0x0430) + "l"; + final TermAnalyzer analyzer = TermAnalyzer.builder().confusableFold().build(); + assertEquals(Confusables.skeleton("paypal"), analyzer.analyze(spoof).get(0).normalized()); + } +} diff --git a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/TermAnalyzerTest.java b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/TermAnalyzerTest.java new file mode 100644 index 000000000..56f16899d --- /dev/null +++ b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/normalizer/TermAnalyzerTest.java @@ -0,0 +1,211 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.util.normalizer; + +import java.util.List; +import java.util.Locale; +import java.util.Set; + +import org.junit.jupiter.api.Test; + +import opennlp.tools.lemmatizer.Lemmatizer; +import opennlp.tools.stemmer.PorterStemmer; +import opennlp.tools.util.Span; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertSame; +import static org.junit.jupiter.api.Assertions.assertThrows; + +public class TermAnalyzerTest { + + private static String cp(int codePoint) { + return new String(Character.toChars(codePoint)); + } + + @Test + void testNoDimensionsLeavesTokenUnchanged() { + final TermAnalyzer analyzer = TermAnalyzer.builder().build(); + final Term term = analyzer.analyze("Hello").get(0); + assertEquals("Hello", term.original()); + assertEquals("Hello", term.normalized()); + assertEquals("Hello", term.peel()); + assertEquals(List.of(), analyzer.dimensions()); + } + + @Test + void testChainAppliesInCanonicalOrderRegardlessOfBuilderOrder() { + // accentFold added before caseFold, but the canonical order is caseFold then accentFold. + final TermAnalyzer analyzer = TermAnalyzer.builder().accentFold().caseFold().build(); + assertEquals(List.of(Dimension.CASE_FOLD, Dimension.ACCENT_FOLD), analyzer.dimensions()); + final String input = "CAF" + cp(0x00C9); // CAFE with capital acute E + final Term term = analyzer.analyze(input).get(0); + assertEquals(input, term.original()); + assertEquals("cafe", term.normalized()); + assertEquals("caf" + cp(0x00E9), term.peel()); // before accent folding: lower-case, acute kept + } + + @Test + void testStemIsTheTopLayer() { + final TermAnalyzer analyzer = + TermAnalyzer.builder().caseFold().stem(new PorterStemmer()).build(); + final Term term = analyzer.analyze("Running").get(0); + assertEquals("running", term.peel()); // case-folded form, before stemming + assertEquals("run", term.normalized()); + assertEquals("run", term.at(Dimension.STEM)); + } + + @Test + void testUnconfiguredCharDimensionComputedLazily() { + final TermAnalyzer analyzer = TermAnalyzer.builder().build(); + final Term term = analyzer.analyze("HELLO").get(0); + assertEquals("HELLO", term.normalized()); + assertEquals("hello", term.at(Dimension.CASE_FOLD)); // lazily added on top of the final form + } + + @Test + void testStemDimensionWithoutStemmerFailsLoudly() { + final TermAnalyzer analyzer = TermAnalyzer.builder().caseFold().build(); + final Term term = analyzer.analyze("running").get(0); + assertThrows(IllegalStateException.class, () -> term.at(Dimension.STEM)); + } + + @Test + void testLemmaWithoutLemmatizerFailsLoudly() { + final TermAnalyzer analyzer = TermAnalyzer.builder().build(); + final Term term = analyzer.analyze("running").get(0); + assertThrows(IllegalStateException.class, () -> term.at(Dimension.LEMMA)); + } + + @Test + void testAnalyzeTextProducesSpans() { + final TermAnalyzer analyzer = TermAnalyzer.builder().caseFold().build(); + final List terms = analyzer.analyze("The Cats"); + assertEquals(2, terms.size()); + assertEquals("The", terms.get(0).original()); + assertEquals("the", terms.get(0).normalized()); + assertEquals(new Span(0, 3), terms.get(0).span()); + assertEquals("Cats", terms.get(1).original()); + assertEquals(new Span(4, 8), terms.get(1).span()); + } + + @Test + void testAnalyzeTokensHasNoSpan() { + final TermAnalyzer analyzer = TermAnalyzer.builder().caseFold().build(); + final List terms = analyzer.analyze(new String[] {"Cats"}, new String[] {"NNS"}); + assertNull(terms.get(0).span()); + assertEquals("cats", terms.get(0).normalized()); + } + + @Test + void testAnalyzeTokensRejectsLengthMismatch() { + final TermAnalyzer analyzer = TermAnalyzer.builder().build(); + assertThrows(IllegalArgumentException.class, + () -> analyzer.analyze(new String[] {"a", "b"}, new String[] {"X"})); + } + + @Test + void testTransformRejectsNonCharacterDimension() { + assertThrows(IllegalArgumentException.class, () -> TermAnalyzer.builder() + .transform(Dimension.STEM, CaseFoldCharSequenceNormalizer.getInstance())); + } + + @Test + void testLemmaWithLemmatizerAndTag() { + final Lemmatizer lemmatizer = new Lemmatizer() { + @Override + public String[] lemmatize(String[] tokens, String[] tags) { + return new String[] {"be"}; + } + + @Override + public List> lemmatize(List tokens, List tags) { + return List.of(List.of("be")); + } + }; + final TermAnalyzer analyzer = + TermAnalyzer.builder().caseFold().lemmatize(lemmatizer).build(); + final Term term = analyzer.analyze(new String[] {"was"}, new String[] {"VBD"}).get(0); + assertEquals("be", term.normalized()); + } + + @Test + void testConfusableFoldComposesWithCaseFold() { + final TermAnalyzer analyzer = TermAnalyzer.builder().caseFold().confusableFold().build(); + final String spoof = "P" + cp(0x0430) + "yp" + cp(0x0430) + "l"; // Paypal with Cyrillic a's + assertEquals(Confusables.skeleton("paypal"), analyzer.analyze(spoof).get(0).normalized()); + } + + @Test + void testAtIsMemoized() { + final TermAnalyzer analyzer = TermAnalyzer.builder().build(); + final Term term = analyzer.analyze("HELLO").get(0); + final String first = term.at(Dimension.CASE_FOLD); + assertSame(first, term.at(Dimension.CASE_FOLD)); + } + + @Test + void testWhitespaceTargetIsConfigurable() { + final CharClass lineFold = CharClass.of(CodePointSet.of('\n', '\t'), '\n'); + final TermAnalyzer analyzer = TermAnalyzer.builder().whitespace(lineFold::collapse).build(); + final Term term = analyzer.analyze(new String[] {"a\n\n\tb"}, new String[] {"X"}).get(0); + assertEquals("a\nb", term.normalized()); + } + + @Test + void testCaseFoldLocaleAppliesTurkishRules() { + final TermAnalyzer analyzer = + TermAnalyzer.builder().caseFold(Locale.forLanguageTag("tr")).build(); + assertEquals(cp(0x0131), analyzer.analyze("I").get(0).normalized()); // dotless lowercase i + } + + @Test + void testAccentFoldScopeFoldsLatin() { + final TermAnalyzer analyzer = TermAnalyzer.builder() + .accentFold(Set.of(Character.UnicodeScript.LATIN), false).build(); + assertEquals("cafe", analyzer.analyze("caf" + cp(0x00E9)).get(0).normalized()); // cafe + acute + } + + @Test + void testMaxTokenLengthChopsTokens() { + final List terms = TermAnalyzer.builder().maxTokenLength(3).build().analyze("abcdefg"); + assertEquals(3, terms.size()); + assertEquals("abc", terms.get(0).original()); + assertEquals("def", terms.get(1).original()); + assertEquals("g", terms.get(2).original()); + } + + @Test + void testAnalyzeEmptyTextProducesNoTerms() { + assertEquals(List.of(), TermAnalyzer.builder().caseFold().build().analyze("")); + } + + @Test + void testWhitespaceOnlyInputHasNoWordTerms() { + assertEquals(List.of(), TermAnalyzer.builder().build().analyze(" \t ")); + } + + @Test + void testAtDimensionBelowFinalIsAppliedOnTop() { + // Final dimension is STEM; asking for NFC applies it on top of the stem (documented behavior). + final TermAnalyzer analyzer = + TermAnalyzer.builder().caseFold().stem(new PorterStemmer()).build(); + final Term term = analyzer.analyze("Running").get(0); + assertEquals("run", term.normalized()); + assertEquals("run", term.at(Dimension.NFC)); + } +}