Skip to content

Commit 17119b5

Browse files
authored
Enrich typo dictionary (#8741)
* Add * Use StringInfo * nit
1 parent 89fc141 commit 17119b5

4 files changed

Lines changed: 129 additions & 51 deletions

File tree

src/NuGetGallery/Services/TyposquattingDistanceCalculation.cs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ private enum PathInfo
3030
Insert,
3131
}
3232

33-
public static bool IsDistanceLessThanThreshold(string str1, string str2, int threshold)
33+
public static bool IsDistanceLessThanOrEqualToThreshold(string str1, string str2, int threshold)
3434
{
3535
if (str1 == null)
3636
{
@@ -50,6 +50,7 @@ public static bool IsDistanceLessThanThreshold(string str1, string str2, int thr
5050

5151
return GetDistance(str1, str2, threshold) <= threshold;
5252
}
53+
5354
private static int GetDistance(string str1, string str2, int threshold)
5455
{
5556
var basicEditDistanceInfo = GetBasicEditDistanceWithPath(str1, str2);

src/NuGetGallery/Services/TyposquattingService.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ public bool IsUploadedPackageIdTyposquatting(string uploadedPackageId, User uplo
7676
Parallel.ForEach(packageIdsCheckList, (packageId, loopState) =>
7777
{
7878
string normalizedPackageId = TyposquattingStringNormalization.NormalizeString(packageId);
79-
if (TyposquattingDistanceCalculation.IsDistanceLessThanThreshold(normalizedUploadedPackageId, normalizedPackageId, threshold))
79+
if (TyposquattingDistanceCalculation.IsDistanceLessThanOrEqualToThreshold(normalizedUploadedPackageId, normalizedPackageId, threshold))
8080
{
8181
collisionIds.Add(packageId);
8282
}

src/NuGetGallery/Services/TyposquattingStringNormalization.cs

Lines changed: 50 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
// Copyright (c) .NET Foundation. All rights reserved.
22
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
33

4-
using System.Collections.Generic;
54
using System.Text;
5+
using System.Globalization;
6+
using System.Collections.Generic;
67

78
namespace NuGetGallery
89
{
@@ -12,63 +13,70 @@ public static class TyposquattingStringNormalization
1213
/// The following dictionary is built through picking up similar characters manually from wiki unicode page.
1314
/// https://en.wikipedia.org/wiki/List_of_Unicode_characters
1415
/// </summary>
15-
private static readonly IReadOnlyDictionary<char, string> SimilarCharacterDictionary = new Dictionary<char, string>()
16+
private static readonly IReadOnlyDictionary<string, string> SimilarCharacterDictionary = new Dictionary<string, string>()
1617
{
17-
{'a', "AÀÁÂÃÄÅàáâãäåĀāĂ㥹ǍǎǞǟǠǡǺǻȀȁȂȃȦȧȺΆΑάαἀἁἂἃἄἅἆἇἈἉἊἋἌἍἎἏӐӑӒӓὰάᾀᾁᾂᾃᾄᾅᾆᾇᾈᾊᾋᾌᾍᾎᾏᾰᾱᾲᾳᾴᾶᾷᾸᾹᾺΆᾼАДад"},
18-
{'b', "BƀƁƂƃƄƅɃḂḃΒϦЂБВЪЬвъьѢѣҌҍႦႪხҔҕӃӄ" },
19-
{'c', "CÇçĆćĈĉĊċČčƇƈȻȼϲϹСсҪҫ𐒨"},
20-
{'d', "DÐĎďĐđƉƊƋƌǷḊḋԀԁԂԃ"},
21-
{'e', "EÈÉÊËèéêëĒēĔĕĖėĘęĚěȄȅȆȇȨȩɆɇΈΕЀЁЄѐёҼҽҾҿӖӗἘἙἚἛἜἝῈΈЕе"},
22-
{'f', "FƑƒḞḟϜϝҒғӺӻ"},
23-
{'g', "GĜĝĞğĠġĢģƓǤǥǦǧǴǵԌԍ"},
24-
{'h', "HĤĥħǶȞȟΉΗἨἩἪἫἬἭἮἯᾘᾙᾚᾛᾜᾝᾞᾟῊΉῌЋНнћҢңҤҥҺһӇӈӉӊԊԋԦԧԨԩհႬႹ𐒅𐒌𐒎𐒣"},
25-
{'i', "I¡ìíîïǐȉȋΐίιϊіїὶίῐῑῒΐῖῗΊΙΪȊȈἰἱἲἳἴἵἶἷἸἹἺἻἼἽἾἿῘῙῚΊІЇӀӏÌÍÎÏĨĩĪīĬĭĮįİǏ"},
26-
{'j', "JĴĵǰȷͿϳЈ"},
27-
{'k', "KĶķĸƘƙǨǩΚκϏЌКкќҚқҜҝҞҟҠҡԞԟ"},
28-
{'l', "LĹĺĻļĽľĿŀŁłſƖƪȴẛ"},
29-
{'m', "MṀṁΜϺϻМмӍӎ𐒄"},
30-
{'n', "NÑñŃńŅņŇňʼnƝǸǹΝᾐᾑᾒᾓᾔᾕᾖᾗῂῃῄῆῇпԤԥԦԧԮԯ𐒐"},
31-
{'o', "OÒÓÔÕÖðòóôõöøŌōŎŏŐőƠơǑǒǪǫǬǭȌȍȎȏȪȫȬȭȮȯȰȱΌΟδοόϘϙὀὁὂὃὄὅὈὉὊὋὌὍὸόῸΌОоӦӧՕჿჾ𐒆𐒠0"},
32-
{'p', "PÞþƤƥƿṖṗΡρϷϸῤῥῬРрҎҏႲႼ"},
33-
{'q', "QȡɊɋԚԛգႭႳ"},
34-
{'r', "RŔŕŖŗŘřƦȐȑȒȓɌɼгѓ"},
35-
{'s', "SŚśŜŝŞşŠšȘșȿṠṡЅѕՏႽჽ𐒖𐒡"},
36-
{'t', "TŢţŤťŦŧƬƭƮȚțȾṪṫͲͳΤτТтҬҭէ"},
37-
{'u', "UÙÚÛÜùúûüŨũŪūŬŭŮůŰűŲųƯưǓǔǕǖǗǘǙǚǛǜȔȕȖȗμυϋύὐὑὒὓὔὕὖὗὺύῠῡῢΰῦῧՍႮ𐒩"},
38-
{'v', "VƔƲνѴѵѶѷ"},
39-
{'w', "WŴŵƜẀẁẂẃẄẅωώШЩшщѡѿὠὡὢὣὤὥὦὧὼώᾠᾡᾢᾣᾤᾥᾦᾧῲῳῴῶῷԜԝ"},
40-
{'x', "X×ΧχХхҲҳӼӽӾӿჯ"},
41-
{'y', "YÝýÿŶŷŸƳƴȲȳɎɏỲỳΎΥΫγϒϓϔЎУЧуўҮүҶҷҸҹӋӌӮӯӰӱӲӳӴӵὙὛὝὟῨῩῪΎႯႸ𐒋𐒦"},
42-
{'z', "ZŹźŻżŽžƵƶȤȥΖჍ"},
43-
{'3', "ƷǮǯȜȝʒЗзэӞӟӠӡჳ"},
44-
{'8', "Ȣȣ"},
45-
{'_', ".-" }
18+
{ "a", "AΑАαаÀÁÂÃÄÅàáâãäåĀāĂ㥹ǍǎǞǟǠǡǺǻȀȁȂȃȦȧȺΆάἀἁἂἃἄἅἆἇἈἉἊἋἌΆἍἎἏӐӑӒӓὰάᾀᾁᾂᾃᾄᾅᾆᾇᾈᾊᾋᾌᾍᾎᾏᾰᾱᾲᾳᾴᾶᾷᾸᾹᾺᾼДд"},
19+
{ "b", "BΒВЪЬƀƁƂƃƄƅɃḂḃϦЂБвъьѢѣҌҍႦႪხҔҕӃӄ"},
20+
{ "c", "CСсϹϲÇçĆćĈĉĊċČčƇƈȻȼҪҫ𐒨"},
21+
{ "d", "DƊԁÐĎďĐđƉƋƌǷḊḋԀԂԃ"},
22+
{ "e", "EΕЕеÈÉÊËèéêëĒēĔĕĖėĘęĚěȄȅȆȇȨȩɆɇΈЀЁЄѐёҼҽҾҿӖӗἘἙἚἛἜἝῈΈ"},
23+
{ "f", "FϜƑƒḞḟϝҒғӺӻ"},
24+
{ "g", "GǤԌĜĝĞğĠġĢģƓǥǦǧǴǵԍ"},
25+
{ "h", "HΗНһհҺĤĥħǶȞȟΉἨἩἪἫἬἭἮἯᾘᾙᾚᾛᾜᾝᾞᾟῊΉῌЋнћҢңҤҥӇӈӉӊԊԋԦԧԨԩႬႹ𐒅𐒌𐒎𐒣"},
26+
{ "i", "IΙІӀ¡ìíîïǐȉȋΐίιϊіїὶίῐῑῒΐῖῗΊΪȊȈἰἱἲἳἴἵἶἷἸἹἺἻἼἽἾἿῘῙῚΊЇӏÌÍÎÏĨĩĪīĬĭĮįİǏ"},
27+
{ "j", "JЈͿϳĴĵǰȷ"},
28+
{ "k", "KΚКKĶķĸƘƙǨǩκϏЌкќҚқҜҝҞҟҠҡԞԟ"},
29+
{ "l", "LĹĺĻļĽľĿŀŁłſƖƪȴẛ"},
30+
{ "m", "MΜМṀṁϺϻмӍӎ𐒄"},
31+
{ "n", "NΝпÑñŃńŅņŇňʼnƝǸǹᾐᾑᾒᾓᾔᾕᾖᾗῂῃῄῆῇԤԥԮԯ𐒐"},
32+
{ "o", "OΟОՕჿоοÒÓÔÕÖðòóôõöøŌōŎŏŐőƠơǑǒǪǫǬǭȌȍȎȏȪȫȬȭȮȯȰȱΌδόϘϙὀὁὂὃὄὅὈὉὊὋὌὍὸόῸΌӦӧჾ𐒆𐒠0"},
33+
{ "p", "PΡРрρÞþƤƥƿṖṗϷϸῤῥῬҎҏႲႼ"},
34+
{ "q", "QգԛȡɊɋԚႭႳ"},
35+
{ "r", "RгŔŕŖŗŘřƦȐȑȒȓɌɼѓ"},
36+
{ "s", "SЅѕՏႽჽŚśŜŝŞşŠšȘșȿṠṡ𐒖𐒡"},
37+
{ "t", "TΤТͲͳŢţŤťŦŧƬƭƮȚțȾṪṫτтҬҭէ"},
38+
{ "u", "UՍႮÙÚÛÜùúûüŨũŪūŬŭŮůŰűŲųƯưǓǔǕǖǗǘǙǚǛǜȔȕȖȗμυϋύὐὑὒὓὔὕὖὗὺύῠῡῢΰῦῧ𐒩"},
39+
{ "v", "VνѴѵƔƲѶѷ"},
40+
{ "w", "WωшԜԝŴŵƜẀẁẂẃẄẅώШЩщѡѿὠὡὢὣὤὥὦὧὼώᾠᾡᾢᾣᾤᾥᾦᾧῲῳῴῶῷ"},
41+
{ "x", "XХΧх×χҲҳӼӽӾӿჯ"},
42+
{ "y", "YΥҮƳуУÝýÿŶŷŸƴȲȳɎɏỲỳΎΫγϒϓϔЎЧўүҶҷҸҹӋӌӮӯӰӱӲӳӴӵὙὛὝὟῨῩῪΎႯႸ𐒋𐒦"},
43+
{ "z", "ZΖჍŹźŻżŽžƵƶȤȥ"},
44+
{ "3", "ƷЗʒӡჳǮǯȜȝзэӞӟӠ"},
45+
{ "8", "Ȣȣ"},
46+
{ "_", ".-" }
4647
};
47-
48-
private static readonly IReadOnlyDictionary<char, char> NormalizedMappingDictionary = GetNormalizedMappingDictionary(SimilarCharacterDictionary);
48+
49+
private static readonly IReadOnlyDictionary<string, string> NormalizedMappingDictionary = GetNormalizedMappingDictionary(SimilarCharacterDictionary);
4950

5051
public static string NormalizeString(string str)
5152
{
52-
var normalizedStr = new StringBuilder(str);
53-
for (var i = 0; i < normalizedStr.Length; i++)
53+
var normalizedString = new StringBuilder();
54+
var textElementEnumerator = StringInfo.GetTextElementEnumerator(str);
55+
while (textElementEnumerator.MoveNext())
5456
{
55-
if (NormalizedMappingDictionary.TryGetValue(normalizedStr[i], out var normalizedCharacter))
57+
var textElement = textElementEnumerator.GetTextElement();
58+
if (NormalizedMappingDictionary.TryGetValue(textElement, out var normalizedTextElement))
59+
{
60+
normalizedString.Append(normalizedTextElement);
61+
}
62+
else
5663
{
57-
normalizedStr[i] = normalizedCharacter;
64+
normalizedString.Append(textElement);
5865
}
5966
}
6067

61-
return normalizedStr.ToString();
68+
return normalizedString.ToString();
6269
}
6370

64-
private static Dictionary<char, char> GetNormalizedMappingDictionary(IReadOnlyDictionary<char, string> similarCharacterDictionary)
71+
private static Dictionary<string, string> GetNormalizedMappingDictionary(IReadOnlyDictionary<string, string> similarCharacterDictionary)
6572
{
66-
var normalizedMappingDictionary = new Dictionary<char, char>();
73+
var normalizedMappingDictionary = new Dictionary<string, string>();
6774
foreach (var item in similarCharacterDictionary)
6875
{
69-
foreach (var c in item.Value)
76+
var textElementEnumerator = StringInfo.GetTextElementEnumerator(item.Value);
77+
while (textElementEnumerator.MoveNext())
7078
{
71-
normalizedMappingDictionary[c] = item.Key;
79+
normalizedMappingDictionary[textElementEnumerator.GetTextElement()] = item.Key;
7280
}
7381
}
7482

tests/NuGetGallery.Facts/Services/TyposquattingServiceFacts.cs

Lines changed: 76 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,12 @@
22
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
33

44
using System;
5-
using System.Collections.Generic;
65
using System.Linq;
6+
using System.Globalization;
7+
using System.Collections.Generic;
78
using Moq;
8-
using NuGet.Services.Entities;
99
using Xunit;
10+
using NuGet.Services.Entities;
1011

1112
namespace NuGetGallery
1213
{
@@ -451,8 +452,8 @@ public void CheckTyposquattingDistance(string str1, string str2, int threshold)
451452
str2 = TyposquattingStringNormalization.NormalizeString(str2);
452453

453454
// Act
454-
var checkResult = TyposquattingDistanceCalculation.IsDistanceLessThanThreshold(str1, str2, threshold);
455-
455+
var checkResult = TyposquattingDistanceCalculation.IsDistanceLessThanOrEqualToThreshold(str1, str2, threshold);
456+
456457
// Assert
457458
Assert.True(checkResult);
458459
}
@@ -470,13 +471,26 @@ public void CheckNotTyposquattingDistance(string str1, string str2, int threshol
470471
str2 = TyposquattingStringNormalization.NormalizeString(str2);
471472

472473
// Act
473-
var checkResult = TyposquattingDistanceCalculation.IsDistanceLessThanThreshold(str1, str2, threshold);
474-
474+
var checkResult = TyposquattingDistanceCalculation.IsDistanceLessThanOrEqualToThreshold(str1, str2, threshold);
475+
475476
// Assert
476477
Assert.False(checkResult);
477478
}
478-
479+
479480
[Theory]
481+
[InlineData("ă", "a")]
482+
[InlineData("aă", "aa")]
483+
[InlineData("aăăa", "aaaa")]
484+
[InlineData("𐒎", "h")]
485+
[InlineData("h𐒎", "hh")]
486+
[InlineData("h𐒎𐒎h", "hhhh")]
487+
[InlineData("aă𐒎a", "aaha")]
488+
[InlineData("a𐒎ăa", "ahaa")]
489+
[InlineData("aă𐒎ăa", "aahaa")]
490+
[InlineData("a𐒎ă𐒎a", "ahaha")]
491+
[InlineData("aă𐒎ă𐒎a", "aahaha")]
492+
[InlineData("aă𐒎𐒎ăă𐒎a", "aahhaaha")]
493+
[InlineData("aă𐒎𐒎a𐒎ăă𐒎ă𐒎a", "aahhahaahaha")]
480494
[InlineData("Microsoft_NetFramework_v1", "microsoft_netframework_v1")]
481495
[InlineData("Microsoft.netframework-v1", "microsoft_netframework_v1")]
482496
[InlineData("mícr0s0ft.nёtFrǎmȇwὀrk.v1", "microsoft_netframework_v1")]
@@ -488,5 +502,60 @@ public void CheckNormalization(string str1, string str2)
488502
// Assert
489503
Assert.Equal(str1, str2);
490504
}
505+
506+
[Fact]
507+
public void CheckNormalizationDictionary()
508+
{
509+
// Arrange
510+
var similarCharacterDictionary = new Dictionary<string, string>()
511+
{
512+
{ "a", "AΑАαаÀÁÂÃÄÅàáâãäåĀāĂ㥹ǍǎǞǟǠǡǺǻȀȁȂȃȦȧȺΆάἀἁἂἃἄἅἆἇἈἉἊἋἌΆἍἎἏӐӑӒӓὰάᾀᾁᾂᾃᾄᾅᾆᾇᾈᾊᾋᾌᾍᾎᾏᾰᾱᾲᾳᾴᾶᾷᾸᾹᾺᾼДд"},
513+
{ "b", "BΒВЪЬƀƁƂƃƄƅɃḂḃϦЂБвъьѢѣҌҍႦႪხҔҕӃӄ"},
514+
{ "c", "CСсϹϲÇçĆćĈĉĊċČčƇƈȻȼҪҫ𐒨"},
515+
{ "d", "DƊԁÐĎďĐđƉƋƌǷḊḋԀԂԃ"},
516+
{ "e", "EΕЕеÈÉÊËèéêëĒēĔĕĖėĘęĚěȄȅȆȇȨȩɆɇΈЀЁЄѐёҼҽҾҿӖӗἘἙἚἛἜἝῈΈ"},
517+
{ "f", "FϜƑƒḞḟϝҒғӺӻ"},
518+
{ "g", "GǤԌĜĝĞğĠġĢģƓǥǦǧǴǵԍ"},
519+
{ "h", "HΗНһհҺĤĥħǶȞȟΉἨἩἪἫἬἭἮἯᾘᾙᾚᾛᾜᾝᾞᾟῊΉῌЋнћҢңҤҥӇӈӉӊԊԋԦԧԨԩႬႹ𐒅𐒌𐒎𐒣"},
520+
{ "i", "IΙІӀ¡ìíîïǐȉȋΐίιϊіїὶίῐῑῒΐῖῗΊΪȊȈἰἱἲἳἴἵἶἷἸἹἺἻἼἽἾἿῘῙῚΊЇӏÌÍÎÏĨĩĪīĬĭĮįİǏ"},
521+
{ "j", "JЈͿϳĴĵǰȷ"},
522+
{ "k", "KΚКKĶķĸƘƙǨǩκϏЌкќҚқҜҝҞҟҠҡԞԟ"},
523+
{ "l", "LĹĺĻļĽľĿŀŁłſƖƪȴẛ"},
524+
{ "m", "MΜМṀṁϺϻмӍӎ𐒄"},
525+
{ "n", "NΝпÑñŃńŅņŇňʼnƝǸǹᾐᾑᾒᾓᾔᾕᾖᾗῂῃῄῆῇԤԥԮԯ𐒐"},
526+
{ "o", "OΟОՕჿоοÒÓÔÕÖðòóôõöøŌōŎŏŐőƠơǑǒǪǫǬǭȌȍȎȏȪȫȬȭȮȯȰȱΌδόϘϙὀὁὂὃὄὅὈὉὊὋὌὍὸόῸΌӦӧჾ𐒆𐒠0"},
527+
{ "p", "PΡРрρÞþƤƥƿṖṗϷϸῤῥῬҎҏႲႼ"},
528+
{ "q", "QգԛȡɊɋԚႭႳ"},
529+
{ "r", "RгŔŕŖŗŘřƦȐȑȒȓɌɼѓ"},
530+
{ "s", "SЅѕՏႽჽŚśŜŝŞşŠšȘșȿṠṡ𐒖𐒡"},
531+
{ "t", "TΤТͲͳŢţŤťŦŧƬƭƮȚțȾṪṫτтҬҭէ"},
532+
{ "u", "UՍႮÙÚÛÜùúûüŨũŪūŬŭŮůŰűŲųƯưǓǔǕǖǗǘǙǚǛǜȔȕȖȗμυϋύὐὑὒὓὔὕὖὗὺύῠῡῢΰῦῧ𐒩"},
533+
{ "v", "VνѴѵƔƲѶѷ"},
534+
{ "w", "WωшԜԝŴŵƜẀẁẂẃẄẅώШЩщѡѿὠὡὢὣὤὥὦὧὼώᾠᾡᾢᾣᾤᾥᾦᾧῲῳῴῶῷ"},
535+
{ "x", "XХΧх×χҲҳӼӽӾӿჯ"},
536+
{ "y", "YΥҮƳуУÝýÿŶŷŸƴȲȳɎɏỲỳΎΫγϒϓϔЎЧўүҶҷҸҹӋӌӮӯӰӱӲӳӴӵὙὛὝὟῨῩῪΎႯႸ𐒋𐒦"},
537+
{ "z", "ZΖჍŹźŻżŽžƵƶȤȥ"},
538+
{ "3", "ƷЗʒӡჳǮǯȜȝзэӞӟӠ"},
539+
{ "8", "Ȣȣ"},
540+
{ "_", ".-" }
541+
};
542+
543+
var testPackageName = "testpackage";
544+
foreach (var item in similarCharacterDictionary)
545+
{
546+
var textElementEnumerator = StringInfo.GetTextElementEnumerator(item.Value);
547+
while (textElementEnumerator.MoveNext())
548+
{
549+
var typoString = testPackageName + textElementEnumerator.GetTextElement();
550+
var baseString = testPackageName + item.Key;
551+
552+
// Act
553+
var normalizedString = TyposquattingStringNormalization.NormalizeString(typoString);
554+
555+
// Assert
556+
Assert.Equal(baseString, normalizedString);
557+
}
558+
}
559+
}
491560
}
492561
}

0 commit comments

Comments
 (0)