Skip to content

Commit b1c2839

Browse files
UserAgentParser parsing (#10114)
A building block for NuGet/Engineering#5082
1 parent 920fa54 commit b1c2839

9 files changed

Lines changed: 447 additions & 4 deletions

File tree

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
{
2+
"version": "0.2.0",
3+
"configurations": [
4+
{
5+
"name": "Debug User Agent Test Module",
6+
"type": "debugpy",
7+
"request": "launch",
8+
"module": "tests.test_useragentparser"
9+
},
10+
{
11+
"name": "Python Debugger: Current File",
12+
"type": "debugpy",
13+
"request": "launch",
14+
"program": "${file}",
15+
"console": "integratedTerminal"
16+
}
17+
]
18+
}

python/StatsLogParser/MANIFEST.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
include loginterpretation/knownclients.yaml

python/StatsLogParser/StatsLogParser.pyproj

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
<SubType>Code</SubType>
3232
</Compile>
3333
<Compile Include="loginterpretation\semanticversion.py" />
34+
<Compile Include="loginterpretation\useragentparser.py" />
3435
<Compile Include="loginterpretation\__init__.py">
3536
<SubType>Code</SubType>
3637
</Compile>
@@ -51,6 +52,7 @@
5152
</Interpreter>
5253
</ItemGroup>
5354
<ItemGroup>
55+
<Content Include="loginterpretation\knownclients.yaml" />
5456
<Content Include="requirements.txt" />
5557
</ItemGroup>
5658
<ItemGroup>
Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +0,0 @@
1-
"""
2-
Log interpretation package
3-
"""
Lines changed: 222 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,222 @@
1+
user_agent_parsers:
2+
# NuGet MSBuild Task
3+
- regex: '(NuGet MSBuild Task)/(\d+)\.(\d+)\.?(\d+)?'
4+
family_replacement: 'NuGet MSBuild Task'
5+
6+
- regex: '(NuGet .NET Core MSBuild Task)/(\d+)\.(\d+)\.?(\d+)?'
7+
family_replacement: 'NuGet .NET Core MSBuild Task'
8+
9+
- regex: '(NuGet Desktop MSBuild Task)/(\d+)\.(\d+)\.?(\d+)?'
10+
family_replacement: 'NuGet Desktop MSBuild Task'
11+
12+
# NuGet Client V3
13+
- regex: '(NuGet Client V3)/(\d+)\.(\d+)\.?(\d+)?'
14+
family_replacement: 'NuGet Client V3'
15+
16+
# NuGet VS PowerShell Console (NuGet 2.8+)
17+
- regex: '(NuGet VS PowerShell Console)/(\d+)\.(\d+)\.?(\d+)?'
18+
family_replacement: 'NuGet VS PowerShell Console'
19+
20+
# NuGet VS Packages Dialog - Solution (NuGet 2.8+)
21+
- regex: '(NuGet VS Packages Dialog - Solution)/(\d+)\.(\d+)\.?(\d+)?'
22+
family_replacement: 'NuGet VS Packages Dialog - Solution'
23+
24+
# NuGet VS Packages Dialog (NuGet 2.8+)
25+
- regex: '(NuGet VS Packages Dialog)/(\d+)\.(\d+)\.?(\d+)?'
26+
family_replacement: 'NuGet VS Packages Dialog - Solution'
27+
28+
# NuGet Add Package Dialog (pre-NuGet 2.8)
29+
- regex: '(NuGet Add Package Dialog)/(\d+)\.(\d+)\.?(\d+)?'
30+
family_replacement: 'NuGet Add Package Dialog'
31+
32+
# NuGet Package Manager Console (pre-NuGet 2.8)
33+
- regex: '(NuGet Package Manager Console)/(\d+)\.(\d+)\.?(\d+)?'
34+
family_replacement: 'NuGet Package Manager Console'
35+
36+
# NuGet Visual Studio Extension (pre-NuGet 2.8)
37+
- regex: '(NuGet Visual Studio Extension)/(\d+)\.(\d+)\.?(\d+)?'
38+
family_replacement: 'NuGet Visual Studio Extension'
39+
40+
# Package-Installer (pre-NuGet 2.8)
41+
- regex: '(Package-Installer)/(\d+)\.(\d+)\.?(\d+)?'
42+
family_replacement: 'Package-Installer'
43+
44+
# NuGet Command Line
45+
- regex: '(NuGet Command Line)/(\d+)\.(\d+)\.?(\d+)?'
46+
family_replacement: 'NuGet Command Line'
47+
48+
# NuGet xplat CLI
49+
- regex: '(NuGet xplat)/(\d+)\.(\d+)\.?(\d+)?'
50+
family_replacement: 'NuGet Cross-Platform Command Line'
51+
52+
# NuGet Core
53+
- regex: '(NuGet Core)/?(\d+)\.(\d+)\.?(\d+)?'
54+
family_replacement: 'NuGet'
55+
56+
# WebMatrix includes its own core version number as part of the client name, before the slash
57+
- regex: '(WebMatrix) (\d+)\.(\d+)\.?(\d+)?'
58+
family_replacement: 'WebMatrix'
59+
60+
# NuGet Package Explorer (npe.codeplex.com)
61+
- regex: '(NuGet Package Explorer Metro)/(\d+)\.(\d+)\.?(\d+)?'
62+
family_replacement: 'NuGet Package Explorer Metro'
63+
- regex: '(NuGet Package Explorer)/(\d+)\.(\d+)\.?(\d+)?'
64+
family_replacement: 'NuGet Package Explorer'
65+
66+
# JetBrains TeamCity - uses a space to separate the client from the version instead of slash
67+
- regex: '(JetBrains TeamCity) (\d+)\.(\d+)\.?(\d+)?'
68+
family_replacement: 'JetBrains TeamCity'
69+
70+
# JetBrains ReSharper Platform for VS10
71+
- regex: '(ReSharperPlatformVs10)/?(\d+)\.(\d+)\.?(\d+)?'
72+
family_replacement: 'JetBrains ReSharper Platform VS2010'
73+
74+
# JetBrains ReSharper Platform for VS11
75+
- regex: '(ReSharperPlatformVs11)/?(\d+)\.(\d+)\.?(\d+)?'
76+
family_replacement: 'JetBrains ReSharper Platform VS2012'
77+
78+
# JetBrains ReSharper Platform for VS12
79+
- regex: '(ReSharperPlatformVs12)/?(\d+)\.(\d+)\.?(\d+)?'
80+
family_replacement: 'JetBrains ReSharper Platform VS2013'
81+
82+
# JetBrains ReSharper Platform for VS14
83+
- regex: '(ReSharperPlatformVs14)/?(\d+)\.(\d+)\.?(\d+)?'
84+
family_replacement: 'JetBrains ReSharper Platform VS2015'
85+
86+
# JetBrains ReSharper
87+
- regex: '(ReSharper)/(\d+)?\.?(\d+)?\.?(\d+)?'
88+
family_replacement: 'JetBrains ReSharper'
89+
90+
# JetBrains dotPeek
91+
- regex: '(dotPeek)/(\d+)\.(\d+)\.(\d+)\.?(\d+)?\)'
92+
family_replacement: 'JetBrains dotPeek'
93+
94+
# JetBrains ReSharper Extension Manager
95+
- regex: '(ReSharper Extension Manager)/(\d+)?\.?(\d+)?\.?(\d+)?'
96+
family_replacement: 'JetBrains ReSharper Extension Manager'
97+
98+
# Sonatype Nexus (www.sonatype.com)
99+
- regex: '(Nexus)/(\d+)\.(\d+)\.?(\d+)?'
100+
family_replacement: 'Sonatype Nexus'
101+
102+
# JFrog Artifactory (www.jfrog.com)
103+
- regex: '(Artifactory)/(\d+)\.(\d+)\.?(\d+)?'
104+
family_replacement: 'JFrog Artifactory'
105+
106+
# MyGet (www.myget.org)
107+
- regex: '(MyGet)/?(\d+)?\.?(\d+)?\.?(\d+)?'
108+
family_replacement: 'MyGet'
109+
110+
# ProGet (www.inedo.com)
111+
- regex: '(ProGet)/(\d+)\.(\d+)\.?(\d+)?'
112+
family_replacement: 'Inedo ProGet'
113+
114+
# Paket (http://fsprojects.github.io/Paket)
115+
- regex: '(Paket)/?(\d+)?\.?(\d+)?\.?(\d+)?'
116+
family_replacement: 'Paket'
117+
118+
# Xamarin Studio (www.xamarin.com)
119+
- regex: '(Xamarin Studio)/(\d+)\.(\d+)\.?(\d+)?'
120+
family_replacement: 'Xamarin Studio'
121+
122+
# MonoDevelop
123+
- regex: '(MonoDevelop)/(\d+)\.(\d+)\.?(\d+)?'
124+
family_replacement: 'MonoDevelop'
125+
126+
# MonoDevelop-Unity
127+
- regex: '(MonoDevelop-Unity)/(\d+)\.(\d+)\.?(\d+)?'
128+
family_replacement: 'MonoDevelop'
129+
130+
# SharpDevelop
131+
- regex: '(SharpDevelop)/(\d+)\.(\d+)\.?(\d+)?'
132+
family_replacement: 'SharpDevelop'
133+
134+
# DNX
135+
- regex: '(Microsoft_.NET_Development_Utility)/(\d+)\.(\d+)\.?(\d+)?'
136+
family_replacement: 'DNX Utility'
137+
- regex: '(NuGet Shim)/?(\d+)?\.?(\d+)?\.?(\d+)?'
138+
family_replacement: 'NuGet Shim'
139+
140+
# PowerShell
141+
- regex: '(WindowsPowerShell)/(\d+)\.(\d+)\.?(\d+)?'
142+
family_replacement: 'Windows PowerShell'
143+
144+
# PowerShell Core
145+
- regex: 'Mozilla.*(PowerShell)/(\d+)\.(\d+)\.?(\d+)?'
146+
family_replacement: 'PowerShell Core'
147+
148+
# Fiddler
149+
- regex: '(Fiddler)/?(\d+)?\.?(\d+)?\.?(\d+)?'
150+
family_replacement: 'Fiddler'
151+
152+
# curl
153+
- regex: '(curl)/?(\d+)?\.?(\d+)?\.?(\d+)?'
154+
family_replacement: 'curl'
155+
156+
# Java
157+
- regex: '(Java)/(\d+)\.(\d+)\.?(\d+)?'
158+
family_replacement: 'Java'
159+
160+
# NuGet Test Client - to be used when making test-calls to nuget.org endpoints
161+
- regex: '(NuGet Test Client)/?(\d+)?\.?(\d+)?\.?(\d+)?'
162+
family_replacement: 'NuGet Test Client'
163+
164+
# Cake NuGet Client
165+
- regex: '(Cake NuGet Client)/(\d+)\.(\d+)\.?(\d+)?'
166+
family_replacement: 'Cake NuGet Client'
167+
168+
# Cake
169+
- regex: '(Cake)/(\d+)\.(\d+)\.?(\d+)?'
170+
family_replacement: 'Cake'
171+
172+
# NuGet Client V3
173+
- regex: '(NuGet VS VSIX)/(\d+)\.(\d+)\.?(\d+)?'
174+
family_replacement: 'NuGet VS VSIX'
175+
176+
# Xamarin Updater
177+
- regex: '(Xamarin Updater).*?\(Version: (\d+)\.(\d+)\.?(\d+)?'
178+
family_replacement: 'Xamarin Updater'
179+
180+
# vsts-task-installer
181+
- regex: '(vsts-task-installer)/(\d+)\.(\d+)\.?(\d+)?'
182+
family_replacement: 'vsts-task-installer'
183+
184+
# Checkmarx 1
185+
- regex: '(Checkmarx-NugetSourceCodePriorityCollector)'
186+
family_replacement: 'Checkmarx NugetSourceCodePriorityCollector'
187+
188+
# Checkmarx 2
189+
- regex: '(Checkmarx-NugetShaCollector)'
190+
family_replacement: 'Checkmarx NugetShaCollector'
191+
192+
# Checkmarx 3
193+
- regex: '(Checkmarx-NugetDllShaCollector)'
194+
family_replacement: 'Checkmarx NugetDllShaCollector'
195+
196+
# Checkmarx 4
197+
- regex: '(Checkmarx-SourceCodeDownloader)'
198+
family_replacement: 'Checkmarx SourceCodeDownloader'
199+
200+
# Azure artifacts
201+
- regex: '(AzureArtifacts)/(\d+)\.(\d+)\.?(\d+)?'
202+
family_replacement: 'Azure artifacts'
203+
204+
# Bazel
205+
- regex: '(Bazel)/.*?(\d+)\.(\d+)\.?(\d+)?'
206+
family_replacement: 'Bazel'
207+
208+
# Visual Studio
209+
- regex: '(Visual Studio)/(\d+)\.(\d+)\.?(\d+)?'
210+
family_replacement: 'Visual Studio'
211+
212+
# NuGetMirror
213+
- regex: '(NuGetMirror)/(\d+)\.(\d+)\.?(\d+)?'
214+
family_replacement: 'NuGetMirror'
215+
216+
# BaGet
217+
- regex: '(BaGet)/(\d+)\.(\d+)\.?(\d+)?'
218+
family_replacement: 'BaGet'
219+
220+
# NuGet - Keep this one at the bottom of this file as a catch-all resolver
221+
- regex: '(NuGet)/?(\d+)\.(\d+)\.?(\d+)?'
222+
family_replacement: 'NuGet'
Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
from __future__ import annotations
2+
from collections import namedtuple
3+
from typing import Optional
4+
import re
5+
import pkg_resources
6+
from ua_parser import user_agent_parser
7+
from ua_parser._regexes import USER_AGENT_PARSERS
8+
import yaml
9+
10+
UserAgent = namedtuple('UserAgent', ['family', 'major', 'minor', 'patch'])
11+
12+
class UserAgentParser:
13+
"""UserAgentParser class to parse user agent string."""
14+
DEFAULT_PARSER_DATA = USER_AGENT_PARSERS
15+
KNOWN_CLIENTS_DATA: list[user_agent_parser.UserAgentParser] = []
16+
KNOWN_CLIENTS_IN_CHINA_DATA: list[user_agent_parser.UserAgentParser] = []
17+
18+
@classmethod
19+
def __static_init__(cls):
20+
cls.KNOWN_CLIENTS_DATA = cls._load_known_clients_parser()
21+
cls.KNOWN_CLIENTS_IN_CHINA_DATA = cls._load_known_clients_in_china_parser()
22+
23+
@staticmethod
24+
def _load_known_clients_parser():
25+
yaml_content = UserAgentParser._read_known_clients_yaml()
26+
return UserAgentParser._create_parser_data_from_yaml(yaml_content)
27+
28+
@staticmethod
29+
def _load_known_clients_in_china_parser():
30+
yaml_content = UserAgentParser._read_known_clients_yaml()
31+
patched_yaml = UserAgentParser._add_support_for_china_cdn(yaml_content)
32+
return UserAgentParser._create_parser_data_from_yaml(patched_yaml)
33+
34+
@staticmethod
35+
def _add_support_for_china_cdn(yaml_content):
36+
patched_yaml = re.sub(
37+
r"(?:[:]\s'\()+([\w\-.\s]+)(?:\))+",
38+
UserAgentParser._replace_whitespace_with_plus_sign,
39+
yaml_content,
40+
flags=re.DOTALL
41+
)
42+
return patched_yaml
43+
44+
@staticmethod
45+
def _replace_whitespace_with_plus_sign(match):
46+
return ": '(" + match.group(1).replace(" ", r"\+") + ")"
47+
48+
@staticmethod
49+
def _read_known_clients_yaml() -> str:
50+
file_name = pkg_resources.resource_filename(__name__, 'knownclients.yaml')
51+
with open(file_name, 'r', encoding='utf-8-sig') as file:
52+
yaml_file = file.read()
53+
54+
return yaml_file
55+
56+
@staticmethod
57+
def _create_parser_data_from_yaml(yaml_content) -> list[user_agent_parser.UserAgentParser]:
58+
data = yaml.safe_load(yaml_content)
59+
60+
parsers: list[user_agent_parser.UserAgentParser] = []
61+
62+
for parser in data["user_agent_parsers"]:
63+
regex = parser["regex"]
64+
65+
family_replacement = parser.get("family_replacement")
66+
v1_replacement = parser.get("v1_replacement")
67+
v2_replacement = parser.get("v2_replacement")
68+
69+
parsers.append(
70+
user_agent_parser.UserAgentParser(
71+
regex, family_replacement, v1_replacement, v2_replacement
72+
)
73+
)
74+
75+
return parsers
76+
77+
_MAX_CACHE_SIZE = 200
78+
_PARSE_CACHE: dict[str, UserAgent] = {}
79+
80+
@staticmethod
81+
def _lookup(ua: str) -> Optional[UserAgent]:
82+
83+
entry = UserAgentParser._PARSE_CACHE.get(ua)
84+
if entry is not None:
85+
return entry
86+
87+
if len(UserAgentParser._PARSE_CACHE) >= UserAgentParser._MAX_CACHE_SIZE:
88+
UserAgentParser._PARSE_CACHE.clear()
89+
90+
return None
91+
92+
@staticmethod
93+
def parse(user_agent_string):
94+
"""Parse using known clients parser, then known clients in China parser, then default parser."""
95+
entry = UserAgentParser._lookup(user_agent_string)
96+
97+
if entry is not None:
98+
return entry
99+
100+
# Try known clients parser
101+
entry = UserAgentParser._parse_user_agent_with_parsers(user_agent_string, UserAgentParser.KNOWN_CLIENTS_DATA)
102+
103+
if entry.family.lower() == 'other': # Try China parser
104+
entry = UserAgentParser._parse_user_agent_with_parsers(user_agent_string, UserAgentParser.KNOWN_CLIENTS_IN_CHINA_DATA)
105+
106+
if entry.family.lower() == 'other': # Try default parser
107+
entry = UserAgentParser._parse_user_agent_with_parsers(user_agent_string, UserAgentParser.DEFAULT_PARSER_DATA)
108+
109+
UserAgentParser._PARSE_CACHE[user_agent_string] = entry
110+
return entry
111+
112+
@staticmethod
113+
def _parse_user_agent_with_parsers(user_agent_string: str, parsers: list[user_agent_parser.UserAgentParser]) -> UserAgent:
114+
for ua_parser in parsers:
115+
family, v1, v2, v3 = ua_parser.Parse(user_agent_string)
116+
if family:
117+
break
118+
119+
family = family or "Other"
120+
return UserAgent(family, v1 or None, v2 or None, v3 or None)
121+
122+
UserAgentParser.__static_init__()

0 commit comments

Comments
 (0)