Skip to content

Commit e6b0102

Browse files
authored
Merge pull request CloudBotIRC#141 from linuxdaemon/gonzobot+fix-url-regex
Update link announcer regex to be more precise
2 parents 74c9c76 + 1c99c78 commit e6b0102

2 files changed

Lines changed: 109 additions & 4 deletions

File tree

plugins/link_announcer.py

Lines changed: 35 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,36 @@
88
from cloudbot.hook import Priority, Action
99

1010
# This will match any URL, blacklist removed and abstracted to a priority/halting system
11-
url_re = re.compile(r'https?://(?:[a-zA-Z]|[0-9]|[$-_@.&+~]|[!*(),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', re.I)
11+
url_re = re.compile(
12+
r"""
13+
https? # Scheme
14+
://
15+
16+
# Username and Password
17+
(?:
18+
(?:[^\[\]?/<~#`!@$%^&*()=+}|:";',>{\s]|%[0-9A-F]{2})*
19+
(?::(?:[^\[\]?/<~#`!@$%^&*()=+}|:";',>{\s]|%[0-9A-F]{2})*)?
20+
@
21+
)?
22+
23+
# Domain
24+
(?:
25+
# TODO Add support for IDNA hostnames as specified by RFC5891
26+
[\-.0-9A-Za-z]+| # host
27+
\d{1,3}(?:\.\d{1,3}){3}| # IPv4
28+
\[[A-F0-9]{0,4}(?::[A-F0-9]{0,4}){2,7}\] # IPv6
29+
)
30+
31+
(?::\d*)? # port
32+
33+
(?:/(?:[A-Za-z0-9!$&-.:;=@_~\u00A0-\U0010FFFD]|%[A-F0-9]{2})*)* # Path segment
34+
35+
(?:\?(?:[A-Za-z0-9!$&-;=@_~\u00A0-\U0010FFFD]|%[A-F0-9]{2})*)? # Query
36+
37+
(?:\#(?:[A-Za-z0-9!$&-;=@_~\u00A0-\U0010FFFD]|%[A-F0-9]{2})*)? # Fragment
38+
""",
39+
re.IGNORECASE | re.VERBOSE
40+
)
1241

1342
HEADERS = {
1443
'Accept-Language': 'en-US,en;q=0.5',
@@ -32,6 +61,8 @@ def print_url_title(message, match):
3261
return
3362

3463
html = BeautifulSoup(content, "lxml", from_encoding=encoding)
35-
title = " ".join(html.title.text.strip().splitlines())
36-
out = "Title: \x02{}\x02".format(title)
37-
message(out)
64+
65+
if html.title:
66+
title = html.title.text
67+
out = "Title: \x02{}\x02".format(title)
68+
message(out)
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
from plugins.link_announcer import url_re
2+
3+
MATCHES = (
4+
"http://foo.com/blah_blah",
5+
"http://foo.com/blah_blah/",
6+
"http://foo.com/blah_blah_(wikipedia)",
7+
"http://foo.com/blah_blah_(wikipedia)_(again)",
8+
"http://www.example.com/wpstyle/?p=364",
9+
"https://www.example.com/foo/?bar=baz&inga=42&quux",
10+
"http://userid:[email protected]:8080",
11+
"http://userid:[email protected]:8080/",
12+
13+
"http://[email protected]/",
14+
"http://[email protected]:8080",
15+
"http://[email protected]:8080/",
16+
"http://userid:[email protected]",
17+
"http://userid:[email protected]/",
18+
"http://142.42.1.1/",
19+
"http://142.42.1.1:8080/",
20+
"http://foo.com/blah_(wikipedia)#cite-1",
21+
"http://foo.com/blah_(wikipedia)_blah#cite-1",
22+
"http://foo.com/unicode_(✪)_in_parens",
23+
"http://foo.com/(something)?after=parens",
24+
"http://code.google.com/events/#&product=browser",
25+
"http://j.mp",
26+
"http://foo.bar/?q=Test%20URL-encoded%20stuff",
27+
"http://1337.net",
28+
"http://a.b-c.de",
29+
"http://223.255.255.254",
30+
)
31+
32+
FAILS = (
33+
"http://",
34+
"http://?",
35+
"http://??",
36+
"http://??/",
37+
"http://#",
38+
"http://##",
39+
"http://##/",
40+
"http://foo.bar?q=Spaces should be encoded",
41+
"//",
42+
"//a",
43+
"///a",
44+
"///",
45+
"http:///a",
46+
"foo.com",
47+
"rdar://1234",
48+
"h://test",
49+
"http:// shouldfail.com",
50+
":// should fail",
51+
"http://foo.bar/foo(bar)baz quux",
52+
"ftps://foo.bar/",
53+
)
54+
55+
SEARCH = (
56+
("[https://example.com]", "https://example.com"),
57+
("<a hreh=\"https://example.com/test.page?#test\">", "https://example.com/test.page?#test"),
58+
("<https://www.example.com/this.is.a.test/blah.txt?a=1#123>", "https://www.example.com/this.is.a.test/blah.txt?a=1#123"),
59+
)
60+
61+
62+
def test_urls():
63+
for url in MATCHES:
64+
assert url_re.fullmatch(url), url
65+
66+
for url in FAILS:
67+
match = url_re.fullmatch(url)
68+
assert not match, match.group()
69+
70+
71+
def test_search():
72+
for text, out in SEARCH:
73+
match = url_re.search(text)
74+
assert match and match.group() == out

0 commit comments

Comments
 (0)