Skip to content

Commit 8fc4ba1

Browse files
authored
Make link_announcer respect page encoding headers (CloudBotIRC#206)
* Make link_announcer.py respect HTML specified page encoding * clean up re-encoding logic * Add unit tests for parsing page encoding in link announcer
1 parent 3308aec commit 8fc4ba1

2 files changed

Lines changed: 56 additions & 3 deletions

File tree

plugins/link_announcer.py

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,20 +60,48 @@ def no_parens(pattern):
6060
MAX_RECV = 1000000
6161

6262

63+
def get_encoding(soup):
64+
meta_charset = soup.find('meta', charset=True)
65+
66+
if meta_charset:
67+
return meta_charset['charset']
68+
else:
69+
meta_content_type = soup.find(
70+
'meta', {'http-equiv': lambda t: t and t.lower() == 'content-type', 'content': True}
71+
)
72+
if meta_content_type:
73+
return requests.utils.get_encoding_from_headers({'content-type': meta_content_type['content']})
74+
75+
return None
76+
77+
78+
def parse_content(content, encoding=None):
79+
html = BeautifulSoup(content, "lxml", from_encoding=encoding)
80+
old_encoding = encoding
81+
82+
encoding = get_encoding(html)
83+
84+
if encoding is not None and encoding != old_encoding:
85+
html = BeautifulSoup(content, "lxml", from_encoding=encoding)
86+
87+
return html
88+
89+
6390
@hook.regex(url_re, priority=Priority.LOW, action=Action.HALTTYPE, only_no_match=True)
6491
def print_url_title(message, match):
6592
with closing(requests.get(match.group(), headers=HEADERS, stream=True, timeout=3)) as r:
6693
r.raise_for_status()
6794
if not r.encoding:
6895
return
6996

97+
# TODO Switch to reading chunks until full title is found, up to MAX_RECV bytes
7098
content = r.raw.read(MAX_RECV + 1, decode_content=True)
7199
encoding = r.encoding
72100

73101
if len(content) > MAX_RECV:
74102
return
75103

76-
html = BeautifulSoup(content, "lxml", from_encoding=encoding)
104+
html = parse_content(content, encoding)
77105

78106
if html.title:
79107
title = html.title.text

tests/plugin_tests/test_link_announcer.py

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
1-
from plugins.link_announcer import url_re
1+
import codecs
2+
3+
from bs4 import BeautifulSoup
4+
5+
from plugins.link_announcer import url_re, get_encoding
26

37
MATCHES = (
48
"http://foo.com/blah_blah",
@@ -65,7 +69,8 @@
6569
("(https://foo.bar)", "https://foo.bar"),
6670
("[https://example.com]", "https://example.com"),
6771
("<a hreh=\"https://example.com/test.page?#test\">", "https://example.com/test.page?#test"),
68-
("<https://www.example.com/this.is.a.test/blah.txt?a=1#123>", "https://www.example.com/this.is.a.test/blah.txt?a=1#123"),
72+
("<https://www.example.com/this.is.a.test/blah.txt?a=1#123>",
73+
"https://www.example.com/this.is.a.test/blah.txt?a=1#123"),
6974
)
7075

7176

@@ -82,3 +87,23 @@ def test_search():
8287
for text, out in SEARCH:
8388
match = url_re.search(text)
8489
assert match and match.group() == out
90+
91+
92+
ENCODINGS = (
93+
(b'<meta charset="utf8">', codecs.lookup('utf8')),
94+
(b'', None),
95+
(b'<meta http-equiv="Content-Type" content="text/html; charset=utf-8">', codecs.lookup('utf8')),
96+
)
97+
98+
99+
def test_encoding_parse():
100+
for text, enc in ENCODINGS:
101+
soup = BeautifulSoup(text, "lxml")
102+
encoding = get_encoding(soup)
103+
if encoding is None:
104+
assert enc is None, "Got empty encoding from {!r} expected {!r}".format(text, enc)
105+
continue
106+
107+
enc_obj = codecs.lookup(encoding)
108+
109+
assert enc, enc_obj

0 commit comments

Comments
 (0)