Skip to content

Commit 6090064

Browse files
committed
soup_adapter.py: migrate to BeautifulSoup4
1 parent 190d7ae commit 6090064

1 file changed

Lines changed: 9 additions & 34 deletions

File tree

python/gumbo/soup_adapter.py

Lines changed: 9 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -13,15 +13,15 @@
1313
# limitations under the License.
1414
#
1515

16-
"""Adapter between Gumbo and BeautifulSoup.
16+
"""Adapter between Gumbo and BeautifulSoup4.
1717
1818
This parses an HTML document and gives back a BeautifulSoup object, which you
1919
can then manipulate like a normal BeautifulSoup parse tree.
2020
"""
2121

2222
__author__ = '[email protected] (Jonathan Tang)'
2323

24-
import BeautifulSoup
24+
import bs4
2525

2626
import gumboc
2727

@@ -45,7 +45,7 @@ def _convert_attrs(attrs):
4545
# TODO(jdtang): Ideally attributes would pass along their positions as well,
4646
# but I can't extend the built in str objects with new attributes. Maybe work
4747
# around this with a subclass in some way...
48-
return [(_utf8(attr.name), _utf8(attr.value)) for attr in attrs]
48+
return {_utf8(attr.name): _utf8(attr.value) for attr in attrs}
4949

5050

5151
def _add_document(soup, element):
@@ -57,8 +57,7 @@ def _add_document(soup, element):
5757
def _add_element(soup, element):
5858
# TODO(jdtang): Expose next/previous in gumbo so they can be passed along to
5959
# BeautifulSoup.
60-
tag = BeautifulSoup.Tag(
61-
soup, _utf8(element.tag_name), _convert_attrs(element.attributes))
60+
tag = bs4.Tag(parser = soup, name = _utf8(element.tag_name), attrs = _convert_attrs(element.attributes))
6261
for child in element.children:
6362
tag.append(_add_node(soup, child))
6463
_add_source_info(
@@ -78,10 +77,10 @@ def add_text_internal(soup, element):
7877
_HANDLERS = [
7978
_add_document,
8079
_add_element,
81-
_add_text(BeautifulSoup.NavigableString),
82-
_add_text(BeautifulSoup.CData),
83-
_add_text(BeautifulSoup.Comment),
84-
_add_text(BeautifulSoup.NavigableString),
80+
_add_text(bs4.NavigableString),
81+
_add_text(bs4.CData),
82+
_add_text(bs4.Comment),
83+
_add_text(bs4.NavigableString),
8584
_add_element,
8685
]
8786

@@ -90,32 +89,8 @@ def _add_node(soup, node):
9089
return _HANDLERS[node.type.value](soup, node.contents)
9190

9291

93-
def _add_next_prev_pointers(soup):
94-
def _traverse(node):
95-
# .findAll requires the .next pointer, which is what we're trying to add
96-
# when we call this, and so we manually supply a generator to yield the
97-
# nodes in DOM order.
98-
yield node
99-
try:
100-
for child in node.contents:
101-
for descendant in _traverse(child):
102-
yield descendant
103-
except AttributeError:
104-
# Not an element.
105-
return
106-
107-
nodes = sorted(_traverse(soup), key=lambda node: node.offset)
108-
if nodes:
109-
nodes[0].previous = None
110-
nodes[-1].next = None
111-
for i, node in enumerate(nodes[1:-1], 1):
112-
nodes[i-1].next = node
113-
node.previous = nodes[i-1]
114-
115-
11692
def parse(text, **kwargs):
11793
with gumboc.parse(text, **kwargs) as output:
118-
soup = BeautifulSoup.BeautifulSoup()
94+
soup = bs4.BeautifulSoup()
11995
soup.append(_add_node(soup, output.contents.root.contents))
120-
_add_next_prev_pointers(soup)
12196
return soup

0 commit comments

Comments
 (0)