1313# limitations under the License.
1414#
1515
16- """Adapter between Gumbo and BeautifulSoup .
16+ """Adapter between Gumbo and BeautifulSoup4 .
1717
1818This parses an HTML document and gives back a BeautifulSoup object, which you
1919can then manipulate like a normal BeautifulSoup parse tree.
2020"""
2121
2222__author__ = '[email protected] (Jonathan Tang)' 2323
24- import BeautifulSoup
24+ import bs4
2525
2626import gumboc
2727
@@ -45,7 +45,7 @@ def _convert_attrs(attrs):
4545 # TODO(jdtang): Ideally attributes would pass along their positions as well,
4646 # but I can't extend the built in str objects with new attributes. Maybe work
4747 # around this with a subclass in some way...
48- return [( _utf8 (attr .name ), _utf8 (attr .value )) for attr in attrs ]
48+ return { _utf8 (attr .name ): _utf8 (attr .value ) for attr in attrs }
4949
5050
5151def _add_document (soup , element ):
@@ -57,8 +57,7 @@ def _add_document(soup, element):
5757def _add_element (soup , element ):
5858 # TODO(jdtang): Expose next/previous in gumbo so they can be passed along to
5959 # BeautifulSoup.
60- tag = BeautifulSoup .Tag (
61- soup , _utf8 (element .tag_name ), _convert_attrs (element .attributes ))
60+ tag = bs4 .Tag (parser = soup , name = _utf8 (element .tag_name ), attrs = _convert_attrs (element .attributes ))
6261 for child in element .children :
6362 tag .append (_add_node (soup , child ))
6463 _add_source_info (
@@ -78,10 +77,10 @@ def add_text_internal(soup, element):
7877_HANDLERS = [
7978 _add_document ,
8079 _add_element ,
81- _add_text (BeautifulSoup .NavigableString ),
82- _add_text (BeautifulSoup .CData ),
83- _add_text (BeautifulSoup .Comment ),
84- _add_text (BeautifulSoup .NavigableString ),
80+ _add_text (bs4 .NavigableString ),
81+ _add_text (bs4 .CData ),
82+ _add_text (bs4 .Comment ),
83+ _add_text (bs4 .NavigableString ),
8584 _add_element ,
8685 ]
8786
@@ -90,32 +89,8 @@ def _add_node(soup, node):
9089 return _HANDLERS [node .type .value ](soup , node .contents )
9190
9291
93- def _add_next_prev_pointers (soup ):
94- def _traverse (node ):
95- # .findAll requires the .next pointer, which is what we're trying to add
96- # when we call this, and so we manually supply a generator to yield the
97- # nodes in DOM order.
98- yield node
99- try :
100- for child in node .contents :
101- for descendant in _traverse (child ):
102- yield descendant
103- except AttributeError :
104- # Not an element.
105- return
106-
107- nodes = sorted (_traverse (soup ), key = lambda node : node .offset )
108- if nodes :
109- nodes [0 ].previous = None
110- nodes [- 1 ].next = None
111- for i , node in enumerate (nodes [1 :- 1 ], 1 ):
112- nodes [i - 1 ].next = node
113- node .previous = nodes [i - 1 ]
114-
115-
11692def parse (text , ** kwargs ):
11793 with gumboc .parse (text , ** kwargs ) as output :
118- soup = BeautifulSoup .BeautifulSoup ()
94+ soup = bs4 .BeautifulSoup ()
11995 soup .append (_add_node (soup , output .contents .root .contents ))
120- _add_next_prev_pointers (soup )
12196 return soup
0 commit comments