Skip to content

Commit 7811944

Browse files
committed
fix parser to comply with most recent spec changes to <select> logic
now we pass 25 more tests from the updated html5lib-tests whatwg/html#10557
1 parent 0d5cb1d commit 7811944

2 files changed

Lines changed: 68 additions & 194 deletions

File tree

src/insertion_mode.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,6 @@ typedef enum {
4040
GUMBO_INSERTION_MODE_IN_TABLE_BODY,
4141
GUMBO_INSERTION_MODE_IN_ROW,
4242
GUMBO_INSERTION_MODE_IN_CELL,
43-
GUMBO_INSERTION_MODE_IN_SELECT,
44-
GUMBO_INSERTION_MODE_IN_SELECT_IN_TABLE,
4543
GUMBO_INSERTION_MODE_IN_TEMPLATE,
4644
GUMBO_INSERTION_MODE_AFTER_BODY,
4745
GUMBO_INSERTION_MODE_IN_FRAMESET,

src/parser.c

Lines changed: 68 additions & 192 deletions
Original file line numberDiff line numberDiff line change
@@ -579,21 +579,6 @@ static GumboInsertionMode get_appropriate_insertion_mode(
579579
GUMBO_INSERTION_MODE_IN_BODY : GUMBO_INSERTION_MODE_INITIAL;
580580

581581
switch (node->v.element.tag) {
582-
case GUMBO_TAG_SELECT: {
583-
if (is_last) {
584-
return GUMBO_INSERTION_MODE_IN_SELECT;
585-
}
586-
for (int i = index; i > 0; --i) {
587-
const GumboNode* ancestor = open_elements->data[i];
588-
if (node_html_tag_is(ancestor, GUMBO_TAG_TEMPLATE)) {
589-
return GUMBO_INSERTION_MODE_IN_SELECT;
590-
}
591-
if (node_html_tag_is(ancestor, GUMBO_TAG_TABLE)) {
592-
return GUMBO_INSERTION_MODE_IN_SELECT_IN_TABLE;
593-
}
594-
}
595-
return GUMBO_INSERTION_MODE_IN_SELECT;
596-
}
597582
case GUMBO_TAG_TD:
598583
case GUMBO_TAG_TH:
599584
if (!is_last) return GUMBO_INSERTION_MODE_IN_CELL;
@@ -1467,12 +1452,6 @@ static bool has_an_element_in_table_scope(GumboParser* parser, GumboTag tag) {
14671452
(gumbo_tagset){TAG(HTML), TAG(TABLE), TAG(TEMPLATE)});
14681453
}
14691454

1470-
// http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-select-scope
1471-
static bool has_an_element_in_select_scope(GumboParser* parser, GumboTag tag) {
1472-
return has_an_element_in_specific_scope(
1473-
parser, 1, &tag, true, (gumbo_tagset){TAG(OPTGROUP), TAG(OPTION)});
1474-
}
1475-
14761455
// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#generate-implied-end-tags
14771456
// "exception" is the "element to exclude from the process" listed in the spec.
14781457
// Pass GUMBO_TAG_LAST to not exclude any of them.
@@ -1546,18 +1525,6 @@ static bool close_current_cell(GumboParser* parser, const GumboToken* token) {
15461525
}
15471526
}
15481527

1549-
// This factors out the "act as if an end tag of tag name 'select' had been
1550-
// seen" clause of the spec, since it's referenced in several places. It pops
1551-
// all nodes from the stack until the current <select> has been closed, then
1552-
// resets the insertion mode appropriately.
1553-
static void close_current_select(GumboParser* parser) {
1554-
GumboNode* node = pop_current_node(parser);
1555-
while (!node_html_tag_is(node, GUMBO_TAG_SELECT)) {
1556-
node = pop_current_node(parser);
1557-
}
1558-
reset_insertion_mode_appropriately(parser);
1559-
}
1560-
15611528
// The list of nodes in the "special" category:
15621529
// http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#special
15631530
static bool is_special_node(const GumboNode* node) {
@@ -2798,6 +2765,12 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
27982765
set_frameset_not_ok(parser);
27992766
return success;
28002767
} else if (tag_is(token, kStartTag, GUMBO_TAG_INPUT)) {
2768+
if (has_an_element_in_scope(parser, GUMBO_TAG_SELECT)) {
2769+
parser_add_parse_error(parser, token);
2770+
while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_SELECT));
2771+
parser->_parser_state->_reprocess_current_token = true;
2772+
return false;
2773+
}
28012774
if (!attribute_matches(&token->v.start_tag.attributes, "type", "hidden")) {
28022775
// Must be before the element is inserted, as that takes ownership of the
28032776
// token's attribute vector.
@@ -2816,6 +2789,12 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
28162789
return true;
28172790
} else if (tag_is(token, kStartTag, GUMBO_TAG_HR)) {
28182791
bool result = maybe_implicitly_close_p_tag(parser, token);
2792+
if (has_an_element_in_scope(parser, GUMBO_TAG_SELECT)) {
2793+
generate_implied_end_tags(parser, GUMBO_TAG_LAST);
2794+
if (has_an_element_in_scope_with_tagname(parser, 2, (GumboTag[]) {GUMBO_TAG_OPTION, GUMBO_TAG_OPTGROUP})) {
2795+
parser_add_parse_error(parser, token);
2796+
}
2797+
}
28192798
insert_element_from_token(parser, token);
28202799
pop_current_node(parser);
28212800
acknowledge_self_closing_tag(parser);
@@ -2848,24 +2827,42 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
28482827
run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
28492828
return true;
28502829
} else if (tag_is(token, kStartTag, GUMBO_TAG_SELECT)) {
2851-
reconstruct_active_formatting_elements(parser);
2852-
insert_element_from_token(parser, token);
2853-
set_frameset_not_ok(parser);
2854-
GumboInsertionMode state = parser->_parser_state->_insertion_mode;
2855-
if (state == GUMBO_INSERTION_MODE_IN_TABLE ||
2856-
state == GUMBO_INSERTION_MODE_IN_CAPTION ||
2857-
state == GUMBO_INSERTION_MODE_IN_TABLE_BODY ||
2858-
state == GUMBO_INSERTION_MODE_IN_ROW ||
2859-
state == GUMBO_INSERTION_MODE_IN_CELL) {
2860-
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_SELECT_IN_TABLE);
2830+
if (has_an_element_in_scope(parser, GUMBO_TAG_SELECT)) {
2831+
parser_add_parse_error(parser, token);
2832+
ignore_token(parser);
2833+
while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_SELECT));
28612834
} else {
2862-
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_SELECT);
2835+
reconstruct_active_formatting_elements(parser);
2836+
insert_element_from_token(parser, token);
2837+
set_frameset_not_ok(parser);
28632838
}
28642839
return true;
2865-
} else if (tag_in(token, kStartTag,
2866-
(gumbo_tagset){TAG(OPTION), TAG(OPTGROUP)})) {
2867-
if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
2868-
pop_current_node(parser);
2840+
} else if (tag_is(token, kStartTag, GUMBO_TAG_OPTION)) {
2841+
if (has_an_element_in_scope(parser, GUMBO_TAG_SELECT)) {
2842+
generate_implied_end_tags(parser, GUMBO_TAG_OPTGROUP);
2843+
if (has_an_element_in_scope(parser, GUMBO_TAG_OPTION)) {
2844+
parser_add_parse_error(parser, token);
2845+
}
2846+
} else {
2847+
if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
2848+
pop_current_node(parser);
2849+
}
2850+
}
2851+
reconstruct_active_formatting_elements(parser);
2852+
insert_element_from_token(parser, token);
2853+
return true;
2854+
} else if (tag_is(token, kStartTag, GUMBO_TAG_OPTGROUP)) {
2855+
if (has_an_element_in_scope(parser, GUMBO_TAG_SELECT)) {
2856+
generate_implied_end_tags(parser, GUMBO_TAG_LAST);
2857+
if (has_an_element_in_scope_with_tagname(parser, 2, (GumboTag[]) {
2858+
GUMBO_TAG_OPTION, GUMBO_TAG_OPTGROUP
2859+
})) {
2860+
parser_add_parse_error(parser, token);
2861+
}
2862+
} else {
2863+
if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
2864+
pop_current_node(parser);
2865+
}
28692866
}
28702867
reconstruct_active_formatting_elements(parser);
28712868
insert_element_from_token(parser, token);
@@ -3372,142 +3369,6 @@ static bool handle_in_cell(GumboParser* parser, GumboToken* token) {
33723369
}
33733370
}
33743371

3375-
// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inselect
3376-
static bool handle_in_select(GumboParser* parser, GumboToken* token) {
3377-
if (token->type == GUMBO_TOKEN_NULL) {
3378-
parser_add_parse_error(parser, token);
3379-
ignore_token(parser);
3380-
return false;
3381-
} else if (token->type == GUMBO_TOKEN_CHARACTER ||
3382-
token->type == GUMBO_TOKEN_WHITESPACE) {
3383-
insert_text_token(parser, token);
3384-
return true;
3385-
} else if (token->type == GUMBO_TOKEN_DOCTYPE) {
3386-
parser_add_parse_error(parser, token);
3387-
ignore_token(parser);
3388-
return false;
3389-
} else if (token->type == GUMBO_TOKEN_COMMENT) {
3390-
append_comment_node(parser, get_current_node(parser), token);
3391-
return true;
3392-
} else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3393-
return handle_in_body(parser, token);
3394-
} else if (tag_is(token, kStartTag, GUMBO_TAG_OPTION)) {
3395-
if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
3396-
pop_current_node(parser);
3397-
}
3398-
insert_element_from_token(parser, token);
3399-
return true;
3400-
} else if (tag_is(token, kStartTag, GUMBO_TAG_OPTGROUP)) {
3401-
if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
3402-
pop_current_node(parser);
3403-
}
3404-
if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTGROUP)) {
3405-
pop_current_node(parser);
3406-
}
3407-
insert_element_from_token(parser, token);
3408-
return true;
3409-
} else if (tag_is(token, kStartTag, GUMBO_TAG_HR)) {
3410-
if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
3411-
pop_current_node(parser);
3412-
}
3413-
if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTGROUP)) {
3414-
pop_current_node(parser);
3415-
}
3416-
insert_element_from_token(parser, token);
3417-
pop_current_node(parser);
3418-
acknowledge_self_closing_tag(parser);
3419-
return true;
3420-
} else if (tag_is(token, kEndTag, GUMBO_TAG_OPTGROUP)) {
3421-
GumboVector* open_elements = &parser->_parser_state->_open_elements;
3422-
if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION) &&
3423-
node_html_tag_is(open_elements->data[open_elements->length - 2],
3424-
GUMBO_TAG_OPTGROUP)) {
3425-
pop_current_node(parser);
3426-
}
3427-
if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTGROUP)) {
3428-
pop_current_node(parser);
3429-
return true;
3430-
} else {
3431-
parser_add_parse_error(parser, token);
3432-
ignore_token(parser);
3433-
return false;
3434-
}
3435-
} else if (tag_is(token, kEndTag, GUMBO_TAG_OPTION)) {
3436-
if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
3437-
pop_current_node(parser);
3438-
return true;
3439-
} else {
3440-
parser_add_parse_error(parser, token);
3441-
ignore_token(parser);
3442-
return false;
3443-
}
3444-
} else if (tag_is(token, kEndTag, GUMBO_TAG_SELECT)) {
3445-
if (!has_an_element_in_select_scope(parser, GUMBO_TAG_SELECT)) {
3446-
parser_add_parse_error(parser, token);
3447-
ignore_token(parser);
3448-
return false;
3449-
}
3450-
close_current_select(parser);
3451-
return true;
3452-
} else if (tag_is(token, kStartTag, GUMBO_TAG_SELECT)) {
3453-
parser_add_parse_error(parser, token);
3454-
ignore_token(parser);
3455-
if (has_an_element_in_select_scope(parser, GUMBO_TAG_SELECT)) {
3456-
close_current_select(parser);
3457-
}
3458-
return false;
3459-
} else if (tag_in(token, kStartTag,
3460-
(gumbo_tagset){TAG(INPUT), TAG(KEYGEN), TAG(TEXTAREA)})) {
3461-
parser_add_parse_error(parser, token);
3462-
if (!has_an_element_in_select_scope(parser, GUMBO_TAG_SELECT)) {
3463-
ignore_token(parser);
3464-
} else {
3465-
close_current_select(parser);
3466-
parser->_parser_state->_reprocess_current_token = true;
3467-
}
3468-
return false;
3469-
} else if (tag_in(token, kStartTag,
3470-
(gumbo_tagset){TAG(SCRIPT), TAG(TEMPLATE)}) ||
3471-
tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
3472-
return handle_in_head(parser, token);
3473-
} else if (token->type == GUMBO_TOKEN_EOF) {
3474-
return handle_in_body(parser, token);
3475-
} else {
3476-
parser_add_parse_error(parser, token);
3477-
ignore_token(parser);
3478-
return false;
3479-
}
3480-
}
3481-
3482-
// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inselectintable
3483-
static bool handle_in_select_in_table(GumboParser* parser, GumboToken* token) {
3484-
if (tag_in(token, kStartTag,
3485-
(gumbo_tagset){TAG(CAPTION), TAG(TABLE), TAG(TBODY), TAG(TFOOT),
3486-
TAG(THEAD), TAG(TR), TAG(TD), TAG(TH)})) {
3487-
parser_add_parse_error(parser, token);
3488-
close_current_select(parser);
3489-
parser->_parser_state->_reprocess_current_token = true;
3490-
return false;
3491-
} else if (tag_in(token, kEndTag,
3492-
(gumbo_tagset){TAG(CAPTION), TAG(TABLE), TAG(TBODY),
3493-
TAG(TFOOT), TAG(THEAD), TAG(TR), TAG(TD), TAG(TH)})) {
3494-
parser_add_parse_error(parser, token);
3495-
if (!has_an_element_in_table_scope(parser, token->v.end_tag)) {
3496-
ignore_token(parser);
3497-
return false;
3498-
} else {
3499-
close_current_select(parser);
3500-
// close_current_select already does the
3501-
// reset_insertion_mode_appropriately
3502-
// reset_insertion_mode_appropriately(parser);
3503-
parser->_parser_state->_reprocess_current_token = true;
3504-
return false;
3505-
}
3506-
} else {
3507-
return handle_in_select(parser, token);
3508-
}
3509-
}
3510-
35113372
// http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#parsing-main-intemplate
35123373
static bool handle_in_template(GumboParser* parser, GumboToken* token) {
35133374
GumboParserState* state = parser->_parser_state;
@@ -3738,14 +3599,29 @@ static bool handle_after_after_frameset(
37383599
// Function pointers for each insertion mode. Keep in sync with
37393600
// insertion_mode.h.
37403601
typedef bool (*TokenHandler)(GumboParser* parser, GumboToken* token);
3741-
static const TokenHandler kTokenHandlers[] = {handle_initial,
3742-
handle_before_html, handle_before_head, handle_in_head,
3743-
handle_in_head_noscript, handle_after_head, handle_in_body, handle_text,
3744-
handle_in_table, handle_in_table_text, handle_in_caption,
3745-
handle_in_column_group, handle_in_table_body, handle_in_row, handle_in_cell,
3746-
handle_in_select, handle_in_select_in_table, handle_in_template,
3747-
handle_after_body, handle_in_frameset, handle_after_frameset,
3748-
handle_after_after_body, handle_after_after_frameset};
3602+
static const TokenHandler kTokenHandlers[] = {
3603+
handle_initial,
3604+
handle_before_html,
3605+
handle_before_head,
3606+
handle_in_head,
3607+
handle_in_head_noscript,
3608+
handle_after_head,
3609+
handle_in_body,
3610+
handle_text,
3611+
handle_in_table,
3612+
handle_in_table_text,
3613+
handle_in_caption,
3614+
handle_in_column_group,
3615+
handle_in_table_body,
3616+
handle_in_row,
3617+
handle_in_cell,
3618+
handle_in_template,
3619+
handle_after_body,
3620+
handle_in_frameset,
3621+
handle_after_frameset,
3622+
handle_after_after_body,
3623+
handle_after_after_frameset,
3624+
};
37493625

37503626
static bool handle_html_content(GumboParser* parser, GumboToken* token) {
37513627
return kTokenHandlers[(unsigned int) parser->_parser_state->_insertion_mode](

0 commit comments

Comments
 (0)