Skip to content

Commit fb69bf2

Browse files
committed
Comment updates and a fix to text node appending in wp_html_split
1 parent d3dd98e commit fb69bf2

1 file changed

Lines changed: 42 additions & 9 deletions

File tree

src/wp-includes/formatting.php

Lines changed: 42 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -643,7 +643,7 @@ function wpautop( $text, $br = true ) {
643643
* array( '&#60;3' ) === wp_split_html( '<3' );
644644
*
645645
* @since 4.2.4
646-
* @since 6.9.0 Reliably parses HTML via the HTML API.
646+
* @since 7.0.0 Reliably parses HTML via the HTML API.
647647
*
648648
* @param string $input HTML document to split, one item for every token.
649649
* These can be text nodes, tags, comments, or doctype declarations.
@@ -668,7 +668,14 @@ public function extract_raw_token() {
668668
$next_at += strlen( $raw_token );
669669
$is_text = '#text' === $token_reporter->get_token_name();
670670

671+
// This is a tag, comment, DOCTYPE declaration, malformed comment, etc…
671672
if ( ! $is_text ) {
673+
/*
674+
* Code relies on the fact that this function always returns text
675+
* in even indices and non-text tokens in odd indices. If there
676+
* isn’t preceding text then an artificial and empty span needs
677+
* to be added.
678+
*/
672679
if ( ! $was_text ) {
673680
$tokens[] = '';
674681
}
@@ -681,14 +688,29 @@ public function extract_raw_token() {
681688
/*
682689
* WordPress looks for shortcodes and escaped shortcodes within the HTML
683690
* where they look like tags but HTML wouldn’t consider them tags, such
684-
* as in "<[header level=2]>". Look for these and artificially split the
685-
* text nodes where it looks like shortcodes reside inside.
691+
* as in "<[header level=2]>".
692+
*
693+
* This means that something WordPress wants to consider a tag might
694+
* appear in the middle of a larger text span. To preserve that behavior
695+
* it’s essential to look inside text nodes for these shortcode instances,
696+
* and if found, split the string around them.
697+
*
698+
* Example:
699+
*
700+
* // HTML sees a single text span here.
701+
* "This is <[tag-name]>important!"
702+
*
703+
* // It needs to break into three segments.
704+
* "This is ", "<[tag-name]>", "important!"
705+
*
706+
* As with the rest of this function, text nodes must appear between these,
707+
* implying the creation of empty nodes where they don’t already exist.
686708
*/
687709
$shortcode_pattern = get_shortcode_regex();
688710
$text_chunks = preg_split( "~(<{$shortcode_pattern}>)~", $raw_token, -1, PREG_SPLIT_DELIM_CAPTURE );
689711
foreach ( $text_chunks as $i => $token ) {
690712
// The preg_split() always puts captured delimiters in the odd indices.
691-
$is_shortcode_tag = 0x01 === $i & 0x01;
713+
$is_shortcode_tag = 1 === $i % 2;
692714

693715
if ( $is_shortcode_tag && ! $was_text ) {
694716
$tokens[] = '';
@@ -698,8 +720,13 @@ public function extract_raw_token() {
698720
* Some legacy code assumes that text nodes will never start with a
699721
* less-than sign (<) but this isn’t the case, as some text nodes do
700722
* if the less-than sign doesn’t introduce a syntax token. To avoid
701-
* further corruption a leading less-than sign is replaced by its
702-
* encoded equivalent numeric character reference.
723+
* further corruption, a leading less-than sign is replaced by its
724+
* equivalent numeric character reference.
725+
*
726+
* Example:
727+
*
728+
* input: "<3 the shortcodes like <[emoji-tag name=heart]>"
729+
* output: "&#60;3 the shortcodes like <[emoji-tag name=heart]>"
703730
*/
704731
if ( ! $is_shortcode_tag && '<' === ( $token[0] ?? '' ) ) {
705732
$token = '&#60;' . substr( $token, 1 );
@@ -719,11 +746,17 @@ public function extract_raw_token() {
719746
* segment needs to appear.
720747
*/
721748
if ( $token_reporter->paused_at_incomplete_token() ) {
722-
if ( ! $was_text ) {
723-
$tokens[] = '';
749+
$token = substr( $input, $next_at );
750+
$syntax_like = '<' === ( $token[0] ?? '' );
751+
$token = $syntax_like ? ( '&#60;' . substr( $token, 1 ) ) : $token;
752+
753+
if ( $was_text ) {
754+
$tokens[ count( $tokens ) - 1 ] .= $token;
755+
} else {
756+
$tokens[] = $token;
724757
}
758+
725759
$was_text = false;
726-
$tokens[] = substr( $input, $next_at );
727760
}
728761

729762
if ( ! $was_text ) {

0 commit comments

Comments
 (0)