@@ -643,7 +643,7 @@ function wpautop( $text, $br = true ) {
643643 * array( '<3' ) === wp_split_html( '<3' );
644644 *
645645 * @since 4.2.4
646- * @since 6.9 .0 Reliably parses HTML via the HTML API.
646+ * @since 7.0 .0 Reliably parses HTML via the HTML API.
647647 *
648648 * @param string $input HTML document to split, one item for every token.
649649 * These can be text nodes, tags, comments, or doctype declarations.
@@ -668,7 +668,14 @@ public function extract_raw_token() {
668668 $ next_at += strlen ( $ raw_token );
669669 $ is_text = '#text ' === $ token_reporter ->get_token_name ();
670670
671+ // This is a tag, comment, DOCTYPE declaration, malformed comment, etc…
671672 if ( ! $ is_text ) {
673+ /*
674+ * Code relies on the fact that this function always returns text
675+ * in even indices and non-text tokens in odd indices. If there
676+ * isn’t preceding text then an artificial and empty span needs
677+ * to be added.
678+ */
672679 if ( ! $ was_text ) {
673680 $ tokens [] = '' ;
674681 }
@@ -681,14 +688,29 @@ public function extract_raw_token() {
681688 /*
682689 * WordPress looks for shortcodes and escaped shortcodes within the HTML
683690 * where they look like tags but HTML wouldn’t consider them tags, such
684- * as in "<[header level=2]>". Look for these and artificially split the
685- * text nodes where it looks like shortcodes reside inside.
691+ * as in "<[header level=2]>".
692+ *
693+ * This means that something WordPress wants to consider a tag might
694+ * appear in the middle of a larger text span. To preserve that behavior
695+ * it’s essential to look inside text nodes for these shortcode instances,
696+ * and if found, split the string around them.
697+ *
698+ * Example:
699+ *
700+ * // HTML sees a single text span here.
701+ * "This is <[tag-name]>important!"
702+ *
703+ * // It needs to break into three segments.
704+ * "This is ", "<[tag-name]>", "important!"
705+ *
706+ * As with the rest of this function, text nodes must appear between these,
707+ * implying the creation of empty nodes where they don’t already exist.
686708 */
687709 $ shortcode_pattern = get_shortcode_regex ();
688710 $ text_chunks = preg_split ( "~(< {$ shortcode_pattern }>)~ " , $ raw_token , -1 , PREG_SPLIT_DELIM_CAPTURE );
689711 foreach ( $ text_chunks as $ i => $ token ) {
690712 // The preg_split() always puts captured delimiters in the odd indices.
691- $ is_shortcode_tag = 0x01 === $ i & 0x01 ;
713+ $ is_shortcode_tag = 1 === $ i % 2 ;
692714
693715 if ( $ is_shortcode_tag && ! $ was_text ) {
694716 $ tokens [] = '' ;
@@ -698,8 +720,13 @@ public function extract_raw_token() {
698720 * Some legacy code assumes that text nodes will never start with a
699721 * less-than sign (<) but this isn’t the case, as some text nodes do
700722 * if the less-than sign doesn’t introduce a syntax token. To avoid
701- * further corruption a leading less-than sign is replaced by its
702- * encoded equivalent numeric character reference.
723+ * further corruption, a leading less-than sign is replaced by its
724+ * equivalent numeric character reference.
725+ *
726+ * Example:
727+ *
728+ * input: "<3 the shortcodes like <[emoji-tag name=heart]>"
729+ * output: "<3 the shortcodes like <[emoji-tag name=heart]>"
703730 */
704731 if ( ! $ is_shortcode_tag && '< ' === ( $ token [0 ] ?? '' ) ) {
705732 $ token = '< ' . substr ( $ token , 1 );
@@ -719,11 +746,17 @@ public function extract_raw_token() {
719746 * segment needs to appear.
720747 */
721748 if ( $ token_reporter ->paused_at_incomplete_token () ) {
722- if ( ! $ was_text ) {
723- $ tokens [] = '' ;
749+ $ token = substr ( $ input , $ next_at );
750+ $ syntax_like = '< ' === ( $ token [0 ] ?? '' );
751+ $ token = $ syntax_like ? ( '< ' . substr ( $ token , 1 ) ) : $ token ;
752+
753+ if ( $ was_text ) {
754+ $ tokens [ count ( $ tokens ) - 1 ] .= $ token ;
755+ } else {
756+ $ tokens [] = $ token ;
724757 }
758+
725759 $ was_text = false ;
726- $ tokens [] = substr ( $ input , $ next_at );
727760 }
728761
729762 if ( ! $ was_text ) {
0 commit comments