From fde8f609bde80cb529c29f00d99b1c1cbcff7fac Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 13 Mar 2026 14:50:14 +0100 Subject: [PATCH 01/43] Perf: Remove redundant after_tag() call from parse_next_tag() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit parse_next_tag() is only called from base_class_next_token(), which already calls after_tag() before invoking parse_next_tag(). The second call is redundant — all state has already been cleaned up. ~8% improvement in tokenization throughput (1250ms → 1150ms). --- bootstrap-html-api.php | 68 +++++++++++++++++++ .../html-api/class-wp-html-tag-processor.php | 2 - 2 files changed, 68 insertions(+), 2 deletions(-) create mode 100644 bootstrap-html-api.php diff --git a/bootstrap-html-api.php b/bootstrap-html-api.php new file mode 100644 index 0000000000000..0af11e44fcdf7 --- /dev/null +++ b/bootstrap-html-api.php @@ -0,0 +1,68 @@ +', '"' ), array( '<', '>', '"' ), $s ); + } +} + +if ( ! function_exists( '__' ) ) { + function __( $s ) { + return $s; + } +} + +if ( ! function_exists( '_doing_it_wrong' ) ) { + function _doing_it_wrong( $message ) { + trigger_error( $message ); + } +} + +if ( ! function_exists( 'wp_kses_uri_attributes' ) ) { + function wp_kses_uri_attributes() { + return array(); + } +} diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 8397ecf520fa2..087392363a014 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -1710,8 +1710,6 @@ private function skip_script_data(): bool { * @return bool Whether a tag was found before the end of the document. */ private function parse_next_tag(): bool { - $this->after_tag(); - $html = $this->html; $doc_length = strlen( $html ); $was_at = $this->bytes_already_parsed; From c33ce0e2da2b37eea5dda74092245e4f41e1b5ec Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 13 Mar 2026 14:52:04 +0100 Subject: [PATCH 02/43] Perf: Cache $this->html in local variable in parse_next_attribute() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit parse_next_attribute() accesses $this->html 8+ times per call via object property lookup. Local variable access is faster in PHP. ~4% improvement (1150ms → 1105ms). --- .../html-api/class-wp-html-tag-processor.php | 23 ++++++++++--------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 087392363a014..fd9bef9d0b6e3 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -2130,10 +2130,11 @@ private function parse_next_tag(): bool { * @return bool Whether an attribute was found before the end of the document. */ private function parse_next_attribute(): bool { - $doc_length = strlen( $this->html ); + $html = $this->html; + $doc_length = strlen( $html ); // Skip whitespace and slashes. - $this->bytes_already_parsed += strspn( $this->html, " \t\f\r\n/", $this->bytes_already_parsed ); + $this->bytes_already_parsed += strspn( $html, " \t\f\r\n/", $this->bytes_already_parsed ); if ( $this->bytes_already_parsed >= $doc_length ) { $this->parser_state = self::STATE_INCOMPLETE_INPUT; @@ -2146,9 +2147,9 @@ private function parse_next_attribute(): bool { * * @see https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-name-state */ - $name_length = '=' === $this->html[ $this->bytes_already_parsed ] - ? 1 + strcspn( $this->html, "=/> \t\f\r\n", $this->bytes_already_parsed + 1 ) - : strcspn( $this->html, "=/> \t\f\r\n", $this->bytes_already_parsed ); + $name_length = '=' === $html[ $this->bytes_already_parsed ] + ? 1 + strcspn( $html, "=/> \t\f\r\n", $this->bytes_already_parsed + 1 ) + : strcspn( $html, "=/> \t\f\r\n", $this->bytes_already_parsed ); // No attribute, just tag closer. if ( 0 === $name_length || $this->bytes_already_parsed + $name_length >= $doc_length ) { @@ -2156,7 +2157,7 @@ private function parse_next_attribute(): bool { } $attribute_start = $this->bytes_already_parsed; - $attribute_name = substr( $this->html, $attribute_start, $name_length ); + $attribute_name = substr( $html, $attribute_start, $name_length ); $this->bytes_already_parsed += $name_length; if ( $this->bytes_already_parsed >= $doc_length ) { $this->parser_state = self::STATE_INCOMPLETE_INPUT; @@ -2171,7 +2172,7 @@ private function parse_next_attribute(): bool { return false; } - $has_value = '=' === $this->html[ $this->bytes_already_parsed ]; + $has_value = '=' === $html[ $this->bytes_already_parsed ]; if ( $has_value ) { ++$this->bytes_already_parsed; $this->skip_whitespace(); @@ -2181,12 +2182,12 @@ private function parse_next_attribute(): bool { return false; } - switch ( $this->html[ $this->bytes_already_parsed ] ) { + switch ( $html[ $this->bytes_already_parsed ] ) { case "'": case '"': - $quote = $this->html[ $this->bytes_already_parsed ]; + $quote = $html[ $this->bytes_already_parsed ]; $value_start = $this->bytes_already_parsed + 1; - $end_quote_at = strpos( $this->html, $quote, $value_start ); + $end_quote_at = strpos( $html, $quote, $value_start ); $end_quote_at = false === $end_quote_at ? $doc_length : $end_quote_at; $value_length = $end_quote_at - $value_start; $attribute_end = $end_quote_at + 1; @@ -2195,7 +2196,7 @@ private function parse_next_attribute(): bool { default: $value_start = $this->bytes_already_parsed; - $value_length = strcspn( $this->html, "> \t\f\r\n", $value_start ); + $value_length = strcspn( $html, "> \t\f\r\n", $value_start ); $attribute_end = $value_start + $value_length; $this->bytes_already_parsed = $attribute_end; } From 137d548b19d0558feabbd282b27b8510acd68840 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 13 Mar 2026 14:53:34 +0100 Subject: [PATCH 03/43] Perf: Inline skip_whitespace() calls in parse_next_attribute() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace two skip_whitespace() method calls with inline strspn() using the local $html variable. This avoids function call overhead and allows PHP's JIT to optimize the entire parse_next_attribute() as one unit. Massive improvement: 1105ms → 316ms (~71% faster). --- src/wp-includes/html-api/class-wp-html-tag-processor.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index fd9bef9d0b6e3..2172fef6e463b 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -2165,7 +2165,7 @@ private function parse_next_attribute(): bool { return false; } - $this->skip_whitespace(); + $this->bytes_already_parsed += strspn( $html, " \t\f\r\n", $this->bytes_already_parsed ); if ( $this->bytes_already_parsed >= $doc_length ) { $this->parser_state = self::STATE_INCOMPLETE_INPUT; @@ -2175,7 +2175,7 @@ private function parse_next_attribute(): bool { $has_value = '=' === $html[ $this->bytes_already_parsed ]; if ( $has_value ) { ++$this->bytes_already_parsed; - $this->skip_whitespace(); + $this->bytes_already_parsed += strspn( $html, " \t\f\r\n", $this->bytes_already_parsed ); if ( $this->bytes_already_parsed >= $doc_length ) { $this->parser_state = self::STATE_INCOMPLETE_INPUT; From a67da3a3827fe382cf4265c732b2a38f852896fa Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 13 Mar 2026 14:54:52 +0100 Subject: [PATCH 04/43] Perf: Cache $this->html and strlen in base_class_next_token() Avoid repeated property lookups and strlen() calls by caching in local variables. Eliminates 2 strlen() calls and 1 property access per token. --- src/wp-includes/html-api/class-wp-html-tag-processor.php | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 2172fef6e463b..35156b57f29ed 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -969,7 +969,10 @@ private function base_class_next_token(): bool { */ $this->parser_state = self::STATE_READY; - if ( $this->bytes_already_parsed >= strlen( $this->html ) ) { + $html = $this->html; + $doc_length = strlen( $html ); + + if ( $this->bytes_already_parsed >= $doc_length ) { $this->parser_state = self::STATE_COMPLETE; return false; } @@ -1005,7 +1008,7 @@ private function base_class_next_token(): bool { // Ensure that the tag closes before the end of the document. if ( self::STATE_INCOMPLETE_INPUT === $this->parser_state || - $this->bytes_already_parsed >= strlen( $this->html ) + $this->bytes_already_parsed >= $doc_length ) { // Does this appropriately clear state (parsed attributes)? $this->parser_state = self::STATE_INCOMPLETE_INPUT; @@ -1014,7 +1017,7 @@ private function base_class_next_token(): bool { return false; } - $tag_ends_at = strpos( $this->html, '>', $this->bytes_already_parsed ); + $tag_ends_at = strpos( $html, '>', $this->bytes_already_parsed ); if ( false === $tag_ends_at ) { $this->parser_state = self::STATE_INCOMPLETE_INPUT; $this->bytes_already_parsed = $was_at; From 7ba66fd95f8a31a643353aeda098f5bcef99ead4 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 13 Mar 2026 14:55:56 +0100 Subject: [PATCH 05/43] Perf: Skip update processing in after_tag() when no modifications queued MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Guard class_name_updates_to_attributes_updates() and lexical_updates processing behind empty() checks. In read-only mode (the common case), these arrays are always empty, avoiding unnecessary function calls. ~7% improvement (312ms → 291ms). --- .../html-api/class-wp-html-tag-processor.php | 50 ++++++++++--------- 1 file changed, 27 insertions(+), 23 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 35156b57f29ed..6ff4f4531fd81 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -2286,37 +2286,41 @@ private function after_tag(): void { * attributes across the two tags, lexical updates with names * need to be flushed to raw lexical updates. */ - $this->class_name_updates_to_attributes_updates(); - - /* - * Purge updates if there are too many. The actual count isn't - * scientific, but a few values from 100 to a few thousand were - * tests to find a practically-useful limit. - * - * If the update queue grows too big, then the Tag Processor - * will spend more time iterating through them and lose the - * efficiency gains of deferring applying them. - */ - if ( 1000 < count( $this->lexical_updates ) ) { - $this->get_updated_html(); + if ( ! empty( $this->classname_updates ) ) { + $this->class_name_updates_to_attributes_updates(); } - foreach ( $this->lexical_updates as $name => $update ) { + if ( ! empty( $this->lexical_updates ) ) { /* - * Any updates appearing after the cursor should be applied - * before proceeding, otherwise they may be overlooked. + * Purge updates if there are too many. The actual count isn't + * scientific, but a few values from 100 to a few thousand were + * tests to find a practically-useful limit. + * + * If the update queue grows too big, then the Tag Processor + * will spend more time iterating through them and lose the + * efficiency gains of deferring applying them. */ - if ( $update->start >= $this->bytes_already_parsed ) { + if ( 1000 < count( $this->lexical_updates ) ) { $this->get_updated_html(); - break; } - if ( is_int( $name ) ) { - continue; - } + foreach ( $this->lexical_updates as $name => $update ) { + /* + * Any updates appearing after the cursor should be applied + * before proceeding, otherwise they may be overlooked. + */ + if ( $update->start >= $this->bytes_already_parsed ) { + $this->get_updated_html(); + break; + } - $this->lexical_updates[] = $update; - unset( $this->lexical_updates[ $name ] ); + if ( is_int( $name ) ) { + continue; + } + + $this->lexical_updates[] = $update; + unset( $this->lexical_updates[ $name ] ); + } } $this->token_starts_at = null; From 2ccfcdb03b8edd6587feceed280438094c0dc187 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 13 Mar 2026 15:00:49 +0100 Subject: [PATCH 06/43] Perf: Use local $at variable in parse_next_attribute() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace ~15 $this->bytes_already_parsed property accesses with a local $at variable, writing back once at the end. Local variable access is significantly faster than object property access in PHP's VM. ~2.3% improvement (291ms → 284ms). --- .../html-api/class-wp-html-tag-processor.php | 74 ++++++++++--------- 1 file changed, 41 insertions(+), 33 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 6ff4f4531fd81..1aa6c64a67523 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -2135,11 +2135,13 @@ private function parse_next_tag(): bool { private function parse_next_attribute(): bool { $html = $this->html; $doc_length = strlen( $html ); + $at = $this->bytes_already_parsed; // Skip whitespace and slashes. - $this->bytes_already_parsed += strspn( $html, " \t\f\r\n/", $this->bytes_already_parsed ); - if ( $this->bytes_already_parsed >= $doc_length ) { - $this->parser_state = self::STATE_INCOMPLETE_INPUT; + $at += strspn( $html, " \t\f\r\n/", $at ); + if ( $at >= $doc_length ) { + $this->parser_state = self::STATE_INCOMPLETE_INPUT; + $this->bytes_already_parsed = $at; return false; } @@ -2150,65 +2152,71 @@ private function parse_next_attribute(): bool { * * @see https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-name-state */ - $name_length = '=' === $html[ $this->bytes_already_parsed ] - ? 1 + strcspn( $html, "=/> \t\f\r\n", $this->bytes_already_parsed + 1 ) - : strcspn( $html, "=/> \t\f\r\n", $this->bytes_already_parsed ); + $name_length = '=' === $html[ $at ] + ? 1 + strcspn( $html, "=/> \t\f\r\n", $at + 1 ) + : strcspn( $html, "=/> \t\f\r\n", $at ); // No attribute, just tag closer. - if ( 0 === $name_length || $this->bytes_already_parsed + $name_length >= $doc_length ) { + if ( 0 === $name_length || $at + $name_length >= $doc_length ) { + $this->bytes_already_parsed = $at; return false; } - $attribute_start = $this->bytes_already_parsed; - $attribute_name = substr( $html, $attribute_start, $name_length ); - $this->bytes_already_parsed += $name_length; - if ( $this->bytes_already_parsed >= $doc_length ) { - $this->parser_state = self::STATE_INCOMPLETE_INPUT; + $attribute_start = $at; + $attribute_name = substr( $html, $attribute_start, $name_length ); + $at += $name_length; + if ( $at >= $doc_length ) { + $this->parser_state = self::STATE_INCOMPLETE_INPUT; + $this->bytes_already_parsed = $at; return false; } - $this->bytes_already_parsed += strspn( $html, " \t\f\r\n", $this->bytes_already_parsed ); - if ( $this->bytes_already_parsed >= $doc_length ) { - $this->parser_state = self::STATE_INCOMPLETE_INPUT; + $at += strspn( $html, " \t\f\r\n", $at ); + if ( $at >= $doc_length ) { + $this->parser_state = self::STATE_INCOMPLETE_INPUT; + $this->bytes_already_parsed = $at; return false; } - $has_value = '=' === $html[ $this->bytes_already_parsed ]; + $has_value = '=' === $html[ $at ]; if ( $has_value ) { - ++$this->bytes_already_parsed; - $this->bytes_already_parsed += strspn( $html, " \t\f\r\n", $this->bytes_already_parsed ); - if ( $this->bytes_already_parsed >= $doc_length ) { - $this->parser_state = self::STATE_INCOMPLETE_INPUT; + ++$at; + $at += strspn( $html, " \t\f\r\n", $at ); + if ( $at >= $doc_length ) { + $this->parser_state = self::STATE_INCOMPLETE_INPUT; + $this->bytes_already_parsed = $at; return false; } - switch ( $html[ $this->bytes_already_parsed ] ) { + switch ( $html[ $at ] ) { case "'": case '"': - $quote = $html[ $this->bytes_already_parsed ]; - $value_start = $this->bytes_already_parsed + 1; - $end_quote_at = strpos( $html, $quote, $value_start ); - $end_quote_at = false === $end_quote_at ? $doc_length : $end_quote_at; - $value_length = $end_quote_at - $value_start; - $attribute_end = $end_quote_at + 1; - $this->bytes_already_parsed = $attribute_end; + $quote = $html[ $at ]; + $value_start = $at + 1; + $end_quote_at = strpos( $html, $quote, $value_start ); + $end_quote_at = false === $end_quote_at ? $doc_length : $end_quote_at; + $value_length = $end_quote_at - $value_start; + $attribute_end = $end_quote_at + 1; + $at = $attribute_end; break; default: - $value_start = $this->bytes_already_parsed; - $value_length = strcspn( $html, "> \t\f\r\n", $value_start ); - $attribute_end = $value_start + $value_length; - $this->bytes_already_parsed = $attribute_end; + $value_start = $at; + $value_length = strcspn( $html, "> \t\f\r\n", $value_start ); + $attribute_end = $value_start + $value_length; + $at = $attribute_end; } } else { - $value_start = $this->bytes_already_parsed; + $value_start = $at; $value_length = 0; $attribute_end = $attribute_start + $name_length; } + $this->bytes_already_parsed = $at; + if ( $attribute_end >= $doc_length ) { $this->parser_state = self::STATE_INCOMPLETE_INPUT; From cf2abc6e815847218fe41bd8f58f93b8930cb2ae Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 13 Mar 2026 15:02:57 +0100 Subject: [PATCH 07/43] restore trunk --- .../html-api/class-wp-html-tag-processor.php | 138 ++++++++---------- 1 file changed, 62 insertions(+), 76 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 1aa6c64a67523..8397ecf520fa2 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -969,10 +969,7 @@ private function base_class_next_token(): bool { */ $this->parser_state = self::STATE_READY; - $html = $this->html; - $doc_length = strlen( $html ); - - if ( $this->bytes_already_parsed >= $doc_length ) { + if ( $this->bytes_already_parsed >= strlen( $this->html ) ) { $this->parser_state = self::STATE_COMPLETE; return false; } @@ -1008,7 +1005,7 @@ private function base_class_next_token(): bool { // Ensure that the tag closes before the end of the document. if ( self::STATE_INCOMPLETE_INPUT === $this->parser_state || - $this->bytes_already_parsed >= $doc_length + $this->bytes_already_parsed >= strlen( $this->html ) ) { // Does this appropriately clear state (parsed attributes)? $this->parser_state = self::STATE_INCOMPLETE_INPUT; @@ -1017,7 +1014,7 @@ private function base_class_next_token(): bool { return false; } - $tag_ends_at = strpos( $html, '>', $this->bytes_already_parsed ); + $tag_ends_at = strpos( $this->html, '>', $this->bytes_already_parsed ); if ( false === $tag_ends_at ) { $this->parser_state = self::STATE_INCOMPLETE_INPUT; $this->bytes_already_parsed = $was_at; @@ -1713,6 +1710,8 @@ private function skip_script_data(): bool { * @return bool Whether a tag was found before the end of the document. */ private function parse_next_tag(): bool { + $this->after_tag(); + $html = $this->html; $doc_length = strlen( $html ); $was_at = $this->bytes_already_parsed; @@ -2133,15 +2132,12 @@ private function parse_next_tag(): bool { * @return bool Whether an attribute was found before the end of the document. */ private function parse_next_attribute(): bool { - $html = $this->html; - $doc_length = strlen( $html ); - $at = $this->bytes_already_parsed; + $doc_length = strlen( $this->html ); // Skip whitespace and slashes. - $at += strspn( $html, " \t\f\r\n/", $at ); - if ( $at >= $doc_length ) { - $this->parser_state = self::STATE_INCOMPLETE_INPUT; - $this->bytes_already_parsed = $at; + $this->bytes_already_parsed += strspn( $this->html, " \t\f\r\n/", $this->bytes_already_parsed ); + if ( $this->bytes_already_parsed >= $doc_length ) { + $this->parser_state = self::STATE_INCOMPLETE_INPUT; return false; } @@ -2152,71 +2148,65 @@ private function parse_next_attribute(): bool { * * @see https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-name-state */ - $name_length = '=' === $html[ $at ] - ? 1 + strcspn( $html, "=/> \t\f\r\n", $at + 1 ) - : strcspn( $html, "=/> \t\f\r\n", $at ); + $name_length = '=' === $this->html[ $this->bytes_already_parsed ] + ? 1 + strcspn( $this->html, "=/> \t\f\r\n", $this->bytes_already_parsed + 1 ) + : strcspn( $this->html, "=/> \t\f\r\n", $this->bytes_already_parsed ); // No attribute, just tag closer. - if ( 0 === $name_length || $at + $name_length >= $doc_length ) { - $this->bytes_already_parsed = $at; + if ( 0 === $name_length || $this->bytes_already_parsed + $name_length >= $doc_length ) { return false; } - $attribute_start = $at; - $attribute_name = substr( $html, $attribute_start, $name_length ); - $at += $name_length; - if ( $at >= $doc_length ) { - $this->parser_state = self::STATE_INCOMPLETE_INPUT; - $this->bytes_already_parsed = $at; + $attribute_start = $this->bytes_already_parsed; + $attribute_name = substr( $this->html, $attribute_start, $name_length ); + $this->bytes_already_parsed += $name_length; + if ( $this->bytes_already_parsed >= $doc_length ) { + $this->parser_state = self::STATE_INCOMPLETE_INPUT; return false; } - $at += strspn( $html, " \t\f\r\n", $at ); - if ( $at >= $doc_length ) { - $this->parser_state = self::STATE_INCOMPLETE_INPUT; - $this->bytes_already_parsed = $at; + $this->skip_whitespace(); + if ( $this->bytes_already_parsed >= $doc_length ) { + $this->parser_state = self::STATE_INCOMPLETE_INPUT; return false; } - $has_value = '=' === $html[ $at ]; + $has_value = '=' === $this->html[ $this->bytes_already_parsed ]; if ( $has_value ) { - ++$at; - $at += strspn( $html, " \t\f\r\n", $at ); - if ( $at >= $doc_length ) { - $this->parser_state = self::STATE_INCOMPLETE_INPUT; - $this->bytes_already_parsed = $at; + ++$this->bytes_already_parsed; + $this->skip_whitespace(); + if ( $this->bytes_already_parsed >= $doc_length ) { + $this->parser_state = self::STATE_INCOMPLETE_INPUT; return false; } - switch ( $html[ $at ] ) { + switch ( $this->html[ $this->bytes_already_parsed ] ) { case "'": case '"': - $quote = $html[ $at ]; - $value_start = $at + 1; - $end_quote_at = strpos( $html, $quote, $value_start ); - $end_quote_at = false === $end_quote_at ? $doc_length : $end_quote_at; - $value_length = $end_quote_at - $value_start; - $attribute_end = $end_quote_at + 1; - $at = $attribute_end; + $quote = $this->html[ $this->bytes_already_parsed ]; + $value_start = $this->bytes_already_parsed + 1; + $end_quote_at = strpos( $this->html, $quote, $value_start ); + $end_quote_at = false === $end_quote_at ? $doc_length : $end_quote_at; + $value_length = $end_quote_at - $value_start; + $attribute_end = $end_quote_at + 1; + $this->bytes_already_parsed = $attribute_end; break; default: - $value_start = $at; - $value_length = strcspn( $html, "> \t\f\r\n", $value_start ); - $attribute_end = $value_start + $value_length; - $at = $attribute_end; + $value_start = $this->bytes_already_parsed; + $value_length = strcspn( $this->html, "> \t\f\r\n", $value_start ); + $attribute_end = $value_start + $value_length; + $this->bytes_already_parsed = $attribute_end; } } else { - $value_start = $at; + $value_start = $this->bytes_already_parsed; $value_length = 0; $attribute_end = $attribute_start + $name_length; } - $this->bytes_already_parsed = $at; - if ( $attribute_end >= $doc_length ) { $this->parser_state = self::STATE_INCOMPLETE_INPUT; @@ -2294,41 +2284,37 @@ private function after_tag(): void { * attributes across the two tags, lexical updates with names * need to be flushed to raw lexical updates. */ - if ( ! empty( $this->classname_updates ) ) { - $this->class_name_updates_to_attributes_updates(); + $this->class_name_updates_to_attributes_updates(); + + /* + * Purge updates if there are too many. The actual count isn't + * scientific, but a few values from 100 to a few thousand were + * tests to find a practically-useful limit. + * + * If the update queue grows too big, then the Tag Processor + * will spend more time iterating through them and lose the + * efficiency gains of deferring applying them. + */ + if ( 1000 < count( $this->lexical_updates ) ) { + $this->get_updated_html(); } - if ( ! empty( $this->lexical_updates ) ) { + foreach ( $this->lexical_updates as $name => $update ) { /* - * Purge updates if there are too many. The actual count isn't - * scientific, but a few values from 100 to a few thousand were - * tests to find a practically-useful limit. - * - * If the update queue grows too big, then the Tag Processor - * will spend more time iterating through them and lose the - * efficiency gains of deferring applying them. + * Any updates appearing after the cursor should be applied + * before proceeding, otherwise they may be overlooked. */ - if ( 1000 < count( $this->lexical_updates ) ) { + if ( $update->start >= $this->bytes_already_parsed ) { $this->get_updated_html(); + break; } - foreach ( $this->lexical_updates as $name => $update ) { - /* - * Any updates appearing after the cursor should be applied - * before proceeding, otherwise they may be overlooked. - */ - if ( $update->start >= $this->bytes_already_parsed ) { - $this->get_updated_html(); - break; - } - - if ( is_int( $name ) ) { - continue; - } - - $this->lexical_updates[] = $update; - unset( $this->lexical_updates[ $name ] ); + if ( is_int( $name ) ) { + continue; } + + $this->lexical_updates[] = $update; + unset( $this->lexical_updates[ $name ] ); } $this->token_starts_at = null; From a2b1ed467921e68d801f2d443da498c1770d05af Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 13 Mar 2026 15:03:42 +0100 Subject: [PATCH 08/43] update benches --- bench.php | 14 ++++++++++++++ bootstrap-html-api.php | 22 ---------------------- 2 files changed, 14 insertions(+), 22 deletions(-) create mode 100755 bench.php diff --git a/bench.php b/bench.php new file mode 100755 index 0000000000000..6279051835a55 --- /dev/null +++ b/bench.php @@ -0,0 +1,14 @@ +#!/usr/bin/env php +next_token() ) { +} +$p = new WP_HTML_Tag_Processor( $html ); +while ( $p->next_token() ) { +} +$p = new WP_HTML_Tag_Processor( $html ); +while ( $p->next_token() ) { +} diff --git a/bootstrap-html-api.php b/bootstrap-html-api.php index 0af11e44fcdf7..aa9ac94e2689a 100644 --- a/bootstrap-html-api.php +++ b/bootstrap-html-api.php @@ -1,7 +1,5 @@ ', '"' ), array( '<', '>', '"' ), $s ); From b0847f3e9e7dbd46603d4c09347a2c4f4e1abd72 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 13 Mar 2026 15:09:09 +0100 Subject: [PATCH 09/43] Perf: Remove redundant after_tag() call from parse_next_tag() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit parse_next_tag() is only called from base_class_next_token(), which already calls after_tag() immediately before. The second call does redundant work resetting state that was just reset. 852ms → 757ms (-11%) --- src/wp-includes/html-api/class-wp-html-tag-processor.php | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 8397ecf520fa2..087392363a014 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -1710,8 +1710,6 @@ private function skip_script_data(): bool { * @return bool Whether a tag was found before the end of the document. */ private function parse_next_tag(): bool { - $this->after_tag(); - $html = $this->html; $doc_length = strlen( $html ); $was_at = $this->bytes_already_parsed; From 0407016a8b21bd1ca78f738292ea2c92a54b8688 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 13 Mar 2026 15:11:42 +0100 Subject: [PATCH 10/43] Perf: Skip update processing in after_tag() when no modifications queued MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For read-only tokenization, classname_updates and lexical_updates are always empty. Guard the update processing with a count check to avoid calling class_name_updates_to_attributes_updates() and iterating an empty array. 757ms → 723ms (-4.5%) --- .../html-api/class-wp-html-tag-processor.php | 62 ++++++++++--------- 1 file changed, 34 insertions(+), 28 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 087392363a014..e029e0abef57d 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -2277,42 +2277,48 @@ private function skip_whitespace(): void { */ private function after_tag(): void { /* - * There could be lexical updates enqueued for an attribute that - * also exists on the next tag. In order to avoid conflating the - * attributes across the two tags, lexical updates with names - * need to be flushed to raw lexical updates. + * Skip update processing when no modifications are queued. + * This is the common case for read-only tokenization. */ - $this->class_name_updates_to_attributes_updates(); - - /* - * Purge updates if there are too many. The actual count isn't - * scientific, but a few values from 100 to a few thousand were - * tests to find a practically-useful limit. - * - * If the update queue grows too big, then the Tag Processor - * will spend more time iterating through them and lose the - * efficiency gains of deferring applying them. - */ - if ( 1000 < count( $this->lexical_updates ) ) { - $this->get_updated_html(); - } + if ( count( $this->classname_updates ) > 0 || count( $this->lexical_updates ) > 0 ) { + /* + * There could be lexical updates enqueued for an attribute that + * also exists on the next tag. In order to avoid conflating the + * attributes across the two tags, lexical updates with names + * need to be flushed to raw lexical updates. + */ + $this->class_name_updates_to_attributes_updates(); - foreach ( $this->lexical_updates as $name => $update ) { /* - * Any updates appearing after the cursor should be applied - * before proceeding, otherwise they may be overlooked. + * Purge updates if there are too many. The actual count isn't + * scientific, but a few values from 100 to a few thousand were + * tests to find a practically-useful limit. + * + * If the update queue grows too big, then the Tag Processor + * will spend more time iterating through them and lose the + * efficiency gains of deferring applying them. */ - if ( $update->start >= $this->bytes_already_parsed ) { + if ( 1000 < count( $this->lexical_updates ) ) { $this->get_updated_html(); - break; } - if ( is_int( $name ) ) { - continue; - } + foreach ( $this->lexical_updates as $name => $update ) { + /* + * Any updates appearing after the cursor should be applied + * before proceeding, otherwise they may be overlooked. + */ + if ( $update->start >= $this->bytes_already_parsed ) { + $this->get_updated_html(); + break; + } - $this->lexical_updates[] = $update; - unset( $this->lexical_updates[ $name ] ); + if ( is_int( $name ) ) { + continue; + } + + $this->lexical_updates[] = $update; + unset( $this->lexical_updates[ $name ] ); + } } $this->token_starts_at = null; From bb92778d9b73cd25da612bd39d8d16607762f1e2 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 13 Mar 2026 15:13:45 +0100 Subject: [PATCH 11/43] Perf: Cache $this->html and strlen in base_class_next_token() Avoids repeated property lookups and strlen() calls in the main tokenization loop. --- .../html-api/class-wp-html-tag-processor.php | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index e029e0abef57d..a4d484888da3c 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -969,7 +969,10 @@ private function base_class_next_token(): bool { */ $this->parser_state = self::STATE_READY; - if ( $this->bytes_already_parsed >= strlen( $this->html ) ) { + $html = $this->html; + $doc_length = strlen( $html ); + + if ( $this->bytes_already_parsed >= $doc_length ) { $this->parser_state = self::STATE_COMPLETE; return false; } @@ -1005,7 +1008,7 @@ private function base_class_next_token(): bool { // Ensure that the tag closes before the end of the document. if ( self::STATE_INCOMPLETE_INPUT === $this->parser_state || - $this->bytes_already_parsed >= strlen( $this->html ) + $this->bytes_already_parsed >= $doc_length ) { // Does this appropriately clear state (parsed attributes)? $this->parser_state = self::STATE_INCOMPLETE_INPUT; @@ -1014,7 +1017,7 @@ private function base_class_next_token(): bool { return false; } - $tag_ends_at = strpos( $this->html, '>', $this->bytes_already_parsed ); + $tag_ends_at = strpos( $html, '>', $this->bytes_already_parsed ); if ( false === $tag_ends_at ) { $this->parser_state = self::STATE_INCOMPLETE_INPUT; $this->bytes_already_parsed = $was_at; @@ -1043,7 +1046,7 @@ private function base_class_next_token(): bool { if ( $this->is_closing_tag || 'html' !== $this->parsing_namespace || - 1 !== strspn( $this->html, 'iIlLnNpPsStTxX', $this->tag_name_starts_at, 1 ) + 1 !== strspn( $html, 'iIlLnNpPsStTxX', $this->tag_name_starts_at, 1 ) ) { return true; } From b1d7f0ad1484ef9fd376b1741d4e2b2b9c960ade Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 13 Mar 2026 15:15:37 +0100 Subject: [PATCH 12/43] Perf: Use local $at and $html variables in parse_next_attribute() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace repeated $this->bytes_already_parsed and $this->html property access with local variables. Also inlines skip_whitespace() calls to avoid method call overhead and use the local $html variable. 728ms → 702ms (-3.6%) --- .../html-api/class-wp-html-tag-processor.php | 81 +++++++++++-------- 1 file changed, 46 insertions(+), 35 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index a4d484888da3c..347d9dd7ae1fd 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -2133,12 +2133,15 @@ private function parse_next_tag(): bool { * @return bool Whether an attribute was found before the end of the document. */ private function parse_next_attribute(): bool { - $doc_length = strlen( $this->html ); + $html = $this->html; + $doc_length = strlen( $html ); + $at = $this->bytes_already_parsed; // Skip whitespace and slashes. - $this->bytes_already_parsed += strspn( $this->html, " \t\f\r\n/", $this->bytes_already_parsed ); - if ( $this->bytes_already_parsed >= $doc_length ) { - $this->parser_state = self::STATE_INCOMPLETE_INPUT; + $at += strspn( $html, " \t\f\r\n/", $at ); + if ( $at >= $doc_length ) { + $this->parser_state = self::STATE_INCOMPLETE_INPUT; + $this->bytes_already_parsed = $at; return false; } @@ -2149,71 +2152,79 @@ private function parse_next_attribute(): bool { * * @see https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-name-state */ - $name_length = '=' === $this->html[ $this->bytes_already_parsed ] - ? 1 + strcspn( $this->html, "=/> \t\f\r\n", $this->bytes_already_parsed + 1 ) - : strcspn( $this->html, "=/> \t\f\r\n", $this->bytes_already_parsed ); + $name_length = '=' === $html[ $at ] + ? 1 + strcspn( $html, "=/> \t\f\r\n", $at + 1 ) + : strcspn( $html, "=/> \t\f\r\n", $at ); // No attribute, just tag closer. - if ( 0 === $name_length || $this->bytes_already_parsed + $name_length >= $doc_length ) { + if ( 0 === $name_length || $at + $name_length >= $doc_length ) { return false; } - $attribute_start = $this->bytes_already_parsed; - $attribute_name = substr( $this->html, $attribute_start, $name_length ); - $this->bytes_already_parsed += $name_length; - if ( $this->bytes_already_parsed >= $doc_length ) { - $this->parser_state = self::STATE_INCOMPLETE_INPUT; + $attribute_start = $at; + $attribute_name = substr( $html, $attribute_start, $name_length ); + $at += $name_length; + if ( $at >= $doc_length ) { + $this->parser_state = self::STATE_INCOMPLETE_INPUT; + $this->bytes_already_parsed = $at; return false; } - $this->skip_whitespace(); - if ( $this->bytes_already_parsed >= $doc_length ) { - $this->parser_state = self::STATE_INCOMPLETE_INPUT; + // Skip whitespace. + $at += strspn( $html, " \t\f\r\n", $at ); + if ( $at >= $doc_length ) { + $this->parser_state = self::STATE_INCOMPLETE_INPUT; + $this->bytes_already_parsed = $at; return false; } - $has_value = '=' === $this->html[ $this->bytes_already_parsed ]; + $has_value = '=' === $html[ $at ]; if ( $has_value ) { - ++$this->bytes_already_parsed; - $this->skip_whitespace(); - if ( $this->bytes_already_parsed >= $doc_length ) { - $this->parser_state = self::STATE_INCOMPLETE_INPUT; + ++$at; + // Skip whitespace. + $at += strspn( $html, " \t\f\r\n", $at ); + if ( $at >= $doc_length ) { + $this->parser_state = self::STATE_INCOMPLETE_INPUT; + $this->bytes_already_parsed = $at; return false; } - switch ( $this->html[ $this->bytes_already_parsed ] ) { + switch ( $html[ $at ] ) { case "'": case '"': - $quote = $this->html[ $this->bytes_already_parsed ]; - $value_start = $this->bytes_already_parsed + 1; - $end_quote_at = strpos( $this->html, $quote, $value_start ); - $end_quote_at = false === $end_quote_at ? $doc_length : $end_quote_at; - $value_length = $end_quote_at - $value_start; - $attribute_end = $end_quote_at + 1; - $this->bytes_already_parsed = $attribute_end; + $quote = $html[ $at ]; + $value_start = $at + 1; + $end_quote_at = strpos( $html, $quote, $value_start ); + $end_quote_at = false === $end_quote_at ? $doc_length : $end_quote_at; + $value_length = $end_quote_at - $value_start; + $attribute_end = $end_quote_at + 1; + $at = $attribute_end; break; default: - $value_start = $this->bytes_already_parsed; - $value_length = strcspn( $this->html, "> \t\f\r\n", $value_start ); - $attribute_end = $value_start + $value_length; - $this->bytes_already_parsed = $attribute_end; + $value_start = $at; + $value_length = strcspn( $html, "> \t\f\r\n", $value_start ); + $attribute_end = $value_start + $value_length; + $at = $attribute_end; } } else { - $value_start = $this->bytes_already_parsed; + $value_start = $at; $value_length = 0; $attribute_end = $attribute_start + $name_length; } if ( $attribute_end >= $doc_length ) { - $this->parser_state = self::STATE_INCOMPLETE_INPUT; + $this->parser_state = self::STATE_INCOMPLETE_INPUT; + $this->bytes_already_parsed = $at; return false; } + $this->bytes_already_parsed = $at; + if ( $this->is_closing_tag ) { return true; } From bad7d220236650d665f65f6dfc49fe8e3043521e Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 13 Mar 2026 15:20:37 +0100 Subject: [PATCH 13/43] Perf: Defer substr() for attribute name until needed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For closing tags, attribute names are parsed but never stored, so the substr() and strtolower() calls are wasted. Move them after the closing tag early-return. 702ms → 696ms --- src/wp-includes/html-api/class-wp-html-tag-processor.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 347d9dd7ae1fd..f95eaf16805d0 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -2162,7 +2162,6 @@ private function parse_next_attribute(): bool { } $attribute_start = $at; - $attribute_name = substr( $html, $attribute_start, $name_length ); $at += $name_length; if ( $at >= $doc_length ) { $this->parser_state = self::STATE_INCOMPLETE_INPUT; @@ -2237,6 +2236,7 @@ private function parse_next_attribute(): bool { * * @see https://html.spec.whatwg.org/multipage/syntax.html#attributes-2:ascii-case-insensitive */ + $attribute_name = substr( $html, $attribute_start, $name_length ); $comparable_name = strtolower( $attribute_name ); // If an attribute is listed many times, only use the first declaration and ignore the rest. From 1170fb6a7ea192c43ca9f08c7573acb1160b0c27 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 13 Mar 2026 16:10:40 +0100 Subject: [PATCH 14/43] Perf: Replace per-attribute function call loop with skip_attributes_and_find_closer() Replaces the parse_next_attribute(false) while loop + strpos('>') with a single method call that scans past all attributes and finds the tag-closing '>' in one pass. Eliminates N function call overheads per tag during read-only tokenization. --- .../html-api/class-wp-html-tag-processor.php | 171 ++++++++++++++++-- 1 file changed, 151 insertions(+), 20 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index f95eaf16805d0..ecbd1f7e5b9ad 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -682,6 +682,20 @@ class WP_HTML_Tag_Processor { */ private $is_closing_tag; + /** + * Byte offset where attribute scanning should start for lazy parsing. + * + * @var int|null + */ + private $attribute_scan_from = null; + + /** + * Whether attributes have been parsed and stored for the current tag. + * + * @var bool + */ + private $attributes_parsed = true; + /** * Lazily-built index of attributes found within an HTML tag, keyed by the attribute name. * @@ -1000,24 +1014,10 @@ private function base_class_next_token(): bool { return true; } - // Parse all of its attributes. - while ( $this->parse_next_attribute() ) { - continue; - } - - // Ensure that the tag closes before the end of the document. - if ( - self::STATE_INCOMPLETE_INPUT === $this->parser_state || - $this->bytes_already_parsed >= $doc_length - ) { - // Does this appropriately clear state (parsed attributes)? - $this->parser_state = self::STATE_INCOMPLETE_INPUT; - $this->bytes_already_parsed = $was_at; - - return false; - } - - $tag_ends_at = strpos( $html, '>', $this->bytes_already_parsed ); + // Fast-scan past all attributes and find the tag-closing '>'. + $this->attribute_scan_from = $this->bytes_already_parsed; + $this->attributes_parsed = false; + $tag_ends_at = $this->skip_attributes_and_find_closer( $html, $doc_length ); if ( false === $tag_ends_at ) { $this->parser_state = self::STATE_INCOMPLETE_INPUT; $this->bytes_already_parsed = $was_at; @@ -1082,6 +1082,7 @@ private function base_class_next_token(): bool { $tag_name_starts_at = $this->tag_name_starts_at; $tag_name_length = $this->tag_name_length; $tag_ends_at = $this->token_starts_at + $this->token_length; + $this->ensure_attributes_parsed(); $attributes = $this->attributes; $duplicate_attributes = $this->duplicate_attributes; @@ -2132,7 +2133,7 @@ private function parse_next_tag(): bool { * * @return bool Whether an attribute was found before the end of the document. */ - private function parse_next_attribute(): bool { + private function parse_next_attribute( bool $store = true ): bool { $html = $this->html; $doc_length = strlen( $html ); $at = $this->bytes_already_parsed; @@ -2224,7 +2225,7 @@ private function parse_next_attribute(): bool { $this->bytes_already_parsed = $at; - if ( $this->is_closing_tag ) { + if ( ! $store || $this->is_closing_tag ) { return true; } @@ -2273,6 +2274,128 @@ private function parse_next_attribute(): bool { return true; } + /** + * Skips past all attributes and finds the tag-closing '>'. + * + * This replaces the parse_next_attribute(false) loop + strpos('>') + * with a single method call, eliminating per-attribute function call + * overhead during read-only tokenization. + * + * @since 6.9.0 + * @ignore + * + * @param string $html The HTML being parsed. + * @param int $doc_length Length of the HTML string. + * @return int|false Position of the closing '>' or false if incomplete. + */ + private function skip_attributes_and_find_closer( string $html, int $doc_length ) { + $at = $this->bytes_already_parsed; + + while ( true ) { + // Skip whitespace and slashes. + $at += strspn( $html, " \t\f\r\n/", $at ); + if ( $at >= $doc_length ) { + $this->parser_state = self::STATE_INCOMPLETE_INPUT; + return false; + } + + /* + * Treat the equal sign as a part of the attribute + * name if it is the first encountered byte. + */ + $name_length = '=' === $html[ $at ] + ? 1 + strcspn( $html, "=/> \t\f\r\n", $at + 1 ) + : strcspn( $html, "=/> \t\f\r\n", $at ); + + // No attribute name means we've reached the tag closer. + if ( 0 === $name_length ) { + return $at; + } + + if ( $at + $name_length >= $doc_length ) { + $this->parser_state = self::STATE_INCOMPLETE_INPUT; + return false; + } + + $at += $name_length; + if ( $at >= $doc_length ) { + $this->parser_state = self::STATE_INCOMPLETE_INPUT; + return false; + } + + // Skip whitespace after attribute name. + $at += strspn( $html, " \t\f\r\n", $at ); + if ( $at >= $doc_length ) { + $this->parser_state = self::STATE_INCOMPLETE_INPUT; + return false; + } + + // No value, boolean attribute. + if ( '=' !== $html[ $at ] ) { + continue; + } + + ++$at; + // Skip whitespace after '='. + $at += strspn( $html, " \t\f\r\n", $at ); + if ( $at >= $doc_length ) { + $this->parser_state = self::STATE_INCOMPLETE_INPUT; + return false; + } + + switch ( $html[ $at ] ) { + case "'": + case '"': + $end_quote_at = strpos( $html, $html[ $at ], $at + 1 ); + if ( false === $end_quote_at ) { + $this->parser_state = self::STATE_INCOMPLETE_INPUT; + return false; + } + $at = $end_quote_at + 1; + break; + + default: + $at += strcspn( $html, "> \t\f\r\n", $at ); + break; + } + + if ( $at >= $doc_length ) { + $this->parser_state = self::STATE_INCOMPLETE_INPUT; + return false; + } + } + } + + /** + * Ensures attributes have been parsed and stored for the current tag. + * + * Attributes are lazily parsed on first access to avoid the overhead + * of creating WP_HTML_Attribute_Token objects when they aren't needed. + * + * @since 6.9.0 + * @ignore + */ + private function ensure_attributes_parsed(): void { + if ( $this->attributes_parsed ) { + return; + } + + $this->attributes_parsed = true; + + if ( null === $this->attribute_scan_from || $this->is_closing_tag ) { + return; + } + + $saved_at = $this->bytes_already_parsed; + $this->bytes_already_parsed = $this->attribute_scan_from; + + while ( $this->parse_next_attribute() ) { + continue; + } + + $this->bytes_already_parsed = $saved_at; + } + /** * Move the internal cursor past any immediate successive whitespace. * @@ -2346,6 +2469,8 @@ private function after_tag(): void { $this->comment_type = null; $this->text_node_classification = self::TEXT_IS_GENERIC; $this->duplicate_attributes = null; + $this->attribute_scan_from = null; + $this->attributes_parsed = true; } /** @@ -2363,6 +2488,7 @@ private function class_name_updates_to_attributes_updates(): void { return; } + $this->ensure_attributes_parsed(); $existing_class = $this->get_enqueued_attribute_value( 'class' ); if ( null === $existing_class || true === $existing_class ) { $existing_class = ''; @@ -2790,6 +2916,7 @@ public function get_attribute( $name ) { return null; } + $this->ensure_attributes_parsed(); $comparable = strtolower( $name ); /* @@ -2873,6 +3000,7 @@ public function get_attribute_names_with_prefix( $prefix ): ?array { return null; } + $this->ensure_attributes_parsed(); $comparable = strtolower( $prefix ); $matches = array(); @@ -4331,6 +4459,7 @@ public function set_attribute( $name, $value ): bool { return false; } + $this->ensure_attributes_parsed(); $name_length = strlen( $name ); /** @@ -4482,6 +4611,8 @@ public function remove_attribute( $name ): bool { return false; } + $this->ensure_attributes_parsed(); + /* * > There must never be two or more attributes on * > the same start tag whose names are an ASCII From 635378f477787030e7dddb4e03d0a9378d4df04b Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 13 Mar 2026 16:13:28 +0100 Subject: [PATCH 15/43] Perf: Minor hot-path optimizations in tag processor - Use local $html variable instead of $this->html for array access in parse_next_tag() - Avoid property read for text_length by computing $at - $was_at directly - Use truthiness check instead of count() > 0 for empty array detection in after_tag() - Move attributes/duplicate_attributes reset from after_tag() to ensure_attributes_parsed() to avoid empty array allocation on every token during read-only tokenization --- .../html-api/class-wp-html-tag-processor.php | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index ecbd1f7e5b9ad..13d7e18290179 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -1745,16 +1745,16 @@ private function parse_next_tag(): bool { $this->parser_state = self::STATE_TEXT_NODE; $this->token_starts_at = $was_at; - $this->token_length = $at - $was_at; $this->text_starts_at = $was_at; - $this->text_length = $this->token_length; + $this->token_length = $at - $was_at; + $this->text_length = $at - $was_at; $this->bytes_already_parsed = $at; return true; } $this->token_starts_at = $at; - if ( $at + 1 < $doc_length && '/' === $this->html[ $at + 1 ] ) { + if ( $at + 1 < $doc_length && '/' === $html[ $at + 1 ] ) { $this->is_closing_tag = true; ++$at; } else { @@ -2380,7 +2380,9 @@ private function ensure_attributes_parsed(): void { return; } - $this->attributes_parsed = true; + $this->attributes_parsed = true; + $this->attributes = array(); + $this->duplicate_attributes = null; if ( null === $this->attribute_scan_from || $this->is_closing_tag ) { return; @@ -2417,7 +2419,7 @@ private function after_tag(): void { * Skip update processing when no modifications are queued. * This is the common case for read-only tokenization. */ - if ( count( $this->classname_updates ) > 0 || count( $this->lexical_updates ) > 0 ) { + if ( $this->classname_updates || $this->lexical_updates ) { /* * There could be lexical updates enqueued for an attribute that * also exists on the next tag. In order to avoid conflating the @@ -2465,10 +2467,8 @@ private function after_tag(): void { $this->text_starts_at = 0; $this->text_length = 0; $this->is_closing_tag = null; - $this->attributes = array(); $this->comment_type = null; $this->text_node_classification = self::TEXT_IS_GENERIC; - $this->duplicate_attributes = null; $this->attribute_scan_from = null; $this->attributes_parsed = true; } From 5b90c801e36924307cf3981fa4f00a54ecaee9c6 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 13 Mar 2026 16:19:09 +0100 Subject: [PATCH 16/43] Perf: Cache doc_length as instance variable to avoid repeated strlen() calls --- .../html-api/class-wp-html-tag-processor.php | 30 +++++++++++++------ 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 13d7e18290179..043d53ba1a5cc 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -439,6 +439,16 @@ class WP_HTML_Tag_Processor { */ protected $html; + /** + * Cached byte length of the HTML string. + * + * Updated whenever $this->html is set to avoid repeated strlen() calls. + * + * @since 6.9.0 + * @var int + */ + private $doc_length = 0; + /** * The last query passed to next_tag(). * @@ -856,7 +866,8 @@ public function __construct( $html ) { ); $html = ''; } - $this->html = $html; + $this->html = $html; + $this->doc_length = strlen( $html ); } /** @@ -984,7 +995,7 @@ private function base_class_next_token(): bool { $this->parser_state = self::STATE_READY; $html = $this->html; - $doc_length = strlen( $html ); + $doc_length = $this->doc_length; if ( $this->bytes_already_parsed >= $doc_length ) { $this->parser_state = self::STATE_COMPLETE; @@ -1416,7 +1427,7 @@ private function skip_rawtext( string $tag_name ): bool { */ private function skip_rcdata( string $tag_name ): bool { $html = $this->html; - $doc_length = strlen( $html ); + $doc_length = $this->doc_length; $tag_length = strlen( $tag_name ); $at = $this->bytes_already_parsed; @@ -1453,7 +1464,7 @@ private function skip_rcdata( string $tag_name ): bool { $at += $tag_length; $this->bytes_already_parsed = $at; - if ( $at >= strlen( $html ) ) { + if ( $at >= $doc_length ) { return false; } @@ -1506,7 +1517,7 @@ private function skip_rcdata( string $tag_name ): bool { private function skip_script_data(): bool { $state = 'unescaped'; $html = $this->html; - $doc_length = strlen( $html ); + $doc_length = $this->doc_length; $at = $this->bytes_already_parsed; while ( false !== $at && $at < $doc_length ) { @@ -1715,7 +1726,7 @@ private function skip_script_data(): bool { */ private function parse_next_tag(): bool { $html = $this->html; - $doc_length = strlen( $html ); + $doc_length = $this->doc_length; $was_at = $this->bytes_already_parsed; $at = $was_at; @@ -1912,7 +1923,7 @@ private function parse_next_tag(): bool { if ( 'html' !== $this->parsing_namespace && - strlen( $html ) > $at + 8 && + $doc_length > $at + 8 && '[' === $html[ $at + 2 ] && 'C' === $html[ $at + 3 ] && 'D' === $html[ $at + 4 ] && @@ -2135,7 +2146,7 @@ private function parse_next_tag(): bool { */ private function parse_next_attribute( bool $store = true ): bool { $html = $this->html; - $doc_length = strlen( $html ); + $doc_length = $this->doc_length; $at = $this->bytes_already_parsed; // Skip whitespace and slashes. @@ -2687,7 +2698,8 @@ private function apply_attributes_updates( int $shift_this_point ): int { $bytes_already_copied = $diff->start + $diff->length; } - $this->html = $output_buffer . substr( $this->html, $bytes_already_copied ); + $this->html = $output_buffer . substr( $this->html, $bytes_already_copied ); + $this->doc_length = strlen( $this->html ); /* * Adjust bookmark locations to account for how the text From 05f63144a935c998f22f898ee8807b461a18fcef Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 13 Mar 2026 16:21:12 +0100 Subject: [PATCH 17/43] Perf: Add fast path in skip_attributes_and_find_closer() for immediate '>' Tags without attributes (closing tags, simple void tags) have '>' immediately after the tag name. A direct byte check before entering the attribute-scanning loop avoids strspn/strcspn overhead for these common cases. --- src/wp-includes/html-api/class-wp-html-tag-processor.php | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 043d53ba1a5cc..6027cd8604404 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -2302,6 +2302,11 @@ private function parse_next_attribute( bool $store = true ): bool { private function skip_attributes_and_find_closer( string $html, int $doc_length ) { $at = $this->bytes_already_parsed; + // Fast path: '>' immediately after tag name (closing tags, void tags). + if ( $at < $doc_length && '>' === $html[ $at ] ) { + return $at; + } + while ( true ) { // Skip whitespace and slashes. $at += strspn( $html, " \t\f\r\n/", $at ); From 087f0da9410b1fcde294a9231e775e1744972f41 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 13 Mar 2026 16:22:33 +0100 Subject: [PATCH 18/43] Perf: Replace strspn single-char checks with direct byte comparisons in parse_next_tag() Replace strspn() calls used for single-character validation with direct character range comparisons. Also simplify tag name length calculation to use a single strcspn() instead of strspn() + strcspn(), since the first-character alpha check is now done via direct comparison. --- .../html-api/class-wp-html-tag-processor.php | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 6027cd8604404..7d9577540b7c6 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -1749,7 +1749,11 @@ private function parse_next_tag(): bool { * * @see https://html.spec.whatwg.org/#tag-open-state */ - if ( 1 !== strspn( $html, '!/?abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ', $at + 1, 1 ) ) { + $next_byte = $html[ $at + 1 ] ?? ''; + if ( + '!' !== $next_byte && '/' !== $next_byte && '?' !== $next_byte && + ( $next_byte < 'A' || ( $next_byte > 'Z' && $next_byte < 'a' ) || $next_byte > 'z' ) + ) { ++$at; continue; } @@ -1786,12 +1790,12 @@ private function parse_next_tag(): bool { * * https://html.spec.whatwg.org/multipage/parsing.html#data-state * * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state */ - $tag_name_prefix_length = strspn( $html, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ', $at + 1 ); - if ( $tag_name_prefix_length > 0 ) { + $first_char = $html[ $at + 1 ] ?? ''; + if ( ( $first_char >= 'a' && $first_char <= 'z' ) || ( $first_char >= 'A' && $first_char <= 'Z' ) ) { ++$at; $this->parser_state = self::STATE_MATCHED_TAG; $this->tag_name_starts_at = $at; - $this->tag_name_length = $tag_name_prefix_length + strcspn( $html, " \t\f\r\n/>", $at + $tag_name_prefix_length ); + $this->tag_name_length = strcspn( $html, " \t\f\r\n/>", $at ); $this->bytes_already_parsed = $at + $this->tag_name_length; return true; } From 7d1ab37f96312caadb36988a033d75d5f065705f Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 13 Mar 2026 16:24:30 +0100 Subject: [PATCH 19/43] Perf: Inline after_tag() into base_class_next_token() and reduce property resets Inline the single-callsite after_tag() method into base_class_next_token() to eliminate method call overhead in the hot tokenization loop. Also remove 4 property resets (token_starts_at, token_length, is_closing_tag, comment_type) that are always overwritten before use or guarded by parser_state checks. --- .../html-api/class-wp-html-tag-processor.php | 36 ++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 7d9577540b7c6..3a07c5cd0381d 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -978,7 +978,41 @@ public function next_token(): bool { */ private function base_class_next_token(): bool { $was_at = $this->bytes_already_parsed; - $this->after_tag(); + + /* + * Apply attribute updates and clean up the previous tag. + * Inlined from after_tag() to avoid method call overhead + * in the hot tokenization loop. + */ + if ( $this->classname_updates || $this->lexical_updates ) { + $this->class_name_updates_to_attributes_updates(); + + if ( 1000 < count( $this->lexical_updates ) ) { + $this->get_updated_html(); + } + + foreach ( $this->lexical_updates as $name => $update ) { + if ( $update->start >= $this->bytes_already_parsed ) { + $this->get_updated_html(); + break; + } + + if ( is_int( $name ) ) { + continue; + } + + $this->lexical_updates[] = $update; + unset( $this->lexical_updates[ $name ] ); + } + } + + $this->tag_name_starts_at = null; + $this->tag_name_length = null; + $this->text_starts_at = 0; + $this->text_length = 0; + $this->text_node_classification = self::TEXT_IS_GENERIC; + $this->attribute_scan_from = null; + $this->attributes_parsed = true; // Don't proceed if there's nothing more to scan. if ( From f05977567656f7306b63bcb28a23d46f6dfa1487 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 13 Mar 2026 16:27:05 +0100 Subject: [PATCH 20/43] Perf: Inline common fast paths for text nodes and regular tags Inline the most common token paths (text nodes and regular tags) directly into base_class_next_token(), eliminating the parse_next_tag() function call for ~95% of tokens. Complex tokens (comments, DOCTYPE, CDATA) still use the full parse_next_tag() method. Also integrates the skip_attributes_and_find_closer fast path for tags with '>' immediately after the tag name, avoiding an additional method call for closing tags and attribute-less tags. --- .../html-api/class-wp-html-tag-processor.php | 111 +++++++++++++++--- 1 file changed, 97 insertions(+), 14 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 3a07c5cd0381d..ee511276618c4 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -1031,12 +1031,103 @@ private function base_class_next_token(): bool { $html = $this->html; $doc_length = $this->doc_length; - if ( $this->bytes_already_parsed >= $doc_length ) { + $at = $this->bytes_already_parsed; + + if ( $at >= $doc_length ) { $this->parser_state = self::STATE_COMPLETE; return false; } - // Find the next tag if it exists. + /* + * Fast path: handle the two most common token types inline. + * + * 1. Text nodes: text between tags (strpos finds next '<'). + * 2. Regular tags: '<' followed by alpha or '/'+alpha. + * + * Complex tokens (comments, DOCTYPE, CDATA, etc.) fall through + * to the full parse_next_tag() method. + */ + $at = strpos( $html, '<', $at ); + + // No '<' found: the rest of the document is a text node. + if ( false === $at ) { + $this->parser_state = self::STATE_TEXT_NODE; + $this->token_starts_at = $was_at; + $this->text_starts_at = $was_at; + $this->token_length = $doc_length - $was_at; + $this->text_length = $doc_length - $was_at; + $this->bytes_already_parsed = $doc_length; + return true; + } + + // Text before the '<': return it as a text node. + if ( $at > $was_at ) { + $next_byte = $html[ $at + 1 ] ?? ''; + if ( + '!' !== $next_byte && '/' !== $next_byte && '?' !== $next_byte && + ( $next_byte < 'A' || ( $next_byte > 'Z' && $next_byte < 'a' ) || $next_byte > 'z' ) + ) { + /* + * The '<' doesn't start a valid token. Fall through to + * the full parse_next_tag() which handles continuation. + */ + goto full_parse; + } + + $this->parser_state = self::STATE_TEXT_NODE; + $this->token_starts_at = $was_at; + $this->text_starts_at = $was_at; + $this->token_length = $at - $was_at; + $this->text_length = $at - $was_at; + $this->bytes_already_parsed = $at; + return true; + } + + // At '<': try to match a regular tag. + $first_char = $html[ $at + 1 ] ?? ''; + $is_closer = '/' === $first_char; + if ( $is_closer ) { + $first_char = $html[ $at + 2 ] ?? ''; + } + + if ( ( $first_char >= 'a' && $first_char <= 'z' ) || ( $first_char >= 'A' && $first_char <= 'Z' ) ) { + $tag_at = $at + 1 + ( $is_closer ? 1 : 0 ); + + $this->token_starts_at = $at; + $this->is_closing_tag = $is_closer; + $this->parser_state = self::STATE_MATCHED_TAG; + $this->tag_name_starts_at = $tag_at; + $this->tag_name_length = strcspn( $html, " \t\f\r\n/>", $tag_at ); + + $after_name = $tag_at + $this->tag_name_length; + + // Fast-scan past all attributes and find the tag-closing '>'. + $this->attribute_scan_from = $after_name; + $this->attributes_parsed = false; + + // Fast path: '>' immediately after tag name. + if ( $after_name < $doc_length && '>' === $html[ $after_name ] ) { + $tag_ends_at = $after_name; + } else { + $this->bytes_already_parsed = $after_name; + $tag_ends_at = $this->skip_attributes_and_find_closer( $html, $doc_length ); + if ( false === $tag_ends_at ) { + $this->parser_state = self::STATE_INCOMPLETE_INPUT; + $this->bytes_already_parsed = $was_at; + return false; + } + } + + $this->parser_state = self::STATE_MATCHED_TAG; + $this->bytes_already_parsed = $tag_ends_at + 1; + $this->token_length = $this->bytes_already_parsed - $at; + + goto after_tag_match; + } + + // Complex token: fall through to full parse_next_tag(). + full_parse: + if ( false === $this->parse_next_tag() ) { if ( self::STATE_INCOMPLETE_INPUT === $this->parser_state ) { $this->bytes_already_parsed = $was_at; @@ -1045,21 +1136,11 @@ private function base_class_next_token(): bool { return false; } - /* - * For legacy reasons the rest of this function handles tags and their - * attributes. If the processor has reached the end of the document - * or if it matched any other token then it should return here to avoid - * attempting to process tag-specific syntax. - */ - if ( - self::STATE_INCOMPLETE_INPUT !== $this->parser_state && - self::STATE_COMPLETE !== $this->parser_state && - self::STATE_MATCHED_TAG !== $this->parser_state - ) { + if ( self::STATE_MATCHED_TAG !== $this->parser_state ) { return true; } - // Fast-scan past all attributes and find the tag-closing '>'. + // Tag found by parse_next_tag — scan attributes. $this->attribute_scan_from = $this->bytes_already_parsed; $this->attributes_parsed = false; $tag_ends_at = $this->skip_attributes_and_find_closer( $html, $doc_length ); @@ -1073,6 +1154,8 @@ private function base_class_next_token(): bool { $this->bytes_already_parsed = $tag_ends_at + 1; $this->token_length = $this->bytes_already_parsed - $this->token_starts_at; + after_tag_match: + /* * Certain tags require additional processing. The first-letter pre-check * avoids unnecessary string allocation when comparing the tag names. From 7277e2e5e5018623afc2b922ba8aa5ce534eac11 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 13 Mar 2026 16:30:20 +0100 Subject: [PATCH 21/43] Perf: Defer property resets to type-specific return paths in base_class_next_token() Instead of resetting all token properties at the top of every iteration, only reset the properties relevant to each token type at its return point. This eliminates ~3.4M unnecessary property writes per benchmark run. --- .../html-api/class-wp-html-tag-processor.php | 84 ++++++++++--------- 1 file changed, 45 insertions(+), 39 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index ee511276618c4..24684698c3186 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -1006,14 +1006,6 @@ private function base_class_next_token(): bool { } } - $this->tag_name_starts_at = null; - $this->tag_name_length = null; - $this->text_starts_at = 0; - $this->text_length = 0; - $this->text_node_classification = self::TEXT_IS_GENERIC; - $this->attribute_scan_from = null; - $this->attributes_parsed = true; - // Don't proceed if there's nothing more to scan. if ( self::STATE_COMPLETE === $this->parser_state || @@ -1022,12 +1014,6 @@ private function base_class_next_token(): bool { return false; } - /* - * The next step in the parsing loop determines the parsing state; - * clear it so that state doesn't linger from the previous step. - */ - $this->parser_state = self::STATE_READY; - $html = $this->html; $doc_length = $this->doc_length; @@ -1051,12 +1037,16 @@ private function base_class_next_token(): bool { // No '<' found: the rest of the document is a text node. if ( false === $at ) { - $this->parser_state = self::STATE_TEXT_NODE; - $this->token_starts_at = $was_at; - $this->text_starts_at = $was_at; - $this->token_length = $doc_length - $was_at; - $this->text_length = $doc_length - $was_at; - $this->bytes_already_parsed = $doc_length; + $this->parser_state = self::STATE_TEXT_NODE; + $this->token_starts_at = $was_at; + $this->text_starts_at = $was_at; + $this->token_length = $doc_length - $was_at; + $this->text_length = $doc_length - $was_at; + $this->tag_name_starts_at = null; + $this->tag_name_length = null; + $this->text_node_classification = self::TEXT_IS_GENERIC; + $this->attributes_parsed = true; + $this->bytes_already_parsed = $doc_length; return true; } @@ -1074,12 +1064,16 @@ private function base_class_next_token(): bool { goto full_parse; } - $this->parser_state = self::STATE_TEXT_NODE; - $this->token_starts_at = $was_at; - $this->text_starts_at = $was_at; - $this->token_length = $at - $was_at; - $this->text_length = $at - $was_at; - $this->bytes_already_parsed = $at; + $this->parser_state = self::STATE_TEXT_NODE; + $this->token_starts_at = $was_at; + $this->text_starts_at = $was_at; + $this->token_length = $at - $was_at; + $this->text_length = $at - $was_at; + $this->tag_name_starts_at = null; + $this->tag_name_length = null; + $this->text_node_classification = self::TEXT_IS_GENERIC; + $this->attributes_parsed = true; + $this->bytes_already_parsed = $at; return true; } @@ -1091,19 +1085,19 @@ private function base_class_next_token(): bool { } if ( ( $first_char >= 'a' && $first_char <= 'z' ) || ( $first_char >= 'A' && $first_char <= 'Z' ) ) { - $tag_at = $at + 1 + ( $is_closer ? 1 : 0 ); - - $this->token_starts_at = $at; - $this->is_closing_tag = $is_closer; - $this->parser_state = self::STATE_MATCHED_TAG; - $this->tag_name_starts_at = $tag_at; - $this->tag_name_length = strcspn( $html, " \t\f\r\n/>", $tag_at ); - - $after_name = $tag_at + $this->tag_name_length; - - // Fast-scan past all attributes and find the tag-closing '>'. - $this->attribute_scan_from = $after_name; - $this->attributes_parsed = false; + $tag_at = $at + 1 + ( $is_closer ? 1 : 0 ); + $tag_length = strcspn( $html, " \t\f\r\n/>", $tag_at ); + $after_name = $tag_at + $tag_length; + + $this->token_starts_at = $at; + $this->is_closing_tag = $is_closer; + $this->tag_name_starts_at = $tag_at; + $this->tag_name_length = $tag_length; + $this->text_starts_at = 0; + $this->text_length = 0; + $this->text_node_classification = self::TEXT_IS_GENERIC; + $this->attribute_scan_from = $after_name; + $this->attributes_parsed = false; // Fast path: '>' immediately after tag name. if ( $after_name < $doc_length && '>' === $html[ $after_name ] ) { @@ -1128,6 +1122,18 @@ private function base_class_next_token(): bool { // Complex token: fall through to full parse_next_tag(). full_parse: + /* + * Reset state for the full parse path. + */ + $this->parser_state = self::STATE_READY; + $this->tag_name_starts_at = null; + $this->tag_name_length = null; + $this->text_starts_at = 0; + $this->text_length = 0; + $this->text_node_classification = self::TEXT_IS_GENERIC; + $this->attribute_scan_from = null; + $this->attributes_parsed = true; + if ( false === $this->parse_next_tag() ) { if ( self::STATE_INCOMPLETE_INPUT === $this->parser_state ) { $this->bytes_already_parsed = $was_at; From 0c9f3065e0d1f9e9da6fecabba480b9ba428f18d Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 13 Mar 2026 16:35:41 +0100 Subject: [PATCH 22/43] Perf: Add tag name length filter before special element check Special elements (SCRIPT, STYLE, TEXTAREA, etc.) have name lengths of 3, 5, 6, 7, or 8 characters. Adding a quick length check before calling get_tag() avoids substr() + strtoupper() allocations for the many common tags (p, li, span, div, section, etc.) that pass the first-letter check but can never be special elements. --- src/wp-includes/html-api/class-wp-html-tag-processor.php | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 24684698c3186..cb9735086cd60 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -1185,6 +1185,15 @@ private function base_class_next_token(): bool { return true; } + /* + * Quick length filter: special elements have name lengths 3, 5, 6, 7, or 8. + * Tags with other lengths can be returned immediately. + */ + $special_tag_name_length = $this->tag_name_length; + if ( $special_tag_name_length < 3 || $special_tag_name_length > 8 || 4 === $special_tag_name_length ) { + return true; + } + $tag_name = $this->get_tag(); /* From 13a7d3f52c2b2c9f33ae23ccd723ead27204021c Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 13 Mar 2026 16:36:50 +0100 Subject: [PATCH 23/43] Perf: Reorder special element checks to put length filter before strspn Move the tag name length filter before the strspn first-letter check. This avoids a function call for the many common tags with non-matching lengths (a, p, li, div, span, etc.) that would pass the closing-tag check but be eliminated by strspn. --- .../html-api/class-wp-html-tag-processor.php | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index cb9735086cd60..0fe0450b70de0 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -1179,18 +1179,21 @@ private function base_class_next_token(): bool { */ if ( $this->is_closing_tag || - 'html' !== $this->parsing_namespace || - 1 !== strspn( $html, 'iIlLnNpPsStTxX', $this->tag_name_starts_at, 1 ) + 'html' !== $this->parsing_namespace ) { return true; } /* * Quick length filter: special elements have name lengths 3, 5, 6, 7, or 8. - * Tags with other lengths can be returned immediately. + * Checking length before the first-letter strspn avoids a function call for + * the many common tags (a, p, li, div, span, etc.) with non-matching lengths. */ $special_tag_name_length = $this->tag_name_length; - if ( $special_tag_name_length < 3 || $special_tag_name_length > 8 || 4 === $special_tag_name_length ) { + if ( + $special_tag_name_length < 3 || $special_tag_name_length > 8 || 4 === $special_tag_name_length || + 1 !== strspn( $html, 'iIlLnNpPsStTxX', $this->tag_name_starts_at, 1 ) + ) { return true; } From f2c0421d7eb6def947bb98f779d2f1ad2b6e2a64 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 13 Mar 2026 16:39:42 +0100 Subject: [PATCH 24/43] Perf: Optimize attribute scanner for common name="value" pattern Restructure skip_attributes_and_find_closer() to check for '=' and quote characters directly after the attribute name, avoiding two strspn() calls per attribute that almost always return 0. Well-formed attributes like class="foo" now go through a fast path with only byte-level comparisons + a single strpos for the closing quote. --- .../html-api/class-wp-html-tag-processor.php | 107 ++++++++++++++---- 1 file changed, 83 insertions(+), 24 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 0fe0450b70de0..378d95ffe89ea 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -2478,42 +2478,101 @@ private function skip_attributes_and_find_closer( string $html, int $doc_length return false; } - // Skip whitespace after attribute name. - $at += strspn( $html, " \t\f\r\n", $at ); - if ( $at >= $doc_length ) { - $this->parser_state = self::STATE_INCOMPLETE_INPUT; - return false; - } + /* + * Fast path for the most common pattern: name="value" + * where '=' immediately follows the name and a quote + * immediately follows '='. Avoids two strspn calls + * that typically return 0. + */ + if ( '=' === $html[ $at ] ) { + ++$at; + if ( $at >= $doc_length ) { + $this->parser_state = self::STATE_INCOMPLETE_INPUT; + return false; + } + + $quote_char = $html[ $at ]; + if ( '"' === $quote_char || "'" === $quote_char ) { + $end_quote_at = strpos( $html, $quote_char, $at + 1 ); + if ( false === $end_quote_at ) { + $this->parser_state = self::STATE_INCOMPLETE_INPUT; + return false; + } + $at = $end_quote_at + 1; + if ( $at >= $doc_length ) { + $this->parser_state = self::STATE_INCOMPLETE_INPUT; + return false; + } + continue; + } + + // Whitespace after '=': skip and read value. + if ( ' ' === $quote_char || "\t" === $quote_char || "\f" === $quote_char || "\r" === $quote_char || "\n" === $quote_char ) { + $at += strspn( $html, " \t\f\r\n", $at ); + if ( $at >= $doc_length ) { + $this->parser_state = self::STATE_INCOMPLETE_INPUT; + return false; + } + + if ( '"' === $html[ $at ] || "'" === $html[ $at ] ) { + $end_quote_at = strpos( $html, $html[ $at ], $at + 1 ); + if ( false === $end_quote_at ) { + $this->parser_state = self::STATE_INCOMPLETE_INPUT; + return false; + } + $at = $end_quote_at + 1; + if ( $at >= $doc_length ) { + $this->parser_state = self::STATE_INCOMPLETE_INPUT; + return false; + } + continue; + } + } - // No value, boolean attribute. - if ( '=' !== $html[ $at ] ) { + // Unquoted value. + $at += strcspn( $html, "> \t\f\r\n", $at ); continue; } - ++$at; - // Skip whitespace after '='. - $at += strspn( $html, " \t\f\r\n", $at ); - if ( $at >= $doc_length ) { - $this->parser_state = self::STATE_INCOMPLETE_INPUT; - return false; - } + // No '=' immediately: check for whitespace before '='. + if ( ' ' === $html[ $at ] || "\t" === $html[ $at ] || "\f" === $html[ $at ] || "\r" === $html[ $at ] || "\n" === $html[ $at ] ) { + $at += strspn( $html, " \t\f\r\n", $at ); + if ( $at >= $doc_length ) { + $this->parser_state = self::STATE_INCOMPLETE_INPUT; + return false; + } - switch ( $html[ $at ] ) { - case "'": - case '"': - $end_quote_at = strpos( $html, $html[ $at ], $at + 1 ); - if ( false === $end_quote_at ) { + // After whitespace, check for '=' to distinguish value from next attribute. + if ( '=' === $html[ $at ] ) { + ++$at; + $at += strspn( $html, " \t\f\r\n", $at ); + if ( $at >= $doc_length ) { $this->parser_state = self::STATE_INCOMPLETE_INPUT; return false; } - $at = $end_quote_at + 1; - break; - default: + if ( '"' === $html[ $at ] || "'" === $html[ $at ] ) { + $end_quote_at = strpos( $html, $html[ $at ], $at + 1 ); + if ( false === $end_quote_at ) { + $this->parser_state = self::STATE_INCOMPLETE_INPUT; + return false; + } + $at = $end_quote_at + 1; + if ( $at >= $doc_length ) { + $this->parser_state = self::STATE_INCOMPLETE_INPUT; + return false; + } + continue; + } + + // Unquoted value. $at += strcspn( $html, "> \t\f\r\n", $at ); - break; + continue; + } } + // Boolean attribute (no value). + if ( $at >= $doc_length ) { $this->parser_state = self::STATE_INCOMPLETE_INPUT; return false; From ac24e7de4319e5d51ad7a20a38b2640c98a06104 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 13 Mar 2026 16:43:52 +0100 Subject: [PATCH 25/43] Perf: Inline single-space and '>' checks in attribute scanner loop Replace the strspn() call at the top of the attribute scanning loop with direct byte comparisons for the two most common cases: a single space separator between attributes and the '>' tag closer. Only falls back to strspn() for uncommon whitespace characters (tabs, newlines, form feeds, slashes). --- .../html-api/class-wp-html-tag-processor.php | 36 +++++++++++++------ 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 378d95ffe89ea..8a38fd89807ec 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -2447,25 +2447,39 @@ private function skip_attributes_and_find_closer( string $html, int $doc_length } while ( true ) { - // Skip whitespace and slashes. - $at += strspn( $html, " \t\f\r\n/", $at ); - if ( $at >= $doc_length ) { - $this->parser_state = self::STATE_INCOMPLETE_INPUT; - return false; + /* + * Skip whitespace and slashes between attributes. + * Fast path: single space (most common) or '>' (tag closer). + */ + $c = $html[ $at ] ?? ''; + if ( '>' === $c ) { + return $at; + } + if ( ' ' === $c ) { + ++$at; + if ( $at >= $doc_length ) { + $this->parser_state = self::STATE_INCOMPLETE_INPUT; + return false; + } + } elseif ( "\t" === $c || "\r" === $c || "\n" === $c || "\f" === $c || '/' === $c ) { + $at += strspn( $html, " \t\f\r\n/", $at ); + if ( $at >= $doc_length ) { + $this->parser_state = self::STATE_INCOMPLETE_INPUT; + return false; + } } /* * Treat the equal sign as a part of the attribute * name if it is the first encountered byte. */ - $name_length = '=' === $html[ $at ] - ? 1 + strcspn( $html, "=/> \t\f\r\n", $at + 1 ) - : strcspn( $html, "=/> \t\f\r\n", $at ); - - // No attribute name means we've reached the tag closer. - if ( 0 === $name_length ) { + $c = $html[ $at ]; + if ( '>' === $c ) { return $at; } + $name_length = '=' === $c + ? 1 + strcspn( $html, "=/> \t\f\r\n", $at + 1 ) + : strcspn( $html, "=/> \t\f\r\n", $at ); if ( $at + $name_length >= $doc_length ) { $this->parser_state = self::STATE_INCOMPLETE_INPUT; From 387be39f4800f262d3b4ed8e1f609e1d9204809f Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 13 Mar 2026 16:45:45 +0100 Subject: [PATCH 26/43] Perf: Remove redundant state check and property write from hot path Remove the STATE_COMPLETE check from base_class_next_token() since the $at >= $doc_length bounds check already handles this case. Also remove the text_node_classification property write from the tag fast path since this value is never read for tag tokens. --- .../html-api/class-wp-html-tag-processor.php | 22 ++++++++----------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 8a38fd89807ec..811de3a1636d5 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -1007,10 +1007,7 @@ private function base_class_next_token(): bool { } // Don't proceed if there's nothing more to scan. - if ( - self::STATE_COMPLETE === $this->parser_state || - self::STATE_INCOMPLETE_INPUT === $this->parser_state - ) { + if ( self::STATE_INCOMPLETE_INPUT === $this->parser_state ) { return false; } @@ -1089,15 +1086,14 @@ private function base_class_next_token(): bool { $tag_length = strcspn( $html, " \t\f\r\n/>", $tag_at ); $after_name = $tag_at + $tag_length; - $this->token_starts_at = $at; - $this->is_closing_tag = $is_closer; - $this->tag_name_starts_at = $tag_at; - $this->tag_name_length = $tag_length; - $this->text_starts_at = 0; - $this->text_length = 0; - $this->text_node_classification = self::TEXT_IS_GENERIC; - $this->attribute_scan_from = $after_name; - $this->attributes_parsed = false; + $this->token_starts_at = $at; + $this->is_closing_tag = $is_closer; + $this->tag_name_starts_at = $tag_at; + $this->tag_name_length = $tag_length; + $this->text_starts_at = 0; + $this->text_length = 0; + $this->attribute_scan_from = $after_name; + $this->attributes_parsed = false; // Fast path: '>' immediately after tag name. if ( $after_name < $doc_length && '>' === $html[ $after_name ] ) { From 901a97c4c7862af57a05c2918706d510acbf5824 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 13 Mar 2026 16:48:48 +0100 Subject: [PATCH 27/43] Perf: Use null text_starts_at for tags instead of setting two properties Set text_starts_at to null instead of 0 for tag tokens in the fast path, and remove the text_length = 0 write. get_modifiable_text() already returns '' when text_starts_at is null, making the text_length write redundant. Saves one property write per tag token. --- src/wp-includes/html-api/class-wp-html-tag-processor.php | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 811de3a1636d5..6212c6d42cdb4 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -1090,8 +1090,7 @@ private function base_class_next_token(): bool { $this->is_closing_tag = $is_closer; $this->tag_name_starts_at = $tag_at; $this->tag_name_length = $tag_length; - $this->text_starts_at = 0; - $this->text_length = 0; + $this->text_starts_at = null; $this->attribute_scan_from = $after_name; $this->attributes_parsed = false; From 8843ddbcca31d95202210b349d129a4011b1c38e Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 13 Mar 2026 16:51:22 +0100 Subject: [PATCH 28/43] Perf: Avoid redundant bytes_already_parsed property read Cache bytes_already_parsed in a local variable at function entry and only re-read the property when lexical updates have been applied, as those may adjust the position. Saves one property read per token in the common read-only tokenization case. --- src/wp-includes/html-api/class-wp-html-tag-processor.php | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 6212c6d42cdb4..e2e60ed5a6bba 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -978,6 +978,7 @@ public function next_token(): bool { */ private function base_class_next_token(): bool { $was_at = $this->bytes_already_parsed; + $at = $was_at; /* * Apply attribute updates and clean up the previous tag. @@ -1004,6 +1005,8 @@ private function base_class_next_token(): bool { $this->lexical_updates[] = $update; unset( $this->lexical_updates[ $name ] ); } + + $at = $this->bytes_already_parsed; } // Don't proceed if there's nothing more to scan. @@ -1014,8 +1017,6 @@ private function base_class_next_token(): bool { $html = $this->html; $doc_length = $this->doc_length; - $at = $this->bytes_already_parsed; - if ( $at >= $doc_length ) { $this->parser_state = self::STATE_COMPLETE; return false; From 75dd6a3e27f1fbc44663e3f0ac2ef93a10f2aeff Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 13 Mar 2026 17:05:58 +0100 Subject: [PATCH 29/43] Perf: Remove attribute_scan_from property, compute on demand Eliminates the attribute_scan_from property entirely. The scan position is now computed as tag_name_starts_at + tag_name_length in ensure_attributes_parsed() on demand, avoiding ~646K property writes per benchmark iteration. --- .../html-api/class-wp-html-tag-processor.php | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index e2e60ed5a6bba..1ef8d35a75ba1 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -692,13 +692,6 @@ class WP_HTML_Tag_Processor { */ private $is_closing_tag; - /** - * Byte offset where attribute scanning should start for lazy parsing. - * - * @var int|null - */ - private $attribute_scan_from = null; - /** * Whether attributes have been parsed and stored for the current tag. * @@ -1092,7 +1085,6 @@ private function base_class_next_token(): bool { $this->tag_name_starts_at = $tag_at; $this->tag_name_length = $tag_length; $this->text_starts_at = null; - $this->attribute_scan_from = $after_name; $this->attributes_parsed = false; // Fast path: '>' immediately after tag name. @@ -1127,7 +1119,6 @@ private function base_class_next_token(): bool { $this->text_starts_at = 0; $this->text_length = 0; $this->text_node_classification = self::TEXT_IS_GENERIC; - $this->attribute_scan_from = null; $this->attributes_parsed = true; if ( false === $this->parse_next_tag() ) { @@ -1143,7 +1134,6 @@ private function base_class_next_token(): bool { } // Tag found by parse_next_tag — scan attributes. - $this->attribute_scan_from = $this->bytes_already_parsed; $this->attributes_parsed = false; $tag_ends_at = $this->skip_attributes_and_find_closer( $html, $doc_length ); if ( false === $tag_ends_at ) { @@ -2608,12 +2598,12 @@ private function ensure_attributes_parsed(): void { $this->attributes = array(); $this->duplicate_attributes = null; - if ( null === $this->attribute_scan_from || $this->is_closing_tag ) { + if ( null === $this->tag_name_starts_at || $this->is_closing_tag ) { return; } $saved_at = $this->bytes_already_parsed; - $this->bytes_already_parsed = $this->attribute_scan_from; + $this->bytes_already_parsed = $this->tag_name_starts_at + $this->tag_name_length; while ( $this->parse_next_attribute() ) { continue; @@ -2693,7 +2683,6 @@ private function after_tag(): void { $this->is_closing_tag = null; $this->comment_type = null; $this->text_node_classification = self::TEXT_IS_GENERIC; - $this->attribute_scan_from = null; $this->attributes_parsed = true; } From 8e8dc9a6bcb726f6d6e0c506f9e0d941fdbb8527 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 13 Mar 2026 17:07:57 +0100 Subject: [PATCH 30/43] Perf: Remove unnecessary attributes_parsed write from text nodes All callers of ensure_attributes_parsed() guard with STATE_MATCHED_TAG check, so the attributes_parsed flag is never read for text nodes. Removing this write saves ~378K property writes per benchmark iteration. --- src/wp-includes/html-api/class-wp-html-tag-processor.php | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 1ef8d35a75ba1..841fb13400c63 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -1036,7 +1036,6 @@ private function base_class_next_token(): bool { $this->tag_name_starts_at = null; $this->tag_name_length = null; $this->text_node_classification = self::TEXT_IS_GENERIC; - $this->attributes_parsed = true; $this->bytes_already_parsed = $doc_length; return true; } @@ -1063,7 +1062,6 @@ private function base_class_next_token(): bool { $this->tag_name_starts_at = null; $this->tag_name_length = null; $this->text_node_classification = self::TEXT_IS_GENERIC; - $this->attributes_parsed = true; $this->bytes_already_parsed = $at; return true; } From 3e77efc4729350d27cdbcdfc9bdde322cbfe8006 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 13 Mar 2026 17:10:49 +0100 Subject: [PATCH 31/43] Perf: Skip tag_name null writes for text nodes, restructure get_tag() Text nodes don't need to null out tag_name_starts_at and tag_name_length. Instead, restructure get_tag() to check parser_state first, which correctly returns null for non-tag tokens without requiring the null sentinel. Saves ~756K property writes per benchmark iteration. --- .../html-api/class-wp-html-tag-processor.php | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 841fb13400c63..0185d15951b98 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -1033,8 +1033,6 @@ private function base_class_next_token(): bool { $this->text_starts_at = $was_at; $this->token_length = $doc_length - $was_at; $this->text_length = $doc_length - $was_at; - $this->tag_name_starts_at = null; - $this->tag_name_length = null; $this->text_node_classification = self::TEXT_IS_GENERIC; $this->bytes_already_parsed = $doc_length; return true; @@ -1059,8 +1057,6 @@ private function base_class_next_token(): bool { $this->text_starts_at = $was_at; $this->token_length = $at - $was_at; $this->text_length = $at - $was_at; - $this->tag_name_starts_at = null; - $this->tag_name_length = null; $this->text_node_classification = self::TEXT_IS_GENERIC; $this->bytes_already_parsed = $at; return true; @@ -3252,21 +3248,19 @@ public function get_namespace(): string { * @return string|null Name of currently matched tag in input HTML, or `null` if none found. */ public function get_tag(): ?string { - if ( null === $this->tag_name_starts_at ) { - return null; + if ( self::STATE_MATCHED_TAG === $this->parser_state ) { + return strtoupper( substr( $this->html, $this->tag_name_starts_at, $this->tag_name_length ) ); } - $tag_name = substr( $this->html, $this->tag_name_starts_at, $this->tag_name_length ); - - if ( self::STATE_MATCHED_TAG === $this->parser_state ) { - return strtoupper( $tag_name ); + if ( null === $this->tag_name_starts_at ) { + return null; } if ( self::STATE_COMMENT === $this->parser_state && self::COMMENT_AS_PI_NODE_LOOKALIKE === $this->get_comment_type() ) { - return $tag_name; + return substr( $this->html, $this->tag_name_starts_at, $this->tag_name_length ); } return null; From 6a3ac015787a7c0bd47e3871674604c69697b594 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 13 Mar 2026 17:14:08 +0100 Subject: [PATCH 32/43] Perf: Short-circuit closing tags in fast path, avoid after_tag_match Closing tags never need special element processing, so return immediately from the fast path instead of goto after_tag_match. Avoids property reads for is_closing_tag and parsing_namespace checks for ~300K+ closing tags. Also avoid re-reading bytes_already_parsed for token_length computation. --- src/wp-includes/html-api/class-wp-html-tag-processor.php | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 0185d15951b98..3eaff090b6552 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -1096,7 +1096,11 @@ private function base_class_next_token(): bool { $this->parser_state = self::STATE_MATCHED_TAG; $this->bytes_already_parsed = $tag_ends_at + 1; - $this->token_length = $this->bytes_already_parsed - $at; + $this->token_length = $tag_ends_at + 1 - $at; + + if ( $is_closer ) { + return true; + } goto after_tag_match; } From 8a00e32e0267e4f3f28740a99a6cd07b0c85afba Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 13 Mar 2026 17:15:18 +0100 Subject: [PATCH 33/43] Perf: Move closer check before after_tag_match for full_parse path Both fast path and full_parse path now return early for closing tags before reaching after_tag_match, which now only handles opening tags. Eliminates is_closing_tag property read from the shared after_tag_match section. --- src/wp-includes/html-api/class-wp-html-tag-processor.php | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 3eaff090b6552..fb4afc51808b5 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -1144,6 +1144,10 @@ private function base_class_next_token(): bool { $this->bytes_already_parsed = $tag_ends_at + 1; $this->token_length = $this->bytes_already_parsed - $this->token_starts_at; + if ( $this->is_closing_tag ) { + return true; + } + after_tag_match: /* @@ -1161,10 +1165,7 @@ private function base_class_next_token(): bool { * - TITLE * - XMP (deprecated) */ - if ( - $this->is_closing_tag || - 'html' !== $this->parsing_namespace - ) { + if ( 'html' !== $this->parsing_namespace ) { return true; } From 0aa5ce2f6977b7c1ec638145b796eddbc63b55ae Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 13 Mar 2026 17:22:07 +0100 Subject: [PATCH 34/43] Perf: Skip strpos when already at '<' for tag tokens MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Check for '<' at current position before calling strpos(). When tokens alternate text→tag, the tag iteration starts at '<' and can skip strpos entirely. Since ~63% of tokens are tags, this eliminates ~646K strpos calls per benchmark iteration. --- .../html-api/class-wp-html-tag-processor.php | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index fb4afc51808b5..d47187254e0dd 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -1018,28 +1018,28 @@ private function base_class_next_token(): bool { /* * Fast path: handle the two most common token types inline. * - * 1. Text nodes: text between tags (strpos finds next '<'). - * 2. Regular tags: '<' followed by alpha or '/'+alpha. + * 1. At '<': try to match a regular tag directly (skip strpos). + * 2. Text nodes: text between tags (strpos finds next '<'). * * Complex tokens (comments, DOCTYPE, CDATA, etc.) fall through * to the full parse_next_tag() method. */ - $at = strpos( $html, '<', $at ); + if ( '<' !== $html[ $at ] ) { + $at = strpos( $html, '<', $at ); - // No '<' found: the rest of the document is a text node. - if ( false === $at ) { - $this->parser_state = self::STATE_TEXT_NODE; - $this->token_starts_at = $was_at; - $this->text_starts_at = $was_at; - $this->token_length = $doc_length - $was_at; - $this->text_length = $doc_length - $was_at; - $this->text_node_classification = self::TEXT_IS_GENERIC; - $this->bytes_already_parsed = $doc_length; - return true; - } + // No '<' found: the rest of the document is a text node. + if ( false === $at ) { + $this->parser_state = self::STATE_TEXT_NODE; + $this->token_starts_at = $was_at; + $this->text_starts_at = $was_at; + $this->token_length = $doc_length - $was_at; + $this->text_length = $doc_length - $was_at; + $this->text_node_classification = self::TEXT_IS_GENERIC; + $this->bytes_already_parsed = $doc_length; + return true; + } - // Text before the '<': return it as a text node. - if ( $at > $was_at ) { + // Validate the '<' starts a valid token before returning text. $next_byte = $html[ $at + 1 ] ?? ''; if ( '!' !== $next_byte && '/' !== $next_byte && '?' !== $next_byte && From 073a6ab2c86f65a1f34fc646b7eca529428ea205 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 13 Mar 2026 17:29:49 +0100 Subject: [PATCH 35/43] Perf: Remove text_starts_at null write for tags, use bounds check Instead of setting text_starts_at = null for every tag to prevent stale text in get_modifiable_text(), add a bounds check that detects stale text_starts_at (from previous text nodes) by comparing against token_starts_at. Saves ~646K property writes. --- src/wp-includes/html-api/class-wp-html-tag-processor.php | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index d47187254e0dd..f31d6c7488d91 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -1078,7 +1078,6 @@ private function base_class_next_token(): bool { $this->is_closing_tag = $is_closer; $this->tag_name_starts_at = $tag_at; $this->tag_name_length = $tag_length; - $this->text_starts_at = null; $this->attributes_parsed = false; // Fast path: '>' immediately after tag name. @@ -4000,7 +3999,7 @@ public function subdivide_text_appropriately(): bool { public function get_modifiable_text(): string { $has_enqueued_update = isset( $this->lexical_updates['modifiable text'] ); - if ( ! $has_enqueued_update && ( null === $this->text_starts_at || 0 === $this->text_length ) ) { + if ( ! $has_enqueued_update && ( null === $this->text_starts_at || 0 === $this->text_length || $this->text_starts_at < $this->token_starts_at ) ) { return ''; } From ae0586c2039232b95b94b9e6833e21e5a6a01a7d Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 13 Mar 2026 17:30:22 +0100 Subject: [PATCH 36/43] Update autoresearch.md with optimizations 16-22 Current: ~330ms (52.8% faster than 699ms baseline) --- autoresearch.md | 73 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 autoresearch.md diff --git a/autoresearch.md b/autoresearch.md new file mode 100644 index 0000000000000..b18bd49470442 --- /dev/null +++ b/autoresearch.md @@ -0,0 +1,73 @@ +# Autoresearch: HTML Tag Processor Performance + +## Objective +Optimize `WP_HTML_Tag_Processor::next_token()` tokenization throughput on html-standard.html (~large real-world HTML). The benchmark iterates all tokens with no modifications — purely read-only tokenization speed. + +## Metrics +- **Primary**: mean execution time (ms, lower is better) via `hyperfine` +- **Secondary**: peak memory (bytes, lower is better) via `/usr/bin/time -l` + +## How to Run +`./autoresearch.sh` — runs hyperfine, outputs `METRIC mean_ms=number` lines. + +## Files in Scope +- `src/wp-includes/html-api/class-wp-html-tag-processor.php` — main parser, all hot path methods +- `src/wp-includes/html-api/class-wp-html-attribute-token.php` — attribute token object (6 props, allocated per attr) +- `src/wp-includes/html-api/class-wp-html-span.php` — span object (2 props, allocated on dup attrs) +- `src/wp-includes/html-api/class-wp-html-text-replacement.php` — text replacement (3 props, not in hot path for read-only) + +## Off Limits +- Test files +- `bench.php` and `bootstrap-html-api.php` +- Any file outside `src/wp-includes/html-api/` + +## Constraints +- PHPUnit tests must pass: `./vendor/bin/phpunit -c tests/phpunit/tests/html-api/phpunit.xml --stop-on-error --stop-on-failure --stop-on-warning --stop-on-defect` +- No new dependencies +- stddev and outliers from hyperfine must remain acceptable +- Changes must preserve all existing behavior + +## What's Been Tried + +### Baseline: ~699ms + +### Wins (cumulative, all committed) +1. **Replace per-attribute function call loop with skip_attributes_and_find_closer()** — eliminates parse_next_attribute(false) calls. Single method scans for `>` handling quoted values. +2. **Inline after_tag() into base_class_next_token()** — removes method call overhead per token. +3. **Inline fast paths for text nodes and regular tags** — handles the two most common token types (text ~378K, tags ~646K) directly in base_class_next_token, falling through to full parse_next_tag() only for complex tokens. +4. **Direct byte comparisons for single-char strspn** — replace strspn for single-character checks with direct `===` comparisons. +5. **Cache doc_length as instance variable** — avoid strlen() per token. +6. **Fast path for '>' immediately after tag name** — skip attribute scanning for tags like ``, `
`. +7. **Defer property resets to type-specific return paths** — text nodes only reset tag-related properties, tags only reset text-related properties. +8. **Tag name length filter before special element check** — special elements have lengths 3,5,6,7,8. Tags of other lengths return immediately without calling get_tag(). +9. **Reorder checks: length before strspn** — many common tags eliminated by cheap integer comparison before the strspn function call. +10. **Optimize attribute scanner for common name="value" pattern** — check for `=` and quote char directly after attribute name, avoiding two strspn() calls that typically return 0. +11. **Inline single-space and '>' checks in attribute scanner loop** — replace strspn for whitespace between attributes with direct byte comparisons for single-space (most common) and '>' (tag closer). +12. **Remove redundant STATE_COMPLETE check** — $at >= $doc_length bounds check handles this case. +13. **Remove text_node_classification write from tag fast path** — never read for tag tokens. +14. **Use null text_starts_at for tags** — allows removing text_length=0 write. get_modifiable_text() returns '' on null text_starts_at. +15. **Avoid redundant bytes_already_parsed property read** — use local $was_at for $at when no lexical updates. + +16. **Remove attribute_scan_from property** — compute scan position as tag_name_starts_at + tag_name_length on demand in ensure_attributes_parsed(). Eliminates property and 3 writes. +17. **Remove attributes_parsed write from text nodes** — all callers of ensure_attributes_parsed() guard with STATE_MATCHED_TAG check, so the flag is never read for non-tag tokens. +18. **Short-circuit closing tags before after_tag_match** — closing tags never need special element processing. Return early using local $is_closer instead of reading property through the shared label. +19. **Move closer check out of after_tag_match** — both fast path and full_parse path return early for closers. after_tag_match now only handles openers, eliminating is_closing_tag read. +20. **Skip strpos when at '<'** — check for '<' at current position before calling strpos(). Tags (~63% of tokens) start at '<' and skip the function call entirely. +21. **Remove text_starts_at null write for tags** — use bounds check (text_starts_at < token_starts_at) in get_modifiable_text() to detect stale text instead of proactively nulling. +22. **Restructure get_tag() for state-based dispatch** — check STATE_MATCHED_TAG first instead of null check on tag_name_starts_at. Allows skipping tag_name null writes for text nodes (~756K writes eliminated). + +### Current: ~330ms (52.8% faster) + +### Dead Ends +- **First-letter bitwise OR + 7 comparisons** — replacing strspn('iIlLnNpPsStTxX',...) was WORSE (655→605ms regression). PHP bitwise string OR creates allocation; 7 comparisons slower than one C-level strspn. +- **substr_compare for special element names** — replacing get_tag()+switch with substr_compare+switch-on-length showed no measurable improvement. The special element check is already rare (filtered by length + first letter). Added code complexity for zero gain. +- **Simplified closer detection** — removing ternary `$is_closer ? 1 : 0` by computing $tag_at incrementally. Neutral result. +- **Local vars for after_tag_match** — passing tag_length/tag_at as locals through the goto label. Neutral — property reads are hot in PHP's cache. +- **Pass $at parameter to skip_attributes_and_find_closer** — extra function parameter overhead cancels the property write/read savings. + +### Architecture Notes +- **Token distribution**: ~646K tags, ~378K text nodes, ~247K attributes across ~1M tokens in html-standard.html +- **Text-tag alternation**: Most tokens alternate text→tag→text→tag. The strpos skip optimization exploits this — tags start at '<' so no search is needed. +- **PHP overhead dominates**: At 330ms / 1M tokens = 330ns/token. Property reads (~20-30ns each), property writes (~20-30ns), method dispatch (~50-100ns) are the main costs. +- **next_token()→base_class_next_token() dispatch**: ~1M extra method calls, but cannot be eliminated because get_updated_html() needs the base implementation. +- **Remaining writes per token**: text nodes ~7 writes, tags ~8 writes. Total ~8M writes per benchmark run at ~20ns each = ~160ms (48% of total). From 52190592707c998707cd08c09cfdda755ec0489e Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 13 Mar 2026 17:37:36 +0100 Subject: [PATCH 37/43] Perf: Replace attributes_parsed boolean with version-based staleness check Use attributes_parsed_at integer compared against token_starts_at to detect stale attribute data, eliminating ~646K attributes_parsed=false writes per parse iteration. The version check in ensure_attributes_parsed() automatically invalidates when a new token is parsed. --- .../html-api/class-wp-html-tag-processor.php | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index f31d6c7488d91..0db25b87e6e15 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -693,11 +693,14 @@ class WP_HTML_Tag_Processor { private $is_closing_tag; /** - * Whether attributes have been parsed and stored for the current tag. + * The token_starts_at value when attributes were last parsed. * - * @var bool + * Used to detect whether cached attributes are stale. When this + * doesn't match token_starts_at, attributes need re-parsing. + * + * @var int */ - private $attributes_parsed = true; + private $attributes_parsed_at = -1; /** * Lazily-built index of attributes found within an HTML tag, keyed by the attribute name. @@ -1078,7 +1081,6 @@ private function base_class_next_token(): bool { $this->is_closing_tag = $is_closer; $this->tag_name_starts_at = $tag_at; $this->tag_name_length = $tag_length; - $this->attributes_parsed = false; // Fast path: '>' immediately after tag name. if ( $after_name < $doc_length && '>' === $html[ $after_name ] ) { @@ -1116,7 +1118,6 @@ private function base_class_next_token(): bool { $this->text_starts_at = 0; $this->text_length = 0; $this->text_node_classification = self::TEXT_IS_GENERIC; - $this->attributes_parsed = true; if ( false === $this->parse_next_tag() ) { if ( self::STATE_INCOMPLETE_INPUT === $this->parser_state ) { @@ -1131,7 +1132,6 @@ private function base_class_next_token(): bool { } // Tag found by parse_next_tag — scan attributes. - $this->attributes_parsed = false; $tag_ends_at = $this->skip_attributes_and_find_closer( $html, $doc_length ); if ( false === $tag_ends_at ) { $this->parser_state = self::STATE_INCOMPLETE_INPUT; @@ -2588,11 +2588,11 @@ private function skip_attributes_and_find_closer( string $html, int $doc_length * @ignore */ private function ensure_attributes_parsed(): void { - if ( $this->attributes_parsed ) { + if ( $this->attributes_parsed_at === $this->token_starts_at ) { return; } - $this->attributes_parsed = true; + $this->attributes_parsed_at = $this->token_starts_at; $this->attributes = array(); $this->duplicate_attributes = null; @@ -2681,7 +2681,7 @@ private function after_tag(): void { $this->is_closing_tag = null; $this->comment_type = null; $this->text_node_classification = self::TEXT_IS_GENERIC; - $this->attributes_parsed = true; + $this->attributes_parsed_at = -1; } /** @@ -5035,6 +5035,7 @@ public function get_updated_html(): string { * └←─┘ back up by strlen("em") + 1 ==> 3 */ $this->bytes_already_parsed = $before_current_tag; + $this->attributes_parsed_at = -1; $this->base_class_next_token(); return $this->html; From e5df992ff8ba0dcc0d4d7b40126792d038d57354 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 13 Mar 2026 17:41:41 +0100 Subject: [PATCH 38/43] Perf: Pre-filter special element length in fast path before goto Check tag name length against special element lengths (3,5,6,7,8) before goto after_tag_match. Common tags with lengths 1,2,4 (a, p, br, li, span, code, etc.) return immediately, avoiding goto dispatch + namespace read + property reads for the majority of opener tags. --- .../html-api/class-wp-html-tag-processor.php | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 0db25b87e6e15..d153b7b44c9a6 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -1103,6 +1103,16 @@ private function base_class_next_token(): bool { return true; } + /* + * Quick length filter for special elements before goto. + * Special element names have lengths 3, 5, 6, 7, or 8. + * Common tags with other lengths (a, p, br, li, span, code, etc.) + * can return immediately without the goto dispatch. + */ + if ( $tag_length < 3 || $tag_length > 8 || 4 === $tag_length ) { + return true; + } + goto after_tag_match; } From fc047b49d394b291dd4acf8700f54fa98d0cb744 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 13 Mar 2026 17:58:53 +0100 Subject: [PATCH 39/43] Perf: Merge STATE_INCOMPLETE_INPUT check into bounds check Remove the dedicated parser_state read for STATE_INCOMPLETE_INPUT at the top of the hot loop. Instead, set bytes_already_parsed = doc_length when incomplete input is detected, so the bounds check ($at >= $doc_length) handles it. This eliminates one property read per token (~1M reads per pass). --- .../html-api/class-wp-html-tag-processor.php | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index d153b7b44c9a6..009709934f27d 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -1005,16 +1005,13 @@ private function base_class_next_token(): bool { $at = $this->bytes_already_parsed; } - // Don't proceed if there's nothing more to scan. - if ( self::STATE_INCOMPLETE_INPUT === $this->parser_state ) { - return false; - } - $html = $this->html; $doc_length = $this->doc_length; if ( $at >= $doc_length ) { - $this->parser_state = self::STATE_COMPLETE; + if ( self::STATE_INCOMPLETE_INPUT !== $this->parser_state ) { + $this->parser_state = self::STATE_COMPLETE; + } return false; } @@ -1090,7 +1087,7 @@ private function base_class_next_token(): bool { $tag_ends_at = $this->skip_attributes_and_find_closer( $html, $doc_length ); if ( false === $tag_ends_at ) { $this->parser_state = self::STATE_INCOMPLETE_INPUT; - $this->bytes_already_parsed = $was_at; + $this->bytes_already_parsed = $doc_length; return false; } } @@ -1131,7 +1128,7 @@ private function base_class_next_token(): bool { if ( false === $this->parse_next_tag() ) { if ( self::STATE_INCOMPLETE_INPUT === $this->parser_state ) { - $this->bytes_already_parsed = $was_at; + $this->bytes_already_parsed = $doc_length; } return false; @@ -1145,7 +1142,7 @@ private function base_class_next_token(): bool { $tag_ends_at = $this->skip_attributes_and_find_closer( $html, $doc_length ); if ( false === $tag_ends_at ) { $this->parser_state = self::STATE_INCOMPLETE_INPUT; - $this->bytes_already_parsed = $was_at; + $this->bytes_already_parsed = $doc_length; return false; } @@ -1263,7 +1260,7 @@ private function base_class_next_token(): bool { if ( ! $found_closer ) { $this->parser_state = self::STATE_INCOMPLETE_INPUT; - $this->bytes_already_parsed = $was_at; + $this->bytes_already_parsed = $doc_length; return false; } From 9c08c0e2fde80fdadaab6553b2ecd37124eabaf5 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 13 Mar 2026 18:41:47 +0100 Subject: [PATCH 40/43] Update autoresearch.md with optimizations 23-25, dead ends, and ideas backlog Document new wins (attributes_parsed_at, fast-path length filter, merged incomplete check), failed experiments, detailed architecture notes from profiling data, and create ideas backlog for next session. --- autoresearch.ideas.md | 16 +++++++++++++++ autoresearch.md | 47 +++++++++++++++++++++++++++++++++---------- 2 files changed, 52 insertions(+), 11 deletions(-) create mode 100644 autoresearch.ideas.md diff --git a/autoresearch.ideas.md b/autoresearch.ideas.md new file mode 100644 index 0000000000000..3ff482776cbad --- /dev/null +++ b/autoresearch.ideas.md @@ -0,0 +1,16 @@ +# Autoresearch Ideas Backlog + +## High Priority (user-suggested) +- **Stack on_push/on_pop callbacks** — the HTML processor stack operations have push/pop callbacks. If these fire during tokenization (even indirectly), they could be significant overhead. Investigate whether any stack operations happen in the tag processor's read-only path, or whether these only apply to the HTML processor's tree-building. +- **Bookmark on_destroy callback** — bookmarks may have cleanup behavior that adds overhead. Check if any bookmark operations happen during pure tokenization. + +## Medium Priority +- **Lazy token_length** — derive from bytes_already_parsed - token_starts_at instead of writing per token. Saves ~1M writes/pass. Requires changing all read sites. +- **Lazy is_closing_tag** — derive from html bytes. Saves 1 write/tag but adds cost to reads. +- **Deferred property writes with lazy flush** — save all non-essential writes, flush on demand. Big win for read-only, slight overhead for read-write. Protected properties can't be deferred. +- **Single boolean for modification check** — replace 2 array reads with 1 boolean read in hot loop. + +## Low Priority / Speculative +- **Integer state constants** — replace string comparisons with int. API-breaking for protected parser_state. +- **Packed tag name properties** — combine tag_name_starts_at + tag_name_length into single int. +- **Static variable caching** — cache html/doc_length across calls. diff --git a/autoresearch.md b/autoresearch.md index b18bd49470442..3636b973f1433 100644 --- a/autoresearch.md +++ b/autoresearch.md @@ -47,7 +47,6 @@ Optimize `WP_HTML_Tag_Processor::next_token()` tokenization throughput on html-s 13. **Remove text_node_classification write from tag fast path** — never read for tag tokens. 14. **Use null text_starts_at for tags** — allows removing text_length=0 write. get_modifiable_text() returns '' on null text_starts_at. 15. **Avoid redundant bytes_already_parsed property read** — use local $was_at for $at when no lexical updates. - 16. **Remove attribute_scan_from property** — compute scan position as tag_name_starts_at + tag_name_length on demand in ensure_attributes_parsed(). Eliminates property and 3 writes. 17. **Remove attributes_parsed write from text nodes** — all callers of ensure_attributes_parsed() guard with STATE_MATCHED_TAG check, so the flag is never read for non-tag tokens. 18. **Short-circuit closing tags before after_tag_match** — closing tags never need special element processing. Return early using local $is_closer instead of reading property through the shared label. @@ -55,19 +54,45 @@ Optimize `WP_HTML_Tag_Processor::next_token()` tokenization throughput on html-s 20. **Skip strpos when at '<'** — check for '<' at current position before calling strpos(). Tags (~63% of tokens) start at '<' and skip the function call entirely. 21. **Remove text_starts_at null write for tags** — use bounds check (text_starts_at < token_starts_at) in get_modifiable_text() to detect stale text instead of proactively nulling. 22. **Restructure get_tag() for state-based dispatch** — check STATE_MATCHED_TAG first instead of null check on tag_name_starts_at. Allows skipping tag_name null writes for text nodes (~756K writes eliminated). +23. **Replace attributes_parsed boolean with version-based staleness check** — use attributes_parsed_at integer compared against token_starts_at. Eliminates ~646K attributes_parsed=false writes per parse iteration. +24. **Pre-filter special element length in fast path before goto** — check tag name length (3,5,6,7,8) before goto after_tag_match. Tags with lengths 1,2,4 (88% of all tags: a, p, br, li, span, code, etc.) return immediately. +25. **Merge STATE_INCOMPLETE_INPUT check into bounds check** — remove dedicated parser_state read at loop start. Set bytes_already_parsed=doc_length on incomplete input so the existing bounds check handles it. Eliminates 1 property read per token. -### Current: ~330ms (52.8% faster) +### Current: ~316ms (54.8% faster) ### Dead Ends -- **First-letter bitwise OR + 7 comparisons** — replacing strspn('iIlLnNpPsStTxX',...) was WORSE (655→605ms regression). PHP bitwise string OR creates allocation; 7 comparisons slower than one C-level strspn. -- **substr_compare for special element names** — replacing get_tag()+switch with substr_compare+switch-on-length showed no measurable improvement. The special element check is already rare (filtered by length + first letter). Added code complexity for zero gain. -- **Simplified closer detection** — removing ternary `$is_closer ? 1 : 0` by computing $tag_at incrementally. Neutral result. -- **Local vars for after_tag_match** — passing tag_length/tag_at as locals through the goto label. Neutral — property reads are hot in PHP's cache. -- **Pass $at parameter to skip_attributes_and_find_closer** — extra function parameter overhead cancels the property write/read savings. +- **First-letter bitwise OR + 7 comparisons** — replacing strspn('iIlLnNpPsStTxX',...) was WORSE. PHP bitwise string OR creates allocation; 7 comparisons slower than one C-level strspn. +- **substr_compare for special element names** — no measurable improvement. The special element check is already rare. +- **Simplified closer detection** — removing ternary `$is_closer ? 1 : 0` by computing $tag_at incrementally. Neutral. +- **Local vars for after_tag_match** — passing tag_length/tag_at as locals through the goto label. Neutral. +- **Pass $at parameter to skip_attributes_and_find_closer** — extra function parameter overhead cancels savings. +- **Add strspn first-letter check to fast path filter** — adding strspn('iIlLnNpPsStTxX') alongside the length filter. Neutral — length filter already catches 88% of tags. +- **Conditional text_node_classification write** — `if (TEXT_IS_GENERIC !== $this->text_node_classification)` before writing. Neutral — the conditional read costs the same as the write. +- **1-byte text node lookahead** — check `$html[$at+1] === '<'` before calling strpos. WORSE (~15ms regression). The extra branch on every text path hurts; strpos with memchr is already very fast for single bytes. +- **Length-3 first-letter filter in fast path** — for len=3 tags, check first letter against p/P/x/X (only PRE/XMP are special). Neutral — extra comparisons offset the savings from avoiding after_tag_match for ~74K div tags. +- **Single boolean has_pending_updates flag** — replace `classname_updates || lexical_updates` (2 reads) with a single boolean. Too invasive: 16+ modification sites need `$this->has_pending_updates = true`. Correctness concerns with clearing the flag. +- **Defer classname_updates check** — only check lexical_updates in hot loop, defer classname conversion. Incorrect: classname conversion requires current tag's attributes; deferring past cursor advance would use wrong attributes. ### Architecture Notes -- **Token distribution**: ~646K tags, ~378K text nodes, ~247K attributes across ~1M tokens in html-standard.html +- **Token distribution**: ~646K tags (325K openers, 321K closers), ~378K text nodes, ~247K attributes, 1 other, across ~1M tokens in html-standard.html +- **Tag name length distribution**: len=1: 184K (28%), len=2: 211K (33%), len=3: 75K (12%), len=4: 174K (27%), len=5+: 4K (0.6%). Length filter catches 88% of tags. +- **Attribute distribution**: ~517K tags without attributes, ~129K with attributes (~20%) +- **Text node length**: 73K are 1 byte, 22K are 2 bytes, 30K are 3 bytes, etc. Most are short (whitespace between tags). - **Text-tag alternation**: Most tokens alternate text→tag→text→tag. The strpos skip optimization exploits this — tags start at '<' so no search is needed. -- **PHP overhead dominates**: At 330ms / 1M tokens = 330ns/token. Property reads (~20-30ns each), property writes (~20-30ns), method dispatch (~50-100ns) are the main costs. -- **next_token()→base_class_next_token() dispatch**: ~1M extra method calls, but cannot be eliminated because get_updated_html() needs the base implementation. -- **Remaining writes per token**: text nodes ~7 writes, tags ~8 writes. Total ~8M writes per benchmark run at ~20ns each = ~160ms (48% of total). +- **PHP overhead dominates**: At 316ms / 1M tokens = 316ns/token (per pass, 3 passes). Property reads (~5-10ns each), property writes (~10-15ns), method dispatch (~10-20ns for JIT-optimized private calls). +- **next_token()→base_class_next_token() dispatch**: ~1M extra method calls, cannot be eliminated because get_updated_html() needs the base implementation. +- **Remaining property reads per token (hot path start)**: bytes_already_parsed, classname_updates, lexical_updates, html, doc_length = 5 reads. +- **Remaining property writes per token**: text nodes ~7, tags ~7. Total ~7M writes per benchmark pass. +- **Protected properties constrain optimization**: parser_state and text_node_classification are protected (read directly by WP_HTML_Processor subclass). Cannot defer or version-gate these without changing the subclass, which is off-limits. +- **after_tag() is dead code**: the method exists but is never called (fully inlined into base_class_next_token). Could be removed, but cosmetic. + +### Unexplored Ideas +- **Stack operations on_push/on_pop callbacks** — the HTML processor's open_elements stack has push/pop callbacks that fire during tree-building. These are not in scope for the tag processor benchmark, but if the benchmark changes to use the HTML processor, these callbacks could be significant overhead. +- **Bookmark on_destroy callback** — bookmarks have cleanup behavior. Not in hot path for read-only benchmark. +- **Lazy token_length computation** — token_length = bytes_already_parsed - token_starts_at for all fast-path tokens. Could eliminate 1 write per token (~1M writes/pass). But read sites are numerous and some (special elements, bookmarks) set token_length independently. Would need to change all read sites. +- **Lazy is_closing_tag computation** — derive from html[token_starts_at+1] === '/'. Saves 1 write per tag but adds 2 property reads + 1 byte access per read (many read sites including subclass). +- **Integer state constants** — replace string parser_state constants with integers for faster comparison. But parser_state is protected and used by external code with string comparisons. +- **Packed tag name properties** — store tag_name_starts_at and tag_name_length in a single 64-bit int. Saves 1 write, adds shift/mask to reads. Only useful if reads are rare (true for fast-path-filtered tags). +- **Static variable caching for $html/$doc_length** — cache across method calls. Saves ~1 property read/call. Shared across instances (problematic for multi-instance usage). +- **Deferred property writes with lazy flush** — store pending token data, only write to properties when external code reads them. Saves all property writes for read-only benchmark. Requires flush checks in all getter methods. Protected properties can't be deferred. +- **Eliminate classname_updates read in hot loop** — both classname_updates and lexical_updates are always empty in the benchmark. Replacing 2 array truthiness checks with a single boolean flag would save 1 read/token, but requires setting the flag in 16+ update methods. From a8527fdec8750bbdbd78a3154e7af7bdf5854e32 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 13 Mar 2026 18:51:51 +0100 Subject: [PATCH 41/43] Perf: Eliminate token_length property writes by computing on demand token_length = bytes_already_parsed - token_starts_at holds at every write site. Derive the value on demand at the ~6 read sites instead of writing it at ~20 write sites (~1M writes/pass eliminated). --- .../html-api/class-wp-html-tag-processor.php | 37 +++++-------------- 1 file changed, 9 insertions(+), 28 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 009709934f27d..7e6254bb5b935 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -1032,7 +1032,6 @@ private function base_class_next_token(): bool { $this->parser_state = self::STATE_TEXT_NODE; $this->token_starts_at = $was_at; $this->text_starts_at = $was_at; - $this->token_length = $doc_length - $was_at; $this->text_length = $doc_length - $was_at; $this->text_node_classification = self::TEXT_IS_GENERIC; $this->bytes_already_parsed = $doc_length; @@ -1055,7 +1054,6 @@ private function base_class_next_token(): bool { $this->parser_state = self::STATE_TEXT_NODE; $this->token_starts_at = $was_at; $this->text_starts_at = $was_at; - $this->token_length = $at - $was_at; $this->text_length = $at - $was_at; $this->text_node_classification = self::TEXT_IS_GENERIC; $this->bytes_already_parsed = $at; @@ -1094,7 +1092,6 @@ private function base_class_next_token(): bool { $this->parser_state = self::STATE_MATCHED_TAG; $this->bytes_already_parsed = $tag_ends_at + 1; - $this->token_length = $tag_ends_at + 1 - $at; if ( $is_closer ) { return true; @@ -1148,7 +1145,6 @@ private function base_class_next_token(): bool { } $this->parser_state = self::STATE_MATCHED_TAG; $this->bytes_already_parsed = $tag_ends_at + 1; - $this->token_length = $this->bytes_already_parsed - $this->token_starts_at; if ( $this->is_closing_tag ) { return true; @@ -1218,7 +1214,7 @@ private function base_class_next_token(): bool { */ $tag_name_starts_at = $this->tag_name_starts_at; $tag_name_length = $this->tag_name_length; - $tag_ends_at = $this->token_starts_at + $this->token_length; + $tag_ends_at = $this->bytes_already_parsed; $this->ensure_attributes_parsed(); $attributes = $this->attributes; $duplicate_attributes = $this->duplicate_attributes; @@ -1272,7 +1268,6 @@ private function base_class_next_token(): bool { * the inner content of the tag. */ $this->token_starts_at = $was_at; - $this->token_length = $this->bytes_already_parsed - $this->token_starts_at; $this->text_starts_at = $tag_ends_at; $this->text_length = $this->tag_name_starts_at - $this->text_starts_at; $this->tag_name_starts_at = $tag_name_starts_at; @@ -1495,7 +1490,7 @@ public function set_bookmark( $name ): bool { return false; } - $this->bookmarks[ $name ] = new WP_HTML_Span( $this->token_starts_at, $this->token_length ); + $this->bookmarks[ $name ] = new WP_HTML_Span( $this->token_starts_at, $this->bytes_already_parsed - $this->token_starts_at ); return true; } @@ -1887,7 +1882,6 @@ private function parse_next_tag(): bool { $this->parser_state = self::STATE_TEXT_NODE; $this->token_starts_at = $was_at; $this->text_starts_at = $was_at; - $this->token_length = $at - $was_at; $this->text_length = $at - $was_at; $this->bytes_already_parsed = $at; return true; @@ -1967,7 +1961,6 @@ private function parse_next_tag(): bool { */ $this->parser_state = self::STATE_COMMENT; $this->comment_type = self::COMMENT_AS_ABRUPTLY_CLOSED_COMMENT; - $this->token_length = $closer_at + $span_of_dashes + 1 - $this->token_starts_at; // Only provide modifiable text if the token is long enough to contain it. if ( $span_of_dashes >= 2 ) { @@ -1998,7 +1991,6 @@ private function parse_next_tag(): bool { if ( $closer_at + 2 < $doc_length && '>' === $html[ $closer_at + 2 ] ) { $this->parser_state = self::STATE_COMMENT; $this->comment_type = self::COMMENT_AS_HTML_COMMENT; - $this->token_length = $closer_at + 3 - $this->token_starts_at; $this->text_starts_at = $this->token_starts_at + 4; $this->text_length = $closer_at - $this->text_starts_at; $this->bytes_already_parsed = $closer_at + 3; @@ -2012,7 +2004,6 @@ private function parse_next_tag(): bool { ) { $this->parser_state = self::STATE_COMMENT; $this->comment_type = self::COMMENT_AS_HTML_COMMENT; - $this->token_length = $closer_at + 4 - $this->token_starts_at; $this->text_starts_at = $this->token_starts_at + 4; $this->text_length = $closer_at - $this->text_starts_at; $this->bytes_already_parsed = $closer_at + 4; @@ -2044,7 +2035,6 @@ private function parse_next_tag(): bool { } $this->parser_state = self::STATE_DOCTYPE; - $this->token_length = $closer_at + 1 - $this->token_starts_at; $this->text_starts_at = $this->token_starts_at + 9; $this->text_length = $closer_at - $this->text_starts_at; $this->bytes_already_parsed = $closer_at + 1; @@ -2072,7 +2062,6 @@ private function parse_next_tag(): bool { $this->parser_state = self::STATE_CDATA_NODE; $this->text_starts_at = $at + 9; $this->text_length = $closer_at - $this->text_starts_at; - $this->token_length = $closer_at + 3 - $this->token_starts_at; $this->bytes_already_parsed = $closer_at + 3; return true; } @@ -2091,7 +2080,6 @@ private function parse_next_tag(): bool { $this->parser_state = self::STATE_COMMENT; $this->comment_type = self::COMMENT_AS_INVALID_HTML; - $this->token_length = $closer_at + 1 - $this->token_starts_at; $this->text_starts_at = $this->token_starts_at + 2; $this->text_length = $closer_at - $this->text_starts_at; $this->bytes_already_parsed = $closer_at + 1; @@ -2112,7 +2100,7 @@ private function parse_next_tag(): bool { * and require the proper closing `]]>` in those cases. */ if ( - $this->token_length >= 10 && + $this->bytes_already_parsed - $this->token_starts_at >= 10 && '[' === $html[ $this->token_starts_at + 2 ] && 'C' === $html[ $this->token_starts_at + 3 ] && 'D' === $html[ $this->token_starts_at + 4 ] && @@ -2149,7 +2137,6 @@ private function parse_next_tag(): bool { } $this->parser_state = self::STATE_PRESUMPTUOUS_TAG; - $this->token_length = $at + 2 - $this->token_starts_at; $this->bytes_already_parsed = $at + 2; return true; } @@ -2168,7 +2155,6 @@ private function parse_next_tag(): bool { $this->parser_state = self::STATE_COMMENT; $this->comment_type = self::COMMENT_AS_INVALID_HTML; - $this->token_length = $closer_at + 1 - $this->token_starts_at; $this->text_starts_at = $this->token_starts_at + 2; $this->text_length = $closer_at - $this->text_starts_at; $this->bytes_already_parsed = $closer_at + 1; @@ -2200,8 +2186,8 @@ private function parse_next_tag(): bool { * * @see https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PITarget */ - if ( $this->token_length >= 5 && '?' === $html[ $closer_at - 1 ] ) { - $comment_text = substr( $html, $this->token_starts_at + 2, $this->token_length - 4 ); + if ( $closer_at + 1 - $this->token_starts_at >= 5 && '?' === $html[ $closer_at - 1 ] ) { + $comment_text = substr( $html, $this->token_starts_at + 2, $closer_at + 1 - $this->token_starts_at - 4 ); $pi_target_length = strspn( $comment_text, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ:_' ); if ( 0 < $pi_target_length ) { @@ -2243,7 +2229,6 @@ private function parse_next_tag(): bool { } $this->parser_state = self::STATE_FUNKY_COMMENT; - $this->token_length = $closer_at + 1 - $this->token_starts_at; $this->text_starts_at = $this->token_starts_at + 2; $this->text_length = $closer_at - $this->text_starts_at; $this->bytes_already_parsed = $closer_at + 1; @@ -2259,9 +2244,8 @@ private function parse_next_tag(): bool { */ $this->parser_state = self::STATE_TEXT_NODE; $this->token_starts_at = $was_at; - $this->token_length = $doc_length - $was_at; $this->text_starts_at = $was_at; - $this->text_length = $this->token_length; + $this->text_length = $doc_length - $was_at; $this->bytes_already_parsed = $doc_length; return true; } @@ -2680,7 +2664,6 @@ private function after_tag(): void { } $this->token_starts_at = null; - $this->token_length = null; $this->tag_name_starts_at = null; $this->tag_name_length = null; $this->text_starts_at = 0; @@ -2993,7 +2976,7 @@ public function seek( $bookmark_name ): bool { if ( $this->token_starts_at === $existing_bookmark->start && - $this->token_length === $existing_bookmark->length + $this->bytes_already_parsed - $this->token_starts_at === $existing_bookmark->length ) { return true; } @@ -3692,7 +3675,7 @@ public function has_self_closing_flag(): bool { *
* ^ this appears one character before the end of the closing ">". */ - return '/' === $this->html[ $this->token_starts_at + $this->token_length - 2 ]; + return '/' === $this->html[ $this->bytes_already_parsed - 2 ]; } /** @@ -3933,7 +3916,6 @@ public function subdivide_text_appropriately(): bool { */ $leading_nulls = strspn( $this->html, "\x00", $this->text_starts_at, $this->text_length ); if ( $leading_nulls > 0 ) { - $this->token_length = $leading_nulls; $this->text_length = $leading_nulls; $this->bytes_already_parsed = $this->token_starts_at + $leading_nulls; $this->text_node_classification = self::TEXT_IS_NULL_SEQUENCE; @@ -3966,7 +3948,6 @@ public function subdivide_text_appropriately(): bool { if ( $at > $this->text_starts_at ) { $new_length = $at - $this->text_starts_at; $this->text_length = $new_length; - $this->token_length = $new_length; $this->bytes_already_parsed = $at; $this->text_node_classification = self::TEXT_IS_WHITESPACE; return true; @@ -5164,7 +5145,7 @@ public function get_doctype_info(): ?WP_HTML_Doctype_Info { return null; } - return WP_HTML_Doctype_Info::from_doctype_token( substr( $this->html, $this->token_starts_at, $this->token_length ) ); + return WP_HTML_Doctype_Info::from_doctype_token( substr( $this->html, $this->token_starts_at, $this->bytes_already_parsed - $this->token_starts_at ) ); } /** From a89e78d346abdf18f9e71f8d7e9f45eb5119e0be Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 13 Mar 2026 19:05:59 +0100 Subject: [PATCH 42/43] Perf: Derive is_closing_tag from HTML bytes instead of storing per token Eliminate is_closing_tag property writes (~646K per pass) by computing '/' === html[token_starts_at + 1] at the ~8 read sites. Uses local variable in parse_next_tag() for the slow path. --- .../html-api/class-wp-html-tag-processor.php | 34 +++++++++---------- 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 7e6254bb5b935..f0bdc25990813 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -1073,7 +1073,6 @@ private function base_class_next_token(): bool { $after_name = $tag_at + $tag_length; $this->token_starts_at = $at; - $this->is_closing_tag = $is_closer; $this->tag_name_starts_at = $tag_at; $this->tag_name_length = $tag_length; @@ -1146,7 +1145,7 @@ private function base_class_next_token(): bool { $this->parser_state = self::STATE_MATCHED_TAG; $this->bytes_already_parsed = $tag_ends_at + 1; - if ( $this->is_closing_tag ) { + if ( '/' === $html[ $this->token_starts_at + 1 ] ) { return true; } @@ -1890,10 +1889,10 @@ private function parse_next_tag(): bool { $this->token_starts_at = $at; if ( $at + 1 < $doc_length && '/' === $html[ $at + 1 ] ) { - $this->is_closing_tag = true; + $is_closer = true; ++$at; } else { - $this->is_closing_tag = false; + $is_closer = false; } /* @@ -1934,7 +1933,7 @@ private function parse_next_tag(): bool { * `is_closing_tag && '!' === $html[ $at + 1 ] ) { + if ( ! $is_closer && '!' === $html[ $at + 1 ] ) { /* * `