From 46c2a66855b433422b561eec48dc9293b5880318 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Thu, 4 Dec 2025 17:49:28 -0700 Subject: [PATCH 1/3] Formatting: Introduce normalizing function for escaped HTML. (#10600) Like `wp_kses_normalize_entities()` but built for UTF-8 and HTML5 and relying on the HTML API for reliabilty. --- src/wp-includes/formatting.php | 142 ++++++++++++++++++ .../formatting/normalizeEscapedHtmlText.php | 58 +++++++ 2 files changed, 200 insertions(+) create mode 100644 tests/phpunit/tests/formatting/normalizeEscapedHtmlText.php diff --git a/src/wp-includes/formatting.php b/src/wp-includes/formatting.php index 3b546c30eebd0..5a58fefcbadfe 100644 --- a/src/wp-includes/formatting.php +++ b/src/wp-includes/formatting.php @@ -992,6 +992,148 @@ function _wp_specialchars( $text, $quote_style = ENT_NOQUOTES, $charset = false, return $text; } +/** + * Normalize the escaping for content within an HTML string. + * + * @since {WP_VERSION} + * + * @param string $context "attribute" for strings comprising a full HTML attribute value, + * or "data" for text nodes. + * @param string $text string containing HTML-escaped or escapable content, in UTF-8. + * @return string version of input where all appropriate characters and escapes + * are standard and predictable. + */ +function wp_normalize_escaped_html_text( string $context, string $text ): string { + $normalized = array(); + $end = strlen( $text ); + $at = 0; + $was_at = 0; + $token_length = 0; + + while ( $at < $end ) { + $next_character_reference_at = strpos( $text, '&', $at ); + if ( false === $next_character_reference_at ) { + break; + } + + $character_reference = WP_HTML_Decoder::read_character_reference( $context, $text, $next_character_reference_at, $token_length ); + + // This is an un-escaped ampersand character, so encode it. + if ( ! isset( $character_reference ) ) { + $normalized[] = substr( $text, $was_at, $next_character_reference_at - $was_at ) . '&'; + $at = $next_character_reference_at + 1; + $was_at = $at; + continue; + } + + // Some characters are best left visible to the human mind. + $should_unhide = 1 === strspn( $character_reference, ',%()0123456789:[]ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz{}' ); + if ( $should_unhide ) { + $normalized[] = substr( $text, $was_at, $next_character_reference_at - $was_at ) . $character_reference; + $at = $next_character_reference_at + $token_length; + $was_at = $at; + continue; + } + + $is_syntax = 1 === strspn( $character_reference, '&"\'<>' ); + if ( $is_syntax && '#' === $text[ $next_character_reference_at + 1 ] ) { + $named_form = strtr( + $character_reference, + array( + '&' => '&', + '"' => '"', + "'" => ''', + '<' => '<', + '>' => '>', + ) + ); + $normalized[] = substr( $text, $was_at, $next_character_reference_at - $was_at ) . $named_form; + $at = $next_character_reference_at + $token_length; + $was_at = $at; + continue; + } + + // This is a valid character reference, but it might not be normative. + $needs_semicolon = ';' !== $text[ $next_character_reference_at + $token_length - 1 ]; + + // This is a named character reference. + if ( '#' !== $text[ $next_character_reference_at + 1 ] ) { + // Nothing to do for already-normalized named character references. + if ( ! $needs_semicolon ) { + $at = $next_character_reference_at + $token_length; + continue; + } + + // Add the missing semicolon. + $normalized[] = substr( $text, $was_at, $next_character_reference_at - $was_at + $token_length ) . ';'; + $at = $next_character_reference_at + $token_length; + $was_at = $at; + continue; + } + + /* + * While named character references have only a single form and are case sensitive, + * numeric character references may contain upper or lowercase hex values and may + * contain unlimited preceding zeros. + */ + $is_hex = 'x' === $text[ $next_character_reference_at + 2 ] || 'X' === $text[ $next_character_reference_at + 2 ]; + $digits_at = $next_character_reference_at + ( $is_hex ? 3 : 2 ); + $leading_zeros = '0' === $text[ $digits_at ] ? strspn( $text, '0', $digits_at ) : 0; + + if ( ! $needs_semicolon && ! $is_hex && 0 === $leading_zeros ) { + // Nothing to do for already-normalized decimal numeric character references. + $at = $next_character_reference_at + $token_length; + continue; + } + + $digits = substr( $text, $digits_at + $leading_zeros, $next_character_reference_at + $token_length - $digits_at - $leading_zeros - ( $needs_semicolon ? 0 : 1 ) ); + if ( $is_hex ) { + $lower_digits = strtolower( $digits ); + + // Nothing to do for already-normalized hexadecimal numeric character references. + if ( $lower_digits === $digits && ! $needs_semicolon && 0 === $leading_zeros ) { + $at = $next_character_reference_at + $token_length; + continue; + } + + $normalized[] = substr( $text, $was_at, $next_character_reference_at - $was_at ) . "&#x{$lower_digits};"; + $at = $next_character_reference_at + $token_length; + $was_at = $at; + continue; + } else { + $normalized[] = substr( $text, $was_at, $next_character_reference_at - $was_at ) . "&#{$digits};"; + $at = $next_character_reference_at + $token_length; + $was_at = $at; + continue; + } + + die( 'should not have arrived here' ); + ++$at; + } + + if ( 0 === $was_at ) { + $normalized_text = strtr( $text, '&', '&' ); + } else { + $normalized[] = substr( $text, $was_at, $end - $was_at ); + $normalized_text = implode( '', $normalized ); + } + + return strtr( + $normalized_text, + array( + '<' => '<', + '>' => '>', + '"' => '"', + "'" => ''', + /* + * Stray ampersand "&" characters have already been replaced above, + * so it’s inappropriate to replace again here, as all remaining + * instances should be part of a normalized character reference. + */ + ) + ); +} + /** * Converts a number of HTML entities into their special characters. * diff --git a/tests/phpunit/tests/formatting/normalizeEscapedHtmlText.php b/tests/phpunit/tests/formatting/normalizeEscapedHtmlText.php new file mode 100644 index 0000000000000..f7b5da2a4578b --- /dev/null +++ b/tests/phpunit/tests/formatting/normalizeEscapedHtmlText.php @@ -0,0 +1,58 @@ +assertEquals( + $expected, + wp_normalize_escaped_html_text( $context, $text ) + ); + } + + public static function data_example_datasets() { + return array( + array( 'attribute', 'test', 'test' ), + array( 'attribute', 'test & done', 'test & done' ), + array( 'attribute', 'þ is not iron', 'þ is not iron' ), + array( 'attribute', 'spec > guess', 'spec > guess' ), + array( 'attribute', 'art & copy', 'art & copy' ), + array( 'attribute', '🅰', '🅰' ), + array( 'attribute', '🅰 ', '🅰 ' ), + + array( 'data', 'test', 'test' ), + array( 'data', 'test & done', 'test & done' ), + array( 'data', 'þ is not iron', 'þ is not iron' ), + array( 'data', 'spec > guess', 'spec > guess' ), + array( 'data', 'art & copy', 'art & copy' ), + array( 'data', '🅰', '🅰' ), + array( 'data', '🅰 ', '🅰 ' ), + + // The “ambiguous ampersand” has different rules in the attribute value and data states. + array( 'attribute', '¬myproblem', '&notmyproblem' ), + array( 'data', '¬myproblem', '¬myproblem' ), + + // Certain characters should remain plaintext. + array( 'attribute', 'eat 3 apples', 'eat 3 apples' ), + array( 'data', 'eat 3 apples', 'eat 3 apples' ), + array( 'data', '<script>', '<script>' ), + array( 'attribute', 'javascript:alert({"test"})', 'javascript:alert({"test"})' ), + + // Syntax characters should be represented uniformly. + array( 'attribute', '<IMG>', '<IMG>' ), + array( 'data', '<IMG>', '<IMG>' ), + ); + } +} From 6c2541f442182762c7d928eb49902c2df431e0e3 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Thu, 26 Feb 2026 22:40:03 -0600 Subject: [PATCH 2/3] Feedback on normalization; add a filter for unhiding. --- src/wp-includes/formatting.php | 54 ++++++++++++++++--- .../formatting/normalizeEscapedHtmlText.php | 12 ++--- 2 files changed, 53 insertions(+), 13 deletions(-) diff --git a/src/wp-includes/formatting.php b/src/wp-includes/formatting.php index 5a58fefcbadfe..349834f48e13c 100644 --- a/src/wp-includes/formatting.php +++ b/src/wp-includes/formatting.php @@ -1026,8 +1026,43 @@ function wp_normalize_escaped_html_text( string $context, string $text ): string continue; } - // Some characters are best left visible to the human mind. - $should_unhide = 1 === strspn( $character_reference, ',%()0123456789:[]ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz{}' ); + // Some characters are best left visible to the human mind (as well as to downstream parsing code). + $default_to_unhide = '0123456789:ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'; + + /** + * Selects which US-ASCII characters to enforce rendering as the byte itself + * rather than as any HTML character reference. + * + * Must be single-byte US-ASCII characters only. The default value unhides + * digits, letters, and the colon character. Set to empty string to prevent + * unhiding altogether. + * + * If non-US-ASCII characters are in the results or the result isn’t a string + * then the default set of characters will be unhidden instead. + * + * @since {WP_VERSION} + * + * @Param string $unhidden_ascii These characters will be represented in HTML as themselves, not as + * any character references. E.g. 'a' as 'a' and not as 'a'. + */ + $unhidden_ascii = apply_filters( 'always_raw_escaped_html_ascii', $default_to_unhide ); + if ( $unhidden_ascii !== $default_to_unhide ) { + $all_ascii_chars = ( + "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" . + "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" . + " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f" + ); + + $is_all_ascii = ( + is_string( $unhidden_ascii ) && + strlen( $unhidden_ascii ) === strspn( $unhidden_ascii, $all_ascii_chars ) + ); + + if ( ! $is_all_ascii ) { + $unhidden_ascii = $default_to_unhide; + } + } + $should_unhide = 1 === strspn( $character_reference, $unhidden_ascii ); if ( $should_unhide ) { $normalized[] = substr( $text, $was_at, $next_character_reference_at - $was_at ) . $character_reference; $at = $next_character_reference_at + $token_length; @@ -1076,7 +1111,9 @@ function wp_normalize_escaped_html_text( string $context, string $text ): string * numeric character references may contain upper or lowercase hex values and may * contain unlimited preceding zeros. */ - $is_hex = 'x' === $text[ $next_character_reference_at + 2 ] || 'X' === $text[ $next_character_reference_at + 2 ]; + $is_small_hex = 'x' === $text[ $next_character_reference_at + 2 ]; + $is_big_hex = 'X' === $text[ $next_character_reference_at + 2 ]; + $is_hex = $is_small_hex || $is_big_hex; $digits_at = $next_character_reference_at + ( $is_hex ? 3 : 2 ); $leading_zeros = '0' === $text[ $digits_at ] ? strspn( $text, '0', $digits_at ) : 0; @@ -1086,21 +1123,24 @@ function wp_normalize_escaped_html_text( string $context, string $text ): string continue; } - $digits = substr( $text, $digits_at + $leading_zeros, $next_character_reference_at + $token_length - $digits_at - $leading_zeros - ( $needs_semicolon ? 0 : 1 ) ); + $nonzero_at = $digits_at + $leading_zeros; + $digit_length = $next_character_reference_at + $token_length - $digits_at - $leading_zeros - ( $needs_semicolon ? 0 : 1 ); if ( $is_hex ) { - $lower_digits = strtolower( $digits ); + $all_uppercase = strspn( $text, '0123456789ABCDEF', $nonzero_at, $digit_length ) === $digit_length; // Nothing to do for already-normalized hexadecimal numeric character references. - if ( $lower_digits === $digits && ! $needs_semicolon && 0 === $leading_zeros ) { + if ( $is_small_hex && $all_uppercase && ! $needs_semicolon && 0 === $leading_zeros ) { $at = $next_character_reference_at + $token_length; continue; } - $normalized[] = substr( $text, $was_at, $next_character_reference_at - $was_at ) . "&#x{$lower_digits};"; + $digits = strtoupper( substr( $text, $nonzero_at, $digit_length ) ); + $normalized[] = substr( $text, $was_at, $next_character_reference_at - $was_at ) . "&#x{$digits};"; $at = $next_character_reference_at + $token_length; $was_at = $at; continue; } else { + $digits = substr( $text, $nonzero_at, $digit_length ); $normalized[] = substr( $text, $was_at, $next_character_reference_at - $was_at ) . "&#{$digits};"; $at = $next_character_reference_at + $token_length; $was_at = $at; diff --git a/tests/phpunit/tests/formatting/normalizeEscapedHtmlText.php b/tests/phpunit/tests/formatting/normalizeEscapedHtmlText.php index f7b5da2a4578b..8a73f3f9e723a 100644 --- a/tests/phpunit/tests/formatting/normalizeEscapedHtmlText.php +++ b/tests/phpunit/tests/formatting/normalizeEscapedHtmlText.php @@ -26,18 +26,18 @@ public static function data_example_datasets() { return array( array( 'attribute', 'test', 'test' ), array( 'attribute', 'test & done', 'test & done' ), - array( 'attribute', 'þ is not iron', 'þ is not iron' ), + array( 'attribute', 'þ is not iron', 'þ is not iron' ), array( 'attribute', 'spec > guess', 'spec > guess' ), array( 'attribute', 'art & copy', 'art & copy' ), - array( 'attribute', '🅰', '🅰' ), - array( 'attribute', '🅰 ', '🅰 ' ), + array( 'attribute', '🅰', '🅰' ), + array( 'attribute', '🅰 ', '🅰 ' ), array( 'data', 'test', 'test' ), array( 'data', 'test & done', 'test & done' ), - array( 'data', 'þ is not iron', 'þ is not iron' ), + array( 'data', 'þ is not iron', 'þ is not iron' ), array( 'data', 'spec > guess', 'spec > guess' ), array( 'data', 'art & copy', 'art & copy' ), - array( 'data', '🅰', '🅰' ), + array( 'data', '🅰', '🅰' ), array( 'data', '🅰 ', '🅰 ' ), // The “ambiguous ampersand” has different rules in the attribute value and data states. @@ -48,7 +48,7 @@ public static function data_example_datasets() { array( 'attribute', 'eat 3 apples', 'eat 3 apples' ), array( 'data', 'eat 3 apples', 'eat 3 apples' ), array( 'data', '<script>', '<script>' ), - array( 'attribute', 'javascript:alert({"test"})', 'javascript:alert({"test"})' ), + array( 'attribute', 'javascript:alert({"test"})', 'javascript:alert({"test"})' ), // Syntax characters should be represented uniformly. array( 'attribute', '<IMG>', '<IMG>' ), From 3a731e2f16572a594d4b1674066db9fa1aea521c Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Fri, 27 Feb 2026 10:47:06 -0600 Subject: [PATCH 3/3] Update those tests that I missed --- tests/phpunit/tests/formatting/normalizeEscapedHtmlText.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/phpunit/tests/formatting/normalizeEscapedHtmlText.php b/tests/phpunit/tests/formatting/normalizeEscapedHtmlText.php index 8a73f3f9e723a..7fa32d69300e2 100644 --- a/tests/phpunit/tests/formatting/normalizeEscapedHtmlText.php +++ b/tests/phpunit/tests/formatting/normalizeEscapedHtmlText.php @@ -38,7 +38,7 @@ public static function data_example_datasets() { array( 'data', 'spec > guess', 'spec > guess' ), array( 'data', 'art & copy', 'art & copy' ), array( 'data', '🅰', '🅰' ), - array( 'data', '🅰 ', '🅰 ' ), + array( 'data', '🅰 ', '🅰 ' ), // The “ambiguous ampersand” has different rules in the attribute value and data states. array( 'attribute', '¬myproblem', '&notmyproblem' ), @@ -48,7 +48,7 @@ public static function data_example_datasets() { array( 'attribute', 'eat 3 apples', 'eat 3 apples' ), array( 'data', 'eat 3 apples', 'eat 3 apples' ), array( 'data', '<script>', '<script>' ), - array( 'attribute', 'javascript:alert({"test"})', 'javascript:alert({"test"})' ), + array( 'attribute', 'javascript:alert({"test"})', 'javascript:alert({"test"})' ), // Syntax characters should be represented uniformly. array( 'attribute', '<IMG>', '<IMG>' ),