From 46c2a66855b433422b561eec48dc9293b5880318 Mon Sep 17 00:00:00 2001
From: Dennis Snell <dennis.snell@automattic.com>
Date: Thu, 4 Dec 2025 17:49:28 -0700
Subject: [PATCH 1/3] Formatting: Introduce normalizing function for escaped
 HTML. (#10600)

Like `wp_kses_normalize_entities()` but built for UTF-8 and HTML5 and
relying on the HTML API for reliabilty.
---
 src/wp-includes/formatting.php                | 142 ++++++++++++++++++
 .../formatting/normalizeEscapedHtmlText.php   |  58 +++++++
 2 files changed, 200 insertions(+)
 create mode 100644 tests/phpunit/tests/formatting/normalizeEscapedHtmlText.php

diff --git a/src/wp-includes/formatting.php b/src/wp-includes/formatting.php
index 3b546c30eebd0..5a58fefcbadfe 100644
--- a/src/wp-includes/formatting.php
+++ b/src/wp-includes/formatting.php
@@ -992,6 +992,148 @@ function _wp_specialchars( $text, $quote_style = ENT_NOQUOTES, $charset = false,
 	return $text;
 }
 
+/**
+ * Normalize the escaping for content within an HTML string.
+ *
+ * @since {WP_VERSION}
+ *
+ * @param string $context "attribute" for strings comprising a full HTML attribute value,
+ *                        or "data" for text nodes.
+ * @param string $text    string containing HTML-escaped or escapable content, in UTF-8.
+ * @return string         version of input where all appropriate characters and escapes
+ *                        are standard and predictable.
+ */
+function wp_normalize_escaped_html_text( string $context, string $text ): string {
+	$normalized   = array();
+	$end          = strlen( $text );
+	$at           = 0;
+	$was_at       = 0;
+	$token_length = 0;
+
+	while ( $at < $end ) {
+		$next_character_reference_at = strpos( $text, '&', $at );
+		if ( false === $next_character_reference_at ) {
+			break;
+		}
+
+		$character_reference = WP_HTML_Decoder::read_character_reference( $context, $text, $next_character_reference_at, $token_length );
+
+		// This is an un-escaped ampersand character, so encode it.
+		if ( ! isset( $character_reference ) ) {
+			$normalized[] = substr( $text, $was_at, $next_character_reference_at - $was_at ) . '&amp;';
+			$at           = $next_character_reference_at + 1;
+			$was_at       = $at;
+			continue;
+		}
+
+		// Some characters are best left visible to the human mind.
+		$should_unhide = 1 === strspn( $character_reference, ',%()0123456789:[]ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz{}' );
+		if ( $should_unhide ) {
+			$normalized[] = substr( $text, $was_at, $next_character_reference_at - $was_at ) . $character_reference;
+			$at           = $next_character_reference_at + $token_length;
+			$was_at       = $at;
+			continue;
+		}
+
+		$is_syntax = 1 === strspn( $character_reference, '&"\'<>' );
+		if ( $is_syntax && '#' === $text[ $next_character_reference_at + 1 ] ) {
+			$named_form   = strtr(
+				$character_reference,
+				array(
+					'&' => '&amp;',
+					'"' => '&quot;',
+					"'" => '&apos;',
+					'<' => '&lt;',
+					'>' => '&gt;',
+				)
+			);
+			$normalized[] = substr( $text, $was_at, $next_character_reference_at - $was_at ) . $named_form;
+			$at           = $next_character_reference_at + $token_length;
+			$was_at       = $at;
+			continue;
+		}
+
+		// This is a valid character reference, but it might not be normative.
+		$needs_semicolon = ';' !== $text[ $next_character_reference_at + $token_length - 1 ];
+
+		// This is a named character reference.
+		if ( '#' !== $text[ $next_character_reference_at + 1 ] ) {
+			// Nothing to do for already-normalized named character references.
+			if ( ! $needs_semicolon ) {
+				$at = $next_character_reference_at + $token_length;
+				continue;
+			}
+
+			// Add the missing semicolon.
+			$normalized[] = substr( $text, $was_at, $next_character_reference_at - $was_at + $token_length ) . ';';
+			$at           = $next_character_reference_at + $token_length;
+			$was_at       = $at;
+			continue;
+		}
+
+		/*
+		 * While named character references have only a single form and are case sensitive,
+		 * numeric character references may contain upper or lowercase hex values and may
+		 * contain unlimited preceding zeros.
+		 */
+		$is_hex        = 'x' === $text[ $next_character_reference_at + 2 ] || 'X' === $text[ $next_character_reference_at + 2 ];
+		$digits_at     = $next_character_reference_at + ( $is_hex ? 3 : 2 );
+		$leading_zeros = '0' === $text[ $digits_at ] ? strspn( $text, '0', $digits_at ) : 0;
+
+		if ( ! $needs_semicolon && ! $is_hex && 0 === $leading_zeros ) {
+			// Nothing to do for already-normalized decimal numeric character references.
+			$at = $next_character_reference_at + $token_length;
+			continue;
+		}
+
+		$digits = substr( $text, $digits_at + $leading_zeros, $next_character_reference_at + $token_length - $digits_at - $leading_zeros - ( $needs_semicolon ? 0 : 1 ) );
+		if ( $is_hex ) {
+			$lower_digits = strtolower( $digits );
+
+			// Nothing to do for already-normalized hexadecimal numeric character references.
+			if ( $lower_digits === $digits && ! $needs_semicolon && 0 === $leading_zeros ) {
+				$at = $next_character_reference_at + $token_length;
+				continue;
+			}
+
+			$normalized[] = substr( $text, $was_at, $next_character_reference_at - $was_at ) . "&#x{$lower_digits};";
+			$at           = $next_character_reference_at + $token_length;
+			$was_at       = $at;
+			continue;
+		} else {
+			$normalized[] = substr( $text, $was_at, $next_character_reference_at - $was_at ) . "&#{$digits};";
+			$at           = $next_character_reference_at + $token_length;
+			$was_at       = $at;
+			continue;
+		}
+
+		die( 'should not have arrived here' );
+		++$at;
+	}
+
+	if ( 0 === $was_at ) {
+		$normalized_text = strtr( $text, '&', '&amp;' );
+	} else {
+		$normalized[]    = substr( $text, $was_at, $end - $was_at );
+		$normalized_text = implode( '', $normalized );
+	}
+
+	return strtr(
+		$normalized_text,
+		array(
+			'<' => '&lt;',
+			'>' => '&gt;',
+			'"' => '&quot;',
+			"'" => '&apos;',
+			/*
+			 * Stray ampersand "&" characters have already been replaced above,
+			 * so it’s inappropriate to replace again here, as all remaining
+			 * instances should be part of a normalized character reference.
+			 */
+		)
+	);
+}
+
 /**
  * Converts a number of HTML entities into their special characters.
  *
diff --git a/tests/phpunit/tests/formatting/normalizeEscapedHtmlText.php b/tests/phpunit/tests/formatting/normalizeEscapedHtmlText.php
new file mode 100644
index 0000000000000..f7b5da2a4578b
--- /dev/null
+++ b/tests/phpunit/tests/formatting/normalizeEscapedHtmlText.php
@@ -0,0 +1,58 @@
+<?php
+
+/**
+* @group formatting
+*
+* @covers \wp_normalize_escaped_html_text()
+*/
+class Tests_Formatting_NormalizeEscapedHtmlText extends WP_UnitTestCase {
+	/**
+	 * Ensures that HTML test is properly normalized.
+	 *
+	 * @dataProvider data_example_datasets
+	 *
+	 * @param string $context
+	 * @param string $text
+	 * @param string $expected
+	 */
+	public function test_example_datasets( $context, $text, $expected ) {
+		$this->assertEquals(
+			$expected,
+			wp_normalize_escaped_html_text( $context, $text )
+		);
+	}
+
+	public static function data_example_datasets() {
+		return array(
+			array( 'attribute', 'test', 'test' ),
+			array( 'attribute', 'test & done', 'test &amp; done' ),
+			array( 'attribute', '&#XFe; is not iron', '&#xfe; is not iron' ),
+			array( 'attribute', 'spec > guess', 'spec &gt; guess' ),
+			array( 'attribute', 'art & copy', 'art &amp; copy' ),
+			array( 'attribute', '&#x1F170', '&#x1f170;' ),
+			array( 'attribute', '&#x1F170 ', '&#x1f170; ' ),
+
+			array( 'data', 'test', 'test' ),
+			array( 'data', 'test & done', 'test &amp; done' ),
+			array( 'data', '&#XFe; is not iron', '&#xfe; is not iron' ),
+			array( 'data', 'spec > guess', 'spec &gt; guess' ),
+			array( 'data', 'art & copy', 'art &amp; copy' ),
+			array( 'data', '&#x1F170', '&#x1f170;' ),
+			array( 'data', '&#x1F170 ', '&#x1f170; ' ),
+
+			// The “ambiguous ampersand” has different rules in the attribute value and data states.
+			array( 'attribute', '&notmyproblem', '&amp;notmyproblem' ),
+			array( 'data', '&notmyproblem', '&not;myproblem' ),
+
+			// Certain characters should remain plaintext.
+			array( 'attribute', 'eat &#x000033; apples', 'eat 3 apples' ),
+			array( 'data', 'eat &#x000033; apples', 'eat 3 apples' ),
+			array( 'data', '<&#x00073;cr&#0105pt&gt;', '&lt;script&gt;' ),
+			array( 'attribute', '&#x6a;avascript&#58alert&#40;&#x0000007b"test&quot;&#125;&#41;', 'javascript:alert({&quot;test&quot;})' ),
+
+			// Syntax characters should be represented uniformly.
+			array( 'attribute', '&#X3CIMG&#00062', '&lt;IMG&gt;' ),
+			array( 'data', '&#X3CIMG&#00062', '&lt;IMG&gt;' ),
+		);
+	}
+}

From 6c2541f442182762c7d928eb49902c2df431e0e3 Mon Sep 17 00:00:00 2001
From: Dennis Snell <dennis.snell@automattic.com>
Date: Thu, 26 Feb 2026 22:40:03 -0600
Subject: [PATCH 2/3] Feedback on normalization; add a filter for unhiding.

---
 src/wp-includes/formatting.php                | 54 ++++++++++++++++---
 .../formatting/normalizeEscapedHtmlText.php   | 12 ++---
 2 files changed, 53 insertions(+), 13 deletions(-)

diff --git a/src/wp-includes/formatting.php b/src/wp-includes/formatting.php
index 5a58fefcbadfe..349834f48e13c 100644
--- a/src/wp-includes/formatting.php
+++ b/src/wp-includes/formatting.php
@@ -1026,8 +1026,43 @@ function wp_normalize_escaped_html_text( string $context, string $text ): string
 			continue;
 		}
 
-		// Some characters are best left visible to the human mind.
-		$should_unhide = 1 === strspn( $character_reference, ',%()0123456789:[]ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz{}' );
+		// Some characters are best left visible to the human mind (as well as to downstream parsing code).
+		$default_to_unhide = '0123456789:ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz';
+
+		/**
+		 * Selects which US-ASCII characters to enforce rendering as the byte itself
+		 * rather than as any HTML character reference.
+		 *
+		 * Must be single-byte US-ASCII characters only. The default value unhides
+		 * digits, letters, and the colon character. Set to empty string to prevent
+		 * unhiding altogether.
+		 *
+		 * If non-US-ASCII characters are in the results or the result isn’t a string
+		 * then the default set of characters will be unhidden instead.
+		 *
+		 * @since {WP_VERSION}
+		 *
+		 * @Param string $unhidden_ascii These characters will be represented in HTML as themselves, not as
+		 *                               any character references. E.g. 'a' as 'a' and not as '&#x61;'.
+		 */
+		$unhidden_ascii = apply_filters( 'always_raw_escaped_html_ascii', $default_to_unhide );
+		if ( $unhidden_ascii !== $default_to_unhide ) {
+			$all_ascii_chars = (
+				"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" .
+				"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" .
+				" !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f"
+			);
+
+			$is_all_ascii = (
+				is_string( $unhidden_ascii ) &&
+				strlen( $unhidden_ascii ) === strspn( $unhidden_ascii, $all_ascii_chars )
+			);
+
+			if ( ! $is_all_ascii ) {
+				$unhidden_ascii = $default_to_unhide;
+			}
+		}
+		$should_unhide = 1 === strspn( $character_reference, $unhidden_ascii );
 		if ( $should_unhide ) {
 			$normalized[] = substr( $text, $was_at, $next_character_reference_at - $was_at ) . $character_reference;
 			$at           = $next_character_reference_at + $token_length;
@@ -1076,7 +1111,9 @@ function wp_normalize_escaped_html_text( string $context, string $text ): string
 		 * numeric character references may contain upper or lowercase hex values and may
 		 * contain unlimited preceding zeros.
 		 */
-		$is_hex        = 'x' === $text[ $next_character_reference_at + 2 ] || 'X' === $text[ $next_character_reference_at + 2 ];
+		$is_small_hex  = 'x' === $text[ $next_character_reference_at + 2 ];
+		$is_big_hex    = 'X' === $text[ $next_character_reference_at + 2 ];
+		$is_hex        = $is_small_hex || $is_big_hex;
 		$digits_at     = $next_character_reference_at + ( $is_hex ? 3 : 2 );
 		$leading_zeros = '0' === $text[ $digits_at ] ? strspn( $text, '0', $digits_at ) : 0;
 
@@ -1086,21 +1123,24 @@ function wp_normalize_escaped_html_text( string $context, string $text ): string
 			continue;
 		}
 
-		$digits = substr( $text, $digits_at + $leading_zeros, $next_character_reference_at + $token_length - $digits_at - $leading_zeros - ( $needs_semicolon ? 0 : 1 ) );
+		$nonzero_at   = $digits_at + $leading_zeros;
+		$digit_length = $next_character_reference_at + $token_length - $digits_at - $leading_zeros - ( $needs_semicolon ? 0 : 1 );
 		if ( $is_hex ) {
-			$lower_digits = strtolower( $digits );
+			$all_uppercase = strspn( $text, '0123456789ABCDEF', $nonzero_at, $digit_length ) === $digit_length;
 
 			// Nothing to do for already-normalized hexadecimal numeric character references.
-			if ( $lower_digits === $digits && ! $needs_semicolon && 0 === $leading_zeros ) {
+			if ( $is_small_hex && $all_uppercase && ! $needs_semicolon && 0 === $leading_zeros ) {
 				$at = $next_character_reference_at + $token_length;
 				continue;
 			}
 
-			$normalized[] = substr( $text, $was_at, $next_character_reference_at - $was_at ) . "&#x{$lower_digits};";
+			$digits       = strtoupper( substr( $text, $nonzero_at, $digit_length ) );
+			$normalized[] = substr( $text, $was_at, $next_character_reference_at - $was_at ) . "&#x{$digits};";
 			$at           = $next_character_reference_at + $token_length;
 			$was_at       = $at;
 			continue;
 		} else {
+			$digits       = substr( $text, $nonzero_at, $digit_length );
 			$normalized[] = substr( $text, $was_at, $next_character_reference_at - $was_at ) . "&#{$digits};";
 			$at           = $next_character_reference_at + $token_length;
 			$was_at       = $at;
diff --git a/tests/phpunit/tests/formatting/normalizeEscapedHtmlText.php b/tests/phpunit/tests/formatting/normalizeEscapedHtmlText.php
index f7b5da2a4578b..8a73f3f9e723a 100644
--- a/tests/phpunit/tests/formatting/normalizeEscapedHtmlText.php
+++ b/tests/phpunit/tests/formatting/normalizeEscapedHtmlText.php
@@ -26,18 +26,18 @@ public static function data_example_datasets() {
 		return array(
 			array( 'attribute', 'test', 'test' ),
 			array( 'attribute', 'test & done', 'test &amp; done' ),
-			array( 'attribute', '&#XFe; is not iron', '&#xfe; is not iron' ),
+			array( 'attribute', '&#XFe; is not iron', '&#xFE; is not iron' ),
 			array( 'attribute', 'spec > guess', 'spec &gt; guess' ),
 			array( 'attribute', 'art & copy', 'art &amp; copy' ),
-			array( 'attribute', '&#x1F170', '&#x1f170;' ),
-			array( 'attribute', '&#x1F170 ', '&#x1f170; ' ),
+			array( 'attribute', '&#x1F170', '&#x1F170;' ),
+			array( 'attribute', '&#x1F170 ', '&#x1F170; ' ),
 
 			array( 'data', 'test', 'test' ),
 			array( 'data', 'test & done', 'test &amp; done' ),
-			array( 'data', '&#XFe; is not iron', '&#xfe; is not iron' ),
+			array( 'data', '&#XFe; is not iron', '&#xFE; is not iron' ),
 			array( 'data', 'spec > guess', 'spec &gt; guess' ),
 			array( 'data', 'art & copy', 'art &amp; copy' ),
-			array( 'data', '&#x1F170', '&#x1f170;' ),
+			array( 'data', '&#x1F170', '&#x1F170;' ),
 			array( 'data', '&#x1F170 ', '&#x1f170; ' ),
 
 			// The “ambiguous ampersand” has different rules in the attribute value and data states.
@@ -48,7 +48,7 @@ public static function data_example_datasets() {
 			array( 'attribute', 'eat &#x000033; apples', 'eat 3 apples' ),
 			array( 'data', 'eat &#x000033; apples', 'eat 3 apples' ),
 			array( 'data', '<&#x00073;cr&#0105pt&gt;', '&lt;script&gt;' ),
-			array( 'attribute', '&#x6a;avascript&#58alert&#40;&#x0000007b"test&quot;&#125;&#41;', 'javascript:alert({&quot;test&quot;})' ),
+			array( 'attribute', '&#x6a;avascript&#58alert&#40;&#x0000007b"test&quot;&#125;&#41;', 'javascript:alert&#40;&#x7b;&quot;test&quot;&#125;&#41;' ),
 
 			// Syntax characters should be represented uniformly.
 			array( 'attribute', '&#X3CIMG&#00062', '&lt;IMG&gt;' ),

From 3a731e2f16572a594d4b1674066db9fa1aea521c Mon Sep 17 00:00:00 2001
From: Dennis Snell <dennis.snell@automattic.com>
Date: Fri, 27 Feb 2026 10:47:06 -0600
Subject: [PATCH 3/3] Update those tests that I missed

---
 tests/phpunit/tests/formatting/normalizeEscapedHtmlText.php | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/phpunit/tests/formatting/normalizeEscapedHtmlText.php b/tests/phpunit/tests/formatting/normalizeEscapedHtmlText.php
index 8a73f3f9e723a..7fa32d69300e2 100644
--- a/tests/phpunit/tests/formatting/normalizeEscapedHtmlText.php
+++ b/tests/phpunit/tests/formatting/normalizeEscapedHtmlText.php
@@ -38,7 +38,7 @@ public static function data_example_datasets() {
 			array( 'data', 'spec > guess', 'spec &gt; guess' ),
 			array( 'data', 'art & copy', 'art &amp; copy' ),
 			array( 'data', '&#x1F170', '&#x1F170;' ),
-			array( 'data', '&#x1F170 ', '&#x1f170; ' ),
+			array( 'data', '&#x1F170 ', '&#x1F170; ' ),
 
 			// The “ambiguous ampersand” has different rules in the attribute value and data states.
 			array( 'attribute', '&notmyproblem', '&amp;notmyproblem' ),
@@ -48,7 +48,7 @@ public static function data_example_datasets() {
 			array( 'attribute', 'eat &#x000033; apples', 'eat 3 apples' ),
 			array( 'data', 'eat &#x000033; apples', 'eat 3 apples' ),
 			array( 'data', '<&#x00073;cr&#0105pt&gt;', '&lt;script&gt;' ),
-			array( 'attribute', '&#x6a;avascript&#58alert&#40;&#x0000007b"test&quot;&#125;&#41;', 'javascript:alert&#40;&#x7b;&quot;test&quot;&#125;&#41;' ),
+			array( 'attribute', '&#x6a;avascript&#58alert&#40;&#x0000007b"test&quot;&#125;&#41;', 'javascript:alert&#40;&#x7B;&quot;test&quot;&#125;&#41;' ),
 
 			// Syntax characters should be represented uniformly.
 			array( 'attribute', '&#X3CIMG&#00062', '&lt;IMG&gt;' ),