@@ -1026,8 +1026,43 @@ function wp_normalize_escaped_html_text( string $context, string $text ): string
10261026 continue ;
10271027 }
10281028
1029- // Some characters are best left visible to the human mind.
1030- $ should_unhide = 1 === strspn ( $ character_reference , ',%()0123456789:[]ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz{} ' );
1029+ // Some characters are best left visible to the human mind (as well as to downstream parsing code).
1030+ $ default_to_unhide = '0123456789:ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz ' ;
1031+
1032+ /**
1033+ * Selects which US-ASCII characters to enforce rendering as the byte itself
1034+ * rather than as any HTML character reference.
1035+ *
1036+ * Must be single-byte US-ASCII characters only. The default value unhides
1037+ * digits, letters, and the colon character. Set to empty string to prevent
1038+ * unhiding altogether.
1039+ *
1040+ * If non-US-ASCII characters are in the results or the result isn’t a string
1041+ * then the default set of characters will be unhidden instead.
1042+ *
1043+ * @since {WP_VERSION}
1044+ *
1045+ * @Param string $unhidden_ascii These characters will be represented in HTML as themselves, not as
1046+ * any character references. E.g. 'a' as 'a' and not as 'a'.
1047+ */
1048+ $ unhidden_ascii = apply_filters ( 'always_raw_escaped_html_ascii ' , $ default_to_unhide );
1049+ if ( $ unhidden_ascii !== $ default_to_unhide ) {
1050+ $ all_ascii_chars = (
1051+ "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" .
1052+ "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" .
1053+ " ! \"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[ \\]^_`abcdefghijklmnopqrstuvwxyz{|}~ \x7f"
1054+ );
1055+
1056+ $ is_all_ascii = (
1057+ is_string ( $ unhidden_ascii ) &&
1058+ strlen ( $ unhidden_ascii ) === strspn ( $ unhidden_ascii , $ all_ascii_chars )
1059+ );
1060+
1061+ if ( ! $ is_all_ascii ) {
1062+ $ unhidden_ascii = $ default_to_unhide ;
1063+ }
1064+ }
1065+ $ should_unhide = 1 === strspn ( $ character_reference , $ unhidden_ascii );
10311066 if ( $ should_unhide ) {
10321067 $ normalized [] = substr ( $ text , $ was_at , $ next_character_reference_at - $ was_at ) . $ character_reference ;
10331068 $ at = $ next_character_reference_at + $ token_length ;
@@ -1076,7 +1111,9 @@ function wp_normalize_escaped_html_text( string $context, string $text ): string
10761111 * numeric character references may contain upper or lowercase hex values and may
10771112 * contain unlimited preceding zeros.
10781113 */
1079- $ is_hex = 'x ' === $ text [ $ next_character_reference_at + 2 ] || 'X ' === $ text [ $ next_character_reference_at + 2 ];
1114+ $ is_small_hex = 'x ' === $ text [ $ next_character_reference_at + 2 ];
1115+ $ is_big_hex = 'X ' === $ text [ $ next_character_reference_at + 2 ];
1116+ $ is_hex = $ is_small_hex || $ is_big_hex ;
10801117 $ digits_at = $ next_character_reference_at + ( $ is_hex ? 3 : 2 );
10811118 $ leading_zeros = '0 ' === $ text [ $ digits_at ] ? strspn ( $ text , '0 ' , $ digits_at ) : 0 ;
10821119
@@ -1086,21 +1123,24 @@ function wp_normalize_escaped_html_text( string $context, string $text ): string
10861123 continue ;
10871124 }
10881125
1089- $ digits = substr ( $ text , $ digits_at + $ leading_zeros , $ next_character_reference_at + $ token_length - $ digits_at - $ leading_zeros - ( $ needs_semicolon ? 0 : 1 ) );
1126+ $ nonzero_at = $ digits_at + $ leading_zeros ;
1127+ $ digit_length = $ next_character_reference_at + $ token_length - $ digits_at - $ leading_zeros - ( $ needs_semicolon ? 0 : 1 );
10901128 if ( $ is_hex ) {
1091- $ lower_digits = strtolower ( $ digits );
1129+ $ all_uppercase = $ digit_length !== strspn ( $ text , ' 0123456789ABCDEF ' , $ nonzero_at , $ digit_length );
10921130
10931131 // Nothing to do for already-normalized hexadecimal numeric character references.
1094- if ( $ lower_digits === $ digits && ! $ needs_semicolon && 0 === $ leading_zeros ) {
1132+ if ( $ is_small_hex && $ all_uppercase && ! $ needs_semicolon && 0 === $ leading_zeros ) {
10951133 $ at = $ next_character_reference_at + $ token_length ;
10961134 continue ;
10971135 }
10981136
1099- $ normalized [] = substr ( $ text , $ was_at , $ next_character_reference_at - $ was_at ) . "&#x {$ lower_digits }; " ;
1137+ $ digits = strtoupper ( substr ( $ text , $ nonzero_at , $ digit_length ) );
1138+ $ normalized [] = substr ( $ text , $ was_at , $ next_character_reference_at - $ was_at ) . "&#x {$ digits }; " ;
11001139 $ at = $ next_character_reference_at + $ token_length ;
11011140 $ was_at = $ at ;
11021141 continue ;
11031142 } else {
1143+ $ digits = substr ( $ text , $ nonzero_at , $ digit_length );
11041144 $ normalized [] = substr ( $ text , $ was_at , $ next_character_reference_at - $ was_at ) . "&# {$ digits }; " ;
11051145 $ at = $ next_character_reference_at + $ token_length ;
11061146 $ was_at = $ at ;
0 commit comments