Skip to content

Commit cd64252

Browse files
committed
Feedback on normalization; add a filter for unhiding.
1 parent 46c2a66 commit cd64252

2 files changed

Lines changed: 53 additions & 13 deletions

File tree

src/wp-includes/formatting.php

Lines changed: 47 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1026,8 +1026,43 @@ function wp_normalize_escaped_html_text( string $context, string $text ): string
10261026
continue;
10271027
}
10281028

1029-
// Some characters are best left visible to the human mind.
1030-
$should_unhide = 1 === strspn( $character_reference, ',%()0123456789:[]ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz{}' );
1029+
// Some characters are best left visible to the human mind (as well as to downstream parsing code).
1030+
$default_to_unhide = '0123456789:ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz';
1031+
1032+
/**
1033+
* Selects which US-ASCII characters to enforce rendering as the byte itself
1034+
* rather than as any HTML character reference.
1035+
*
1036+
* Must be single-byte US-ASCII characters only. The default value unhides
1037+
* digits, letters, and the colon character. Set to empty string to prevent
1038+
* unhiding altogether.
1039+
*
1040+
* If non-US-ASCII characters are in the results or the result isn’t a string
1041+
* then the default set of characters will be unhidden instead.
1042+
*
1043+
* @since {WP_VERSION}
1044+
*
1045+
* @Param string $unhidden_ascii These characters will be represented in HTML as themselves, not as
1046+
* any character references. E.g. 'a' as 'a' and not as 'a'.
1047+
*/
1048+
$unhidden_ascii = apply_filters( 'always_raw_escaped_html_ascii', $default_to_unhide );
1049+
if ( $unhidden_ascii !== $default_to_unhide ) {
1050+
$all_ascii_chars = (
1051+
"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" .
1052+
"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" .
1053+
" !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f"
1054+
);
1055+
1056+
$is_all_ascii = (
1057+
is_string( $unhidden_ascii ) &&
1058+
strlen( $unhidden_ascii ) === strspn( $unhidden_ascii, $all_ascii_chars )
1059+
);
1060+
1061+
if ( ! $is_all_ascii ) {
1062+
$unhidden_ascii = $default_to_unhide;
1063+
}
1064+
}
1065+
$should_unhide = 1 === strspn( $character_reference, $unhidden_ascii );
10311066
if ( $should_unhide ) {
10321067
$normalized[] = substr( $text, $was_at, $next_character_reference_at - $was_at ) . $character_reference;
10331068
$at = $next_character_reference_at + $token_length;
@@ -1076,7 +1111,9 @@ function wp_normalize_escaped_html_text( string $context, string $text ): string
10761111
* numeric character references may contain upper or lowercase hex values and may
10771112
* contain unlimited preceding zeros.
10781113
*/
1079-
$is_hex = 'x' === $text[ $next_character_reference_at + 2 ] || 'X' === $text[ $next_character_reference_at + 2 ];
1114+
$is_small_hex = 'x' === $text[ $next_character_reference_at + 2 ];
1115+
$is_big_hex = 'X' === $text[ $next_character_reference_at + 2 ];
1116+
$is_hex = $is_small_hex || $is_big_hex;
10801117
$digits_at = $next_character_reference_at + ( $is_hex ? 3 : 2 );
10811118
$leading_zeros = '0' === $text[ $digits_at ] ? strspn( $text, '0', $digits_at ) : 0;
10821119

@@ -1086,21 +1123,24 @@ function wp_normalize_escaped_html_text( string $context, string $text ): string
10861123
continue;
10871124
}
10881125

1089-
$digits = substr( $text, $digits_at + $leading_zeros, $next_character_reference_at + $token_length - $digits_at - $leading_zeros - ( $needs_semicolon ? 0 : 1 ) );
1126+
$nonzero_at = $digits_at + $leading_zeros;
1127+
$digit_length = $next_character_reference_at + $token_length - $digits_at - $leading_zeros - ( $needs_semicolon ? 0 : 1 );
10901128
if ( $is_hex ) {
1091-
$lower_digits = strtolower( $digits );
1129+
$all_uppercase = $digit_length !== strspn( $text, '0123456789ABCDEF', $nonzero_at, $digit_length );
10921130

10931131
// Nothing to do for already-normalized hexadecimal numeric character references.
1094-
if ( $lower_digits === $digits && ! $needs_semicolon && 0 === $leading_zeros ) {
1132+
if ( $is_small_hex && $all_uppercase && ! $needs_semicolon && 0 === $leading_zeros ) {
10951133
$at = $next_character_reference_at + $token_length;
10961134
continue;
10971135
}
10981136

1099-
$normalized[] = substr( $text, $was_at, $next_character_reference_at - $was_at ) . "&#x{$lower_digits};";
1137+
$digits = strtoupper( substr( $text, $nonzero_at, $digit_length ) );
1138+
$normalized[] = substr( $text, $was_at, $next_character_reference_at - $was_at ) . "&#x{$digits};";
11001139
$at = $next_character_reference_at + $token_length;
11011140
$was_at = $at;
11021141
continue;
11031142
} else {
1143+
$digits = substr( $text, $nonzero_at, $digit_length );
11041144
$normalized[] = substr( $text, $was_at, $next_character_reference_at - $was_at ) . "&#{$digits};";
11051145
$at = $next_character_reference_at + $token_length;
11061146
$was_at = $at;

tests/phpunit/tests/formatting/normalizeEscapedHtmlText.php

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -26,18 +26,18 @@ public static function data_example_datasets() {
2626
return array(
2727
array( 'attribute', 'test', 'test' ),
2828
array( 'attribute', 'test & done', 'test &amp; done' ),
29-
array( 'attribute', '&#XFe; is not iron', '&#xfe; is not iron' ),
29+
array( 'attribute', '&#XFe; is not iron', '&#xFE; is not iron' ),
3030
array( 'attribute', 'spec > guess', 'spec &gt; guess' ),
3131
array( 'attribute', 'art & copy', 'art &amp; copy' ),
32-
array( 'attribute', '&#x1F170', '&#x1f170;' ),
33-
array( 'attribute', '&#x1F170 ', '&#x1f170; ' ),
32+
array( 'attribute', '&#x1F170', '&#x1F170;' ),
33+
array( 'attribute', '&#x1F170 ', '&#x1F170; ' ),
3434

3535
array( 'data', 'test', 'test' ),
3636
array( 'data', 'test & done', 'test &amp; done' ),
37-
array( 'data', '&#XFe; is not iron', '&#xfe; is not iron' ),
37+
array( 'data', '&#XFe; is not iron', '&#xFE; is not iron' ),
3838
array( 'data', 'spec > guess', 'spec &gt; guess' ),
3939
array( 'data', 'art & copy', 'art &amp; copy' ),
40-
array( 'data', '&#x1F170', '&#x1f170;' ),
40+
array( 'data', '&#x1F170', '&#x1F170;' ),
4141
array( 'data', '&#x1F170 ', '&#x1f170; ' ),
4242

4343
// The “ambiguous ampersand” has different rules in the attribute value and data states.
@@ -48,7 +48,7 @@ public static function data_example_datasets() {
4848
array( 'attribute', 'eat &#x000033; apples', 'eat 3 apples' ),
4949
array( 'data', 'eat &#x000033; apples', 'eat 3 apples' ),
5050
array( 'data', '<&#x00073;cr&#0105pt&gt;', '&lt;script&gt;' ),
51-
array( 'attribute', '&#x6a;avascript&#58alert&#40;&#x0000007b"test&quot;&#125;&#41;', 'javascript:alert({&quot;test&quot;})' ),
51+
array( 'attribute', '&#x6a;avascript&#58alert&#40;&#x0000007b"test&quot;&#125;&#41;', 'javascript:alert&#40;&#x7b;&quot;test&quot;&#125;&#41;' ),
5252

5353
// Syntax characters should be represented uniformly.
5454
array( 'attribute', '&#X3CIMG&#00062', '&lt;IMG&gt;' ),

0 commit comments

Comments
 (0)