Skip to content

Commit 21ca4b2

Browse files
committed
General: Add support for most unicode letters in WP backend login names
This accepts user names that contain a single script, but not mixed-script names, such as ones that mix Latin and Cyrillic. That seemed to be closest to the code's existing philosophy. Since PHP 7.2 and 7.3 don't offer mb_str_split, this change leaves sites on those versions of PHP with ASCII-only user names. Existing user names should continue to work unchanged, adding new (non-ASCII) user names will not work. Some mistyped user names would be silently corrected in the past, but not any more (on 7.2/3).
1 parent eadef0b commit 21ca4b2

3 files changed

Lines changed: 191 additions & 5 deletions

File tree

src/wp-includes/formatting.php

Lines changed: 126 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2111,6 +2111,124 @@ function sanitize_file_name( $filename ) {
21112111
return apply_filters( 'sanitize_file_name', $filename, $filename_raw );
21122112
}
21132113

2114+
/**
2115+
* Returns true if the string contains no more than one unicode
2116+
* script, and false if it contains two or more. This only considers
2117+
* alphabetic characters.
2118+
*
2119+
* This returns true for an empty string.
2120+
*
2121+
* IntlChar does not support returning the script property defined by
2122+
* https://www.unicode.org/reports/tr24/, so this implementation uses
2123+
* a workaround. Some of the old scripts have several code blocks, but
2124+
* the scripts currently being added have only one, since the
2125+
* committee has grown better at estimating the necessary size.
2126+
*
2127+
* This maps the known extension blocks ("latin extended a" etc) to
2128+
* the first block for that script, and then checks that the string
2129+
* uses only a single block. This works for the scripts currently in
2130+
* Unicode, and will work for future scripts as long as the committee
2131+
* keeps estimating correctly, so there's only one block for each
2132+
* future script.
2133+
*/
2134+
2135+
function uses_single_unicode_script( $input ) {
2136+
if ( version_compare( PHP_VERSION, '7.4.0', '<' ) ) {
2137+
/* mb_str_split is new in 7.4; accept ASCII only in 7.2-3 */
2138+
return preg_match( '/^[a-zA-Z0-9 _.\-@]+$/i', $input );
2139+
}
2140+
$block = 0;
2141+
// phpcs:ignore PHPCompatibility.FunctionUse.NewFunctions.mb_str_splitFound -- old versions of PHP are handled above
2142+
foreach ( mb_str_split( $input ) as $cp ) {
2143+
if ( IntlChar::isalpha( $cp ) ) {
2144+
$b = IntlChar::getBlockCode( $cp );
2145+
switch ( $b ) {
2146+
case IntlChar::BLOCK_CODE_LATIN_1_SUPPLEMENT:
2147+
// fall through
2148+
case IntlChar::BLOCK_CODE_LATIN_EXTENDED_A:
2149+
// fall through
2150+
case IntlChar::BLOCK_CODE_LATIN_EXTENDED_B:
2151+
case IntlChar::BLOCK_CODE_LATIN_EXTENDED_C:
2152+
case IntlChar::BLOCK_CODE_LATIN_EXTENDED_D:
2153+
case IntlChar::BLOCK_CODE_IPA_EXTENSIONS: // used in Ghana etc
2154+
case IntlChar::BLOCK_CODE_LATIN_EXTENDED_ADDITIONAL:
2155+
$b = IntlChar::BLOCK_CODE_BASIC_LATIN;
2156+
break;
2157+
case IntlChar::BLOCK_CODE_GREEK_EXTENDED:
2158+
case IntlChar::BLOCK_CODE_COPTIC:
2159+
case IntlChar::BLOCK_CODE_COPTIC_EPACT_NUMBERS:
2160+
// Greek and coptic overlap. Coptic
2161+
// looks like Greek upper case, so
2162+
// readers of Greek can read Coptic,
2163+
// but readers of Coptic can't
2164+
// necessarily read Greek. This led to
2165+
// an unfortunate situation in
2166+
// Unicode, where the two can't be
2167+
// properly distinguished by
2168+
// block. However, because of the
2169+
// overlap, this isn't really a
2170+
// problem.
2171+
$b = IntlChar::BLOCK_CODE_GREEK;
2172+
break;
2173+
case IntlChar::BLOCK_CODE_ETHIOPIC_EXTENDED:
2174+
case IntlChar::BLOCK_CODE_ETHIOPIC_EXTENDED_A:
2175+
case IntlChar::BLOCK_CODE_ETHIOPIC_SUPPLEMENT:
2176+
$b = IntlChar::BLOCK_CODE_ETHIOPIC;
2177+
break;
2178+
case IntlChar::BLOCK_CODE_ARABIC_EXTENDED_A:
2179+
case IntlChar::BLOCK_CODE_ARABIC_SUPPLEMENT:
2180+
case IntlChar::BLOCK_CODE_ARABIC_PRESENTATION_FORMS_A:
2181+
case IntlChar::BLOCK_CODE_ARABIC_PRESENTATION_FORMS_B:
2182+
case IntlChar::BLOCK_CODE_ARABIC_SUPPLEMENT:
2183+
$b = IntlChar::BLOCK_CODE_ARABIC;
2184+
break;
2185+
case IntlChar::BLOCK_CODE_CYRILLIC_EXTENDED_A:
2186+
case IntlChar::BLOCK_CODE_CYRILLIC_EXTENDED_B:
2187+
$b = IntlChar::BLOCK_CODE_CYRILLIC;
2188+
break;
2189+
case IntlChar::BLOCK_CODE_BOPOMOFO_EXTENDED:
2190+
$b = IntlChar::BLOCK_CODE_BOPOMOFO;
2191+
break;
2192+
case IntlChar::BLOCK_CODE_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS_EXTENDED:
2193+
$b = IntlChar::BLOCK_CODE_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS;
2194+
break;
2195+
case IntlChar::BLOCK_CODE_DEVANAGARI_EXTENDED:
2196+
$b = IntlChar::BLOCK_CODE_DEVANAGARI;
2197+
break;
2198+
case IntlChar::BLOCK_CODE_HANGUL_JAMO:
2199+
case IntlChar::BLOCK_CODE_HANGUL_JAMO_EXTENDED_A:
2200+
case IntlChar::BLOCK_CODE_HANGUL_JAMO_EXTENDED_B:
2201+
$b = IntlChar::BLOCK_CODE_HANGUL;
2202+
break;
2203+
case IntlChar::BLOCK_CODE_MYANMAR_EXTENDED_A:
2204+
case IntlChar::BLOCK_CODE_MYANMAR_EXTENDED_B:
2205+
$b = IntlChar::BLOCK_CODE_MYANMAR;
2206+
break;
2207+
case IntlChar::BLOCK_CODE_CJK_STROKES:
2208+
case IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS:
2209+
case IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A:
2210+
case IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B:
2211+
case IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C:
2212+
case IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D:
2213+
case IntlChar::BLOCK_CODE_CJK_COMPATIBILITY_IDEOGRAPHS:
2214+
case IntlChar::BLOCK_CODE_CJK_RADICALS_SUPPLEMENT:
2215+
case IntlChar::BLOCK_CODE_ENCLOSED_CJK_LETTERS_AND_MONTHS:
2216+
case IntlChar::BLOCK_CODE_CJK_COMPATIBILITY_FORMS:
2217+
case IntlChar::BLOCK_CODE_CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT:
2218+
$b = IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS;
2219+
break;
2220+
}
2221+
if ( 0 === $block ) {
2222+
$block = $b;
2223+
}
2224+
if ( $block !== $b ) {
2225+
return false;
2226+
}
2227+
}
2228+
}
2229+
return true;
2230+
}
2231+
21142232
/**
21152233
* Sanitizes a username, stripping out unsafe characters.
21162234
*
@@ -2129,17 +2247,21 @@ function sanitize_file_name( $filename ) {
21292247
function sanitize_user( $username, $strict = false ) {
21302248
$raw_username = $username;
21312249
$username = wp_strip_all_tags( $username );
2132-
$username = remove_accents( $username );
21332250
// Remove percent-encoded characters.
2134-
$username = preg_replace( '|%([a-fA-F0-9][a-fA-F0-9])|', '', $username );
2251+
$username = urldecode( $username );
21352252
// Remove HTML entities.
21362253
$username = preg_replace( '/&.+?;/', '', $username );
21372254

2138-
// If strict, reduce to ASCII for max portability.
2139-
if ( $strict ) {
2255+
// If mixing different scripts, remove all but ASCII.
2256+
if ( ! uses_single_unicode_script( $username ) ) {
21402257
$username = preg_replace( '|[^a-z0-9 _.\-@]|i', '', $username );
21412258
}
21422259

2260+
// If strict, remove reduce to letters and numbers.
2261+
if ( $strict ) {
2262+
$username = preg_replace( '|[^a-z0-9 _.\-@\p{L}\p{N}]|iu', '', $username );
2263+
}
2264+
21432265
$username = trim( $username );
21442266
// Consolidate contiguous whitespace.
21452267
$username = preg_replace( '|\s+|', ' ', $username );

tests/phpunit/tests/formatting/sanitizeUser.php

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,9 @@ public function test_strips_encoded_ampersand() {
2424
}
2525

2626
public function test_strips_encoded_ampersand_when_followed_by_semicolon() {
27+
if ( !function_exists( 'mb_str_split' ) ) {
28+
$this->markTestSkipped( 'PHP 7.2/3 lacks mb_str_split' );
29+
}
2730
$expected = 'ATT Test;';
2831

2932
// Multisite forces user logins to lowercase.
@@ -35,10 +38,39 @@ public function test_strips_encoded_ampersand_when_followed_by_semicolon() {
3538
}
3639

3740
public function test_strips_percent_encoded_octets() {
38-
$expected = is_multisite() ? 'franois' : 'Franois';
41+
if ( !function_exists( 'mb_str_split' ) ) {
42+
$this->markTestSkipped( 'PHP 7.2/3 lacks mb_str_split' );
43+
}
44+
$expected = is_multisite() ? 'françois' : 'François';
3945
$this->assertSame( $expected, sanitize_user( 'Fran%c3%a7ois' ) );
4046
}
4147
public function test_optional_strict_mode_reduces_to_safe_ascii_subset() {
4248
$this->assertSame( 'abc', sanitize_user( '()~ab~ˆcˆ!', true ) );
4349
}
50+
51+
public function test_accepts_all_arabic() {
52+
if ( !function_exists( 'mb_str_split' ) ) {
53+
$this->markTestSkipped( 'PHP 7.2/3 lacks mb_str_split' );
54+
}
55+
$expected = 'آرنت';
56+
$encoded = '%D8%A2%D8%B1%D9%86%D8%AA';
57+
58+
$this->assertSame( $expected, sanitize_user( $expected ) );
59+
$this->assertSame( $expected, sanitize_user( $encoded ) );
60+
}
61+
62+
public function test_accepts_west_african_latin() {
63+
if ( !function_exists( 'mb_str_split' ) ) {
64+
$this->markTestSkipped( 'PHP 7.2/3 lacks mb_str_split' );
65+
}
66+
$expected = 'tɔnatɔn';
67+
$encoded = 't%C9%94nat%C9%94n';
68+
69+
$this->assertSame( $expected, sanitize_user( $expected ) );
70+
$this->assertSame( $expected, sanitize_user( $encoded ) );
71+
}
72+
73+
public function test_blocks_latin_cyrillic_mixed_name() {
74+
$this->assertSame( 'arn', sanitize_user( 'arn%D1%82' ) );
75+
}
4476
}

tests/phpunit/tests/user.php

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -848,6 +848,38 @@ public function test_validate_username_invalid() {
848848
$this->assertFalse( validate_username( '@#&99sd' ) );
849849
}
850850

851+
/**
852+
* Check that single-script usernames work, mixed-script ones
853+
* don't, and really strange ones don't.
854+
*
855+
* Since validate_username() uses strict sanitization, this
856+
* test checks some codepoints strictly, rejecting some that
857+
* are less readable than confusable.
858+
*/
859+
public function test_validate_utf8_usernames() {
860+
if ( !function_exists( 'mb_str_split' ) ) {
861+
$this->markTestSkipped( 'PHP 7.2/3 lacks mb_str_split' );
862+
}
863+
/* WordPress approves of drab grey (grå) Norwegian weather */
864+
$this->assertTrue( validate_username( 'grå' ) );
865+
/* Latin I, Cyrillic V like latin B, Latin M */
866+
$this->assertFalse( validate_username( 'IВM' ) );
867+
/* Three Cyrillic letters */
868+
$this->assertTrue( validate_username( 'ІВМ' ) );
869+
/* A metal umlaut fails because validate_username is
870+
* strict and n̈ is unfamiliar in every language
871+
*/
872+
$this->assertFalse( validate_username( 'spın̈altap' ) );
873+
/* Emoji skintones fail because usernames should be
874+
* easily distinguishable
875+
*/
876+
$this->assertFalse( validate_username( '👱🏼' ) );
877+
$this->assertFalse( validate_username( '👱🏾' ) );
878+
/* Many plain emoji also are barely distinguishable */
879+
$this->assertFalse( validate_username( '😊' ) );
880+
$this->assertFalse( validate_username( '☺️' ) );
881+
}
882+
851883
/**
852884
* @ticket 29880
853885
*/

0 commit comments

Comments
 (0)