Skip to content

Commit 088aced

Browse files
committed
General: Add support for most unicode letters in WP backend login names
This accepts user names that contain a single script, but not mixed-script names, such as ones that mix Latin and Cyrillic. That seemed to be closest to the code's existing philosophy. Since PHP 7.2 and 7.3 don't offer mb_str_split, this change leaves sites on those versions of PHP with ASCII-only user names. Existing user names should continue to work unchanged, adding new (non-ASCII) user names will not work.
1 parent e3e134a commit 088aced

3 files changed

Lines changed: 175 additions & 5 deletions

File tree

src/wp-includes/formatting.php

Lines changed: 125 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2111,6 +2111,123 @@ function sanitize_file_name( $filename ) {
21112111
return apply_filters( 'sanitize_file_name', $filename, $filename_raw );
21122112
}
21132113

2114+
/**
2115+
* Returns true if the string contains no more than one unicode
2116+
* script, and false if it contains two or more. This only considers
2117+
* alphabetic characters.
2118+
*
2119+
* This returns true for an empty string.
2120+
*
2121+
* IntlChar does not support returning the script property defined by
2122+
* https://www.unicode.org/reports/tr24/, so this implementation uses
2123+
* a workaround. Some of the old scripts have several code blocks, but
2124+
* the scripts currently being added have only one, since the
2125+
* committee has grown better at estimating the necessary size.
2126+
*
2127+
* This maps the known extension blocks ("latin extended a" etc) to
2128+
* the first block for that script, and then checks that the string
2129+
* uses only a single block. This works for the scripts currently in
2130+
* Unicode, and will work for future scripts as long as the committee
2131+
* keeps estimating correctly, so there's only one block for each
2132+
* future script.
2133+
*/
2134+
2135+
function uses_single_unicode_script( $input ) {
2136+
if ( version_compare( PHP_VERSION, '7.4.0', '<') ) {
2137+
/* mb_str_split is new in 7.4; accept ASCII only in 7.2-3 */
2138+
return preg_match( '/^[a-zA-Z0-9 _.\-@]+$/i', $input );
2139+
}
2140+
$block = 0;
2141+
foreach ( mb_str_split( $input ) as $cp ) {
2142+
if ( IntlChar::isalpha( $cp ) ) {
2143+
$b = IntlChar::getBlockCode( $cp );
2144+
switch ( $b ) {
2145+
case IntlChar::BLOCK_CODE_LATIN_1_SUPPLEMENT:
2146+
// fall through
2147+
case IntlChar::BLOCK_CODE_LATIN_EXTENDED_A:
2148+
// fall through
2149+
case IntlChar::BLOCK_CODE_LATIN_EXTENDED_B:
2150+
case IntlChar::BLOCK_CODE_LATIN_EXTENDED_C:
2151+
case IntlChar::BLOCK_CODE_LATIN_EXTENDED_D:
2152+
case IntlChar::BLOCK_CODE_IPA_EXTENSIONS: // used in Ghana etc
2153+
case IntlChar::BLOCK_CODE_LATIN_EXTENDED_ADDITIONAL:
2154+
$b = IntlChar::BLOCK_CODE_BASIC_LATIN;
2155+
break;
2156+
case IntlChar::BLOCK_CODE_GREEK_EXTENDED:
2157+
case IntlChar::BLOCK_CODE_COPTIC:
2158+
case IntlChar::BLOCK_CODE_COPTIC_EPACT_NUMBERS:
2159+
// Greek and coptic overlap. Coptic
2160+
// looks like Greek upper case, so
2161+
// readers of Greek can read Coptic,
2162+
// but readers of Coptic can't
2163+
// necessarily read Greek. This led to
2164+
// an unfortunate situation in
2165+
// Unicode, where the two can't be
2166+
// properly distinguished by
2167+
// block. However, because of the
2168+
// overlap, this isn't really a
2169+
// problem.
2170+
$b = IntlChar::BLOCK_CODE_GREEK;
2171+
break;
2172+
case IntlChar::BLOCK_CODE_ETHIOPIC_EXTENDED:
2173+
case IntlChar::BLOCK_CODE_ETHIOPIC_EXTENDED_A:
2174+
case IntlChar::BLOCK_CODE_ETHIOPIC_SUPPLEMENT:
2175+
$b = IntlChar::BLOCK_CODE_ETHIOPIC;
2176+
break;
2177+
case IntlChar::BLOCK_CODE_ARABIC_EXTENDED_A:
2178+
case IntlChar::BLOCK_CODE_ARABIC_SUPPLEMENT:
2179+
case IntlChar::BLOCK_CODE_ARABIC_PRESENTATION_FORMS_A:
2180+
case IntlChar::BLOCK_CODE_ARABIC_PRESENTATION_FORMS_B:
2181+
case IntlChar::BLOCK_CODE_ARABIC_SUPPLEMENT:
2182+
$b = IntlChar::BLOCK_CODE_ARABIC;
2183+
break;
2184+
case IntlChar::BLOCK_CODE_CYRILLIC_EXTENDED_A:
2185+
case IntlChar::BLOCK_CODE_CYRILLIC_EXTENDED_B:
2186+
$b = IntlChar::BLOCK_CODE_CYRILLIC;
2187+
break;
2188+
case IntlChar::BLOCK_CODE_BOPOMOFO_EXTENDED:
2189+
$b = IntlChar::BLOCK_CODE_BOPOMOFO;
2190+
break;
2191+
case IntlChar::BLOCK_CODE_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS_EXTENDED:
2192+
$b = IntlChar::BLOCK_CODE_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS;
2193+
break;
2194+
case IntlChar::BLOCK_CODE_DEVANAGARI_EXTENDED:
2195+
$b = IntlChar::BLOCK_CODE_DEVANAGARI;
2196+
break;
2197+
case IntlChar::BLOCK_CODE_HANGUL_JAMO:
2198+
case IntlChar::BLOCK_CODE_HANGUL_JAMO_EXTENDED_A:
2199+
case IntlChar::BLOCK_CODE_HANGUL_JAMO_EXTENDED_B:
2200+
$b = IntlChar::BLOCK_CODE_HANGUL;
2201+
break;
2202+
case IntlChar::BLOCK_CODE_MYANMAR_EXTENDED_A:
2203+
case IntlChar::BLOCK_CODE_MYANMAR_EXTENDED_B:
2204+
$b = IntlChar::BLOCK_CODE_MYANMAR;
2205+
break;
2206+
case IntlChar::BLOCK_CODE_CJK_STROKES:
2207+
case IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS:
2208+
case IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A:
2209+
case IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B:
2210+
case IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C:
2211+
case IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D:
2212+
case IntlChar::BLOCK_CODE_CJK_COMPATIBILITY_IDEOGRAPHS:
2213+
case IntlChar::BLOCK_CODE_CJK_RADICALS_SUPPLEMENT:
2214+
case IntlChar::BLOCK_CODE_ENCLOSED_CJK_LETTERS_AND_MONTHS:
2215+
case IntlChar::BLOCK_CODE_CJK_COMPATIBILITY_FORMS:
2216+
case IntlChar::BLOCK_CODE_CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT:
2217+
$b = IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS;
2218+
break;
2219+
}
2220+
if ( 0 === $block ) {
2221+
$block = $b;
2222+
}
2223+
if ( $block !== $b ) {
2224+
return false;
2225+
}
2226+
}
2227+
}
2228+
return true;
2229+
}
2230+
21142231
/**
21152232
* Sanitizes a username, stripping out unsafe characters.
21162233
*
@@ -2129,17 +2246,21 @@ function sanitize_file_name( $filename ) {
21292246
function sanitize_user( $username, $strict = false ) {
21302247
$raw_username = $username;
21312248
$username = wp_strip_all_tags( $username );
2132-
$username = remove_accents( $username );
21332249
// Remove percent-encoded characters.
2134-
$username = preg_replace( '|%([a-fA-F0-9][a-fA-F0-9])|', '', $username );
2250+
$username = urldecode( $username );
21352251
// Remove HTML entities.
21362252
$username = preg_replace( '/&.+?;/', '', $username );
21372253

2138-
// If strict, reduce to ASCII for max portability.
2139-
if ( $strict ) {
2254+
// If mixing different scripts, remove all but ASCII.
2255+
if ( ! uses_single_unicode_script( $username ) ) {
21402256
$username = preg_replace( '|[^a-z0-9 _.\-@]|i', '', $username );
21412257
}
21422258

2259+
// If strict, remove reduce to letters and numbers.
2260+
if ( $strict ) {
2261+
$username = preg_replace( '|[^a-z0-9 _.\-@\p{L}\p{N}]|iu', '', $username );
2262+
}
2263+
21432264
$username = trim( $username );
21442265
// Consolidate contiguous whitespace.
21452266
$username = preg_replace( '|\s+|', ' ', $username );

tests/phpunit/tests/formatting/sanitizeUser.php

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,10 +35,30 @@ public function test_strips_encoded_ampersand_when_followed_by_semicolon() {
3535
}
3636

3737
public function test_strips_percent_encoded_octets() {
38-
$expected = is_multisite() ? 'franois' : 'Franois';
38+
$expected = is_multisite() ? 'françois' : 'François';
3939
$this->assertSame( $expected, sanitize_user( 'Fran%c3%a7ois' ) );
4040
}
4141
public function test_optional_strict_mode_reduces_to_safe_ascii_subset() {
4242
$this->assertSame( 'abc', sanitize_user( '()~ab~ˆcˆ!', true ) );
4343
}
44+
45+
public function test_accepts_all_arabic() {
46+
$expected = 'آرنت';
47+
$encoded = '%D8%A2%D8%B1%D9%86%D8%AA';
48+
49+
$this->assertSame( $expected, sanitize_user( $expected ) );
50+
$this->assertSame( $expected, sanitize_user( $encoded ) );
51+
}
52+
53+
public function test_accepts_west_african_latin() {
54+
$expected = 'tɔnatɔn';
55+
$encoded = 't%C9%94nat%C9%94n';
56+
57+
$this->assertSame( $expected, sanitize_user( $expected ) );
58+
$this->assertSame( $expected, sanitize_user( $encoded ) );
59+
}
60+
61+
public function test_blocks_latin_cyrillic_mixed_name() {
62+
$this->assertSame( 'arn', sanitize_user( 'arn%D1%82' ) );
63+
}
4464
}

tests/phpunit/tests/user.php

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -820,6 +820,35 @@ public function test_validate_username_invalid() {
820820
$this->assertFalse( validate_username( '@#&99sd' ) );
821821
}
822822

823+
/**
824+
* Check that single-script usernames work, mixed-script ones
825+
* don't, and really strange ones don't.
826+
*
827+
* Since validate_username() uses strict sanitization, this
828+
* test checks some codepoints strictly, rejecting some that
829+
* are less readable than confusable.
830+
*/
831+
public function test_validate_utf8_usernames() {
832+
/* WordPress approves of drab grey (grå) Norwegian weather */
833+
$this->assertTrue( validate_username( 'grå' ) );
834+
/* Latin I, Cyrillic V like latin B, Latin M */
835+
$this->assertFalse( validate_username( 'IВM' ) );
836+
/* Three Cyrillic letters */
837+
$this->assertTrue( validate_username( 'ІВМ' ) );
838+
/* A metal umlaut fails because validate_username is
839+
* strict and n̈ is unfamiliar in every language
840+
*/
841+
$this->assertFalse( validate_username( 'spın̈altap' ) );
842+
/* Emoji skintones fail because usernames should be
843+
* easily distinguishable
844+
*/
845+
$this->assertFalse( validate_username( '👱🏼' ) );
846+
$this->assertFalse( validate_username( '👱🏾' ) );
847+
/* Many plain emoji also are barely distinguishable */
848+
$this->assertFalse( validate_username( '😊' ) );
849+
$this->assertFalse( validate_username( '☺️' ) );
850+
}
851+
823852
/**
824853
* @ticket 29880
825854
*/

0 commit comments

Comments
 (0)