Skip to content

Commit a411260

Browse files
committed
General: Add support for most unicode letters in WP backend login names
This accepts user names that contain a single script, but not mixed-script names, such as ones that mix Latin and Cyrillic. That seemed to be closest to the code's existing philosophy.
1 parent e3e134a commit a411260

3 files changed

Lines changed: 171 additions & 5 deletions

File tree

src/wp-includes/formatting.php

Lines changed: 121 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2111,6 +2111,119 @@ function sanitize_file_name( $filename ) {
21112111
return apply_filters( 'sanitize_file_name', $filename, $filename_raw );
21122112
}
21132113

2114+
/**
2115+
* Returns true if the string contains no more than one unicode
2116+
* script, and false if it contains two or more. This only considers
2117+
* alphabetic characters.
2118+
*
2119+
* This returns true for an empty string.
2120+
*
2121+
* IntlChar does not support returning the script property defined by
2122+
* https://www.unicode.org/reports/tr24/, so this implementation uses
2123+
* a workaround. Some of the old scripts have several code blocks, but
2124+
* the scripts currently being added have only one, since the
2125+
* committee has grown better at estimating the necessary size.
2126+
*
2127+
* This maps the known extension blocks ("latin extended a" etc) to
2128+
* the first block for that script, and then checks that the string
2129+
* uses only a single block. This works for the scripts currently in
2130+
* Unicode, and will work for future scripts as long as the committee
2131+
* keeps estimating correctly, so there's only one block for each
2132+
* future script.
2133+
*/
2134+
2135+
function uses_single_unicode_script( $input ) {
2136+
$block = 0;
2137+
foreach ( mb_str_split( $input ) as $cp ) {
2138+
if ( IntlChar::isalpha( $cp ) ) {
2139+
$b = IntlChar::getBlockCode( $cp );
2140+
switch ( $b ) {
2141+
case IntlChar::BLOCK_CODE_LATIN_1_SUPPLEMENT:
2142+
// fall through
2143+
case IntlChar::BLOCK_CODE_LATIN_EXTENDED_A:
2144+
// fall through
2145+
case IntlChar::BLOCK_CODE_LATIN_EXTENDED_B:
2146+
case IntlChar::BLOCK_CODE_LATIN_EXTENDED_C:
2147+
case IntlChar::BLOCK_CODE_LATIN_EXTENDED_D:
2148+
case IntlChar::BLOCK_CODE_IPA_EXTENSIONS: // used in Ghana etc
2149+
case IntlChar::BLOCK_CODE_LATIN_EXTENDED_ADDITIONAL:
2150+
$b = IntlChar::BLOCK_CODE_BASIC_LATIN;
2151+
break;
2152+
case IntlChar::BLOCK_CODE_GREEK_EXTENDED:
2153+
case IntlChar::BLOCK_CODE_COPTIC:
2154+
case IntlChar::BLOCK_CODE_COPTIC_EPACT_NUMBERS:
2155+
// Greek and coptic overlap. Coptic
2156+
// looks like Greek upper case, so
2157+
// readers of Greek can read Coptic,
2158+
// but readers of Coptic can't
2159+
// necessarily read Greek. This led to
2160+
// an unfortunate situation in
2161+
// Unicode, where the two can't be
2162+
// properly distinguished by
2163+
// block. However, because of the
2164+
// overlap, this isn't really a
2165+
// problem.
2166+
$b = IntlChar::BLOCK_CODE_GREEK;
2167+
break;
2168+
case IntlChar::BLOCK_CODE_ETHIOPIC_EXTENDED:
2169+
case IntlChar::BLOCK_CODE_ETHIOPIC_EXTENDED_A:
2170+
case IntlChar::BLOCK_CODE_ETHIOPIC_SUPPLEMENT:
2171+
$b = IntlChar::BLOCK_CODE_ETHIOPIC;
2172+
break;
2173+
case IntlChar::BLOCK_CODE_ARABIC_EXTENDED_A:
2174+
case IntlChar::BLOCK_CODE_ARABIC_SUPPLEMENT:
2175+
case IntlChar::BLOCK_CODE_ARABIC_PRESENTATION_FORMS_A:
2176+
case IntlChar::BLOCK_CODE_ARABIC_PRESENTATION_FORMS_B:
2177+
case IntlChar::BLOCK_CODE_ARABIC_SUPPLEMENT:
2178+
$b = IntlChar::BLOCK_CODE_ARABIC;
2179+
break;
2180+
case IntlChar::BLOCK_CODE_CYRILLIC_EXTENDED_A:
2181+
case IntlChar::BLOCK_CODE_CYRILLIC_EXTENDED_B:
2182+
$b = IntlChar::BLOCK_CODE_CYRILLIC;
2183+
break;
2184+
case IntlChar::BLOCK_CODE_BOPOMOFO_EXTENDED:
2185+
$b = IntlChar::BLOCK_CODE_BOPOMOFO;
2186+
break;
2187+
case IntlChar::BLOCK_CODE_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS_EXTENDED:
2188+
$b = IntlChar::BLOCK_CODE_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS;
2189+
break;
2190+
case IntlChar::BLOCK_CODE_DEVANAGARI_EXTENDED:
2191+
$b = IntlChar::BLOCK_CODE_DEVANAGARI;
2192+
break;
2193+
case IntlChar::BLOCK_CODE_HANGUL_JAMO:
2194+
case IntlChar::BLOCK_CODE_HANGUL_JAMO_EXTENDED_A:
2195+
case IntlChar::BLOCK_CODE_HANGUL_JAMO_EXTENDED_B:
2196+
$b = IntlChar::BLOCK_CODE_HANGUL;
2197+
break;
2198+
case IntlChar::BLOCK_CODE_MYANMAR_EXTENDED_A:
2199+
case IntlChar::BLOCK_CODE_MYANMAR_EXTENDED_B:
2200+
$b = IntlChar::BLOCK_CODE_MYANMAR;
2201+
break;
2202+
case IntlChar::BLOCK_CODE_CJK_STROKES:
2203+
case IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS:
2204+
case IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A:
2205+
case IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B:
2206+
case IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C:
2207+
case IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D:
2208+
case IntlChar::BLOCK_CODE_CJK_COMPATIBILITY_IDEOGRAPHS:
2209+
case IntlChar::BLOCK_CODE_CJK_RADICALS_SUPPLEMENT:
2210+
case IntlChar::BLOCK_CODE_ENCLOSED_CJK_LETTERS_AND_MONTHS:
2211+
case IntlChar::BLOCK_CODE_CJK_COMPATIBILITY_FORMS:
2212+
case IntlChar::BLOCK_CODE_CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT:
2213+
$b = IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS;
2214+
break;
2215+
}
2216+
if ( 0 === $block ) {
2217+
$block = $b;
2218+
}
2219+
if ( $block !== $b ) {
2220+
return false;
2221+
}
2222+
}
2223+
}
2224+
return true;
2225+
}
2226+
21142227
/**
21152228
* Sanitizes a username, stripping out unsafe characters.
21162229
*
@@ -2129,17 +2242,21 @@ function sanitize_file_name( $filename ) {
21292242
function sanitize_user( $username, $strict = false ) {
21302243
$raw_username = $username;
21312244
$username = wp_strip_all_tags( $username );
2132-
$username = remove_accents( $username );
21332245
// Remove percent-encoded characters.
2134-
$username = preg_replace( '|%([a-fA-F0-9][a-fA-F0-9])|', '', $username );
2246+
$username = urldecode( $username );
21352247
// Remove HTML entities.
21362248
$username = preg_replace( '/&.+?;/', '', $username );
21372249

2138-
// If strict, reduce to ASCII for max portability.
2139-
if ( $strict ) {
2250+
// If mixing different scripts, remove all but ASCII.
2251+
if ( ! uses_single_unicode_script( $username ) ) {
21402252
$username = preg_replace( '|[^a-z0-9 _.\-@]|i', '', $username );
21412253
}
21422254

2255+
// If strict, remove reduce to letters and numbers.
2256+
if ( $strict ) {
2257+
$username = preg_replace( '|[^a-z0-9 _.\-@\p{L}\p{N}]|iu', '', $username );
2258+
}
2259+
21432260
$username = trim( $username );
21442261
// Consolidate contiguous whitespace.
21452262
$username = preg_replace( '|\s+|', ' ', $username );

tests/phpunit/tests/formatting/sanitizeUser.php

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,10 +35,30 @@ public function test_strips_encoded_ampersand_when_followed_by_semicolon() {
3535
}
3636

3737
public function test_strips_percent_encoded_octets() {
38-
$expected = is_multisite() ? 'franois' : 'Franois';
38+
$expected = is_multisite() ? 'françois' : 'François';
3939
$this->assertSame( $expected, sanitize_user( 'Fran%c3%a7ois' ) );
4040
}
4141
public function test_optional_strict_mode_reduces_to_safe_ascii_subset() {
4242
$this->assertSame( 'abc', sanitize_user( '()~ab~ˆcˆ!', true ) );
4343
}
44+
45+
public function test_accepts_all_arabic() {
46+
$expected = 'آرنت';
47+
$encoded = '%D8%A2%D8%B1%D9%86%D8%AA';
48+
49+
$this->assertSame( $expected, sanitize_user( $expected ) );
50+
$this->assertSame( $expected, sanitize_user( $encoded ) );
51+
}
52+
53+
public function test_accepts_west_african_latin() {
54+
$expected = 'tɔnatɔn';
55+
$encoded = 't%C9%94nat%C9%94n';
56+
57+
$this->assertSame( $expected, sanitize_user( $expected ) );
58+
$this->assertSame( $expected, sanitize_user( $encoded ) );
59+
}
60+
61+
public function test_blocks_latin_cyrillic_mixed_name() {
62+
$this->assertSame( 'arn', sanitize_user( 'arn%D1%82' ) );
63+
}
4464
}

tests/phpunit/tests/user.php

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -820,6 +820,35 @@ public function test_validate_username_invalid() {
820820
$this->assertFalse( validate_username( '@#&99sd' ) );
821821
}
822822

823+
/**
824+
* Check that single-script usernames work, mixed-script ones
825+
* don't, and really strange ones don't.
826+
*
827+
* Since validate_username() uses strict sanitization, this
828+
* test checks some codepoints strictly, rejecting some that
829+
* are less readable than confusable.
830+
*/
831+
public function test_validate_utf8_usernames() {
832+
/* WordPress approves of drab grey (grå) Norwegian weather */
833+
$this->assertTrue( validate_username( 'grå' ) );
834+
/* Latin I, Cyrillic V like latin B, Latin M */
835+
$this->assertFalse( validate_username( 'IВM' ) );
836+
/* Three Cyrillic letters */
837+
$this->assertTrue( validate_username( 'ІВМ' ) );
838+
/* A metal umlaut fails because validate_username is
839+
* strict and n̈ is unfamiliar in every language
840+
*/
841+
$this->assertFalse( validate_username( 'spın̈altap' ) );
842+
/* Emoji skintones fail because usernames should be
843+
* easily distinguishable
844+
*/
845+
$this->assertFalse( validate_username( '👱🏼' ) );
846+
$this->assertFalse( validate_username( '👱🏾' ) );
847+
/* Many plain emoji also are barely distinguishable */
848+
$this->assertFalse( validate_username( '😊' ) );
849+
$this->assertFalse( validate_username( '☺️' ) );
850+
}
851+
823852
/**
824853
* @ticket 29880
825854
*/

0 commit comments

Comments
 (0)