General: Add support for most unicode letters in WP backend login names

arnt · arnt · commit 088aced11276 · 2024-12-04T09:26:58.000+01:00
This accepts user names that contain a single script, but not mixed-script
names, such as ones that mix Latin and Cyrillic. That seemed to be closest
to the code's existing philosophy.

Since PHP 7.2 and 7.3 don't offer mb_str_split, this change leaves sites
on those versions of PHP with ASCII-only user names. Existing user names
should continue to work unchanged, adding new (non-ASCII) user names will
not work.
diff --git a/src/wp-includes/formatting.php b/src/wp-includes/formatting.php
@@ -2111,6 +2111,123 @@ function sanitize_file_name( $filename ) {
 	return apply_filters( 'sanitize_file_name', $filename, $filename_raw );
 }
 
+/**
+ * Returns true if the string contains no more than one unicode
+ * script, and false if it contains two or more. This only considers
+ * alphabetic characters.
+ *
+ * This returns true for an empty string.
+ *
+ * IntlChar does not support returning the script property defined by
+ * https://www.unicode.org/reports/tr24/, so this implementation uses
+ * a workaround. Some of the old scripts have several code blocks, but
+ * the scripts currently being added have only one, since the
+ * committee has grown better at estimating the necessary size.
+ *
+ * This maps the known extension blocks ("latin extended a" etc) to
+ * the first block for that script, and then checks that the string
+ * uses only a single block. This works for the scripts currently in
+ * Unicode, and will work for future scripts as long as the committee
+ * keeps estimating correctly, so there's only one block for each
+ * future script.
+ */
+
+function uses_single_unicode_script( $input ) {
+	if ( version_compare( PHP_VERSION, '7.4.0', '<') ) {
+		/* mb_str_split is new in 7.4; accept ASCII only in 7.2-3 */
+		return preg_match( '/^[a-zA-Z0-9 _.\-@]+$/i', $input );
+	}
+	$block = 0;
+	foreach ( mb_str_split( $input ) as $cp ) {
+		if ( IntlChar::isalpha( $cp ) ) {
+			$b = IntlChar::getBlockCode( $cp );
+			switch ( $b ) {
+				case IntlChar::BLOCK_CODE_LATIN_1_SUPPLEMENT:
+					// fall through
+				case IntlChar::BLOCK_CODE_LATIN_EXTENDED_A:
+					// fall through
+				case IntlChar::BLOCK_CODE_LATIN_EXTENDED_B:
+				case IntlChar::BLOCK_CODE_LATIN_EXTENDED_C:
+				case IntlChar::BLOCK_CODE_LATIN_EXTENDED_D:
+				case IntlChar::BLOCK_CODE_IPA_EXTENSIONS: // used in Ghana etc
+				case IntlChar::BLOCK_CODE_LATIN_EXTENDED_ADDITIONAL:
+					$b = IntlChar::BLOCK_CODE_BASIC_LATIN;
+					break;
+				case IntlChar::BLOCK_CODE_GREEK_EXTENDED:
+				case IntlChar::BLOCK_CODE_COPTIC:
+				case IntlChar::BLOCK_CODE_COPTIC_EPACT_NUMBERS:
+					// Greek and coptic overlap. Coptic
+					// looks like Greek upper case, so
+					// readers of Greek can read Coptic,
+					// but readers of Coptic can't
+					// necessarily read Greek. This led to
+					// an unfortunate situation in
+					// Unicode, where the two can't be
+					// properly distinguished by
+					// block. However, because of the
+					// overlap, this isn't really a
+					// problem.
+					$b = IntlChar::BLOCK_CODE_GREEK;
+					break;
+				case IntlChar::BLOCK_CODE_ETHIOPIC_EXTENDED:
+				case IntlChar::BLOCK_CODE_ETHIOPIC_EXTENDED_A:
+				case IntlChar::BLOCK_CODE_ETHIOPIC_SUPPLEMENT:
+					$b = IntlChar::BLOCK_CODE_ETHIOPIC;
+					break;
+				case IntlChar::BLOCK_CODE_ARABIC_EXTENDED_A:
+				case IntlChar::BLOCK_CODE_ARABIC_SUPPLEMENT:
+				case IntlChar::BLOCK_CODE_ARABIC_PRESENTATION_FORMS_A:
+				case IntlChar::BLOCK_CODE_ARABIC_PRESENTATION_FORMS_B:
+				case IntlChar::BLOCK_CODE_ARABIC_SUPPLEMENT:
+					$b = IntlChar::BLOCK_CODE_ARABIC;
+					break;
+				case IntlChar::BLOCK_CODE_CYRILLIC_EXTENDED_A:
+				case IntlChar::BLOCK_CODE_CYRILLIC_EXTENDED_B:
+					$b = IntlChar::BLOCK_CODE_CYRILLIC;
+					break;
+				case IntlChar::BLOCK_CODE_BOPOMOFO_EXTENDED:
+					$b = IntlChar::BLOCK_CODE_BOPOMOFO;
+					break;
+				case IntlChar::BLOCK_CODE_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS_EXTENDED:
+					$b = IntlChar::BLOCK_CODE_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS;
+					break;
+				case IntlChar::BLOCK_CODE_DEVANAGARI_EXTENDED:
+					$b = IntlChar::BLOCK_CODE_DEVANAGARI;
+					break;
+				case IntlChar::BLOCK_CODE_HANGUL_JAMO:
+				case IntlChar::BLOCK_CODE_HANGUL_JAMO_EXTENDED_A:
+				case IntlChar::BLOCK_CODE_HANGUL_JAMO_EXTENDED_B:
+					$b = IntlChar::BLOCK_CODE_HANGUL;
+					break;
+				case IntlChar::BLOCK_CODE_MYANMAR_EXTENDED_A:
+				case IntlChar::BLOCK_CODE_MYANMAR_EXTENDED_B:
+					$b = IntlChar::BLOCK_CODE_MYANMAR;
+					break;
+				case IntlChar::BLOCK_CODE_CJK_STROKES:
+				case IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS:
+				case IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A:
+				case IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B:
+				case IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C:
+				case IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D:
+				case IntlChar::BLOCK_CODE_CJK_COMPATIBILITY_IDEOGRAPHS:
+				case IntlChar::BLOCK_CODE_CJK_RADICALS_SUPPLEMENT:
+				case IntlChar::BLOCK_CODE_ENCLOSED_CJK_LETTERS_AND_MONTHS:
+				case IntlChar::BLOCK_CODE_CJK_COMPATIBILITY_FORMS:
+				case IntlChar::BLOCK_CODE_CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT:
+					$b = IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS;
+					break;
+			}
+			if ( 0 === $block ) {
+				$block = $b;
+			}
+			if ( $block !== $b ) {
+				return false;
+			}
+		}
+	}
+	return true;
+}
+
 /**
  * Sanitizes a username, stripping out unsafe characters.
  *
@@ -2129,17 +2246,21 @@ function sanitize_file_name( $filename ) {
 function sanitize_user( $username, $strict = false ) {
 	$raw_username = $username;
 	$username     = wp_strip_all_tags( $username );
-	$username     = remove_accents( $username );
 	// Remove percent-encoded characters.
-	$username = preg_replace( '|%([a-fA-F0-9][a-fA-F0-9])|', '', $username );
+	$username = urldecode( $username );
 	// Remove HTML entities.
 	$username = preg_replace( '/&.+?;/', '', $username );
 
-	// If strict, reduce to ASCII for max portability.
-	if ( $strict ) {
+	// If mixing different scripts, remove all but ASCII.
+	if ( ! uses_single_unicode_script( $username ) ) {
 		$username = preg_replace( '|[^a-z0-9 _.\-@]|i', '', $username );
 	}
 
+	// If strict, remove reduce to letters and numbers.
+	if ( $strict ) {
+		$username = preg_replace( '|[^a-z0-9 _.\-@\p{L}\p{N}]|iu', '', $username );
+	}
+
 	$username = trim( $username );
 	// Consolidate contiguous whitespace.
 	$username = preg_replace( '|\s+|', ' ', $username );
diff --git a/tests/phpunit/tests/formatting/sanitizeUser.php b/tests/phpunit/tests/formatting/sanitizeUser.php
@@ -35,10 +35,30 @@ public function test_strips_encoded_ampersand_when_followed_by_semicolon() {
 	}
 
 	public function test_strips_percent_encoded_octets() {
-		$expected = is_multisite() ? 'franois' : 'Franois';
+		$expected = is_multisite() ? 'françois' : 'François';
 		$this->assertSame( $expected, sanitize_user( 'Fran%c3%a7ois' ) );
 	}
 	public function test_optional_strict_mode_reduces_to_safe_ascii_subset() {
 		$this->assertSame( 'abc', sanitize_user( '()~ab~ˆcˆ!', true ) );
 	}
+
+	public function test_accepts_all_arabic() {
+		$expected = 'آرنت';
+		$encoded  = '%D8%A2%D8%B1%D9%86%D8%AA';
+
+		$this->assertSame( $expected, sanitize_user( $expected ) );
+		$this->assertSame( $expected, sanitize_user( $encoded ) );
+	}
+
+	public function test_accepts_west_african_latin() {
+		$expected = 'tɔnatɔn';
+		$encoded  = 't%C9%94nat%C9%94n';
+
+		$this->assertSame( $expected, sanitize_user( $expected ) );
+		$this->assertSame( $expected, sanitize_user( $encoded ) );
+	}
+
+	public function test_blocks_latin_cyrillic_mixed_name() {
+		$this->assertSame( 'arn', sanitize_user( 'arn%D1%82' ) );
+	}
 }
diff --git a/tests/phpunit/tests/user.php b/tests/phpunit/tests/user.php
@@ -820,6 +820,35 @@ public function test_validate_username_invalid() {
 		$this->assertFalse( validate_username( '@#&99sd' ) );
 	}
 
+	/**
+	 * Check that single-script usernames work, mixed-script ones
+	 * don't, and really strange ones don't.
+	 *
+	 * Since validate_username() uses strict sanitization, this
+	 * test checks some codepoints strictly, rejecting some that
+	 * are less readable than confusable.
+	 */
+	public function test_validate_utf8_usernames() {
+		/* WordPress approves of drab grey (grå) Norwegian weather */
+		$this->assertTrue( validate_username( 'grå' ) );
+		/* Latin I, Cyrillic V like latin B, Latin M */
+		$this->assertFalse( validate_username( 'IВM' ) );
+		/* Three Cyrillic letters */
+		$this->assertTrue( validate_username( 'ІВМ' ) );
+		/* A metal umlaut fails because validate_username is
+		 * strict and n̈ is unfamiliar in every language
+		 */
+		$this->assertFalse( validate_username( 'spın̈altap' ) );
+		/* Emoji skintones fail because usernames should be
+		 * easily distinguishable
+		 */
+		$this->assertFalse( validate_username( '👱🏼' ) );
+		$this->assertFalse( validate_username( '👱🏾' ) );
+		/* Many plain emoji also are barely distinguishable */
+		$this->assertFalse( validate_username( '😊' ) );
+		$this->assertFalse( validate_username( '☺️' ) );
+	}
+
 	/**
 	 * @ticket 29880
 	 */