@@ -2111,6 +2111,123 @@ function sanitize_file_name( $filename ) {
21112111 return apply_filters ( 'sanitize_file_name ' , $ filename , $ filename_raw );
21122112}
21132113
2114+ /**
2115+ * Returns true if the string contains no more than one unicode
2116+ * script, and false if it contains two or more. This only considers
2117+ * alphabetic characters.
2118+ *
2119+ * This returns true for an empty string.
2120+ *
2121+ * IntlChar does not support returning the script property defined by
2122+ * https://www.unicode.org/reports/tr24/, so this implementation uses
2123+ * a workaround. Some of the old scripts have several code blocks, but
2124+ * the scripts currently being added have only one, since the
2125+ * committee has grown better at estimating the necessary size.
2126+ *
2127+ * This maps the known extension blocks ("latin extended a" etc) to
2128+ * the first block for that script, and then checks that the string
2129+ * uses only a single block. This works for the scripts currently in
2130+ * Unicode, and will work for future scripts as long as the committee
2131+ * keeps estimating correctly, so there's only one block for each
2132+ * future script.
2133+ */
2134+
2135+ function uses_single_unicode_script ( $ input ) {
2136+ if ( version_compare ( PHP_VERSION , '7.4.0 ' , '< ' ) ) {
2137+ /* mb_str_split is new in 7.4; accept ASCII only in 7.2-3 */
2138+ return preg_match ( '/^[a-zA-Z0-9 _.\-@]+$/i ' , $ input );
2139+ }
2140+ $ block = 0 ;
2141+ foreach ( mb_str_split ( $ input ) as $ cp ) {
2142+ if ( IntlChar::isalpha ( $ cp ) ) {
2143+ $ b = IntlChar::getBlockCode ( $ cp );
2144+ switch ( $ b ) {
2145+ case IntlChar::BLOCK_CODE_LATIN_1_SUPPLEMENT :
2146+ // fall through
2147+ case IntlChar::BLOCK_CODE_LATIN_EXTENDED_A :
2148+ // fall through
2149+ case IntlChar::BLOCK_CODE_LATIN_EXTENDED_B :
2150+ case IntlChar::BLOCK_CODE_LATIN_EXTENDED_C :
2151+ case IntlChar::BLOCK_CODE_LATIN_EXTENDED_D :
2152+ case IntlChar::BLOCK_CODE_IPA_EXTENSIONS : // used in Ghana etc
2153+ case IntlChar::BLOCK_CODE_LATIN_EXTENDED_ADDITIONAL :
2154+ $ b = IntlChar::BLOCK_CODE_BASIC_LATIN ;
2155+ break ;
2156+ case IntlChar::BLOCK_CODE_GREEK_EXTENDED :
2157+ case IntlChar::BLOCK_CODE_COPTIC :
2158+ case IntlChar::BLOCK_CODE_COPTIC_EPACT_NUMBERS :
2159+ // Greek and coptic overlap. Coptic
2160+ // looks like Greek upper case, so
2161+ // readers of Greek can read Coptic,
2162+ // but readers of Coptic can't
2163+ // necessarily read Greek. This led to
2164+ // an unfortunate situation in
2165+ // Unicode, where the two can't be
2166+ // properly distinguished by
2167+ // block. However, because of the
2168+ // overlap, this isn't really a
2169+ // problem.
2170+ $ b = IntlChar::BLOCK_CODE_GREEK ;
2171+ break ;
2172+ case IntlChar::BLOCK_CODE_ETHIOPIC_EXTENDED :
2173+ case IntlChar::BLOCK_CODE_ETHIOPIC_EXTENDED_A :
2174+ case IntlChar::BLOCK_CODE_ETHIOPIC_SUPPLEMENT :
2175+ $ b = IntlChar::BLOCK_CODE_ETHIOPIC ;
2176+ break ;
2177+ case IntlChar::BLOCK_CODE_ARABIC_EXTENDED_A :
2178+ case IntlChar::BLOCK_CODE_ARABIC_SUPPLEMENT :
2179+ case IntlChar::BLOCK_CODE_ARABIC_PRESENTATION_FORMS_A :
2180+ case IntlChar::BLOCK_CODE_ARABIC_PRESENTATION_FORMS_B :
2181+ case IntlChar::BLOCK_CODE_ARABIC_SUPPLEMENT :
2182+ $ b = IntlChar::BLOCK_CODE_ARABIC ;
2183+ break ;
2184+ case IntlChar::BLOCK_CODE_CYRILLIC_EXTENDED_A :
2185+ case IntlChar::BLOCK_CODE_CYRILLIC_EXTENDED_B :
2186+ $ b = IntlChar::BLOCK_CODE_CYRILLIC ;
2187+ break ;
2188+ case IntlChar::BLOCK_CODE_BOPOMOFO_EXTENDED :
2189+ $ b = IntlChar::BLOCK_CODE_BOPOMOFO ;
2190+ break ;
2191+ case IntlChar::BLOCK_CODE_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS_EXTENDED :
2192+ $ b = IntlChar::BLOCK_CODE_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS ;
2193+ break ;
2194+ case IntlChar::BLOCK_CODE_DEVANAGARI_EXTENDED :
2195+ $ b = IntlChar::BLOCK_CODE_DEVANAGARI ;
2196+ break ;
2197+ case IntlChar::BLOCK_CODE_HANGUL_JAMO :
2198+ case IntlChar::BLOCK_CODE_HANGUL_JAMO_EXTENDED_A :
2199+ case IntlChar::BLOCK_CODE_HANGUL_JAMO_EXTENDED_B :
2200+ $ b = IntlChar::BLOCK_CODE_HANGUL ;
2201+ break ;
2202+ case IntlChar::BLOCK_CODE_MYANMAR_EXTENDED_A :
2203+ case IntlChar::BLOCK_CODE_MYANMAR_EXTENDED_B :
2204+ $ b = IntlChar::BLOCK_CODE_MYANMAR ;
2205+ break ;
2206+ case IntlChar::BLOCK_CODE_CJK_STROKES :
2207+ case IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS :
2208+ case IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A :
2209+ case IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B :
2210+ case IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C :
2211+ case IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D :
2212+ case IntlChar::BLOCK_CODE_CJK_COMPATIBILITY_IDEOGRAPHS :
2213+ case IntlChar::BLOCK_CODE_CJK_RADICALS_SUPPLEMENT :
2214+ case IntlChar::BLOCK_CODE_ENCLOSED_CJK_LETTERS_AND_MONTHS :
2215+ case IntlChar::BLOCK_CODE_CJK_COMPATIBILITY_FORMS :
2216+ case IntlChar::BLOCK_CODE_CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT :
2217+ $ b = IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS ;
2218+ break ;
2219+ }
2220+ if ( 0 === $ block ) {
2221+ $ block = $ b ;
2222+ }
2223+ if ( $ block !== $ b ) {
2224+ return false ;
2225+ }
2226+ }
2227+ }
2228+ return true ;
2229+ }
2230+
21142231/**
21152232 * Sanitizes a username, stripping out unsafe characters.
21162233 *
@@ -2129,17 +2246,21 @@ function sanitize_file_name( $filename ) {
21292246function sanitize_user ( $ username , $ strict = false ) {
21302247 $ raw_username = $ username ;
21312248 $ username = wp_strip_all_tags ( $ username );
2132- $ username = remove_accents ( $ username );
21332249 // Remove percent-encoded characters.
2134- $ username = preg_replace ( ' |%([a-fA-F0-9][a-fA-F0-9])| ' , '' , $ username );
2250+ $ username = urldecode ( $ username );
21352251 // Remove HTML entities.
21362252 $ username = preg_replace ( '/&.+?;/ ' , '' , $ username );
21372253
2138- // If strict, reduce to ASCII for max portability .
2139- if ( $ strict ) {
2254+ // If mixing different scripts, remove all but ASCII .
2255+ if ( ! uses_single_unicode_script ( $ username ) ) {
21402256 $ username = preg_replace ( '|[^a-z0-9 _.\-@]|i ' , '' , $ username );
21412257 }
21422258
2259+ // If strict, remove reduce to letters and numbers.
2260+ if ( $ strict ) {
2261+ $ username = preg_replace ( '|[^a-z0-9 _.\-@\p{L}\p{N}]|iu ' , '' , $ username );
2262+ }
2263+
21432264 $ username = trim ( $ username );
21442265 // Consolidate contiguous whitespace.
21452266 $ username = preg_replace ( '|\s+| ' , ' ' , $ username );
0 commit comments