@@ -2111,6 +2111,112 @@ function sanitize_file_name( $filename ) {
21112111 return apply_filters ( 'sanitize_file_name ' , $ filename , $ filename_raw );
21122112}
21132113
2114+ /**
2115+ * Returns true if the string contains no more than one unicode
2116+ * script, and false if it contains two or more. This only considers
2117+ * alphabetic characters.
2118+ *
2119+ * This returns true for an empty string.
2120+ *
2121+ * IntlChar does not support returning the script property defined by
2122+ * https://www.unicode.org/reports/tr24/, so this implementation uses
2123+ * a workaround. Some of the old scripts have several code blocks, but
2124+ * the scripts currently being added have only one, since the
2125+ * committee has grown better at estimating the necessary size.
2126+ *
2127+ * This maps the known extension blocks ("latin extended a" etc) to
2128+ * the first block for that script, and then checks that the string
2129+ * uses only a single block. This works for the scripts currently in
2130+ * Unicode, and will work for future scripts as long as the committee
2131+ * keeps estimating correctly, so there's only one block for each
2132+ * future script.
2133+ */
2134+
2135+ function uses_single_unicode_script ( $ input ) {
2136+ $ block = 0 ;
2137+ foreach ( mb_str_split ($ input ) as $ cp ) {
2138+ if (IntlChar::isalpha ($ cp )) {
2139+ $ b = IntlChar::getBlockCode ($ cp );
2140+ switch ($ b ) {
2141+ case IntlChar::BLOCK_CODE_LATIN_1_SUPPLEMENT :
2142+ case IntlChar::BLOCK_CODE_LATIN_EXTENDED_A :
2143+ case IntlChar::BLOCK_CODE_LATIN_EXTENDED_B :
2144+ case IntlChar::BLOCK_CODE_LATIN_EXTENDED_C :
2145+ case IntlChar::BLOCK_CODE_LATIN_EXTENDED_D :
2146+ case IntlChar::BLOCK_CODE_IPA_EXTENSIONS : // used in Ghana etc
2147+ case IntlChar::BLOCK_CODE_LATIN_EXTENDED_ADDITIONAL :
2148+ $ b = IntlChar::BLOCK_CODE_BASIC_LATIN ;
2149+ break ;
2150+ case IntlChar::BLOCK_CODE_GREEK_EXTENDED :
2151+ case IntlChar::BLOCK_CODE_COPTIC :
2152+ case IntlChar::BLOCK_CODE_COPTIC_EPACT_NUMBERS :
2153+ // Greek and coptic overlap. Coptic looks like Greek
2154+ // upper case, so readers of Greek can read Coptic,
2155+ // but readers of Coptic can't necessarily read
2156+ // Greek. This led to an unfortunate situation in
2157+ // Unicode, where the two can't be properly
2158+ // distinguished by block. However, because of the
2159+ // overlap, this isn't really a problem.
2160+ $ b = IntlChar::BLOCK_CODE_GREEK ;
2161+ case IntlChar::BLOCK_CODE_ETHIOPIC_EXTENDED :
2162+ case IntlChar::BLOCK_CODE_ETHIOPIC_EXTENDED_A :
2163+ case IntlChar::BLOCK_CODE_ETHIOPIC_SUPPLEMENT :
2164+ $ b = IntlChar::BLOCK_CODE_ETHIOPIC ;
2165+ break ;
2166+ case IntlChar::BLOCK_CODE_ARABIC_EXTENDED_A :
2167+ case IntlChar::BLOCK_CODE_ARABIC_SUPPLEMENT :
2168+ case IntlChar::BLOCK_CODE_ARABIC_PRESENTATION_FORMS_A :
2169+ case IntlChar::BLOCK_CODE_ARABIC_PRESENTATION_FORMS_B :
2170+ case IntlChar::BLOCK_CODE_ARABIC_SUPPLEMENT :
2171+ $ b = IntlChar::BLOCK_CODE_ARABIC ;
2172+ break ;
2173+ case IntlChar::BLOCK_CODE_CYRILLIC_EXTENDED_A :
2174+ case IntlChar::BLOCK_CODE_CYRILLIC_EXTENDED_B :
2175+ $ b = IntlChar::BLOCK_CODE_CYRILLIC ;
2176+ break ;
2177+ case IntlChar::BLOCK_CODE_BOPOMOFO_EXTENDED :
2178+ $ b = IntlChar::BLOCK_CODE_BOPOMOFO ;
2179+ break ;
2180+ case IntlChar::BLOCK_CODE_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS_EXTENDED :
2181+ $ b = IntlChar::BLOCK_CODE_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS ;
2182+ break ;
2183+ case IntlChar::BLOCK_CODE_DEVANAGARI_EXTENDED :
2184+ $ b = IntlChar::BLOCK_CODE_DEVANAGARI ;
2185+ break ;
2186+ case IntlChar::BLOCK_CODE_HANGUL_JAMO :
2187+ case IntlChar::BLOCK_CODE_HANGUL_JAMO_EXTENDED_A :
2188+ case IntlChar::BLOCK_CODE_HANGUL_JAMO_EXTENDED_B :
2189+ $ b = IntlChar::BLOCK_CODE_HANGUL ;
2190+ break ;
2191+ case IntlChar::BLOCK_CODE_MYANMAR_EXTENDED_A :
2192+ case IntlChar::BLOCK_CODE_MYANMAR_EXTENDED_B :
2193+ $ b = IntlChar::BLOCK_CODE_MYANMAR ;
2194+ break ;
2195+ case IntlChar::BLOCK_CODE_CJK_STROKES :
2196+ case IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS :
2197+ case IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A :
2198+ case IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B :
2199+ case IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C :
2200+ case IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D :
2201+ case IntlChar::BLOCK_CODE_CJK_COMPATIBILITY_IDEOGRAPHS :
2202+ case IntlChar::BLOCK_CODE_CJK_RADICALS_SUPPLEMENT :
2203+ case IntlChar::BLOCK_CODE_ENCLOSED_CJK_LETTERS_AND_MONTHS :
2204+ case IntlChar::BLOCK_CODE_CJK_COMPATIBILITY_FORMS :
2205+ case IntlChar::BLOCK_CODE_CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT :
2206+ $ b = IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS ;
2207+ break ;
2208+ }
2209+ if ($ block == 0 ) {
2210+ $ block = $ b ;
2211+ }
2212+ if ($ block != $ b ) {
2213+ return false ;
2214+ }
2215+ }
2216+ }
2217+ return true ;
2218+ }
2219+
21142220/**
21152221 * Sanitizes a username, stripping out unsafe characters.
21162222 *
@@ -2129,17 +2235,21 @@ function sanitize_file_name( $filename ) {
21292235function sanitize_user ( $ username , $ strict = false ) {
21302236 $ raw_username = $ username ;
21312237 $ username = wp_strip_all_tags ( $ username );
2132- $ username = remove_accents ( $ username );
21332238 // Remove percent-encoded characters.
2134- $ username = preg_replace ( ' |%([a-fA-F0-9][a-fA-F0-9])| ' , '' , $ username );
2239+ $ username = urldecode ( $ username );
21352240 // Remove HTML entities.
21362241 $ username = preg_replace ( '/&.+?;/ ' , '' , $ username );
21372242
2138- // If strict, reduce to ASCII for max portability .
2139- if ( $ strict ) {
2243+ // If mixing different scripts, remove all but ASCII .
2244+ if ( ! uses_single_unicode_script ( $ username ) ) {
21402245 $ username = preg_replace ( '|[^a-z0-9 _.\-@]|i ' , '' , $ username );
21412246 }
21422247
2248+ // If strict, remove reduce to letters and numbers.
2249+ if ( $ strict ) {
2250+ $ username = preg_replace ( '|[^a-z0-9 _.\-@\p{L}\p{N}]|iu ' , '' , $ username );
2251+ }
2252+
21432253 $ username = trim ( $ username );
21442254 // Consolidate contiguous whitespace.
21452255 $ username = preg_replace ( '|\s+| ' , ' ' , $ username );
0 commit comments