@@ -2111,6 +2111,124 @@ function sanitize_file_name( $filename ) {
21112111 return apply_filters ( 'sanitize_file_name ' , $ filename , $ filename_raw );
21122112}
21132113
2114+ /**
2115+ * Returns true if the string contains no more than one unicode
2116+ * script, and false if it contains two or more. This only considers
2117+ * alphabetic characters.
2118+ *
2119+ * This returns true for an empty string.
2120+ *
2121+ * IntlChar does not support returning the script property defined by
2122+ * https://www.unicode.org/reports/tr24/, so this implementation uses
2123+ * a workaround. Some of the old scripts have several code blocks, but
2124+ * the scripts currently being added have only one, since the
2125+ * committee has grown better at estimating the necessary size.
2126+ *
2127+ * This maps the known extension blocks ("latin extended a" etc) to
2128+ * the first block for that script, and then checks that the string
2129+ * uses only a single block. This works for the scripts currently in
2130+ * Unicode, and will work for future scripts as long as the committee
2131+ * keeps estimating correctly, so there's only one block for each
2132+ * future script.
2133+ */
2134+
2135+ function uses_single_unicode_script ( $ input ) {
2136+ if ( version_compare ( PHP_VERSION , '7.4.0 ' , '< ' ) ) {
2137+ /* mb_str_split is new in 7.4; accept ASCII only in 7.2-3 */
2138+ return preg_match ( '/^[a-zA-Z0-9 _.\-@]+$/i ' , $ input );
2139+ }
2140+ $ block = 0 ;
2141+ // phpcs:ignore PHPCompatibility.FunctionUse.NewFunctions.mb_str_splitFound -- old versions of PHP are handled above
2142+ foreach ( mb_str_split ( $ input ) as $ cp ) {
2143+ if ( IntlChar::isalpha ( $ cp ) ) {
2144+ $ b = IntlChar::getBlockCode ( $ cp );
2145+ switch ( $ b ) {
2146+ case IntlChar::BLOCK_CODE_LATIN_1_SUPPLEMENT :
2147+ // fall through
2148+ case IntlChar::BLOCK_CODE_LATIN_EXTENDED_A :
2149+ // fall through
2150+ case IntlChar::BLOCK_CODE_LATIN_EXTENDED_B :
2151+ case IntlChar::BLOCK_CODE_LATIN_EXTENDED_C :
2152+ case IntlChar::BLOCK_CODE_LATIN_EXTENDED_D :
2153+ case IntlChar::BLOCK_CODE_IPA_EXTENSIONS : // used in Ghana etc
2154+ case IntlChar::BLOCK_CODE_LATIN_EXTENDED_ADDITIONAL :
2155+ $ b = IntlChar::BLOCK_CODE_BASIC_LATIN ;
2156+ break ;
2157+ case IntlChar::BLOCK_CODE_GREEK_EXTENDED :
2158+ case IntlChar::BLOCK_CODE_COPTIC :
2159+ case IntlChar::BLOCK_CODE_COPTIC_EPACT_NUMBERS :
2160+ // Greek and coptic overlap. Coptic
2161+ // looks like Greek upper case, so
2162+ // readers of Greek can read Coptic,
2163+ // but readers of Coptic can't
2164+ // necessarily read Greek. This led to
2165+ // an unfortunate situation in
2166+ // Unicode, where the two can't be
2167+ // properly distinguished by
2168+ // block. However, because of the
2169+ // overlap, this isn't really a
2170+ // problem.
2171+ $ b = IntlChar::BLOCK_CODE_GREEK ;
2172+ break ;
2173+ case IntlChar::BLOCK_CODE_ETHIOPIC_EXTENDED :
2174+ case IntlChar::BLOCK_CODE_ETHIOPIC_EXTENDED_A :
2175+ case IntlChar::BLOCK_CODE_ETHIOPIC_SUPPLEMENT :
2176+ $ b = IntlChar::BLOCK_CODE_ETHIOPIC ;
2177+ break ;
2178+ case IntlChar::BLOCK_CODE_ARABIC_EXTENDED_A :
2179+ case IntlChar::BLOCK_CODE_ARABIC_SUPPLEMENT :
2180+ case IntlChar::BLOCK_CODE_ARABIC_PRESENTATION_FORMS_A :
2181+ case IntlChar::BLOCK_CODE_ARABIC_PRESENTATION_FORMS_B :
2182+ case IntlChar::BLOCK_CODE_ARABIC_SUPPLEMENT :
2183+ $ b = IntlChar::BLOCK_CODE_ARABIC ;
2184+ break ;
2185+ case IntlChar::BLOCK_CODE_CYRILLIC_EXTENDED_A :
2186+ case IntlChar::BLOCK_CODE_CYRILLIC_EXTENDED_B :
2187+ $ b = IntlChar::BLOCK_CODE_CYRILLIC ;
2188+ break ;
2189+ case IntlChar::BLOCK_CODE_BOPOMOFO_EXTENDED :
2190+ $ b = IntlChar::BLOCK_CODE_BOPOMOFO ;
2191+ break ;
2192+ case IntlChar::BLOCK_CODE_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS_EXTENDED :
2193+ $ b = IntlChar::BLOCK_CODE_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS ;
2194+ break ;
2195+ case IntlChar::BLOCK_CODE_DEVANAGARI_EXTENDED :
2196+ $ b = IntlChar::BLOCK_CODE_DEVANAGARI ;
2197+ break ;
2198+ case IntlChar::BLOCK_CODE_HANGUL_JAMO :
2199+ case IntlChar::BLOCK_CODE_HANGUL_JAMO_EXTENDED_A :
2200+ case IntlChar::BLOCK_CODE_HANGUL_JAMO_EXTENDED_B :
2201+ $ b = IntlChar::BLOCK_CODE_HANGUL ;
2202+ break ;
2203+ case IntlChar::BLOCK_CODE_MYANMAR_EXTENDED_A :
2204+ case IntlChar::BLOCK_CODE_MYANMAR_EXTENDED_B :
2205+ $ b = IntlChar::BLOCK_CODE_MYANMAR ;
2206+ break ;
2207+ case IntlChar::BLOCK_CODE_CJK_STROKES :
2208+ case IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS :
2209+ case IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A :
2210+ case IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B :
2211+ case IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C :
2212+ case IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D :
2213+ case IntlChar::BLOCK_CODE_CJK_COMPATIBILITY_IDEOGRAPHS :
2214+ case IntlChar::BLOCK_CODE_CJK_RADICALS_SUPPLEMENT :
2215+ case IntlChar::BLOCK_CODE_ENCLOSED_CJK_LETTERS_AND_MONTHS :
2216+ case IntlChar::BLOCK_CODE_CJK_COMPATIBILITY_FORMS :
2217+ case IntlChar::BLOCK_CODE_CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT :
2218+ $ b = IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS ;
2219+ break ;
2220+ }
2221+ if ( 0 === $ block ) {
2222+ $ block = $ b ;
2223+ }
2224+ if ( $ block !== $ b ) {
2225+ return false ;
2226+ }
2227+ }
2228+ }
2229+ return true ;
2230+ }
2231+
21142232/**
21152233 * Sanitizes a username, stripping out unsafe characters.
21162234 *
@@ -2129,17 +2247,21 @@ function sanitize_file_name( $filename ) {
21292247function sanitize_user ( $ username , $ strict = false ) {
21302248 $ raw_username = $ username ;
21312249 $ username = wp_strip_all_tags ( $ username );
2132- $ username = remove_accents ( $ username );
21332250 // Remove percent-encoded characters.
2134- $ username = preg_replace ( ' |%([a-fA-F0-9][a-fA-F0-9])| ' , '' , $ username );
2251+ $ username = urldecode ( $ username );
21352252 // Remove HTML entities.
21362253 $ username = preg_replace ( '/&.+?;/ ' , '' , $ username );
21372254
2138- // If strict, reduce to ASCII for max portability .
2139- if ( $ strict ) {
2255+ // If mixing different scripts, remove all but ASCII .
2256+ if ( ! uses_single_unicode_script ( $ username ) ) {
21402257 $ username = preg_replace ( '|[^a-z0-9 _.\-@]|i ' , '' , $ username );
21412258 }
21422259
2260+ // If strict, remove reduce to letters and numbers.
2261+ if ( $ strict ) {
2262+ $ username = preg_replace ( '|[^a-z0-9 _.\-@\p{L}\p{N}]|iu ' , '' , $ username );
2263+ }
2264+
21432265 $ username = trim ( $ username );
21442266 // Consolidate contiguous whitespace.
21452267 $ username = preg_replace ( '|\s+| ' , ' ' , $ username );
0 commit comments