Skip to content

Commit 05af093

Browse files
committed
General: Add support for unicode email addresses in is_email and sanitize_email
This adds support for the unicode address extensions in RFC 6530-3 and refactors the code so there are fewer long regexes and less duplication between sanitize_email and is_email. A new class, WP_Email_Address, provides the shared parts. Opting out of unicode support is easy, default-filters.php adds unicode support by adding filters, which can be removed. sanitize_email no longer does major changes like removing an entire subdomain from someone's address, it only cleans up things like soft hyphens and whitespace — changes that happen when coping an email address from text. During testing, it became clear that antispambot() worked only for strings using a single-byte encoding, while this uses UTF8. Fixed. Fixes #31992. Props SirLouen, dmsnell, tusharbharti, mukeshpanchal27, akirk.
1 parent bccb9c1 commit 05af093

8 files changed

Lines changed: 711 additions & 175 deletions

File tree

Lines changed: 246 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,246 @@
1+
<?php
2+
/**
3+
* Class 'WP_Email_Address'.
4+
*
5+
* @package WordPress
6+
* @since 7.0.0
7+
*/
8+
9+
/**
10+
* Represents a validated email address.
11+
*
12+
* Use the static factory method {@see WP_Email_Address::from_string()} to create instances
13+
* of this class rather than the constructor, which is private.
14+
*
15+
* @since 7.0.0
16+
*/
17+
final class WP_Email_Address {
18+
19+
/**
20+
* Regex for the local part when Unicode is not enabled.
21+
*
22+
* Matches the character set from the WHATWG email specification:
23+
* https://html.spec.whatwg.org/multipage/input.html#email-state-(type=email)
24+
*
25+
* @since 7.0.0
26+
* @var string
27+
*/
28+
const LOCAL_PART_ASCII_REGEX = '/^[a-zA-Z0-9.!#$%&\'*+\/=?^_`{|}~-]+$/';
29+
30+
/**
31+
* Regex for the local part when Unicode is enabled.
32+
*
33+
* Extends the WHATWG character set to allow Unicode letters and numbers,
34+
* and applies the same grapheme-cluster structure used for domain labels:
35+
* each cluster must open with a non-combining character.
36+
*
37+
* @since 7.0.0
38+
* @var string
39+
*/
40+
const LOCAL_PART_UNICODE_REGEX = '/^([\p{L}\p{N}.!#$%&\'*+\/=?^_`{|}~-]\p{M}*)+$/u';
41+
42+
/**
43+
* Pattern for a single ASCII domain label (no dot).
44+
*
45+
* Matches a label from the WHATWG email specification: starts and ends with
46+
* a letter or digit; internal characters may include hyphens.
47+
*
48+
* @since 7.0.0
49+
* @var string
50+
*/
51+
const DOMAIN_LABEL_ASCII = '[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?';
52+
53+
/**
54+
* Pattern for a single Unicode domain label (no dot).
55+
*
56+
* Extends the ASCII label pattern to allow Unicode letters and numbers,
57+
* with grapheme-cluster structure: each cluster must open with a letter or
58+
* digit (not a combining mark), followed by zero or more combining marks.
59+
*
60+
* @since 7.0.0
61+
* @var string
62+
*/
63+
const DOMAIN_LABEL_UNICODE = '[\p{L}\p{N}]\p{M}*(?:(?:[\p{L}\p{N}-]\p{M}*)*[\p{L}\p{N}]\p{M}*)?';
64+
65+
/**
66+
* Regex for the domain when Unicode is not enabled.
67+
*
68+
* Assembled from {@see self::DOMAIN_LABEL_ASCII}: one label, then zero or
69+
* more dot-separated labels.
70+
*
71+
* @since 7.0.0
72+
* @var string
73+
*/
74+
const DOMAIN_ASCII_REGEX = '/^' . self::DOMAIN_LABEL_ASCII . '(?:\.' . self::DOMAIN_LABEL_ASCII . ')*$/';
75+
76+
/**
77+
* Regex for the domain when Unicode is enabled.
78+
*
79+
* Assembled from {@see self::DOMAIN_LABEL_UNICODE}: one label, then zero or
80+
* more dot-prefixed labels.
81+
*
82+
* @since 7.0.0
83+
* @var string
84+
*/
85+
const DOMAIN_UNICODE_REGEX = '/^' . self::DOMAIN_LABEL_UNICODE . '(?:\.' . self::DOMAIN_LABEL_UNICODE . ')*$/u';
86+
87+
/**
88+
* The local part of the email address (the portion before the '@').
89+
*
90+
* @since 7.0.0
91+
* @var string
92+
*/
93+
private $localpart;
94+
95+
/**
96+
* The domain part of the email address (the portion after the '@').
97+
*
98+
* @since 7.0.0
99+
* @var string
100+
*/
101+
private $domain;
102+
103+
/**
104+
* Private constructor. Use {@see WP_Email_Address::from_string()} to create instances.
105+
*
106+
* @since 7.0.0
107+
*
108+
* @param string $localpart The local part of the email address.
109+
* @param string $domain The domain part of the email address.
110+
*/
111+
private function __construct( string $localpart, string $domain ) {
112+
$this->localpart = $localpart;
113+
$this->domain = $domain;
114+
}
115+
116+
/**
117+
* Creates a WP_Email_Address from a string.
118+
*
119+
* This method is intended to accept all strings that are considered valid email
120+
* addresses by the WHATWG HTML specification for the email input type:
121+
*
122+
* https://html.spec.whatwg.org/multipage/input.html#email-state-(type=email)
123+
*
124+
* and some additional addresses, while rejecting strings that
125+
* are more likely to be typos, mispastes, or attacks. This class
126+
* may reject a few address that are valid according to RFC 5322,
127+
* but it always accepts an address if it's valid according to
128+
* WHATWG. Put differently: If users can type an address into
129+
* the major browsers of 2026, this class accepts them, if
130+
* they can't (in 2026), this class may or may not. (Note that
131+
* "<iframe src=...>"@example.com is valid according to the RFC.)
132+
*
133+
* @since 7.0.0
134+
*
135+
* @param string $input The email address string to parse.
136+
* @param bool $unicode Whether to allow Unicode characters in the address.
137+
* @return WP_Email_Address|false A WP_Email_Address instance, or false if the input is invalid.
138+
*/
139+
public static function from_string( string $input, bool $unicode ) {
140+
// There must be exactly one '@' sign.
141+
$at_pos = strpos( $input, '@' );
142+
if ( false === $at_pos || strrpos( $input, '@' ) !== $at_pos ) {
143+
return false;
144+
}
145+
146+
$localpart = substr( $input, 0, $at_pos );
147+
$domain = substr( $input, $at_pos + 1 );
148+
149+
foreach ( explode( '.', $domain ) as $label ) {
150+
// DNS limits each label to 63 octets.
151+
if ( strlen( $label ) > 63 ) {
152+
return false;
153+
}
154+
}
155+
156+
if ( $unicode && function_exists( 'idn_to_utf8' ) ) {
157+
// Validate each domain label, decode any punycode to UTF-8, and
158+
// reassemble the decoded labels into the local $domain variable.
159+
$decoded_labels = array();
160+
foreach ( explode( '.', $domain ) as $label ) {
161+
// Decode punycode labels to their Unicode form for further validation.
162+
if ( str_starts_with( $label, 'xn--' ) ) {
163+
$label = idn_to_utf8( $label, IDNA_DEFAULT, INTL_IDNA_VARIANT_UTS46 );
164+
if ( false === $label ) {
165+
return false;
166+
}
167+
}
168+
// Reject labels with a reserved ACE-like prefix (two chars followed by '--').
169+
if ( preg_match( '/^..--/u', $label ) ) {
170+
return false;
171+
}
172+
$decoded_labels[] = $label;
173+
}
174+
$domain = implode( '.', $decoded_labels );
175+
} else {
176+
// Without Unicode support, reject any non-ASCII byte in either part.
177+
if ( preg_match( '/[\x80-\xff]/', $input ) ) {
178+
return false;
179+
}
180+
}
181+
182+
// Both parts must be valid UTF-8, regardless of whether Unicode is requested. (A valid ASCII string is also valid UTF-8.)
183+
if ( ! wp_is_valid_utf8( $localpart ) || ! wp_is_valid_utf8( $domain ) ) {
184+
return false;
185+
}
186+
187+
// Validate the local part against the allowed character set.
188+
if ( ! preg_match( $unicode ? self::LOCAL_PART_UNICODE_REGEX : self::LOCAL_PART_ASCII_REGEX, $localpart ) ) {
189+
/** This filter is documented in wp-includes/formatting.php */
190+
if ( ! apply_filters( 'is_email', false, $input, 'local_invalid_chars' ) ) {
191+
return false;
192+
}
193+
}
194+
195+
// The domain must contain at least one dot.
196+
if ( ! str_contains( $domain, '.' ) ) {
197+
/** This filter is documented in wp-includes/formatting.php */
198+
if ( ! apply_filters( 'is_email', false, $input, 'domain_no_periods' ) ) {
199+
return false;
200+
}
201+
}
202+
203+
// Validate the domain against the allowed structure.
204+
if ( ! preg_match( $unicode ? self::DOMAIN_UNICODE_REGEX : self::DOMAIN_ASCII_REGEX, $domain ) ) {
205+
return false;
206+
}
207+
208+
return new self( $localpart, $domain );
209+
}
210+
211+
/**
212+
* Returns the local part of the email address (the portion before the '@').
213+
*
214+
* @since 7.0.0
215+
*
216+
* @return string The local part of the email address.
217+
*/
218+
public function get_localpart(): string {
219+
return $this->localpart;
220+
}
221+
222+
/**
223+
* Returns the domain part of the email address (the portion after the '@').
224+
*
225+
* @since 7.0.0
226+
*
227+
* @return string The domain part of the email address.
228+
*/
229+
public function get_domain(): string {
230+
return $this->domain;
231+
}
232+
233+
/**
234+
* Returns the complete email address as a string.
235+
*
236+
* The returned value can always be passed to {@see WP_Email_Address::from_string()}
237+
* and will produce an equivalent WP_Email_Address instance.
238+
*
239+
* @since 7.0.0
240+
*
241+
* @return string The complete email address.
242+
*/
243+
public function get_address(): string {
244+
return $this->localpart . '@' . $this->domain;
245+
}
246+
}

src/wp-includes/default-filters.php

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,17 @@
8787
add_filter( $filter, 'wp_filter_kses' );
8888
}
8989

90+
// Email addresses: Allow unicode if and only if as the database can
91+
// store them. This affects all addresses, including those entered
92+
// into contact forms.
93+
if ( 'utf8mb4' === $wpdb->charset ) {
94+
add_filter( 'is_email', 'wp_is_unicode_email', 10, 3 );
95+
add_filter( 'sanitize_email', 'wp_sanitize_unicode_email', 10, 3 );
96+
} else {
97+
add_filter( 'is_email', 'wp_is_ascii_email', 10, 3 );
98+
add_filter( 'sanitize_email', 'wp_sanitize_ascii_email', 10, 3 );
99+
}
100+
90101
// Display URL.
91102
foreach ( array( 'user_url', 'link_url', 'link_image', 'link_rss', 'comment_url', 'post_guid' ) as $filter ) {
92103
if ( is_admin() ) {

0 commit comments

Comments
 (0)