Skip to content

Commit f7aa293

Browse files
Fix host validation: zone ID characters and NFKC percent bypass
Finding 1: IPv6 zone IDs were not validated even when validate_host=True. Any character — including CR, LF, and null bytes — could be embedded in url.host via URL.build(host='::1%<bad>'). This creates an asymmetry: regular hostnames are correctly rejected for control characters but zone IDs were passed through verbatim. Fix: add _ZONE_ID_RE regex (RFC 6874 unreserved + sub-delims) and validate the zone portion of IPv6 addresses in _encode_host() when validate_host=True. Finding 2: _check_netloc() normalizes the netloc via NFKC and checks for URL-reserved characters but '%' was missing from the checked set. U+FF05 (FULLWIDTH PERCENT SIGN) and U+FE6A (SMALL PERCENT SIGN) both normalize to '%' under NFKC and were accepted, ultimately producing a literal '%' in url.host via the stdlib IDNA fallback in _idna_encode(). Fix: add '%' to the character set checked in _check_netloc().
1 parent 2f180d1 commit f7aa293

6 files changed

Lines changed: 48 additions & 1 deletion

File tree

CHANGES/1655.bugfix.1.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
Fixed :meth:`~yarl.URL.build` failing to validate characters in the zone ID
2+
portion of IPv6 addresses when ``validate_host=True``, allowing control
3+
characters such as CR and LF to pass through into ``url.host``
4+
-- by :user:`rodrigobnogueira`.

CHANGES/1655.bugfix.2.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
Fixed ``_check_netloc()`` missing ``%`` from its NFKC normalization character
2+
check, which allowed Unicode characters U+FF05 (FULLWIDTH PERCENT SIGN) and
3+
U+FE6A (SMALL PERCENT SIGN) to produce a literal ``%`` in ``url.host`` via
4+
the standard library IDNA fallback
5+
-- by :user:`rodrigobnogueira`.

tests/test_url.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
_VERTICAL_COLON = "\ufe13" # normalizes to ":"
1313
_FULL_WITH_NUMBER_SIGN = "\uff03" # normalizes to "#"
1414
_ACCOUNT_OF = "\u2100" # normalizes to "a/c"
15+
_FULLWIDTH_PERCENT = "\uff05" # normalizes to "%"
16+
_SMALL_PERCENT = "\ufe6a" # normalizes to "%"
1517

1618

1719
def test_inheritance() -> None:
@@ -2465,3 +2467,16 @@ def test_url_with_invalid_unicode(disallowed_unicode: str) -> None:
24652467
ValueError, match="contains invalid characters under NFKC normalization"
24662468
):
24672469
URL(f"http://example.{disallowed_unicode}.com/frag")
2470+
2471+
2472+
@pytest.mark.parametrize(
2473+
"percent_char",
2474+
[_FULLWIDTH_PERCENT, _SMALL_PERCENT],
2475+
ids=["fullwidth-percent-U+FF05", "small-percent-U+FE6A"],
2476+
)
2477+
def test_url_with_fullwidth_percent_rejected(percent_char: str) -> None:
2478+
"""NFKC normalization of fullwidth/small percent signs must be caught."""
2479+
with pytest.raises(
2480+
ValueError, match="contains invalid characters under NFKC normalization"
2481+
):
2482+
URL(f"http://evil.com{percent_char}2e.internal/")

tests/test_url_build.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,21 @@ def test_url_ipv4_in_ipv6() -> None:
3535
assert str(u) == "http://[2001:db8:122:344::c000:221]"
3636

3737

38+
@pytest.mark.parametrize(
39+
("zone", "desc"),
40+
(
41+
("\r\nX-Injected: evil", "crlf-injection"),
42+
("\x00evil", "null-byte"),
43+
("zone with spaces", "spaces"),
44+
),
45+
ids=("crlf-injection", "null-byte", "spaces"),
46+
)
47+
def test_url_build_ipv6_zone_id_invalid_chars(zone: str, desc: str) -> None:
48+
"""Zone IDs with control characters must be rejected by validate_host."""
49+
with pytest.raises(ValueError, match="Invalid characters in IPv6 zone ID"):
50+
URL.build(scheme="http", host=f"::1%{zone}", path="/")
51+
52+
3853
def test_build_with_scheme() -> None:
3954
u = URL.build(scheme="blob", path="path")
4055
assert str(u) == "blob:path"

yarl/_parse.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ def _check_netloc(netloc: str) -> None:
9696
# Note that there are no unicode decompositions for the character '@' so
9797
# its currently impossible to have test coverage for this branch, however if the
9898
# one should be added in the future we want to make sure its still checked.
99-
for c in "/?#@:": # pragma: no branch
99+
for c in "/?#@:%": # pragma: no branch
100100
if c in normalized_netloc:
101101
raise ValueError(
102102
f"netloc '{netloc}' contains invalid "

yarl/_url.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,10 @@
8989
re.VERBOSE,
9090
)
9191

92+
# RFC 6874 ZoneID = 1*( unreserved / pct-encoded )
93+
# In practice, sub-delimiters are also used (e.g. eth0, Ethernet+1).
94+
_ZONE_ID_RE = re.compile(r"^[A-Za-z0-9._~!$&'()*+,;=%-]+$")
95+
9296
_T = TypeVar("_T")
9397

9498
if sys.version_info >= (3, 11):
@@ -1574,6 +1578,10 @@ def _encode_host(host: str, validate_host: bool) -> str:
15741578
except ValueError:
15751579
pass
15761580
else:
1581+
if sep and validate_host and not _ZONE_ID_RE.match(zone):
1582+
raise ValueError(
1583+
f"Invalid characters in IPv6 zone ID: {zone!r}"
1584+
)
15771585
# These checks should not happen in the
15781586
# LRU to keep the cache size small
15791587
host = ip.compressed

0 commit comments

Comments
 (0)