Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGES/1655.bugfix.1.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Fixed :meth:`~yarl.URL.build` failing to validate characters in the zone ID
portion of IPv6 addresses when ``validate_host=True``, allowing control
characters such as CR and LF to pass through into ``url.host``
-- by :user:`rodrigobnogueira`.
5 changes: 5 additions & 0 deletions CHANGES/1655.bugfix.2.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Fixed ``_check_netloc()`` missing ``%`` from its NFKC normalization character
check, which allowed Unicode characters U+FF05 (FULLWIDTH PERCENT SIGN) and
U+FE6A (SMALL PERCENT SIGN) to produce a literal ``%`` in ``url.host`` via
the standard library IDNA fallback
-- by :user:`rodrigobnogueira`.
15 changes: 15 additions & 0 deletions tests/test_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
_VERTICAL_COLON = "\ufe13" # normalizes to ":"
_FULL_WITH_NUMBER_SIGN = "\uff03" # normalizes to "#"
_ACCOUNT_OF = "\u2100" # normalizes to "a/c"
_FULLWIDTH_PERCENT = "\uff05" # normalizes to "%"
_SMALL_PERCENT = "\ufe6a" # normalizes to "%"


def test_inheritance() -> None:
Expand Down Expand Up @@ -2465,3 +2467,16 @@ def test_url_with_invalid_unicode(disallowed_unicode: str) -> None:
ValueError, match="contains invalid characters under NFKC normalization"
):
URL(f"http://example.{disallowed_unicode}.com/frag")


@pytest.mark.parametrize(
"percent_char",
[_FULLWIDTH_PERCENT, _SMALL_PERCENT],
ids=["fullwidth-percent-U+FF05", "small-percent-U+FE6A"],
)
def test_url_with_fullwidth_percent_rejected(percent_char: str) -> None:
"""NFKC normalization of fullwidth/small percent signs must be caught."""
with pytest.raises(
ValueError, match="contains invalid characters under NFKC normalization"
):
URL(f"http://evil.com{percent_char}2e.internal/")
15 changes: 15 additions & 0 deletions tests/test_url_build.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,21 @@ def test_url_ipv4_in_ipv6() -> None:
assert str(u) == "http://[2001:db8:122:344::c000:221]"


@pytest.mark.parametrize(
"zone",
(
"\r\nX-Injected: evil",
"\x00evil",
"zone with spaces",
),
ids=("crlf-injection", "null-byte", "spaces"),
)
def test_url_build_ipv6_zone_id_invalid_chars(zone: str) -> None:
"""Zone IDs with control characters must be rejected by validate_host."""
with pytest.raises(ValueError, match="Invalid characters in IPv6 zone ID"):
URL.build(scheme="http", host=f"::1%{zone}", path="/")


def test_build_with_scheme() -> None:
u = URL.build(scheme="blob", path="path")
assert str(u) == "blob:path"
Expand Down
2 changes: 1 addition & 1 deletion yarl/_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def _check_netloc(netloc: str) -> None:
# Note that there are no unicode decompositions for the character '@' so
# its currently impossible to have test coverage for this branch, however if the
# one should be added in the future we want to make sure its still checked.
for c in "/?#@:": # pragma: no branch
for c in "/?#@:%": # pragma: no branch
if c in normalized_netloc:
raise ValueError(
f"netloc '{netloc}' contains invalid "
Expand Down
8 changes: 8 additions & 0 deletions yarl/_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,12 @@
re.VERBOSE,
)

# Zone IDs in URIs are defined by RFC 6874 (obsoleted by RFC 9844 for UI usage):
# ZoneID = 1*( unreserved / pct-encoded )
# https://www.rfc-editor.org/rfc/rfc6874#section-2
# In practice, sub-delimiters are also used (e.g. eth0, Ethernet+1).
Comment thread
Dreamsorcerer marked this conversation as resolved.
Outdated
_ZONE_ID_RE = re.compile(r"^[A-Za-z0-9._~!$&'()*+,;=%-]+$")

_T = TypeVar("_T")

if sys.version_info >= (3, 11):
Expand Down Expand Up @@ -1574,6 +1580,8 @@ def _encode_host(host: str, validate_host: bool) -> str:
except ValueError:
pass
else:
if sep and validate_host and not _ZONE_ID_RE.match(zone):
raise ValueError(f"Invalid characters in IPv6 zone ID: {zone!r}")
# These checks should not happen in the
# LRU to keep the cache size small
host = ip.compressed
Expand Down
Loading