Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions Doc/library/urlparse.rst
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,11 @@ The :mod:`urlparse` module defines the following functions:
decomposed before parsing, or is not a Unicode string, no error will be
raised.

.. warning::

:func:`urlparse` does not perform validation. See :ref:`URL parsing
security <url-parsing-security>` for details.

.. versionchanged:: 2.5
Added attributes to return value.

Expand Down Expand Up @@ -248,6 +253,10 @@ The :mod:`urlparse` module defines the following functions:
decomposed before parsing, or is not a Unicode string, no error will be
raised.

Following some of the `WHATWG spec`_ that updates RFC 3986, leading C0
control and space characters are stripped from the URL. ``\n``,
``\r`` and tab ``\t`` characters are removed from the URL at any position.

.. versionadded:: 2.2

.. versionchanged:: 2.5
Expand All @@ -257,6 +266,9 @@ The :mod:`urlparse` module defines the following functions:
Characters that affect netloc parsing under NFKC normalization will
now raise :exc:`ValueError`.

.. versionchanged:: 2.7.17.8
Leading WHATWG C0 control and space characters are stripped from the URL.


.. function:: urlunsplit(parts)

Expand Down Expand Up @@ -378,3 +390,14 @@ The following classes provide the implementations of the parse results:

Concrete class for :func:`urlsplit` results.

.. _url-parsing-security:

URL parsing security
--------------------

The :func:`urlsplit` and :func:`urlparse` APIs do not perform **validation** of
inputs. They may not raise errors on inputs that other applications consider
invalid. They may also succeed on some inputs that might not be considered
URLs elsewhere. Their purpose is for practical functionality rather than
purity.

110 changes: 102 additions & 8 deletions Lib/test/test_urlparse.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2022 ActiveState Software Inc.
# test_urlparse.py is licensed under the PSFLv2 License.
# See the file LICENSE for details.
Expand All @@ -8,6 +9,23 @@
import unittest
import urlparse

# Add ability to run sub-tests
def sub_test(param_list):
"""Decorates a test case to run it as a set of subtests."""

def decorator(f):

@functools.wraps(f)
def wrapped(self):
for param in param_list:
with self.subTest(**param):
f(self, **param)

return wrapped

return decorator


RFC1808_BASE = "http://a/b/c/d;p?q#f"
RFC2396_BASE = "http://a/b/c/d;p?q"
RFC3986_BASE = 'http://a/b/c/d;p?q'
Expand Down Expand Up @@ -602,16 +620,92 @@ def test_urlsplit_remove_unsafe_bytes(self):
self.assertEqual(p.port, None)
self.assertEqual(p.geturl(), u"http://www.python.org/javascript:alert('msg')/#frag")

def test_urlsplit_strip_url(self):
noise = bytes(bytearray(range(0, 0x20 + 1)))
base_url = "http://User:Pass@www.python.org:080/doc/?query=yes#frag"

def test_attributes_bad_port(self):
"""Check handling of non-integer ports."""
p = urlparse.urlsplit("http://www.example.net:foo")
self.assertEqual(p.netloc, "www.example.net:foo")
self.assertRaises(ValueError, lambda: p.port)
url = noise.decode("utf-8") + base_url
p = urlparse.urlsplit(url)
self.assertEqual(p.scheme, "http")
self.assertEqual(p.netloc, "User:Pass@www.python.org:080")
self.assertEqual(p.path, "/doc/")
self.assertEqual(p.query, "query=yes")
self.assertEqual(p.fragment, "frag")
self.assertEqual(p.username, "User")
self.assertEqual(p.password, "Pass")
self.assertEqual(p.hostname, "www.python.org")
self.assertEqual(p.port, 80)
self.assertEqual(p.geturl(), base_url)

p = urlparse.urlparse("http://www.example.net:foo")
self.assertEqual(p.netloc, "www.example.net:foo")
self.assertRaises(ValueError, lambda: p.port)
url = noise + base_url.encode("utf-8")
p = urlparse.urlsplit(url)
self.assertEqual(p.scheme, b"http")
self.assertEqual(p.netloc, b"User:Pass@www.python.org:080")
self.assertEqual(p.path, b"/doc/")
self.assertEqual(p.query, b"query=yes")
self.assertEqual(p.fragment, b"frag")
self.assertEqual(p.username, b"User")
self.assertEqual(p.password, b"Pass")
self.assertEqual(p.hostname, b"www.python.org")
self.assertEqual(p.port, 80)
self.assertEqual(p.geturl(), base_url.encode("utf-8"))

# Test that trailing space is preserved as some applications rely on
# this within query strings.
query_spaces_url = "https://www.python.org:88/doc/?query= "
p = urlparse.urlsplit(noise.decode("utf-8") + query_spaces_url)
self.assertEqual(p.scheme, "https")
self.assertEqual(p.netloc, "www.python.org:88")
self.assertEqual(p.path, "/doc/")
self.assertEqual(p.query, "query= ")
self.assertEqual(p.port, 88)
self.assertEqual(p.geturl(), query_spaces_url)

p = urlparse.urlsplit("www.pypi.org ")
# That "hostname" gets considered a "path" due to the
# trailing space and our existing logic... YUCK...
# and re-assembles via geturl aka unurlsplit into the original.
# django.core.validators.URLValidator (at least through v3.2) relies on
# this, for better or worse, to catch it in a ValidationError via its
# regular expressions.
# Here we test the basic round trip concept of such a trailing space.
self.assertEqual(urlparse.urlunsplit(p), "www.pypi.org ")

# with scheme as cache-key
url = "//www.python.org/"
scheme = noise.decode("utf-8") + "https" + noise.decode("utf-8")
for _ in range(2):
p = urlparse.urlsplit(url, scheme=scheme)
self.assertEqual(p.scheme, "https")
self.assertEqual(p.geturl(), "https://www.python.org/")

def test_attributes_bad_port_a(self):
"""Check handling of invalid ports."""
for bytes in (False, True):
for parse in (urlparse.urlsplit, urlparse.urlparse):
# Spaces and invalid characters are stripped now, so the missing one's can't cause issues
# for port in ("foo", "1.5", "-1", "0x10", "-0", "1_1", " 1", "1 ", "६"):
for port in ("foo", "1.5", "0x10", "1_1"):
netloc = "www.example.net:" + port
url = "http://" + netloc + "/"
if bytes:
netloc = netloc.encode("ascii")
url = url.encode("ascii")
p = parse(url)
self.assertEqual(p.netloc, netloc)
with self.assertRaises(ValueError):
p.port

def test_attributes_bad_port_b(self):
"""Check handling of invalid ports."""
for parse in (urlparse.urlsplit, urlparse.urlparse):
for port in ("६"):
netloc = "www.example.net:" + port
url = "http://" + netloc + "/"
p = parse(url)
self.assertEqual(p.netloc, netloc)
with self.assertRaises(ValueError):
p.port

def test_attributes_without_netloc(self):
# This example is straight from RFC 3261. It looks like it
Expand Down
14 changes: 14 additions & 0 deletions Lib/urlparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@
parsing quirks from older RFCs are retained. The testcases in
test_urlparse.py provides a good indicator of parsing behavior.

The WHATWG URL Parser spec should also be considered. We are not compliant with
it either due to existing user code API behavior expectations (Hyrum's Law).
It serves as a useful guide when making changes.
"""

import re
Expand Down Expand Up @@ -66,6 +69,10 @@
'0123456789'
'+-.')

# Leading and trailing C0 control and space to be stripped per WHATWG spec.
# == "".join([chr(i) for i in range(0, 0x20 + 1)])
_WHATWG_C0_CONTROL_OR_SPACE = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f '

# Unsafe bytes to be removed per WHATWG spec
_UNSAFE_URL_BYTES_TO_REMOVE = ['\t', '\r', '\n']

Expand Down Expand Up @@ -197,13 +204,20 @@ def urlsplit(url, scheme='', allow_fragments=True):
Return a 5-tuple: (scheme, netloc, path, query, fragment).
Note that we don't break the components up in smaller bits
(e.g. netloc is a single string) and we don't expand % escapes."""

allow_fragments = bool(allow_fragments)
key = url, scheme, allow_fragments, type(url), type(scheme)
cached = _parse_cache.get(key, None)
if cached:
return cached
if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
clear_cache()

# Only lstrip url as some applications rely on preserving trailing space.
# (https://url.spec.whatwg.org/#concept-basic-url-parser would strip both)
url = url.lstrip(_WHATWG_C0_CONTROL_OR_SPACE)
scheme = scheme.strip(_WHATWG_C0_CONTROL_OR_SPACE)

netloc = query = fragment = ''
i = url.find(':')
if i > 0:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
:func:`urllib.parse.urlsplit` now strips leading C0 control and space
characters following the specification for URLs defined by WHATWG in
response to CVE-2023-24329. Patch by Illia Volochii.