diff --git a/Doc/library/urlparse.rst b/Doc/library/urlparse.rst index 0989c88c302253..36e332c25fe00f 100644 --- a/Doc/library/urlparse.rst +++ b/Doc/library/urlparse.rst @@ -125,6 +125,11 @@ The :mod:`urlparse` module defines the following functions: decomposed before parsing, or is not a Unicode string, no error will be raised. + .. warning:: + + :func:`urlparse` does not perform validation. See :ref:`URL parsing + security ` for details. + .. versionchanged:: 2.5 Added attributes to return value. @@ -248,6 +253,10 @@ The :mod:`urlparse` module defines the following functions: decomposed before parsing, or is not a Unicode string, no error will be raised. + Following some of the `WHATWG spec`_ that updates RFC 3986, leading C0 + control and space characters are stripped from the URL. ``\n``, + ``\r`` and tab ``\t`` characters are removed from the URL at any position. + .. versionadded:: 2.2 .. versionchanged:: 2.5 @@ -257,6 +266,9 @@ The :mod:`urlparse` module defines the following functions: Characters that affect netloc parsing under NFKC normalization will now raise :exc:`ValueError`. + .. versionchanged:: 2.7.17.8 + Leading WHATWG C0 control and space characters are stripped from the URL. + .. function:: urlunsplit(parts) @@ -378,3 +390,14 @@ The following classes provide the implementations of the parse results: Concrete class for :func:`urlsplit` results. +.. _url-parsing-security: + +URL parsing security +-------------------- + +The :func:`urlsplit` and :func:`urlparse` APIs do not perform **validation** of +inputs. They may not raise errors on inputs that other applications consider +invalid. They may also succeed on some inputs that might not be considered +URLs elsewhere. Their purpose is for practical functionality rather than +purity. + diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py index e00fb7af699668..f924289041a7a1 100644 --- a/Lib/test/test_urlparse.py +++ b/Lib/test/test_urlparse.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- # Copyright (C) 2022 ActiveState Software Inc. # test_urlparse.py is licensed under the PSFLv2 License. # See the file LICENSE for details. @@ -8,6 +9,23 @@ import unittest import urlparse +# Add ability to run sub-tests +def sub_test(param_list): + """Decorates a test case to run it as a set of subtests.""" + + def decorator(f): + + @functools.wraps(f) + def wrapped(self): + for param in param_list: + with self.subTest(**param): + f(self, **param) + + return wrapped + + return decorator + + RFC1808_BASE = "http://a/b/c/d;p?q#f" RFC2396_BASE = "http://a/b/c/d;p?q" RFC3986_BASE = 'http://a/b/c/d;p?q' @@ -602,16 +620,92 @@ def test_urlsplit_remove_unsafe_bytes(self): self.assertEqual(p.port, None) self.assertEqual(p.geturl(), u"http://www.python.org/javascript:alert('msg')/#frag") + def test_urlsplit_strip_url(self): + noise = bytes(bytearray(range(0, 0x20 + 1))) + base_url = "http://User:Pass@www.python.org:080/doc/?query=yes#frag" - def test_attributes_bad_port(self): - """Check handling of non-integer ports.""" - p = urlparse.urlsplit("http://www.example.net:foo") - self.assertEqual(p.netloc, "www.example.net:foo") - self.assertRaises(ValueError, lambda: p.port) + url = noise.decode("utf-8") + base_url + p = urlparse.urlsplit(url) + self.assertEqual(p.scheme, "http") + self.assertEqual(p.netloc, "User:Pass@www.python.org:080") + self.assertEqual(p.path, "/doc/") + self.assertEqual(p.query, "query=yes") + self.assertEqual(p.fragment, "frag") + self.assertEqual(p.username, "User") + self.assertEqual(p.password, "Pass") + self.assertEqual(p.hostname, "www.python.org") + self.assertEqual(p.port, 80) + self.assertEqual(p.geturl(), base_url) - p = urlparse.urlparse("http://www.example.net:foo") - self.assertEqual(p.netloc, "www.example.net:foo") - self.assertRaises(ValueError, lambda: p.port) + url = noise + base_url.encode("utf-8") + p = urlparse.urlsplit(url) + self.assertEqual(p.scheme, b"http") + self.assertEqual(p.netloc, b"User:Pass@www.python.org:080") + self.assertEqual(p.path, b"/doc/") + self.assertEqual(p.query, b"query=yes") + self.assertEqual(p.fragment, b"frag") + self.assertEqual(p.username, b"User") + self.assertEqual(p.password, b"Pass") + self.assertEqual(p.hostname, b"www.python.org") + self.assertEqual(p.port, 80) + self.assertEqual(p.geturl(), base_url.encode("utf-8")) + + # Test that trailing space is preserved as some applications rely on + # this within query strings. + query_spaces_url = "https://www.python.org:88/doc/?query= " + p = urlparse.urlsplit(noise.decode("utf-8") + query_spaces_url) + self.assertEqual(p.scheme, "https") + self.assertEqual(p.netloc, "www.python.org:88") + self.assertEqual(p.path, "/doc/") + self.assertEqual(p.query, "query= ") + self.assertEqual(p.port, 88) + self.assertEqual(p.geturl(), query_spaces_url) + + p = urlparse.urlsplit("www.pypi.org ") + # That "hostname" gets considered a "path" due to the + # trailing space and our existing logic... YUCK... + # and re-assembles via geturl aka unurlsplit into the original. + # django.core.validators.URLValidator (at least through v3.2) relies on + # this, for better or worse, to catch it in a ValidationError via its + # regular expressions. + # Here we test the basic round trip concept of such a trailing space. + self.assertEqual(urlparse.urlunsplit(p), "www.pypi.org ") + + # with scheme as cache-key + url = "//www.python.org/" + scheme = noise.decode("utf-8") + "https" + noise.decode("utf-8") + for _ in range(2): + p = urlparse.urlsplit(url, scheme=scheme) + self.assertEqual(p.scheme, "https") + self.assertEqual(p.geturl(), "https://www.python.org/") + + def test_attributes_bad_port_a(self): + """Check handling of invalid ports.""" + for bytes in (False, True): + for parse in (urlparse.urlsplit, urlparse.urlparse): + # Spaces and invalid characters are stripped now, so the missing one's can't cause issues + # for port in ("foo", "1.5", "-1", "0x10", "-0", "1_1", " 1", "1 ", "६"): + for port in ("foo", "1.5", "0x10", "1_1"): + netloc = "www.example.net:" + port + url = "http://" + netloc + "/" + if bytes: + netloc = netloc.encode("ascii") + url = url.encode("ascii") + p = parse(url) + self.assertEqual(p.netloc, netloc) + with self.assertRaises(ValueError): + p.port + + def test_attributes_bad_port_b(self): + """Check handling of invalid ports.""" + for parse in (urlparse.urlsplit, urlparse.urlparse): + for port in ("६"): + netloc = "www.example.net:" + port + url = "http://" + netloc + "/" + p = parse(url) + self.assertEqual(p.netloc, netloc) + with self.assertRaises(ValueError): + p.port def test_attributes_without_netloc(self): # This example is straight from RFC 3261. It looks like it diff --git a/Lib/urlparse.py b/Lib/urlparse.py index 914ccd2c6942e6..0f12940c3dc0ed 100644 --- a/Lib/urlparse.py +++ b/Lib/urlparse.py @@ -30,6 +30,9 @@ parsing quirks from older RFCs are retained. The testcases in test_urlparse.py provides a good indicator of parsing behavior. +The WHATWG URL Parser spec should also be considered. We are not compliant with +it either due to existing user code API behavior expectations (Hyrum's Law). +It serves as a useful guide when making changes. """ import re @@ -66,6 +69,10 @@ '0123456789' '+-.') +# Leading and trailing C0 control and space to be stripped per WHATWG spec. +# == "".join([chr(i) for i in range(0, 0x20 + 1)]) +_WHATWG_C0_CONTROL_OR_SPACE = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f ' + # Unsafe bytes to be removed per WHATWG spec _UNSAFE_URL_BYTES_TO_REMOVE = ['\t', '\r', '\n'] @@ -197,6 +204,7 @@ def urlsplit(url, scheme='', allow_fragments=True): Return a 5-tuple: (scheme, netloc, path, query, fragment). Note that we don't break the components up in smaller bits (e.g. netloc is a single string) and we don't expand % escapes.""" + allow_fragments = bool(allow_fragments) key = url, scheme, allow_fragments, type(url), type(scheme) cached = _parse_cache.get(key, None) @@ -204,6 +212,12 @@ def urlsplit(url, scheme='', allow_fragments=True): return cached if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth clear_cache() + + # Only lstrip url as some applications rely on preserving trailing space. + # (https://url.spec.whatwg.org/#concept-basic-url-parser would strip both) + url = url.lstrip(_WHATWG_C0_CONTROL_OR_SPACE) + scheme = scheme.strip(_WHATWG_C0_CONTROL_OR_SPACE) + netloc = query = fragment = '' i = url.find(':') if i > 0: diff --git a/Misc/NEWS.d/next/Security/2023-03-07-20-59-17.gh-issue-102153.14CLSZ.rst b/Misc/NEWS.d/next/Security/2023-03-07-20-59-17.gh-issue-102153.14CLSZ.rst new file mode 100644 index 00000000000000..e57ac4ed3ac5d7 --- /dev/null +++ b/Misc/NEWS.d/next/Security/2023-03-07-20-59-17.gh-issue-102153.14CLSZ.rst @@ -0,0 +1,3 @@ +:func:`urllib.parse.urlsplit` now strips leading C0 control and space +characters following the specification for URLs defined by WHATWG in +response to CVE-2023-24329. Patch by Illia Volochii.