From f5f4950d76f0f7731833f4c71770a9ee5b740fe5 Mon Sep 17 00:00:00 2001 From: Donald Stufft Date: Fri, 15 Jul 2022 22:40:10 -0400 Subject: [PATCH 01/19] Move metadata into a package --- packaging/metadata/__init__.py | 4 ++++ packaging/{metadata.py => metadata/_types.py} | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) create mode 100644 packaging/metadata/__init__.py rename packaging/{metadata.py => metadata/_types.py} (99%) diff --git a/packaging/metadata/__init__.py b/packaging/metadata/__init__.py new file mode 100644 index 000000000..e5cae33ec --- /dev/null +++ b/packaging/metadata/__init__.py @@ -0,0 +1,4 @@ +from ._types import DynamicField, Metadata + + +__all__ = ["DynamicField", "Metadata"] diff --git a/packaging/metadata.py b/packaging/metadata/_types.py similarity index 99% rename from packaging/metadata.py rename to packaging/metadata/_types.py index 81405febe..61325cb7e 100644 --- a/packaging/metadata.py +++ b/packaging/metadata/_types.py @@ -4,7 +4,7 @@ from collections.abc import Iterable from typing import Optional, Tuple -from . import ( # Alt name avoids shadowing. +from .. import ( # Alt name avoids shadowing. requirements, specifiers, utils, From 88ada97881a88c45b4e2a46c7ceca91dd927d0df Mon Sep 17 00:00:00 2001 From: Donald Stufft Date: Sat, 16 Jul 2022 03:58:20 -0400 Subject: [PATCH 02/19] implement parsing from a metadata format to intermediate --- packaging/metadata/__init__.py | 3 +- packaging/metadata/_parse.py | 473 +++++++++++++++++++++++++++++++++ 2 files changed, 475 insertions(+), 1 deletion(-) create mode 100644 packaging/metadata/_parse.py diff --git a/packaging/metadata/__init__.py b/packaging/metadata/__init__.py index e5cae33ec..91cb0646a 100644 --- a/packaging/metadata/__init__.py +++ b/packaging/metadata/__init__.py @@ -1,4 +1,5 @@ +from ._parse import RawMetadata, parse_email, parse_json from ._types import DynamicField, Metadata -__all__ = ["DynamicField", "Metadata"] +__all__ = ["DynamicField", "Metadata", "RawMetadata", "parse_email", "parse_json"] diff --git a/packaging/metadata/_parse.py b/packaging/metadata/_parse.py new file mode 100644 index 000000000..192e1be34 --- /dev/null +++ b/packaging/metadata/_parse.py @@ -0,0 +1,473 @@ +import json +import email.feedparser +import email.parser +import email.policy + +from typing import Any, Optional, TypedDict, cast + + +# The RawMetadata class attempts to make as few assumptions about +# the underlying serialization formats as possible, these could +# possibly serialize in an entirely different way, but the idea +# here is that as long as a serialization formats some very +# basic primitives in *some* way (strings, lists, and one map +# but that map can be easily implemented as a list of strings) +# then we can support serializing to and from that format. +class RawMetadata(TypedDict, total=False): + metadata_version: str + name: str + version: str + dynamic: list[str] + platforms: list[str] + supported_platforms: list[str] + summary: str + description: str + description_content_type: str + keywords: list[str] + home_page: str + download_url: str + author: str + author_email: str + maintainer: str + maintainer_email: str + license: str + classifiers: list[str] + requires_dist: list[str] + requires_python: str + requires_external: list[str] + project_urls: dict[str, str] + provides_extra: list[str] + provides_dist: list[str] + obsoletes_dist: list[str] + + +_STRING_FIELDS = { + "metadata_version", + "name", + "version", + "summary", + "home_page", + "download_url", + "author", + "author_email", + "maintainer", + "maintainer_email", + "license", + "requires_python", +} + +_LIST_STRING_FIELDS = { + "dynamic", + "platforms", + "supported_platforms", + "classifiers", + "requires_dist", + "requires_python", + "requires_external", + "provides_extra", + "provides_dist", + "obsoletes_dist", +} + +# General helper functions for parsing some string values for reusing in +# multiple parse_FORMAT functions + + +def _parse_keywords(data: str) -> list[str]: + return [k.strip() for k in data.split(",")] + + +def _parse_project_urls(data: list[str]) -> dict[str, str]: + urls = {} + for pair in data: + # Our logic is slightly tricky here as we want to try and do + # *something* reasonable with malformed data. + # + # The main thing that we have to worry about, is data that does + # not have a ',' at all to split the Key from the Value. There + # isn't a singular right answer here, and we will fail validation + # later on (if the caller is validating) so it doesn't *really* + # matter, but since the missing value has to be an empty str + # and our return value is dict[str, str], if we let the key + # be the missing value, then they'd just multiple '' values that + # overwrite each other. + # + # The other potentional issue is that it's possible to have the + # same Key multiple times in the metadata, with no solid "right" + # answer with what to do in that case, we'll do the only thing + # we can, which is treat the field as unparseable and add it + # to our list of unparsed fields. + parts = [p.strip() for p in pair.split(",", 1)] + parts.extend([""] * (max(0, 2 - len(parts)))) # Ensure 2 items + + # TODO: The spec doesn't say anything about if the keys should be + # considered case sensitive or not... logically they should + # be case preserving, but case insensitive, but doing that + # would open up more cases where we might have duplicated + # entries. + label, url = parts + if label in urls: + # The label already exists in our set of urls, so this field + # is unparseable, and we can just add the whole thing to our + # unparseable data and stop processing it. + raise KeyError("duplicate keys in project urls") + urls[label] = url + + return urls + + +# The various parse_FORMAT functions here are intended to be as lenient as +# possible in their parsing, while still returning a correctly typed +# RawMetadata. +# +# To aid in this, we also generally want to do as little touching of the +# data as possible, except where there are possibly some historic holdovers +# that make valid data awkward to work with. +# +# While this is a lower level, intermediate format than our ``Metadata`` +# class, some light touch ups can make a massive different in usability. + + +_EMAIL_FIELD_MAPPING = { + "Metadata-Version": "metadata_version", + "Name": "name", + "Version": "version", + "Dynamic": "dynamic", + "Platform": "platforms", + "Supported-Platform": "supported_platforms", + "Summary": "summary", + "Description": "description", + "Description-Content-Type": "description_content_type", + "Keywords": "keywords", + "Home-Page": "home_page", + "Download-URL": "download_url", + "Author": "author", + "Author-Email": "author_email", + "Maintainer": "maintainer", + "Maintainer-Email": "maintainer_email", + "License": "license", + "Classifier": "classifiers", + "Requires-Dist": "requires_dist", + "Requires-Python": "requires_python", + "Requires-External": "requires_external", + "Project-URL": "project_urls", + "Provides-Extra": "provides_extra", + "Provides-Dist": "provides_dist", + "Obsoletes-Dist": "obsoletes_dist", +} + + +def parse_email(data: bytes | str) -> tuple[RawMetadata, dict[Any, Any]]: + raw = {} + unparsed: dict[Any, Any] = {} + + if isinstance(data, str): + parsed = email.parser.Parser(policy=email.policy.compat32).parsestr(data) + else: + # In theory we could use the BytesParser from email.parser, but that has + # several problems that this method solves: + # + # 1. BytesParser (and BytesFeedParser) hard codes an assumption that the + # bytes are encoded as ascii (with a surrogateescape handler), but + # the packaging specifications explicitly have decided that our specs + # are in UTF8, not ascii. + # 2. We could work around (1) by just decoding the bytes using utf8 ourself + # and then pass it into Parser, which we *could* do, however we're + # attempting to be lenient with this method to enable someone to usee + # this class to parse as much as possible while ignoring any errors that + # do come from it. + # + # So we'll want to break our bytes up into a list of headers followed up + # by the message body. + # + # Unfortunately, doing this is impossible without lightly parsing the + # RFC 822 format ourselves, which is not the most straightforward thing + # primarily because of a few concerns: + # + # 1. Conceptually RFC 822 messages is a format where you emit all of the + # headers first, one per line, then a blank line, then the body of the + # message. But it has the ability to "fold" a long header line across + # multiple lines, so to correctly do decoding on a field by field basis + # we will have to take this folding into account (but we do not need to + # actually implement the unfolding, we just want to make sure we have + # the entire logical "line" for that header). + # 2. The message body isn't part of a normal field, it's effectively a + # a blank header field, then everythig else is part of the body. + # 3. If a particular field can't be decoded using utf8, then we want to + # treat that field as unparseable, but getting the name out of that field + # requires implementing (more) of RFC 822 ourselves, though it's a pretty + # straight forward part. + # 4. RFC 822 very specifically calls out CRLF as the line endings, but the + # python stdlib email.parser supports CRLF or LF, and in practice the + # core metadata specs are emiting METADATA files using LF only. + # + # TODO: Is doing this unconditionally for `bytes` the best idea here? Another + # option is to provide a helper function that will produce a possibly + # mojibaked string, and expect people who want per field decoding + # leniency to manually decode bytes using that method instead. + parser = email.feedparser.FeedParser(policy=email.policy.compat32) + + # We don't use splitlines here, because it splits on a lot more different + # types of line endings than we want to split on. Since in practice we + # have to support just LF, we can just split on that, and do our decoding + # and let the FeedParser deal with sorting out if it should be CRLF or LF. + buf = b"" + in_body = False + for line in data.split(b"\n"): + # Put our LF back onto our line that the call to .split() removed. + line = line + b"\n" + + # If we're in the body of our message, line continuation no longer matters + # and we can just buffer the entire body so we can attempt to decode it + # all at once. + if in_body: + buf += line + continue + + # Continuation lines always start with LWSP, so we'll check to if we have + # any data to parse and if so, if this is NOT a continuation line, if it's + # not then we've finished reading the previous logical line, and we need + # to decode it and pass it into the FeedParser. + if buf and line[:1] not in {b" ", b"\t"}: + try: + encoded = buf.decode("utf8", "strict") + except UnicodeDecodeError: + # If we've gotten here, then we can't actually determine what + # encoding this line is in, so we'll try to pull a header key + # out of it to give us something to put into our unparsed data. + parts = buf.split(b":", 1) + parts.extend([b""] * (max(0, 2 - len(parts)))) # Ensure 2 items + + # We're leaving this data as bytes and we're also leaving it folded, + # if the caller wants to attempt to parse something out of this + unparsed[parts[0]] = parts[1] + else: + parser.feed(encoded) + + # Either way, this logical line has been handled, so we'll reset our + # buffer and keep going. + buf = b"" + + # Check to see if this line is the "blank" line that signals the end + # of the header data and the start of the body data. + if line in {b"\n", b"\r\n"}: + parser.feed(line.decode("utf8", "strict")) + in_body = True + # More header data, add it to our buffer + else: + buf += line + + # At this point, buf should be full of the entire body (if there was one) so + # we'll attempt to decode that. + try: + encoded = buf.decode("utf8", "strict") + except UnicodeDecodeError: + # Our body isn't valid UTF8, we know what the key name for the Description + # is though, so we can just use that + unparsed["Description"] = buf + + # Actually consume our data, turning it into our email Message. + parsed = parser.close() + + # We have to wrap parsed.keys() in a set, because in the case of multiple + # values for a key (a list), the key will appear multiple times in the + # list of keys, but we're avoiding that by using get_all(). + for name in set(parsed.keys()): + # We use get_all here, even for fields that aren't multiple use, because + # otherwise someone could have say, two Name fields, and we would just + # silently ignore it rather than doing something about it. + value = parsed.get_all(name) + + raw_name = _EMAIL_FIELD_MAPPING.get(name) + if raw_name is None: + # This is a bit of a weird situation, we've encountered a key that + # we don't know what it means, so we don't know whether it's meant + # to be a list or not. + # + # Since we can't really tell one way or another, we'll just leave it + # as a list, even though it may be a single item list, because that's + # what makes the most sense for email headers. + unparsed[name] = value + continue + + # If this is one of our string fields, then we'll check to see if our + # value is a list of a single item, if it is then we'll assume that + # it was emited as a single string, and unwrap the str from inside + # the list. + # + # If it's any other kind of data, then we haven't the faintest clue + # what we should parse it as, and we have to just add it to our list + # of unparsed stuff. + if raw_name in _STRING_FIELDS and len(value) == 1: + raw[raw_name] = value[0] + # If this is one our list of string fields, then we can just assign + # the value, since email *only* has strings, and our get_all() call + # above ensures that this is a list. + elif raw_name in _LIST_STRING_FIELDS: + raw[raw_name] = value + # Special Case: Keywords + # The keywords field is implemented in the metadata spec as a str, + # but it conceptually is a list of strings, and is serialized using + # ", ".join(keywords), so we'll do some light data massaging to turn + # this into what it logically is. + elif raw_name == "keywords" and len(value) == 1: + raw[raw_name] = _parse_keywords(value[0]) + # Special Case: Project-URL + # The project urls is implemented in the metadata spec as a list of + # specially formatted strings that represent a key and a value, which + # is fundamentally a mapping, however the email format doesn't support + # mappings in a sane way, so it was crammed into a list of strings + # instead. + # + # We will do a little light data massaging to turn this into a map as + # it logically should be. + elif raw_name == "project_urls": + try: + raw[raw_name] = _parse_project_urls(value) + except ValueError: + unparsed[name] = value + # Nothing that we've done has managed to parse this, so it'll just + # throw it in our unparseable data and move on. + else: + unparsed[name] = value + + # We need to support getting the Description from the message payload in + # addition to getting it from the the headers, but since Description is + # conceptually a string, if it's already been set from headers then we'll + # clear it out move them both to unparsed. + # + # NOTE: For whatever reason, this will return a list of strings if the + # message is in mutlipart format, otherwise it will return a single + # string. The list format would be an unparseable error. + payload = parsed.get_payload() + if payload: + # Check to see if we've got duplicated values, if so remove the + # parsed one and move to unparsed. + if "description" in raw: + unparsed["Description"] = [raw.pop("description")] + if isinstance(payload, str): + unparsed["Description"].append(payload) + else: + unparsed["Description"].extend(payload) + # If payload is a string, then we're good to go to add this to our + # RawMetadata. + elif isinstance(payload, str): + raw["description"] = payload + # Otherwise, it's unparseable, and we need to record that. + else: + unparsed["Description"] = payload + + # We need to cast our `raw` to a metadata, because a TypedDict only support + # literal key names, but we're computing our key names on purpose, but the + # way this function is implemented, our `TypedDict` can only have valid key + # names. + return cast(RawMetadata, raw), unparsed + + +# This might appear to be a mapping of the same key to itself, and in many cases +# it is. However, the algorithm in PEP 566 doesn't match 100% the keys chosen +# for RawMetadata, so we use this mapping just like with email to handle that. +_JSON_FIELD_MAPPING = { + "metadata_version": "metadata_version", + "name": "name", + "version": "version", + "dynamic": "dynamic", + "platform": "platforms", + "supported_platform": "supported_platforms", + "summary": "summary", + "description": "description", + "description_content_type": "description_content_type", + "keywords": "keywords", + "home_page": "home_page", + "download_url": "download_url", + "author": "author", + "author_email": "author_email", + "maintainer": "maintainer", + "maintainer_email": "maintainer_email", + "license": "license", + "classifier": "classifiers", + "requires_dist": "requires_dist", + "requires_python": "requires_python", + "requires_external": "requires_external", + "project_url": "project_urls", + "provides_extra": "provides_extra", + "provides_dist": "provides_dist", + "obsoletes_dist": "obsoletes_dist", +} + + +def parse_json(data: bytes | str) -> tuple[RawMetadata, dict[Any, Any]]: + raw: dict[Any, Any] = {} + unparsed: dict[Any, Any] = {} + parsed = json.loads(data) + + # We need to make sure that the data given to us actually implements + # a dict, if it's any other type then there is no way we can parse + # anything meaningful out of it, so we'll just give up and bail out. + if not isinstance(parsed, dict): + raise ValueError("Invalid json data, must be a mapping") + + for name, value in parsed.items(): + raw_name = _JSON_FIELD_MAPPING.get(name) + if raw_name is None: + # We don't know this key, so chuck it into our unparsed data + # and continue on. + unparsed[name] = value + continue + + # If this is one of our string fields, check to see if it's actually + # a string, if it's not then we don't have any idea how to handle it + if raw_name in _STRING_FIELDS and isinstance(value, str): + raw[raw_name] = value + # If this is one of our string fields, check to see if it's actually + # a list of strings, if it's not then we don't have any idea how to + # handle it + elif ( + raw_name in _LIST_STRING_FIELDS + and isinstance(value, list) + and all(isinstance(v, str) for v in value) + ): + raw[raw_name] = cast(list[str], value) + # Special Case: Keywords + # The keywords field is implemented in the metadata spec as a str, + # but it conceptually is a list of strings. Interestingly, the + # JSON spec as described in PEP 566 already implements this as a + # list of strings, so we don't technically have to do anything. + # + # We're still treating this as as a special case though, because + # in the metadata specification it's a single string, so it's not + # included in our list of list string fields. + elif ( + raw_name == "keywords" + and isinstance(value, list) + and all(isinstance(v, str) for v in value) + ): + raw[raw_name] = value + # Special Case: Project-URL + # The project urls is implemented in the metadata spec as a list of + # specially formatted strings that represent a key and a value, which + # is fundamentally a mapping, however the email format doesn't support + # mappings in a sane way, so it was crammed into a list of strings + # instead. + # + # We will do a little light data massaging to turn this into a map as + # it logically should be. + elif ( + raw_name == "project_urls" + and isinstance(value, list) + and all(isinstance(v, str) for v in value) + ): + try: + raw[raw_name] = _parse_project_urls(value) + except ValueError: + unparsed[name] = value + # Nothing that we've done has managed to parse this, so it'll just + # throw it in our unparseable data and move on. + else: + unparsed[name] = value + + # We need to cast our `raw` to a metadata, because a TypedDict only support + # literal key names, but we're computing our key names on purpose, but the + # way this function is implemented, our `TypedDict` can only have valid key + # names. + return cast(RawMetadata, raw), unparsed From 3b42e3ca6f79693324ba5dcad4104f9cff29c857 Mon Sep 17 00:00:00 2001 From: Donald Stufft Date: Sat, 16 Jul 2022 04:02:54 -0400 Subject: [PATCH 03/19] fix linting --- packaging/metadata/__init__.py | 1 - packaging/metadata/_parse.py | 5 ++--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/packaging/metadata/__init__.py b/packaging/metadata/__init__.py index 91cb0646a..c4752576e 100644 --- a/packaging/metadata/__init__.py +++ b/packaging/metadata/__init__.py @@ -1,5 +1,4 @@ from ._parse import RawMetadata, parse_email, parse_json from ._types import DynamicField, Metadata - __all__ = ["DynamicField", "Metadata", "RawMetadata", "parse_email", "parse_json"] diff --git a/packaging/metadata/_parse.py b/packaging/metadata/_parse.py index 192e1be34..6771f4497 100644 --- a/packaging/metadata/_parse.py +++ b/packaging/metadata/_parse.py @@ -1,9 +1,8 @@ -import json import email.feedparser import email.parser import email.policy - -from typing import Any, Optional, TypedDict, cast +import json +from typing import Any, TypedDict, cast # The RawMetadata class attempts to make as few assumptions about From 9c78c5b7f3426584ba165cd80df089fe7cf17559 Mon Sep 17 00:00:00 2001 From: Donald Stufft Date: Sat, 16 Jul 2022 10:47:18 -0400 Subject: [PATCH 04/19] use older syntax for unions --- packaging/metadata/_parse.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/packaging/metadata/_parse.py b/packaging/metadata/_parse.py index 6771f4497..f7db63d52 100644 --- a/packaging/metadata/_parse.py +++ b/packaging/metadata/_parse.py @@ -2,7 +2,7 @@ import email.parser import email.policy import json -from typing import Any, TypedDict, cast +from typing import Any, TypedDict, Union, cast # The RawMetadata class attempts to make as few assumptions about @@ -156,7 +156,7 @@ def _parse_project_urls(data: list[str]) -> dict[str, str]: } -def parse_email(data: bytes | str) -> tuple[RawMetadata, dict[Any, Any]]: +def parse_email(data: Union[bytes, str]) -> tuple[RawMetadata, dict[Any, Any]]: raw = {} unparsed: dict[Any, Any] = {} @@ -395,7 +395,7 @@ def parse_email(data: bytes | str) -> tuple[RawMetadata, dict[Any, Any]]: } -def parse_json(data: bytes | str) -> tuple[RawMetadata, dict[Any, Any]]: +def parse_json(data: Union[bytes, str]) -> tuple[RawMetadata, dict[Any, Any]]: raw: dict[Any, Any] = {} unparsed: dict[Any, Any] = {} parsed = json.loads(data) From 8e257675a3f67682599383a377a3848d9ec5fff9 Mon Sep 17 00:00:00 2001 From: Donald Stufft Date: Sat, 16 Jul 2022 12:27:33 -0400 Subject: [PATCH 05/19] Use the method from pkg_metadata to deal with email encodinga --- packaging/metadata/_parse.py | 205 ++++++++++++++--------------------- 1 file changed, 80 insertions(+), 125 deletions(-) diff --git a/packaging/metadata/_parse.py b/packaging/metadata/_parse.py index f7db63d52..46bcbf665 100644 --- a/packaging/metadata/_parse.py +++ b/packaging/metadata/_parse.py @@ -1,4 +1,5 @@ import email.feedparser +import email.header import email.parser import email.policy import json @@ -157,116 +158,13 @@ def _parse_project_urls(data: list[str]) -> dict[str, str]: def parse_email(data: Union[bytes, str]) -> tuple[RawMetadata, dict[Any, Any]]: - raw = {} + raw: dict[str, Any] = {} unparsed: dict[Any, Any] = {} if isinstance(data, str): parsed = email.parser.Parser(policy=email.policy.compat32).parsestr(data) else: - # In theory we could use the BytesParser from email.parser, but that has - # several problems that this method solves: - # - # 1. BytesParser (and BytesFeedParser) hard codes an assumption that the - # bytes are encoded as ascii (with a surrogateescape handler), but - # the packaging specifications explicitly have decided that our specs - # are in UTF8, not ascii. - # 2. We could work around (1) by just decoding the bytes using utf8 ourself - # and then pass it into Parser, which we *could* do, however we're - # attempting to be lenient with this method to enable someone to usee - # this class to parse as much as possible while ignoring any errors that - # do come from it. - # - # So we'll want to break our bytes up into a list of headers followed up - # by the message body. - # - # Unfortunately, doing this is impossible without lightly parsing the - # RFC 822 format ourselves, which is not the most straightforward thing - # primarily because of a few concerns: - # - # 1. Conceptually RFC 822 messages is a format where you emit all of the - # headers first, one per line, then a blank line, then the body of the - # message. But it has the ability to "fold" a long header line across - # multiple lines, so to correctly do decoding on a field by field basis - # we will have to take this folding into account (but we do not need to - # actually implement the unfolding, we just want to make sure we have - # the entire logical "line" for that header). - # 2. The message body isn't part of a normal field, it's effectively a - # a blank header field, then everythig else is part of the body. - # 3. If a particular field can't be decoded using utf8, then we want to - # treat that field as unparseable, but getting the name out of that field - # requires implementing (more) of RFC 822 ourselves, though it's a pretty - # straight forward part. - # 4. RFC 822 very specifically calls out CRLF as the line endings, but the - # python stdlib email.parser supports CRLF or LF, and in practice the - # core metadata specs are emiting METADATA files using LF only. - # - # TODO: Is doing this unconditionally for `bytes` the best idea here? Another - # option is to provide a helper function that will produce a possibly - # mojibaked string, and expect people who want per field decoding - # leniency to manually decode bytes using that method instead. - parser = email.feedparser.FeedParser(policy=email.policy.compat32) - - # We don't use splitlines here, because it splits on a lot more different - # types of line endings than we want to split on. Since in practice we - # have to support just LF, we can just split on that, and do our decoding - # and let the FeedParser deal with sorting out if it should be CRLF or LF. - buf = b"" - in_body = False - for line in data.split(b"\n"): - # Put our LF back onto our line that the call to .split() removed. - line = line + b"\n" - - # If we're in the body of our message, line continuation no longer matters - # and we can just buffer the entire body so we can attempt to decode it - # all at once. - if in_body: - buf += line - continue - - # Continuation lines always start with LWSP, so we'll check to if we have - # any data to parse and if so, if this is NOT a continuation line, if it's - # not then we've finished reading the previous logical line, and we need - # to decode it and pass it into the FeedParser. - if buf and line[:1] not in {b" ", b"\t"}: - try: - encoded = buf.decode("utf8", "strict") - except UnicodeDecodeError: - # If we've gotten here, then we can't actually determine what - # encoding this line is in, so we'll try to pull a header key - # out of it to give us something to put into our unparsed data. - parts = buf.split(b":", 1) - parts.extend([b""] * (max(0, 2 - len(parts)))) # Ensure 2 items - - # We're leaving this data as bytes and we're also leaving it folded, - # if the caller wants to attempt to parse something out of this - unparsed[parts[0]] = parts[1] - else: - parser.feed(encoded) - - # Either way, this logical line has been handled, so we'll reset our - # buffer and keep going. - buf = b"" - - # Check to see if this line is the "blank" line that signals the end - # of the header data and the start of the body data. - if line in {b"\n", b"\r\n"}: - parser.feed(line.decode("utf8", "strict")) - in_body = True - # More header data, add it to our buffer - else: - buf += line - - # At this point, buf should be full of the entire body (if there was one) so - # we'll attempt to decode that. - try: - encoded = buf.decode("utf8", "strict") - except UnicodeDecodeError: - # Our body isn't valid UTF8, we know what the key name for the Description - # is though, so we can just use that - unparsed["Description"] = buf - - # Actually consume our data, turning it into our email Message. - parsed = parser.close() + parsed = email.parser.BytesParser(policy=email.policy.compat32).parsebytes(data) # We have to wrap parsed.keys() in a set, because in the case of multiple # values for a key (a list), the key will appear multiple times in the @@ -275,7 +173,63 @@ def parse_email(data: Union[bytes, str]) -> tuple[RawMetadata, dict[Any, Any]]: # We use get_all here, even for fields that aren't multiple use, because # otherwise someone could have say, two Name fields, and we would just # silently ignore it rather than doing something about it. - value = parsed.get_all(name) + headers = parsed.get_all(name) + + # The way the email module works when parsing bytes is that it + # unconditionally decodes the bytes as ascii, using the surrogateescape + # handler, and then when you pull that data back out (such as with get_all) + # it looks to see if the str has any surrogate escapes, and if it does + # it wraps it in a Header object instead of returning the string. + # + # So we'll look for those Header objects, and fix up the encoding + value = [] + valid_encoding = True + for h in headers: + # It's unclear if this can return more types than just a Header or + # a str, so we'll just assert here to make sure. + assert isinstance(h, (email.header.Header, str)) + + # If it's a header object, we need to do our little dance to get + # the real data out of it. In cases where there is invalid data + # we're going to end up with mojibake, but I don't see a good way + # around that without reimplementing parts of the Header object + # ourselves. + # + # That should be fine, since if that happens, this key is going + # into the unparsed dict anyways. + if isinstance(h, email.header.Header): + # The Heade object stores it's data as chunks, and each chunk + # can be independently encoded, so we'll need to check each + # of them. + chunks = [] + for bin, encoding in email.header.decode_header(h): + # This means it found a surrogate escape, that could be + # valid data (if the source was utf8), or invalid. + if encoding == "unknown-8bit": + try: + bin.decode("utf8", "strict") + except UnicodeDecodeError: + # Enable mojibake + encoding = "latin1" + valid_encoding = False + else: + encoding = "utf8" + chunks.append((bin, encoding)) + + # Turn our chunks back into a Header object, then let that + # Header object do the right thing to turn them into a + # string for us. + value.append(str(email.header.make_header(chunks))) + # This is already a string, so just add it + else: + value.append(h) + + # We've processed all of our values to get them into a list of str, + # but we may have mojibake data, in which case this is an unparsed + # field. + if not valid_encoding: + unparsed[name] = value + continue raw_name = _EMAIL_FIELD_MAPPING.get(name) if raw_name is None: @@ -335,26 +289,27 @@ def parse_email(data: Union[bytes, str]) -> tuple[RawMetadata, dict[Any, Any]]: # conceptually a string, if it's already been set from headers then we'll # clear it out move them both to unparsed. # - # NOTE: For whatever reason, this will return a list of strings if the - # message is in mutlipart format, otherwise it will return a single - # string. The list format would be an unparseable error. - payload = parsed.get_payload() - if payload: - # Check to see if we've got duplicated values, if so remove the - # parsed one and move to unparsed. - if "description" in raw: - unparsed["Description"] = [raw.pop("description")] - if isinstance(payload, str): - unparsed["Description"].append(payload) - else: - unparsed["Description"].extend(payload) - # If payload is a string, then we're good to go to add this to our - # RawMetadata. - elif isinstance(payload, str): - raw["description"] = payload - # Otherwise, it's unparseable, and we need to record that. + # It's possible that someone has messed up and given us a multipart body, + # in which case we'll move the entire body to the unparsed dictionary. + if parsed.is_multipart(): + unparsed["Description"] = parsed.get_payload(decode=True) + # We know we'll get a single bytes object out of this, so now we just need + # to deal with encodings. + else: + bpayload = parsed.get_payload(decode=True) + assert isinstance(bpayload, bytes) + + try: + payload = bpayload.decode("utf", "strict") + except UnicodeDecodeError: + unparsed["Description"] = bpayload else: - unparsed["Description"] = payload + # Check to see if we've already got a description, if so then both + # it, and this body move to unparseable. + if "description" in raw: + unparsed["Description"] = [raw.pop("description"), payload] + else: + raw["description"] = payload # We need to cast our `raw` to a metadata, because a TypedDict only support # literal key names, but we're computing our key names on purpose, but the From 54d663baa461791851f9361c54052fd79fb84d06 Mon Sep 17 00:00:00 2001 From: Donald Stufft Date: Sat, 16 Jul 2022 12:40:20 -0400 Subject: [PATCH 06/19] correct casing and missing headers --- packaging/metadata/_parse.py | 57 ++++++++++++++++++++---------------- 1 file changed, 31 insertions(+), 26 deletions(-) diff --git a/packaging/metadata/_parse.py b/packaging/metadata/_parse.py index 46bcbf665..5bc42f7af 100644 --- a/packaging/metadata/_parse.py +++ b/packaging/metadata/_parse.py @@ -46,6 +46,8 @@ class RawMetadata(TypedDict, total=False): "name", "version", "summary", + "description", + "description_content_type", "home_page", "download_url", "author", @@ -62,7 +64,6 @@ class RawMetadata(TypedDict, total=False): "supported_platforms", "classifiers", "requires_dist", - "requires_python", "requires_external", "provides_extra", "provides_dist", @@ -129,31 +130,31 @@ def _parse_project_urls(data: list[str]) -> dict[str, str]: _EMAIL_FIELD_MAPPING = { - "Metadata-Version": "metadata_version", - "Name": "name", - "Version": "version", - "Dynamic": "dynamic", - "Platform": "platforms", - "Supported-Platform": "supported_platforms", - "Summary": "summary", - "Description": "description", - "Description-Content-Type": "description_content_type", - "Keywords": "keywords", - "Home-Page": "home_page", - "Download-URL": "download_url", - "Author": "author", - "Author-Email": "author_email", - "Maintainer": "maintainer", - "Maintainer-Email": "maintainer_email", - "License": "license", - "Classifier": "classifiers", - "Requires-Dist": "requires_dist", - "Requires-Python": "requires_python", - "Requires-External": "requires_external", - "Project-URL": "project_urls", - "Provides-Extra": "provides_extra", - "Provides-Dist": "provides_dist", - "Obsoletes-Dist": "obsoletes_dist", + "metadata-version": "metadata_version", + "name": "name", + "version": "version", + "dynamic": "dynamic", + "platform": "platforms", + "supported-platform": "supported_platforms", + "summary": "summary", + "description": "description", + "description-content-type": "description_content_type", + "keywords": "keywords", + "home-page": "home_page", + "download-url": "download_url", + "author": "author", + "author-email": "author_email", + "maintainer": "maintainer", + "maintainer-email": "maintainer_email", + "license": "license", + "classifier": "classifiers", + "requires-dist": "requires_dist", + "requires-python": "requires_python", + "requires-external": "requires_external", + "project-url": "project_urls", + "provides-extra": "provides_extra", + "provides-dist": "provides_dist", + "obsoletes-dist": "obsoletes_dist", } @@ -170,6 +171,10 @@ def parse_email(data: Union[bytes, str]) -> tuple[RawMetadata, dict[Any, Any]]: # values for a key (a list), the key will appear multiple times in the # list of keys, but we're avoiding that by using get_all(). for name in set(parsed.keys()): + # Header names in RFC are case insensitive, so we'll normalize to all + # lower case to make comparisons easier. + name = name.lower() + # We use get_all here, even for fields that aren't multiple use, because # otherwise someone could have say, two Name fields, and we would just # silently ignore it rather than doing something about it. From 85516bcc007952f00c609411da35c5a3b5d4ac4b Mon Sep 17 00:00:00 2001 From: Donald Stufft Date: Sat, 16 Jul 2022 13:23:57 -0400 Subject: [PATCH 07/19] Handle str vs bytes data better --- packaging/metadata/_parse.py | 51 ++++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 20 deletions(-) diff --git a/packaging/metadata/_parse.py b/packaging/metadata/_parse.py index 5bc42f7af..59d04874c 100644 --- a/packaging/metadata/_parse.py +++ b/packaging/metadata/_parse.py @@ -2,6 +2,7 @@ import email.header import email.parser import email.policy +import email.message import json from typing import Any, TypedDict, Union, cast @@ -293,28 +294,17 @@ def parse_email(data: Union[bytes, str]) -> tuple[RawMetadata, dict[Any, Any]]: # addition to getting it from the the headers, but since Description is # conceptually a string, if it's already been set from headers then we'll # clear it out move them both to unparsed. - # - # It's possible that someone has messed up and given us a multipart body, - # in which case we'll move the entire body to the unparsed dictionary. - if parsed.is_multipart(): - unparsed["Description"] = parsed.get_payload(decode=True) - # We know we'll get a single bytes object out of this, so now we just need - # to deal with encodings. + try: + payload = _get_payload(parsed, data) + except ValueError: + unparsed["Description"] = parsed.get_payload(decode=isinstance(data, bytes)) else: - bpayload = parsed.get_payload(decode=True) - assert isinstance(bpayload, bytes) - - try: - payload = bpayload.decode("utf", "strict") - except UnicodeDecodeError: - unparsed["Description"] = bpayload + # Check to see if we've already got a description, if so then both + # it, and this body move to unparseable. + if "description" in raw: + unparsed["Description"] = [raw.pop("description"), payload] else: - # Check to see if we've already got a description, if so then both - # it, and this body move to unparseable. - if "description" in raw: - unparsed["Description"] = [raw.pop("description"), payload] - else: - raw["description"] = payload + raw["description"] = payload # We need to cast our `raw` to a metadata, because a TypedDict only support # literal key names, but we're computing our key names on purpose, but the @@ -430,3 +420,24 @@ def parse_json(data: Union[bytes, str]) -> tuple[RawMetadata, dict[Any, Any]]: # way this function is implemented, our `TypedDict` can only have valid key # names. return cast(RawMetadata, raw), unparsed + + +def _get_payload(msg: email.message.Message, source: Union[bytes, str]) -> str: + # If our source is a str, then our caller has managed encodings for us, + # and we don't need to deal with it. + if isinstance(source, str): + payload: Union[list[str], str] = msg.get_payload() + if isinstance(payload, list): + raise ValueError("payload is a multipart") + return payload + # If our source is a bytes, then we're managing the encoding and we need + # to deal with it. + else: + bpayload: Union[list[bytes], bytes] = msg.get_payload(decode=True) + if isinstance(bpayload, list): + raise ValueError("payload is a multipart") + + try: + return bpayload.decode("utf8", "strict") + except UnicodeDecodeError: + raise ValueError("payload in an invalid encoding") From e30b28ebc6ff49a48cba377cdb9e26ad8da0f46a Mon Sep 17 00:00:00 2001 From: Donald Stufft Date: Sat, 16 Jul 2022 13:30:53 -0400 Subject: [PATCH 08/19] sort metadata fields better, add missing Metadata 1.1 fields --- packaging/metadata/_parse.py | 148 +++++++++++++++++++++-------------- 1 file changed, 91 insertions(+), 57 deletions(-) diff --git a/packaging/metadata/_parse.py b/packaging/metadata/_parse.py index 59d04874c..95c6cb81a 100644 --- a/packaging/metadata/_parse.py +++ b/packaging/metadata/_parse.py @@ -15,60 +15,88 @@ # but that map can be easily implemented as a list of strings) # then we can support serializing to and from that format. class RawMetadata(TypedDict, total=False): + # Metadata 1.0 - PEP 241 metadata_version: str name: str version: str - dynamic: list[str] platforms: list[str] - supported_platforms: list[str] summary: str description: str - description_content_type: str keywords: list[str] home_page: str - download_url: str author: str author_email: str - maintainer: str - maintainer_email: str license: str + + # Metadata 1.1 - PEP 314 + supported_platforms: list[str] + download_url: str classifiers: list[str] + requires: list[str] + provides: list[str] + obsoletes: list[str] + + # Metadata 1.2 - PEP 345 + maintainer: str + maintainer_email: str requires_dist: list[str] + provides_dist: list[str] + obsoletes_dist: list[str] requires_python: str requires_external: list[str] project_urls: dict[str, str] + + # Metadata 2.0 + # PEP 426 attempted to completely revamp the metadata format + # but got stuck without ever being able to build consensus on + # it and ultimately ended up withdrawn. + # + # However, a number of tools had started emiting METADATA with + # `2.0` Metadata-Version, so for historical reasons, this version + # was skipped. + + # Metadata 2.1 - PEP 566 + description_content_type: str provides_extra: list[str] - provides_dist: list[str] - obsoletes_dist: list[str] + + # Metadata 2.2 - PEP 643 + dynamic: list[str] + + # Metadata 2.3 - PEP 685 + # No new fields were added in PEP 685, just some edge case were + # tightened up to provide better interoptability. _STRING_FIELDS = { - "metadata_version", - "name", - "version", - "summary", + "author", + "author_email", "description", "description_content_type", - "home_page", "download_url", - "author", - "author_email", + "home_page", + "license", "maintainer", "maintainer_email", - "license", + "metadata_version", + "name", "requires_python", + "summary", + "version", } _LIST_STRING_FIELDS = { + "classifiers", "dynamic", + "obsoletes", + "obsoletes_dist", "platforms", - "supported_platforms", - "classifiers", + "provides", + "provides_dist", + "provides_extra", + "requires", "requires_dist", "requires_external", - "provides_extra", - "provides_dist", - "obsoletes_dist", + "supported_platforms", } # General helper functions for parsing some string values for reusing in @@ -131,31 +159,34 @@ def _parse_project_urls(data: list[str]) -> dict[str, str]: _EMAIL_FIELD_MAPPING = { - "metadata-version": "metadata_version", - "name": "name", - "version": "version", - "dynamic": "dynamic", - "platform": "platforms", - "supported-platform": "supported_platforms", - "summary": "summary", + "author": "author", + "author-email": "author_email", + "classifier": "classifiers", "description": "description", "description-content-type": "description_content_type", - "keywords": "keywords", - "home-page": "home_page", "download-url": "download_url", - "author": "author", - "author-email": "author_email", + "dynamic": "dynamic", + "home-page": "home_page", + "keywords": "keywords", + "license": "license", "maintainer": "maintainer", "maintainer-email": "maintainer_email", - "license": "license", - "classifier": "classifiers", - "requires-dist": "requires_dist", - "requires-python": "requires_python", - "requires-external": "requires_external", + "metadata-version": "metadata_version", + "name": "name", + "obsoletes": "obsoletes", + "obsoletes-dist": "obsoletes_dist", + "platform": "platforms", "project-url": "project_urls", - "provides-extra": "provides_extra", + "provides": "provides", "provides-dist": "provides_dist", - "obsoletes-dist": "obsoletes_dist", + "provides-extra": "provides_extra", + "requires": "requires", + "requires-dist": "requires_dist", + "requires-external": "requires_external", + "requires-python": "requires_python", + "summary": "summary", + "supported-platform": "supported_platforms", + "version": "version", } @@ -317,31 +348,34 @@ def parse_email(data: Union[bytes, str]) -> tuple[RawMetadata, dict[Any, Any]]: # it is. However, the algorithm in PEP 566 doesn't match 100% the keys chosen # for RawMetadata, so we use this mapping just like with email to handle that. _JSON_FIELD_MAPPING = { - "metadata_version": "metadata_version", - "name": "name", - "version": "version", - "dynamic": "dynamic", - "platform": "platforms", - "supported_platform": "supported_platforms", - "summary": "summary", + "author": "author", + "author_email": "author_email", + "classifier": "classifiers", "description": "description", "description_content_type": "description_content_type", - "keywords": "keywords", - "home_page": "home_page", "download_url": "download_url", - "author": "author", - "author_email": "author_email", + "dynamic": "dynamic", + "home_page": "home_page", + "keywords": "keywords", + "license": "license", "maintainer": "maintainer", "maintainer_email": "maintainer_email", - "license": "license", - "classifier": "classifiers", - "requires_dist": "requires_dist", - "requires_python": "requires_python", - "requires_external": "requires_external", + "metadata_version": "metadata_version", + "name": "name", + "obsoletes": "obsoletes", + "obsoletes_dist": "obsoletes_dist", + "platform": "platforms", "project_url": "project_urls", - "provides_extra": "provides_extra", + "provides": "provides", "provides_dist": "provides_dist", - "obsoletes_dist": "obsoletes_dist", + "provides_extra": "provides_extra", + "requires": "requires", + "requires_dist": "requires_dist", + "requires_external": "requires_external", + "requires_python": "requires_python", + "summary": "summary", + "supported_platform": "supported_platforms", + "version": "version", } From 5b7e097dc3f79ba855625b877413a3dbb6e67fc1 Mon Sep 17 00:00:00 2001 From: Donald Stufft Date: Sat, 16 Jul 2022 13:31:38 -0400 Subject: [PATCH 09/19] linting --- packaging/metadata/_parse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packaging/metadata/_parse.py b/packaging/metadata/_parse.py index 95c6cb81a..80080ec7b 100644 --- a/packaging/metadata/_parse.py +++ b/packaging/metadata/_parse.py @@ -1,8 +1,8 @@ import email.feedparser import email.header +import email.message import email.parser import email.policy -import email.message import json from typing import Any, TypedDict, Union, cast From 2a795e9446833291f0dd3ee89e6484ab88ed7931 Mon Sep 17 00:00:00 2001 From: Donald Stufft Date: Sat, 16 Jul 2022 13:34:48 -0400 Subject: [PATCH 10/19] reorganization --- packaging/metadata/__init__.py | 2 +- packaging/metadata/{_parse.py => _raw.py} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename packaging/metadata/{_parse.py => _raw.py} (100%) diff --git a/packaging/metadata/__init__.py b/packaging/metadata/__init__.py index c4752576e..f9594b166 100644 --- a/packaging/metadata/__init__.py +++ b/packaging/metadata/__init__.py @@ -1,4 +1,4 @@ -from ._parse import RawMetadata, parse_email, parse_json +from ._raw import RawMetadata, parse_email, parse_json from ._types import DynamicField, Metadata __all__ = ["DynamicField", "Metadata", "RawMetadata", "parse_email", "parse_json"] diff --git a/packaging/metadata/_parse.py b/packaging/metadata/_raw.py similarity index 100% rename from packaging/metadata/_parse.py rename to packaging/metadata/_raw.py From e0959708170572b1d4aba736ec9752d700d579e0 Mon Sep 17 00:00:00 2001 From: Donald Stufft Date: Sat, 16 Jul 2022 13:36:29 -0400 Subject: [PATCH 11/19] Expose packaging.metadata.raw as it's own module --- packaging/metadata/__init__.py | 3 +-- packaging/metadata/{_raw.py => raw.py} | 3 +++ 2 files changed, 4 insertions(+), 2 deletions(-) rename packaging/metadata/{_raw.py => raw.py} (99%) diff --git a/packaging/metadata/__init__.py b/packaging/metadata/__init__.py index f9594b166..d0ba767b1 100644 --- a/packaging/metadata/__init__.py +++ b/packaging/metadata/__init__.py @@ -1,4 +1,3 @@ -from ._raw import RawMetadata, parse_email, parse_json from ._types import DynamicField, Metadata -__all__ = ["DynamicField", "Metadata", "RawMetadata", "parse_email", "parse_json"] +__all__ = ["DynamicField", "Metadata"] diff --git a/packaging/metadata/_raw.py b/packaging/metadata/raw.py similarity index 99% rename from packaging/metadata/_raw.py rename to packaging/metadata/raw.py index 80080ec7b..03d88990e 100644 --- a/packaging/metadata/_raw.py +++ b/packaging/metadata/raw.py @@ -7,6 +7,9 @@ from typing import Any, TypedDict, Union, cast +__all__ = ["RawMetadata", "parse_email", "parse_json"] + + # The RawMetadata class attempts to make as few assumptions about # the underlying serialization formats as possible, these could # possibly serialize in an entirely different way, but the idea From 0a2b73376d4f3bc4e498b244791f0b75b63790e0 Mon Sep 17 00:00:00 2001 From: Donald Stufft Date: Sat, 16 Jul 2022 13:40:15 -0400 Subject: [PATCH 12/19] More compatible type hints --- packaging/metadata/raw.py | 53 +++++++++++++++++++-------------------- 1 file changed, 26 insertions(+), 27 deletions(-) diff --git a/packaging/metadata/raw.py b/packaging/metadata/raw.py index 03d88990e..7ba0c2cc0 100644 --- a/packaging/metadata/raw.py +++ b/packaging/metadata/raw.py @@ -4,8 +4,7 @@ import email.parser import email.policy import json -from typing import Any, TypedDict, Union, cast - +from typing import Any, Dict, List, Tuple, TypedDict, Union, cast __all__ = ["RawMetadata", "parse_email", "parse_json"] @@ -22,32 +21,32 @@ class RawMetadata(TypedDict, total=False): metadata_version: str name: str version: str - platforms: list[str] + platforms: List[str] summary: str description: str - keywords: list[str] + keywords: List[str] home_page: str author: str author_email: str license: str # Metadata 1.1 - PEP 314 - supported_platforms: list[str] + supported_platforms: List[str] download_url: str - classifiers: list[str] - requires: list[str] - provides: list[str] - obsoletes: list[str] + classifiers: List[str] + requires: List[str] + provides: List[str] + obsoletes: List[str] # Metadata 1.2 - PEP 345 maintainer: str maintainer_email: str - requires_dist: list[str] - provides_dist: list[str] - obsoletes_dist: list[str] + requires_dist: List[str] + provides_dist: List[str] + obsoletes_dist: List[str] requires_python: str - requires_external: list[str] - project_urls: dict[str, str] + requires_external: List[str] + project_urls: Dict[str, str] # Metadata 2.0 # PEP 426 attempted to completely revamp the metadata format @@ -60,10 +59,10 @@ class RawMetadata(TypedDict, total=False): # Metadata 2.1 - PEP 566 description_content_type: str - provides_extra: list[str] + provides_extra: List[str] # Metadata 2.2 - PEP 643 - dynamic: list[str] + dynamic: List[str] # Metadata 2.3 - PEP 685 # No new fields were added in PEP 685, just some edge case were @@ -106,11 +105,11 @@ class RawMetadata(TypedDict, total=False): # multiple parse_FORMAT functions -def _parse_keywords(data: str) -> list[str]: +def _parse_keywords(data: str) -> List[str]: return [k.strip() for k in data.split(",")] -def _parse_project_urls(data: list[str]) -> dict[str, str]: +def _parse_project_urls(data: List[str]) -> Dict[str, str]: urls = {} for pair in data: # Our logic is slightly tricky here as we want to try and do @@ -193,9 +192,9 @@ def _parse_project_urls(data: list[str]) -> dict[str, str]: } -def parse_email(data: Union[bytes, str]) -> tuple[RawMetadata, dict[Any, Any]]: - raw: dict[str, Any] = {} - unparsed: dict[Any, Any] = {} +def parse_email(data: Union[bytes, str]) -> Tuple[RawMetadata, Dict[Any, Any]]: + raw: Dict[str, Any] = {} + unparsed: Dict[Any, Any] = {} if isinstance(data, str): parsed = email.parser.Parser(policy=email.policy.compat32).parsestr(data) @@ -382,9 +381,9 @@ def parse_email(data: Union[bytes, str]) -> tuple[RawMetadata, dict[Any, Any]]: } -def parse_json(data: Union[bytes, str]) -> tuple[RawMetadata, dict[Any, Any]]: - raw: dict[Any, Any] = {} - unparsed: dict[Any, Any] = {} +def parse_json(data: Union[bytes, str]) -> Tuple[RawMetadata, Dict[Any, Any]]: + raw: Dict[Any, Any] = {} + unparsed: Dict[Any, Any] = {} parsed = json.loads(data) # We need to make sure that the data given to us actually implements @@ -413,7 +412,7 @@ def parse_json(data: Union[bytes, str]) -> tuple[RawMetadata, dict[Any, Any]]: and isinstance(value, list) and all(isinstance(v, str) for v in value) ): - raw[raw_name] = cast(list[str], value) + raw[raw_name] = cast(List[str], value) # Special Case: Keywords # The keywords field is implemented in the metadata spec as a str, # but it conceptually is a list of strings. Interestingly, the @@ -463,14 +462,14 @@ def _get_payload(msg: email.message.Message, source: Union[bytes, str]) -> str: # If our source is a str, then our caller has managed encodings for us, # and we don't need to deal with it. if isinstance(source, str): - payload: Union[list[str], str] = msg.get_payload() + payload: Union[List[str], str] = msg.get_payload() if isinstance(payload, list): raise ValueError("payload is a multipart") return payload # If our source is a bytes, then we're managing the encoding and we need # to deal with it. else: - bpayload: Union[list[bytes], bytes] = msg.get_payload(decode=True) + bpayload: Union[List[bytes], bytes] = msg.get_payload(decode=True) if isinstance(bpayload, list): raise ValueError("payload is a multipart") From 36ac9b71fc8e8df97d123fff01793559673df108 Mon Sep 17 00:00:00 2001 From: Donald Stufft Date: Sat, 16 Jul 2022 17:19:01 -0400 Subject: [PATCH 13/19] Enable emitting email/json from RawMetadata --- packaging/metadata/raw.py | 134 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 134 insertions(+) diff --git a/packaging/metadata/raw.py b/packaging/metadata/raw.py index 7ba0c2cc0..d25d8f37b 100644 --- a/packaging/metadata/raw.py +++ b/packaging/metadata/raw.py @@ -346,6 +346,93 @@ def parse_email(data: Union[bytes, str]) -> Tuple[RawMetadata, Dict[Any, Any]]: return cast(RawMetadata, raw), unparsed +_EMAIL_FIELD_ORDER = [ + # Always put the metadata version first, incase it ever changes how + # we parse this file. + "metadata_version", + # Put the other pieces of mandatory information next. + "name", + "version", + # We're just going to emit all of these in sorted order, except we'll + # float deprecated or "rarely used" fields to the bottom. + "author", + "author_email", + "classifiers", + # We are purposely excluding the description field, we don't want to + # write that field out as a header, so we won't include it in this list + # and it will have to be manually handled instead. + # "description", + "description_content_type", + "download_url", + "dynamic", + "home_page", + "keywords", + "license", + "maintainer", + "maintainer_email", + "platforms", + "project_urls", + "provides_extra", + "requires_dist", + "requires_python", + "summary", + "supported_platforms", + # Deprecated or "rarely used" + "obsoletes", + "obsoletes_dist", + "provides", + "provides_dist", + "requires", + "requires_external", +] + + +def emit_email(raw: RawMetadata) -> bytes: + # TypedDict only allows literal keys, we know that are dynamic keys are correct + # but to satisfy the type checker we'll cast things. + data = cast(Dict[str, Any], raw) + + # Figure out our mapping to email names + field_names = dict((v, k) for (k, v) in _EMAIL_FIELD_MAPPING.items()) + + # From what I can tell, there is no way to get the email module in the stdlib + # to actually emit a ``METADATA``file in the format that we need, so instead + # we'll have to manually craft one. + lines = [] + + for field in _EMAIL_FIELD_ORDER: + field_name = field_names[field] + field_data = data.get(field) + if field_data: + # String fields get emitted as Key: Data + if field in _STRING_FIELDS and isinstance(field_data, str): + lines.append(f"{field_name}: {_rfc822_escape(field_data)}") + # List String fields get emitted as a Key: Data per entry. + elif field in _LIST_STRING_FIELDS and isinstance(field_data, list): + for item in field_data: + lines.append(f"{field_name}: {_rfc822_escape(item)}") + # Special Case: Keywords + # We need to turn our List String for Keywords back into a singular + # string for the core metadata spec. + elif field == "keywords" and isinstance(field_data, list): + lines.append(f"{field_name}: {_rfc822_escape(', '.join(field_data))}") + # Special Case: Project-URL + # We need to turn our dict[str, str] back into the list of specially + # formatted strings to match what the core metadata expects. + elif field == "project_urls" and isinstance(field_data, dict): + for label, url in field_data.items(): + lines.append( + f"{field_name}: {_rfc822_escape(', '.join([label, url]))}" + ) + + msg = "\n".join(lines) + description = raw.get("description") + if description: + msg = msg + "\n\n" + description + + return msg.encode("utf8") + + # This might appear to be a mapping of the same key to itself, and in many cases # it is. However, the algorithm in PEP 566 doesn't match 100% the keys chosen # for RawMetadata, so we use this mapping just like with email to handle that. @@ -458,6 +545,43 @@ def parse_json(data: Union[bytes, str]) -> Tuple[RawMetadata, Dict[Any, Any]]: return cast(RawMetadata, raw), unparsed +def emit_json(raw: RawMetadata) -> bytes: + # TypedDict only allows literal keys, we know that are dynamic keys are correct + # but to satisfy the type checker we'll cast things. + data = cast(Dict[str, Any], raw) + + # Figure out our mapping to email names + field_names = dict((v, k) for (k, v) in _JSON_FIELD_MAPPING.items()) + + out = {} + for field in _EMAIL_FIELD_ORDER: + field_name = field_names[field] + field_data = data.get(field) + if field_data: + if (field in _STRING_FIELDS and isinstance(field_data, str)) or ( + field in _LIST_STRING_FIELDS and isinstance(field_data, list) + ): + out[field_name] = field_data + # Special Case: Keywords + # We need to turn our List String for Keywords back into a singular + # string for the core metadata spec. + elif field == "keywords" and isinstance(field_data, list): + out[field_name] = ", ".join(field_data) + # Special Case: Project-URL + # We need to turn our dict[str, str] back into the list of specially + # formatted strings to match what the core metadata expects. + elif field == "project_urls" and isinstance(field_data, dict): + out[field_name] = [ + f"{label}, {url}" for (label, url) in field_data.items() + ] + + description = raw.get("description") + if description: + out["description"] = description + + return json.dumps(out).encode("utf8") + + def _get_payload(msg: email.message.Message, source: Union[bytes, str]) -> str: # If our source is a str, then our caller has managed encodings for us, # and we don't need to deal with it. @@ -477,3 +601,13 @@ def _get_payload(msg: email.message.Message, source: Union[bytes, str]) -> str: return bpayload.decode("utf8", "strict") except UnicodeDecodeError: raise ValueError("payload in an invalid encoding") + + +def _rfc822_escape(header: str) -> str: + """ + Return a version of the string escaped for inclusion in an + RFC-822 header, by ensuring there are 8 spaces space after each newline. + """ + lines = header.split("\n") + sep = "\n" + 8 * " " + return sep.join(lines) From 54d354a70e8ecf68396decd3309f49295ecf6589 Mon Sep 17 00:00:00 2001 From: Donald Stufft Date: Sat, 16 Jul 2022 17:27:16 -0400 Subject: [PATCH 14/19] pyupgrade --- packaging/metadata/raw.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packaging/metadata/raw.py b/packaging/metadata/raw.py index d25d8f37b..4a5e595df 100644 --- a/packaging/metadata/raw.py +++ b/packaging/metadata/raw.py @@ -393,7 +393,7 @@ def emit_email(raw: RawMetadata) -> bytes: data = cast(Dict[str, Any], raw) # Figure out our mapping to email names - field_names = dict((v, k) for (k, v) in _EMAIL_FIELD_MAPPING.items()) + field_names = {v: k for (k, v) in _EMAIL_FIELD_MAPPING.items()} # From what I can tell, there is no way to get the email module in the stdlib # to actually emit a ``METADATA``file in the format that we need, so instead @@ -551,7 +551,7 @@ def emit_json(raw: RawMetadata) -> bytes: data = cast(Dict[str, Any], raw) # Figure out our mapping to email names - field_names = dict((v, k) for (k, v) in _JSON_FIELD_MAPPING.items()) + field_names = {v: k for (k, v) in _JSON_FIELD_MAPPING.items()} out = {} for field in _EMAIL_FIELD_ORDER: From a0f99e8bd18b5042aad6aae3984acfdde972c170 Mon Sep 17 00:00:00 2001 From: Donald Stufft Date: Sat, 16 Jul 2022 21:26:11 -0400 Subject: [PATCH 15/19] don't overwrite when payload is empty --- packaging/metadata/raw.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/packaging/metadata/raw.py b/packaging/metadata/raw.py index 4a5e595df..9d508c507 100644 --- a/packaging/metadata/raw.py +++ b/packaging/metadata/raw.py @@ -332,12 +332,13 @@ def parse_email(data: Union[bytes, str]) -> Tuple[RawMetadata, Dict[Any, Any]]: except ValueError: unparsed["Description"] = parsed.get_payload(decode=isinstance(data, bytes)) else: - # Check to see if we've already got a description, if so then both - # it, and this body move to unparseable. - if "description" in raw: - unparsed["Description"] = [raw.pop("description"), payload] - else: - raw["description"] = payload + if payload: + # Check to see if we've already got a description, if so then both + # it, and this body move to unparseable. + if "description" in raw: + unparsed["Description"] = [raw.pop("description"), payload] + else: + raw["description"] = payload # We need to cast our `raw` to a metadata, because a TypedDict only support # literal key names, but we're computing our key names on purpose, but the From 52be4b5dbc1e7b912d1c9243ecc1e17698460411 Mon Sep 17 00:00:00 2001 From: Donald Stufft Date: Sun, 17 Jul 2022 11:03:59 -0400 Subject: [PATCH 16/19] Start reworking the Metadata class --- packaging/metadata/_types.py | 270 ++++++++++++------------------ packaging/metadata/_validation.py | 134 +++++++++++++++ 2 files changed, 244 insertions(+), 160 deletions(-) create mode 100644 packaging/metadata/_validation.py diff --git a/packaging/metadata/_types.py b/packaging/metadata/_types.py index 1722981be..f6068b450 100644 --- a/packaging/metadata/_types.py +++ b/packaging/metadata/_types.py @@ -1,10 +1,11 @@ +from __future__ import annotations + import enum -from typing import Iterable, List, Optional, Tuple +from typing import Optional, Tuple, TypedDict -from ..requirements import Requirement -from ..specifiers import SpecifierSet -from ..utils import NormalizedName, canonicalize_name from ..version import Version +from ._validation import RegexValidator, Required, eagerly_validate, lazy_validator +from .raw import RawMetadata, parse_email, parse_json # Type aliases. _NameAndEmail = Tuple[Optional[str], str] @@ -50,162 +51,111 @@ class DynamicField(enum.Enum): PROVIDES_EXTRA = "provides-extra" -class Metadata: - """A class representing the `Core Metadata`_ for a project. +class _ValidatedMetadata(TypedDict, total=False): + # Metadata 1.0 - PEP 241 + name: str + version: Version + # platforms: List[str] + # summary: str + # description: str + # keywords: List[str] + # home_page: str + # author: str + # author_email: str + # license: str - Every potential metadata field except for ``Metadata-Version`` is represented by a - parameter to the class' constructor. The required metadata can be passed in - positionally or via keyword, while all optional metadata can only be passed in via - keyword. - Every parameter has a matching attribute on instances, except for *name* (see - :attr:`display_name` and :attr:`canonical_name`). Any parameter that accepts an - :class:`~collections.abc.Iterable` is represented as a :class:`list` on the - corresponding attribute. - """ +class Metadata: - # A property named `display_name` exposes the value. - _display_name: str - # A property named `canonical_name` exposes the value. - _canonical_name: NormalizedName - version: Version - platforms: List[str] - summary: str - description: str - keywords: List[str] - home_page: str - author: str - author_emails: List[_NameAndEmail] - license: str - supported_platforms: List[str] - download_url: str - classifiers: List[str] - maintainer: str - maintainer_emails: List[_NameAndEmail] - requires_dists: List[Requirement] - requires_python: SpecifierSet - requires_externals: List[str] - project_urls: List[_LabelAndURL] - provides_dists: List[str] - obsoletes_dists: List[str] - description_content_type: str - provides_extras: List[NormalizedName] - dynamic_fields: List[DynamicField] - - def __init__( - self, - name: str, - version: Version, - *, - # 1.0 - platforms: Optional[Iterable[str]] = None, - summary: Optional[str] = None, - description: Optional[str] = None, - keywords: Optional[Iterable[str]] = None, - home_page: Optional[str] = None, - author: Optional[str] = None, - author_emails: Optional[Iterable[_NameAndEmail]] = None, - license: Optional[str] = None, - # 1.1 - supported_platforms: Optional[Iterable[str]] = None, - download_url: Optional[str] = None, - classifiers: Optional[Iterable[str]] = None, - # 1.2 - maintainer: Optional[str] = None, - maintainer_emails: Optional[Iterable[_NameAndEmail]] = None, - requires_dists: Optional[Iterable[Requirement]] = None, - requires_python: Optional[SpecifierSet] = None, - requires_externals: Optional[Iterable[str]] = None, - project_urls: Optional[Iterable[_LabelAndURL]] = None, - provides_dists: Optional[Iterable[str]] = None, - obsoletes_dists: Optional[Iterable[str]] = None, - # 2.1 - description_content_type: Optional[str] = None, - provides_extras: Optional[Iterable[NormalizedName]] = None, - # 2.2 - dynamic_fields: Optional[Iterable[DynamicField]] = None, - ) -> None: - """Initialize a Metadata object. - - The parameters all correspond to fields in `Core Metadata`_. - - :param name: ``Name`` - :param version: ``Version`` - :param platforms: ``Platform`` - :param summary: ``Summary`` - :param description: ``Description`` - :param keywords: ``Keywords`` - :param home_page: ``Home-Page`` - :param author: ``Author`` - :param author_emails: - ``Author-Email`` (two-item tuple represents the name and email of the - author) - :param license: ``License`` - :param supported_platforms: ``Supported-Platform`` - :param download_url: ``Download-URL`` - :param classifiers: ``Classifier`` - :param maintainer: ``Maintainer`` - :param maintainer_emails: - ``Maintainer-Email`` (two-item tuple represent the name and email of the - maintainer) - :param requires_dists: ``Requires-Dist`` - :param SpecifierSet requires_python: ``Requires-Python`` - :param requires_externals: ``Requires-External`` - :param project_urls: ``Project-URL`` - :param provides_dists: ``Provides-Dist`` - :param obsoletes_dists: ``Obsoletes-Dist`` - :param description_content_type: ``Description-Content-Type`` - :param provides_extras: ``Provides-Extra`` - :param dynamic_fields: ``Dynamic`` - """ - self.display_name = name - self.version = version - self.platforms = list(platforms or []) - self.summary = summary or "" - self.description = description or "" - self.keywords = list(keywords or []) - self.home_page = home_page or "" - self.author = author or "" - self.author_emails = list(author_emails or []) - self.license = license or "" - self.supported_platforms = list(supported_platforms or []) - self.download_url = download_url or "" - self.classifiers = list(classifiers or []) - self.maintainer = maintainer or "" - self.maintainer_emails = list(maintainer_emails or []) - self.requires_dists = list(requires_dists or []) - self.requires_python = requires_python or SpecifierSet() - self.requires_externals = list(requires_externals or []) - self.project_urls = list(project_urls or []) - self.provides_dists = list(provides_dists or []) - self.obsoletes_dists = list(obsoletes_dists or []) - self.description_content_type = description_content_type or "" - self.provides_extras = list(provides_extras or []) - self.dynamic_fields = list(dynamic_fields or []) - - @property - def display_name(self) -> str: - """ - The project name to be displayed to users (i.e. not normalized). Initially - set based on the `name` parameter. - - Setting this attribute will also update :attr:`canonical_name`. - """ - return self._display_name - - @display_name.setter - def display_name(self, value: str) -> None: - self._display_name = value - self._canonical_name = canonicalize_name(value) - - # Use functools.cached_property once Python 3.7 support is dropped. - # Value is set by self.display_name.setter to keep in sync with self.display_name. - @property - def canonical_name(self) -> NormalizedName: - """ - The normalized project name as per :func:`packaging.utils.canonicalize_name`. - - The attribute is read-only and automatically calculated based on the value of - :attr:`display_name`. - """ - return self._canonical_name + # We store our "actual" metadata as a RawMetadata, which + # gives is a little bit of indirection here. The RawMetadata + # class is lenient as to what it will consider valid, but this + # class is not. + # + # However, we want to support validation to happen both up front + # and on the fly as you access attributes, and when using the + # on the fly validation, we don't want to validate anything else + # except for the specific piece of metadata that is being + # asked for. + # + # That means that we need to store, at least initially, the + # metadata in a form that is lenient, which is exactly the + # purpose of RawMetadata. + _raw: RawMetadata + + # Likewise, we need a place to store our honest to goodness actually + # validated metadata too, we could just store this in a dict, but + # this will give us better typing. + _validated: _ValidatedMetadata + + def __init__(self) -> None: + raise NotImplementedError + + # It's not exactly the most pythonic thing to have a bunch of getter/setters + # like this for every attribute, however this enables us to do our on the + # fly validation. + + # Name: Metadata 1.0 + name = lazy_validator( + str, + validators=[ + Required(), + RegexValidator("(?i)^([A-Z0-9]|[A-Z0-9][A-Z0-9._-]*[A-Z0-9])$"), + ], + ) + # Version: Metadata 1.0 + version = lazy_validator(Version, validators=[Required()]) + + @classmethod + def from_raw(cls, raw: RawMetadata, *, validate: bool = True) -> Metadata: + # Ok this is some kind of gross code here, but it has a specific + # purpose. + # + # We want to enable the progrmatic API of the Metadata + # class to strictly validate, including requires data, so + # we want something like Metadata("foo", "1.0", ...), but + # we also want from_raw to *not* require that data, so we + # treat our __init__ as our public constructor, then we bypass + # the __init__ when calling from_raw to let us setup the object + # in a completely different way, without exposing that as + # programatic API in and of itself. + meta = cls.__new__(cls) + meta._raw = raw + meta._validated = _ValidatedMetadata() + + # It's not possible to use Metadata without validating, but the + # validate parameter here lets people control whether the entire + # metadata gets validated up front, or whether it gets validated + # on demand. + if validate: + eagerly_validate(meta) + + return meta + + @classmethod + def from_email(cls, data: bytes | str, *, validate: bool = True) -> Metadata: + raw, unparsed = parse_email(data) + + # Regardless of the validate attribute, we don't let unparsed data + # pass silently, if someone wants to drop unparsed data on the floor + # they can call parse_email themselves and pass it into from_raw + if unparsed: + raise ValueError( + f"Could not parse, extra keys: {', '.join(unparsed.keys())}" + ) + + return cls.from_raw(raw, validate=validate) + + @classmethod + def from_json(cls, data: bytes | str, *, validate: bool = True) -> Metadata: + raw, unparsed = parse_json(data) + + # Regardless of the validate attribute, we don't let unparsed data + # pass silently, if someone wants to drop unparsed data on the floor + # they can call parse_email themselves and pass it into from_raw + if unparsed: + raise ValueError( + f"Could not parse, extra keys: {', '.join(unparsed.keys())}" + ) + + return cls.from_raw(raw, validate=validate) diff --git a/packaging/metadata/_validation.py b/packaging/metadata/_validation.py new file mode 100644 index 000000000..806e3d887 --- /dev/null +++ b/packaging/metadata/_validation.py @@ -0,0 +1,134 @@ +from __future__ import annotations + +import re +from typing import ( + Any, + Callable, + Type, + TypeVar, + Optional, + TYPE_CHECKING, + cast, + Dict, + Union, + List, + Generic, +) + +if TYPE_CHECKING: + from ._types import Metadata + + +V = TypeVar("V") + + +Validator = Callable[[V], None] + + +class lazy_validator(Generic[V]): + + # This hack exists to work around https://github.com/python/mypy/issues/708 + _creator: Union[Callable[[Any], V], Callable[[Any], V]] + _raw_name: str + _validators: List[Validator[Optional[V]]] + + def __init__( + self, + creator: Callable[[Any], V], + *, + raw_name: Optional[str] = None, + validators: Optional[List[Validator[Optional[V]]]] = None, + ) -> None: + self._creator = creator + if raw_name is not None: + self._raw_name = raw_name + if validators is not None: + self._validators = validators + else: + self._validators = [] + + def __set_name__(self, owner: Metadata, name: str) -> None: + self._raw_name = name + + def __get__(self, obj: Metadata, owner: Type[Metadata]) -> Optional[V]: + # TypedDict doesn't support variable key names, and Python 3.7 doesn't + # support Literal which would let us let it know that this is validated + # already to be safe, so we'll cast here to make things work. + raw = cast(Dict[str, Any], obj._raw) + validated = cast(Dict[str, Optional[V]], obj._validated) + + if self._raw_name not in validated: + value = self._validate(raw.get(self._raw_name)) + validated[self._raw_name] = value + del raw[self._raw_name] + + return validated[self._raw_name] + + def __set__(self, obj: Metadata, value: Any) -> None: + raw = cast(Dict[str, Any], obj._raw) + validated = cast(Dict[str, Optional[V]], obj._validated) + + validated_value = self._validate(value) + validated[self._raw_name] = validated_value + raw.pop(self._raw_name, None) + + def __delete__(self, obj: Metadata) -> None: + raw = cast(Dict[str, Any], obj._raw) + validated = cast(Dict[str, Optional[V]], obj._validated) + + raw.pop(self._raw_name, None) + validated.pop(self._raw_name, None) + + def _validate(self, data: Any) -> Optional[V]: + # Create our value from our raw data + value = self._creator(data) if data is not None else None + + # Loop over our validators, and ensure that our value is actually valid + for validator in self._validators: + validator(value) + + return value + + +def eagerly_validate(obj: Metadata) -> None: + for name, field in obj.__class__.__dict__.items(): + if isinstance(field, lazy_validator): + getattr(obj, name) + + +class Required: + + _error_msg: str + + def __init__(self, message: Optional[str] = None): + if message is None: + self._error_msg = "value is required: {value!r}" + else: + self._error_msg = message + + def __call__(self, value: V) -> None: + if value is None: + raise ValueError(self._error_msg.format(value=value)) + + +class RegexValidator: + + _regex: re.Pattern[str] + _error_msg: str + + def __init__( + self, regex: Union[str, re.Pattern[str]], *, message: Optional[str] = None + ): + if isinstance(regex, str): + self._regex = re.compile(regex) + else: + self._regex = regex + + if message is None: + self._error_msg = "invalid value: {value!r}" + else: + self._error_msg = message + + def __call__(self, value: Optional[str]) -> None: + if value is not None and self._regex.search(value) is None: + raise ValueError(self._error_msg.format(value=value)) From 67492856024627a389f380598a7dc591efe22edc Mon Sep 17 00:00:00 2001 From: Donald Stufft Date: Sun, 17 Jul 2022 11:14:06 -0400 Subject: [PATCH 17/19] linting --- packaging/metadata/_validation.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/packaging/metadata/_validation.py b/packaging/metadata/_validation.py index 806e3d887..d2944cb1a 100644 --- a/packaging/metadata/_validation.py +++ b/packaging/metadata/_validation.py @@ -2,17 +2,17 @@ import re from typing import ( + TYPE_CHECKING, Any, Callable, + Dict, + Generic, + List, + Optional, Type, TypeVar, - Optional, - TYPE_CHECKING, - cast, - Dict, Union, - List, - Generic, + cast, ) if TYPE_CHECKING: From 1d1487adb63f213abbbe875252e6c1f36d1f60f2 Mon Sep 17 00:00:00 2001 From: Donald Stufft Date: Sun, 17 Jul 2022 13:30:36 -0400 Subject: [PATCH 18/19] more metadata --- packaging/metadata/_types.py | 29 ++++++-- packaging/metadata/_utils.py | 19 ++++++ packaging/metadata/_validation.py | 110 ++++++++++++++++++++---------- 3 files changed, 118 insertions(+), 40 deletions(-) create mode 100644 packaging/metadata/_utils.py diff --git a/packaging/metadata/_types.py b/packaging/metadata/_types.py index f6068b450..8c1f924c3 100644 --- a/packaging/metadata/_types.py +++ b/packaging/metadata/_types.py @@ -1,11 +1,13 @@ from __future__ import annotations import enum -from typing import Optional, Tuple, TypedDict +from collections.abc import Iterable +from typing import Any, List, Optional, Tuple, TypedDict from ..version import Version from ._validation import RegexValidator, Required, eagerly_validate, lazy_validator from .raw import RawMetadata, parse_email, parse_json +from ._utils import as_str, as_list_str # Type aliases. _NameAndEmail = Tuple[Optional[str], str] @@ -51,12 +53,24 @@ class DynamicField(enum.Enum): PROVIDES_EXTRA = "provides-extra" +@enum.unique +class MetadataVersion(enum.Enum): + v1_0 = "1.0" + v1_1 = "1.1" + v1_2 = "1.2" + v2_0 = "2.0" + v2_1 = "2.1" + v2_2 = "2.2" + v2_3 = "2.3" + + class _ValidatedMetadata(TypedDict, total=False): # Metadata 1.0 - PEP 241 + metadata_version: str name: str version: Version - # platforms: List[str] - # summary: str + platforms: List[str] + summary: str # description: str # keywords: List[str] # home_page: str @@ -95,9 +109,13 @@ def __init__(self) -> None: # like this for every attribute, however this enables us to do our on the # fly validation. + # Metadata-Version: Metadata 1.0 + _metadata_version = lazy_validator( + MetadataVersion, raw_name="metadata_version", validators=[Required()] + ) # Name: Metadata 1.0 name = lazy_validator( - str, + as_str, validators=[ Required(), RegexValidator("(?i)^([A-Z0-9]|[A-Z0-9][A-Z0-9._-]*[A-Z0-9])$"), @@ -105,6 +123,9 @@ def __init__(self) -> None: ) # Version: Metadata 1.0 version = lazy_validator(Version, validators=[Required()]) + # Platform: Metadata 1.0 + platforms = lazy_validator(as_list_str) + summary = lazy_validator(as_str) @classmethod def from_raw(cls, raw: RawMetadata, *, validate: bool = True) -> Metadata: diff --git a/packaging/metadata/_utils.py b/packaging/metadata/_utils.py new file mode 100644 index 000000000..827d7b48f --- /dev/null +++ b/packaging/metadata/_utils.py @@ -0,0 +1,19 @@ +from collections.abc import Iterable +from typing import Any, List + + +def as_str(inp: Any) -> str: + if not isinstance(inp, str): + raise ValueError("Must be a str") + return inp + + +def as_list_str(inp: Any) -> List[str]: + if not isinstance(inp, Iterable): + raise ValueError("Must be a list of str") + results = [] + for entry in inp: + if not isinstance(entry, str): + raise ValueError("Must a list of str") + results.append(entry) + return results diff --git a/packaging/metadata/_validation.py b/packaging/metadata/_validation.py index d2944cb1a..4b9a8ffcc 100644 --- a/packaging/metadata/_validation.py +++ b/packaging/metadata/_validation.py @@ -1,5 +1,7 @@ from __future__ import annotations +import abc +from email import message import re from typing import ( TYPE_CHECKING, @@ -19,25 +21,22 @@ from ._types import Metadata -V = TypeVar("V") - - -Validator = Callable[[V], None] +T = TypeVar("T") -class lazy_validator(Generic[V]): +class lazy_validator(Generic[T]): # This hack exists to work around https://github.com/python/mypy/issues/708 - _creator: Union[Callable[[Any], V], Callable[[Any], V]] + _creator: Union[Callable[[Any], T], Callable[[Any], T]] _raw_name: str - _validators: List[Validator[Optional[V]]] + _validators: List[Callable[[Any], None]] def __init__( self, - creator: Callable[[Any], V], + creator: Callable[[Any], T], *, raw_name: Optional[str] = None, - validators: Optional[List[Validator[Optional[V]]]] = None, + validators: Optional[List[Callable[[Any], None]]] = None, ) -> None: self._creator = creator if raw_name is not None: @@ -48,14 +47,15 @@ def __init__( self._validators = [] def __set_name__(self, owner: Metadata, name: str) -> None: - self._raw_name = name + if not hasattr(self, "_raw_name"): + self._raw_name = name - def __get__(self, obj: Metadata, owner: Type[Metadata]) -> Optional[V]: + def __get__(self, obj: Metadata, owner: Type[Metadata]) -> Optional[T]: # TypedDict doesn't support variable key names, and Python 3.7 doesn't # support Literal which would let us let it know that this is validated # already to be safe, so we'll cast here to make things work. raw = cast(Dict[str, Any], obj._raw) - validated = cast(Dict[str, Optional[V]], obj._validated) + validated = cast(Dict[str, Optional[T]], obj._validated) if self._raw_name not in validated: value = self._validate(raw.get(self._raw_name)) @@ -66,7 +66,7 @@ def __get__(self, obj: Metadata, owner: Type[Metadata]) -> Optional[V]: def __set__(self, obj: Metadata, value: Any) -> None: raw = cast(Dict[str, Any], obj._raw) - validated = cast(Dict[str, Optional[V]], obj._validated) + validated = cast(Dict[str, Optional[T]], obj._validated) validated_value = self._validate(value) validated[self._raw_name] = validated_value @@ -74,12 +74,12 @@ def __set__(self, obj: Metadata, value: Any) -> None: def __delete__(self, obj: Metadata) -> None: raw = cast(Dict[str, Any], obj._raw) - validated = cast(Dict[str, Optional[V]], obj._validated) + validated = cast(Dict[str, Optional[T]], obj._validated) raw.pop(self._raw_name, None) validated.pop(self._raw_name, None) - def _validate(self, data: Any) -> Optional[V]: + def _validate(self, data: Any) -> Optional[T]: # Create our value from our raw data value = self._creator(data) if data is not None else None @@ -96,39 +96,77 @@ def eagerly_validate(obj: Metadata) -> None: getattr(obj, name) -class Required: +V = TypeVar("V") - _error_msg: str - def __init__(self, message: Optional[str] = None): - if message is None: - self._error_msg = "value is required: {value!r}" - else: - self._error_msg = message +class ValidationError(Exception): + pass + + +class Validator(Generic[V], abc.ABC): + + message: str + + def __init__(self, *args: Any, message: Optional[str] = None, **kwargs: Any): + super().__init__(*args, **kwargs) + if message is not None: + self.message = message + + def __call__(self, value: Optional[V]) -> None: + try: + self.full_validate(value) + except Exception as exc: + raise ValidationError(self.message.format(value=value)) from exc + + def full_validate(self, value: Optional[V]) -> None: + if value is not None: + self.validate(value) + + @abc.abstractmethod + def validate(self, value: V) -> None: + ... - def __call__(self, value: V) -> None: + +class Required(Validator[V]): + + message: str = "value is required: {value!r}" + + def full_validate(self, value: Optional[V]) -> None: if value is None: - raise ValueError(self._error_msg.format(value=value)) + raise ValueError("required value") + def validate(self, value: V) -> None: + pass -class RegexValidator: + +class RegexValidator(Validator[V]): _regex: re.Pattern[str] - _error_msg: str + message: str = "invalid value: {value!r}" + + def __init__(self, regex: Union[str, re.Pattern[str]], *args: Any, **kwargs: Any): + super().__init__(*args, **kwargs) - def __init__( - self, regex: Union[str, re.Pattern[str]], *, message: Optional[str] = None - ): if isinstance(regex, str): self._regex = re.compile(regex) else: self._regex = regex - if message is None: - self._error_msg = "invalid value: {value!r}" - else: - self._error_msg = message + def validate(self, value: V) -> None: + if not isinstance(value, str): + raise TypeError + + if self._regex.search(value) is None: + raise ValueError(f"doesn't match: {self._regex.pattern}") + + +class SingleLine(Validator[V]): + + message: str = "must contain only one line: {value!r}" + + def validate(self, value: V) -> None: + if not isinstance(value, str): + raise TypeError - def __call__(self, value: Optional[str]) -> None: - if value is not None and self._regex.search(value) is None: - raise ValueError(self._error_msg.format(value=value)) + if "\n" in value or "\r" in value: + raise ValueError("multiline str") From d2d07e8d3d884e35e7db003ea7116fd585147e93 Mon Sep 17 00:00:00 2001 From: Donald Stufft Date: Sun, 17 Jul 2022 13:34:40 -0400 Subject: [PATCH 19/19] linting --- packaging/metadata/_types.py | 7 +++--- packaging/metadata/_validation.py | 39 +++++++++++-------------------- 2 files changed, 16 insertions(+), 30 deletions(-) diff --git a/packaging/metadata/_types.py b/packaging/metadata/_types.py index 8c1f924c3..8c3b1b36a 100644 --- a/packaging/metadata/_types.py +++ b/packaging/metadata/_types.py @@ -1,13 +1,12 @@ from __future__ import annotations import enum -from collections.abc import Iterable -from typing import Any, List, Optional, Tuple, TypedDict +from typing import Optional, Tuple, TypedDict from ..version import Version +from ._utils import as_list_str, as_str from ._validation import RegexValidator, Required, eagerly_validate, lazy_validator from .raw import RawMetadata, parse_email, parse_json -from ._utils import as_str, as_list_str # Type aliases. _NameAndEmail = Tuple[Optional[str], str] @@ -69,7 +68,7 @@ class _ValidatedMetadata(TypedDict, total=False): metadata_version: str name: str version: Version - platforms: List[str] + platforms: list[str] summary: str # description: str # keywords: List[str] diff --git a/packaging/metadata/_validation.py b/packaging/metadata/_validation.py index 4b9a8ffcc..d27e96924 100644 --- a/packaging/metadata/_validation.py +++ b/packaging/metadata/_validation.py @@ -1,21 +1,8 @@ from __future__ import annotations import abc -from email import message import re -from typing import ( - TYPE_CHECKING, - Any, - Callable, - Dict, - Generic, - List, - Optional, - Type, - TypeVar, - Union, - cast, -) +from typing import TYPE_CHECKING, Any, Callable, Dict, Generic, Optional, TypeVar, cast if TYPE_CHECKING: from ._types import Metadata @@ -24,19 +11,19 @@ T = TypeVar("T") -class lazy_validator(Generic[T]): +class lazy_validator(Generic[T]): # noqa: N801 # This hack exists to work around https://github.com/python/mypy/issues/708 - _creator: Union[Callable[[Any], T], Callable[[Any], T]] + _creator: Callable[[Any], T] | Callable[[Any], T] _raw_name: str - _validators: List[Callable[[Any], None]] + _validators: list[Callable[[Any], None]] def __init__( self, creator: Callable[[Any], T], *, - raw_name: Optional[str] = None, - validators: Optional[List[Callable[[Any], None]]] = None, + raw_name: str | None = None, + validators: list[Callable[[Any], None]] | None = None, ) -> None: self._creator = creator if raw_name is not None: @@ -50,7 +37,7 @@ def __set_name__(self, owner: Metadata, name: str) -> None: if not hasattr(self, "_raw_name"): self._raw_name = name - def __get__(self, obj: Metadata, owner: Type[Metadata]) -> Optional[T]: + def __get__(self, obj: Metadata, owner: type[Metadata]) -> T | None: # TypedDict doesn't support variable key names, and Python 3.7 doesn't # support Literal which would let us let it know that this is validated # already to be safe, so we'll cast here to make things work. @@ -79,7 +66,7 @@ def __delete__(self, obj: Metadata) -> None: raw.pop(self._raw_name, None) validated.pop(self._raw_name, None) - def _validate(self, data: Any) -> Optional[T]: + def _validate(self, data: Any) -> T | None: # Create our value from our raw data value = self._creator(data) if data is not None else None @@ -107,18 +94,18 @@ class Validator(Generic[V], abc.ABC): message: str - def __init__(self, *args: Any, message: Optional[str] = None, **kwargs: Any): + def __init__(self, *args: Any, message: str | None = None, **kwargs: Any): super().__init__(*args, **kwargs) if message is not None: self.message = message - def __call__(self, value: Optional[V]) -> None: + def __call__(self, value: V | None) -> None: try: self.full_validate(value) except Exception as exc: raise ValidationError(self.message.format(value=value)) from exc - def full_validate(self, value: Optional[V]) -> None: + def full_validate(self, value: V | None) -> None: if value is not None: self.validate(value) @@ -131,7 +118,7 @@ class Required(Validator[V]): message: str = "value is required: {value!r}" - def full_validate(self, value: Optional[V]) -> None: + def full_validate(self, value: V | None) -> None: if value is None: raise ValueError("required value") @@ -144,7 +131,7 @@ class RegexValidator(Validator[V]): _regex: re.Pattern[str] message: str = "invalid value: {value!r}" - def __init__(self, regex: Union[str, re.Pattern[str]], *args: Any, **kwargs: Any): + def __init__(self, regex: str | re.Pattern[str], *args: Any, **kwargs: Any): super().__init__(*args, **kwargs) if isinstance(regex, str):