From f5f4950d76f0f7731833f4c71770a9ee5b740fe5 Mon Sep 17 00:00:00 2001
From: Donald Stufft <donald@stufft.io>
Date: Fri, 15 Jul 2022 22:40:10 -0400
Subject: [PATCH 01/19] Move metadata into a package

---
 packaging/metadata/__init__.py                | 4 ++++
 packaging/{metadata.py => metadata/_types.py} | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)
 create mode 100644 packaging/metadata/__init__.py
 rename packaging/{metadata.py => metadata/_types.py} (99%)

diff --git a/packaging/metadata/__init__.py b/packaging/metadata/__init__.py
new file mode 100644
index 000000000..e5cae33ec
--- /dev/null
+++ b/packaging/metadata/__init__.py
@@ -0,0 +1,4 @@
+from ._types import DynamicField, Metadata
+
+
+__all__ = ["DynamicField", "Metadata"]
diff --git a/packaging/metadata.py b/packaging/metadata/_types.py
similarity index 99%
rename from packaging/metadata.py
rename to packaging/metadata/_types.py
index 81405febe..61325cb7e 100644
--- a/packaging/metadata.py
+++ b/packaging/metadata/_types.py
@@ -4,7 +4,7 @@
 from collections.abc import Iterable
 from typing import Optional, Tuple
 
-from . import (  # Alt name avoids shadowing.
+from .. import (  # Alt name avoids shadowing.
     requirements,
     specifiers,
     utils,

From 88ada97881a88c45b4e2a46c7ceca91dd927d0df Mon Sep 17 00:00:00 2001
From: Donald Stufft <donald@stufft.io>
Date: Sat, 16 Jul 2022 03:58:20 -0400
Subject: [PATCH 02/19] implement parsing from a metadata format to
 intermediate

---
 packaging/metadata/__init__.py |   3 +-
 packaging/metadata/_parse.py   | 473 +++++++++++++++++++++++++++++++++
 2 files changed, 475 insertions(+), 1 deletion(-)
 create mode 100644 packaging/metadata/_parse.py

diff --git a/packaging/metadata/__init__.py b/packaging/metadata/__init__.py
index e5cae33ec..91cb0646a 100644
--- a/packaging/metadata/__init__.py
+++ b/packaging/metadata/__init__.py
@@ -1,4 +1,5 @@
+from ._parse import RawMetadata, parse_email, parse_json
 from ._types import DynamicField, Metadata
 
 
-__all__ = ["DynamicField", "Metadata"]
+__all__ = ["DynamicField", "Metadata", "RawMetadata", "parse_email", "parse_json"]
diff --git a/packaging/metadata/_parse.py b/packaging/metadata/_parse.py
new file mode 100644
index 000000000..192e1be34
--- /dev/null
+++ b/packaging/metadata/_parse.py
@@ -0,0 +1,473 @@
+import json
+import email.feedparser
+import email.parser
+import email.policy
+
+from typing import Any, Optional, TypedDict, cast
+
+
+# The RawMetadata class attempts to make as few assumptions about
+# the underlying serialization formats as possible, these could
+# possibly serialize in an entirely different way, but the idea
+# here is that as long as a serialization formats some very
+# basic primitives in *some* way (strings, lists, and one map
+# but that map can be easily implemented as a list of strings)
+# then we can support serializing to and from that format.
+class RawMetadata(TypedDict, total=False):
+    metadata_version: str
+    name: str
+    version: str
+    dynamic: list[str]
+    platforms: list[str]
+    supported_platforms: list[str]
+    summary: str
+    description: str
+    description_content_type: str
+    keywords: list[str]
+    home_page: str
+    download_url: str
+    author: str
+    author_email: str
+    maintainer: str
+    maintainer_email: str
+    license: str
+    classifiers: list[str]
+    requires_dist: list[str]
+    requires_python: str
+    requires_external: list[str]
+    project_urls: dict[str, str]
+    provides_extra: list[str]
+    provides_dist: list[str]
+    obsoletes_dist: list[str]
+
+
+_STRING_FIELDS = {
+    "metadata_version",
+    "name",
+    "version",
+    "summary",
+    "home_page",
+    "download_url",
+    "author",
+    "author_email",
+    "maintainer",
+    "maintainer_email",
+    "license",
+    "requires_python",
+}
+
+_LIST_STRING_FIELDS = {
+    "dynamic",
+    "platforms",
+    "supported_platforms",
+    "classifiers",
+    "requires_dist",
+    "requires_python",
+    "requires_external",
+    "provides_extra",
+    "provides_dist",
+    "obsoletes_dist",
+}
+
+# General helper functions for parsing some string values for reusing in
+# multiple parse_FORMAT functions
+
+
+def _parse_keywords(data: str) -> list[str]:
+    return [k.strip() for k in data.split(",")]
+
+
+def _parse_project_urls(data: list[str]) -> dict[str, str]:
+    urls = {}
+    for pair in data:
+        # Our logic is slightly tricky here as we want to try and do
+        # *something* reasonable with malformed data.
+        #
+        # The main thing that we have to worry about, is data that does
+        # not have a ',' at all to split the Key from the Value. There
+        # isn't a singular right answer here, and we will fail validation
+        # later on (if the caller is validating) so it doesn't *really*
+        # matter, but since the missing value has to be an empty str
+        # and our return value is dict[str, str], if we let the key
+        # be the missing value, then they'd just multiple '' values that
+        # overwrite each other.
+        #
+        # The other potentional issue is that it's possible to have the
+        # same Key multiple times in the metadata, with no solid "right"
+        # answer with what to do in that case, we'll do the only thing
+        # we can, which is treat the field as unparseable and add it
+        # to our list of unparsed fields.
+        parts = [p.strip() for p in pair.split(",", 1)]
+        parts.extend([""] * (max(0, 2 - len(parts))))  # Ensure 2 items
+
+        # TODO: The spec doesn't say anything about if the keys should be
+        #       considered case sensitive or not... logically they should
+        #       be case preserving, but case insensitive, but doing that
+        #       would open up more cases where we might have duplicated
+        #       entries.
+        label, url = parts
+        if label in urls:
+            # The label already exists in our set of urls, so this field
+            # is unparseable, and we can just add the whole thing to our
+            # unparseable data and stop processing it.
+            raise KeyError("duplicate keys in project urls")
+        urls[label] = url
+
+    return urls
+
+
+# The various parse_FORMAT functions here are intended to be as lenient as
+# possible in their parsing, while still returning a correctly typed
+# RawMetadata.
+#
+# To aid in this, we also generally want to do as little touching of the
+# data as possible, except where there are possibly some historic holdovers
+# that make valid data awkward to work with.
+#
+# While this is a lower level, intermediate format than our ``Metadata``
+# class, some light touch ups can make a massive different in usability.
+
+
+_EMAIL_FIELD_MAPPING = {
+    "Metadata-Version": "metadata_version",
+    "Name": "name",
+    "Version": "version",
+    "Dynamic": "dynamic",
+    "Platform": "platforms",
+    "Supported-Platform": "supported_platforms",
+    "Summary": "summary",
+    "Description": "description",
+    "Description-Content-Type": "description_content_type",
+    "Keywords": "keywords",
+    "Home-Page": "home_page",
+    "Download-URL": "download_url",
+    "Author": "author",
+    "Author-Email": "author_email",
+    "Maintainer": "maintainer",
+    "Maintainer-Email": "maintainer_email",
+    "License": "license",
+    "Classifier": "classifiers",
+    "Requires-Dist": "requires_dist",
+    "Requires-Python": "requires_python",
+    "Requires-External": "requires_external",
+    "Project-URL": "project_urls",
+    "Provides-Extra": "provides_extra",
+    "Provides-Dist": "provides_dist",
+    "Obsoletes-Dist": "obsoletes_dist",
+}
+
+
+def parse_email(data: bytes | str) -> tuple[RawMetadata, dict[Any, Any]]:
+    raw = {}
+    unparsed: dict[Any, Any] = {}
+
+    if isinstance(data, str):
+        parsed = email.parser.Parser(policy=email.policy.compat32).parsestr(data)
+    else:
+        # In theory we could use the BytesParser from email.parser, but that has
+        # several problems that this method solves:
+        #
+        # 1. BytesParser (and BytesFeedParser) hard codes an assumption that the
+        #    bytes are encoded as ascii (with a surrogateescape handler), but
+        #    the packaging specifications explicitly have decided that our specs
+        #    are in UTF8, not ascii.
+        # 2. We could work around (1) by just decoding the bytes using utf8 ourself
+        #    and then pass it into Parser, which we *could* do, however we're
+        #    attempting to be lenient with this method to enable someone to usee
+        #    this class to parse as much as possible while ignoring any errors that
+        #    do come from it.
+        #
+        #    So we'll want to break our bytes up into a list of headers followed up
+        #    by the message body.
+        #
+        #    Unfortunately, doing this is impossible without lightly parsing the
+        #    RFC 822 format ourselves, which is not the most straightforward thing
+        #    primarily because of a few concerns:
+        #
+        #    1. Conceptually RFC 822 messages is a format where you emit all of the
+        #       headers first, one per line, then a blank line, then the body of the
+        #       message. But it has the ability to "fold" a long header line across
+        #       multiple lines, so to correctly do decoding on a field by field basis
+        #       we will have to take this folding into account (but we do not need to
+        #       actually implement the unfolding, we just want to make sure we have
+        #       the entire logical "line" for that header).
+        #    2. The message body isn't part of a normal field, it's effectively a
+        #       a blank header field, then everythig else is part of the body.
+        #    3. If a particular field can't be decoded using utf8, then we want to
+        #       treat that field as unparseable, but getting the name out of that field
+        #       requires implementing (more) of RFC 822 ourselves, though it's a pretty
+        #       straight forward part.
+        #    4. RFC 822 very specifically calls out CRLF as the line endings, but the
+        #       python stdlib email.parser supports CRLF or LF, and in practice the
+        #       core metadata specs are emiting METADATA files using LF only.
+        #
+        # TODO: Is doing this unconditionally for `bytes` the best idea here? Another
+        #       option is to provide a helper function that will produce a possibly
+        #       mojibaked string, and expect people who want per field decoding
+        #       leniency to manually decode bytes using that method instead.
+        parser = email.feedparser.FeedParser(policy=email.policy.compat32)
+
+        # We don't use splitlines here, because it splits on a lot more different
+        # types of line endings than we want to split on. Since in practice we
+        # have to support just LF, we can just split on that, and do our decoding
+        # and let the FeedParser deal with sorting out if it should be CRLF or LF.
+        buf = b""
+        in_body = False
+        for line in data.split(b"\n"):
+            # Put our LF back onto our line that the call to .split() removed.
+            line = line + b"\n"
+
+            # If we're in the body of our message, line continuation no longer matters
+            # and we can just buffer the entire body so we can attempt to decode it
+            # all at once.
+            if in_body:
+                buf += line
+                continue
+
+            # Continuation lines always start with LWSP, so we'll check to if we have
+            # any data to parse and if so, if this is NOT a continuation line, if it's
+            # not then we've finished reading the previous logical line, and we need
+            # to decode it and pass it into the FeedParser.
+            if buf and line[:1] not in {b" ", b"\t"}:
+                try:
+                    encoded = buf.decode("utf8", "strict")
+                except UnicodeDecodeError:
+                    # If we've gotten here, then we can't actually determine what
+                    # encoding this line is in, so we'll try to pull a header key
+                    # out of it to give us something to put into our unparsed data.
+                    parts = buf.split(b":", 1)
+                    parts.extend([b""] * (max(0, 2 - len(parts))))  # Ensure 2 items
+
+                    # We're leaving this data as bytes and we're also leaving it folded,
+                    # if the caller wants to attempt to parse something out of this
+                    unparsed[parts[0]] = parts[1]
+                else:
+                    parser.feed(encoded)
+
+                # Either way, this logical line has been handled, so we'll reset our
+                # buffer and keep going.
+                buf = b""
+
+            # Check to see if this line is the "blank" line that signals the end
+            # of the header data and the start of the body data.
+            if line in {b"\n", b"\r\n"}:
+                parser.feed(line.decode("utf8", "strict"))
+                in_body = True
+            # More header data, add it to our buffer
+            else:
+                buf += line
+
+        # At this point, buf should be full of the entire body (if there was one) so
+        # we'll attempt to decode that.
+        try:
+            encoded = buf.decode("utf8", "strict")
+        except UnicodeDecodeError:
+            # Our body isn't valid UTF8, we know what the key name for the Description
+            # is though, so we can just use that
+            unparsed["Description"] = buf
+
+        # Actually consume our data, turning it into our email Message.
+        parsed = parser.close()
+
+    # We have to wrap parsed.keys() in a set, because in the case of multiple
+    # values for a key (a list), the key will appear multiple times in the
+    # list of keys, but we're avoiding that by using get_all().
+    for name in set(parsed.keys()):
+        # We use get_all here, even for fields that aren't multiple use, because
+        # otherwise someone could have say, two Name fields, and we would just
+        # silently ignore it rather than doing something about it.
+        value = parsed.get_all(name)
+
+        raw_name = _EMAIL_FIELD_MAPPING.get(name)
+        if raw_name is None:
+            # This is a bit of a weird situation, we've encountered a key that
+            # we don't know what it means, so we don't know whether it's meant
+            # to be a list or not.
+            #
+            # Since we can't really tell one way or another, we'll just leave it
+            # as a list, even though it may be a single item list, because that's
+            # what makes the most sense for email headers.
+            unparsed[name] = value
+            continue
+
+        # If this is one of our string fields, then we'll check to see if our
+        # value is a list of a single item, if it is then we'll assume that
+        # it was emited as a single string, and unwrap the str from inside
+        # the list.
+        #
+        # If it's any other kind of data, then we haven't the faintest clue
+        # what we should parse it as, and we have to just add it to our list
+        # of unparsed stuff.
+        if raw_name in _STRING_FIELDS and len(value) == 1:
+            raw[raw_name] = value[0]
+        # If this is one our list of string fields, then we can just assign
+        # the value, since email *only* has strings, and our get_all() call
+        # above ensures that this is a list.
+        elif raw_name in _LIST_STRING_FIELDS:
+            raw[raw_name] = value
+        # Special Case: Keywords
+        # The keywords field is implemented in the metadata spec as a str,
+        # but it conceptually is a list of strings, and is serialized using
+        # ", ".join(keywords), so we'll do some light data massaging to turn
+        # this into what it logically is.
+        elif raw_name == "keywords" and len(value) == 1:
+            raw[raw_name] = _parse_keywords(value[0])
+        # Special Case: Project-URL
+        # The project urls is implemented in the metadata spec as a list of
+        # specially formatted strings that represent a key and a value, which
+        # is fundamentally a mapping, however the email format doesn't support
+        # mappings in a sane way, so it was crammed into a list of strings
+        # instead.
+        #
+        # We will do a little light data massaging to turn this into a map as
+        # it logically should be.
+        elif raw_name == "project_urls":
+            try:
+                raw[raw_name] = _parse_project_urls(value)
+            except ValueError:
+                unparsed[name] = value
+        # Nothing that we've done has managed to parse this, so it'll just
+        # throw it in our unparseable data and move on.
+        else:
+            unparsed[name] = value
+
+    # We need to support getting the Description from the message payload in
+    # addition to getting it from the the headers, but since Description is
+    # conceptually a string, if it's already been set from headers then we'll
+    # clear it out move them both to unparsed.
+    #
+    # NOTE: For whatever reason, this will return a list of strings if the
+    #       message is in mutlipart format, otherwise it will return a single
+    #       string. The list format would be an unparseable error.
+    payload = parsed.get_payload()
+    if payload:
+        # Check to see if we've got duplicated values, if so remove the
+        # parsed one and move to unparsed.
+        if "description" in raw:
+            unparsed["Description"] = [raw.pop("description")]
+            if isinstance(payload, str):
+                unparsed["Description"].append(payload)
+            else:
+                unparsed["Description"].extend(payload)
+        # If payload is a string, then we're good to go to add this to our
+        # RawMetadata.
+        elif isinstance(payload, str):
+            raw["description"] = payload
+        # Otherwise, it's unparseable, and we need to record that.
+        else:
+            unparsed["Description"] = payload
+
+    # We need to cast our `raw` to a metadata, because a TypedDict only support
+    # literal key names, but we're computing our key names on purpose, but the
+    # way this function is implemented, our `TypedDict` can only have valid key
+    # names.
+    return cast(RawMetadata, raw), unparsed
+
+
+# This might appear to be a mapping of the same key to itself, and in many cases
+# it is. However, the algorithm in PEP 566 doesn't match 100% the keys chosen
+# for RawMetadata, so we use this mapping just like with email to handle that.
+_JSON_FIELD_MAPPING = {
+    "metadata_version": "metadata_version",
+    "name": "name",
+    "version": "version",
+    "dynamic": "dynamic",
+    "platform": "platforms",
+    "supported_platform": "supported_platforms",
+    "summary": "summary",
+    "description": "description",
+    "description_content_type": "description_content_type",
+    "keywords": "keywords",
+    "home_page": "home_page",
+    "download_url": "download_url",
+    "author": "author",
+    "author_email": "author_email",
+    "maintainer": "maintainer",
+    "maintainer_email": "maintainer_email",
+    "license": "license",
+    "classifier": "classifiers",
+    "requires_dist": "requires_dist",
+    "requires_python": "requires_python",
+    "requires_external": "requires_external",
+    "project_url": "project_urls",
+    "provides_extra": "provides_extra",
+    "provides_dist": "provides_dist",
+    "obsoletes_dist": "obsoletes_dist",
+}
+
+
+def parse_json(data: bytes | str) -> tuple[RawMetadata, dict[Any, Any]]:
+    raw: dict[Any, Any] = {}
+    unparsed: dict[Any, Any] = {}
+    parsed = json.loads(data)
+
+    # We need to make sure that the data given to us actually implements
+    # a dict, if it's any other type then there is no way we can parse
+    # anything meaningful out of it, so we'll just give up and bail out.
+    if not isinstance(parsed, dict):
+        raise ValueError("Invalid json data, must be a mapping")
+
+    for name, value in parsed.items():
+        raw_name = _JSON_FIELD_MAPPING.get(name)
+        if raw_name is None:
+            # We don't know this key, so chuck it into our unparsed data
+            # and continue on.
+            unparsed[name] = value
+            continue
+
+        # If this is one of our string fields, check to see if it's actually
+        # a string, if it's not then we don't have any idea how to handle it
+        if raw_name in _STRING_FIELDS and isinstance(value, str):
+            raw[raw_name] = value
+        # If this is one of our string fields, check to see if it's actually
+        # a list of strings, if it's not then we don't have any idea how to
+        # handle it
+        elif (
+            raw_name in _LIST_STRING_FIELDS
+            and isinstance(value, list)
+            and all(isinstance(v, str) for v in value)
+        ):
+            raw[raw_name] = cast(list[str], value)
+        # Special Case: Keywords
+        # The keywords field is implemented in the metadata spec as a str,
+        # but it conceptually is a list of strings. Interestingly, the
+        # JSON spec as described in PEP 566 already implements this as a
+        # list of strings, so we don't technically have to do anything.
+        #
+        # We're still treating this as as a special case though, because
+        # in the metadata specification it's a single string, so it's not
+        # included in our list of list string fields.
+        elif (
+            raw_name == "keywords"
+            and isinstance(value, list)
+            and all(isinstance(v, str) for v in value)
+        ):
+            raw[raw_name] = value
+        # Special Case: Project-URL
+        # The project urls is implemented in the metadata spec as a list of
+        # specially formatted strings that represent a key and a value, which
+        # is fundamentally a mapping, however the email format doesn't support
+        # mappings in a sane way, so it was crammed into a list of strings
+        # instead.
+        #
+        # We will do a little light data massaging to turn this into a map as
+        # it logically should be.
+        elif (
+            raw_name == "project_urls"
+            and isinstance(value, list)
+            and all(isinstance(v, str) for v in value)
+        ):
+            try:
+                raw[raw_name] = _parse_project_urls(value)
+            except ValueError:
+                unparsed[name] = value
+        # Nothing that we've done has managed to parse this, so it'll just
+        # throw it in our unparseable data and move on.
+        else:
+            unparsed[name] = value
+
+    # We need to cast our `raw` to a metadata, because a TypedDict only support
+    # literal key names, but we're computing our key names on purpose, but the
+    # way this function is implemented, our `TypedDict` can only have valid key
+    # names.
+    return cast(RawMetadata, raw), unparsed

From 3b42e3ca6f79693324ba5dcad4104f9cff29c857 Mon Sep 17 00:00:00 2001
From: Donald Stufft <donald@stufft.io>
Date: Sat, 16 Jul 2022 04:02:54 -0400
Subject: [PATCH 03/19] fix linting

---
 packaging/metadata/__init__.py | 1 -
 packaging/metadata/_parse.py   | 5 ++---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/packaging/metadata/__init__.py b/packaging/metadata/__init__.py
index 91cb0646a..c4752576e 100644
--- a/packaging/metadata/__init__.py
+++ b/packaging/metadata/__init__.py
@@ -1,5 +1,4 @@
 from ._parse import RawMetadata, parse_email, parse_json
 from ._types import DynamicField, Metadata
 
-
 __all__ = ["DynamicField", "Metadata", "RawMetadata", "parse_email", "parse_json"]
diff --git a/packaging/metadata/_parse.py b/packaging/metadata/_parse.py
index 192e1be34..6771f4497 100644
--- a/packaging/metadata/_parse.py
+++ b/packaging/metadata/_parse.py
@@ -1,9 +1,8 @@
-import json
 import email.feedparser
 import email.parser
 import email.policy
-
-from typing import Any, Optional, TypedDict, cast
+import json
+from typing import Any, TypedDict, cast
 
 
 # The RawMetadata class attempts to make as few assumptions about

From 9c78c5b7f3426584ba165cd80df089fe7cf17559 Mon Sep 17 00:00:00 2001
From: Donald Stufft <donald@stufft.io>
Date: Sat, 16 Jul 2022 10:47:18 -0400
Subject: [PATCH 04/19] use older syntax for unions

---
 packaging/metadata/_parse.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/packaging/metadata/_parse.py b/packaging/metadata/_parse.py
index 6771f4497..f7db63d52 100644
--- a/packaging/metadata/_parse.py
+++ b/packaging/metadata/_parse.py
@@ -2,7 +2,7 @@
 import email.parser
 import email.policy
 import json
-from typing import Any, TypedDict, cast
+from typing import Any, TypedDict, Union, cast
 
 
 # The RawMetadata class attempts to make as few assumptions about
@@ -156,7 +156,7 @@ def _parse_project_urls(data: list[str]) -> dict[str, str]:
 }
 
 
-def parse_email(data: bytes | str) -> tuple[RawMetadata, dict[Any, Any]]:
+def parse_email(data: Union[bytes, str]) -> tuple[RawMetadata, dict[Any, Any]]:
     raw = {}
     unparsed: dict[Any, Any] = {}
 
@@ -395,7 +395,7 @@ def parse_email(data: bytes | str) -> tuple[RawMetadata, dict[Any, Any]]:
 }
 
 
-def parse_json(data: bytes | str) -> tuple[RawMetadata, dict[Any, Any]]:
+def parse_json(data: Union[bytes, str]) -> tuple[RawMetadata, dict[Any, Any]]:
     raw: dict[Any, Any] = {}
     unparsed: dict[Any, Any] = {}
     parsed = json.loads(data)

From 8e257675a3f67682599383a377a3848d9ec5fff9 Mon Sep 17 00:00:00 2001
From: Donald Stufft <donald@stufft.io>
Date: Sat, 16 Jul 2022 12:27:33 -0400
Subject: [PATCH 05/19] Use the method from pkg_metadata to deal with email
 encodinga

---
 packaging/metadata/_parse.py | 205 ++++++++++++++---------------------
 1 file changed, 80 insertions(+), 125 deletions(-)

diff --git a/packaging/metadata/_parse.py b/packaging/metadata/_parse.py
index f7db63d52..46bcbf665 100644
--- a/packaging/metadata/_parse.py
+++ b/packaging/metadata/_parse.py
@@ -1,4 +1,5 @@
 import email.feedparser
+import email.header
 import email.parser
 import email.policy
 import json
@@ -157,116 +158,13 @@ def _parse_project_urls(data: list[str]) -> dict[str, str]:
 
 
 def parse_email(data: Union[bytes, str]) -> tuple[RawMetadata, dict[Any, Any]]:
-    raw = {}
+    raw: dict[str, Any] = {}
     unparsed: dict[Any, Any] = {}
 
     if isinstance(data, str):
         parsed = email.parser.Parser(policy=email.policy.compat32).parsestr(data)
     else:
-        # In theory we could use the BytesParser from email.parser, but that has
-        # several problems that this method solves:
-        #
-        # 1. BytesParser (and BytesFeedParser) hard codes an assumption that the
-        #    bytes are encoded as ascii (with a surrogateescape handler), but
-        #    the packaging specifications explicitly have decided that our specs
-        #    are in UTF8, not ascii.
-        # 2. We could work around (1) by just decoding the bytes using utf8 ourself
-        #    and then pass it into Parser, which we *could* do, however we're
-        #    attempting to be lenient with this method to enable someone to usee
-        #    this class to parse as much as possible while ignoring any errors that
-        #    do come from it.
-        #
-        #    So we'll want to break our bytes up into a list of headers followed up
-        #    by the message body.
-        #
-        #    Unfortunately, doing this is impossible without lightly parsing the
-        #    RFC 822 format ourselves, which is not the most straightforward thing
-        #    primarily because of a few concerns:
-        #
-        #    1. Conceptually RFC 822 messages is a format where you emit all of the
-        #       headers first, one per line, then a blank line, then the body of the
-        #       message. But it has the ability to "fold" a long header line across
-        #       multiple lines, so to correctly do decoding on a field by field basis
-        #       we will have to take this folding into account (but we do not need to
-        #       actually implement the unfolding, we just want to make sure we have
-        #       the entire logical "line" for that header).
-        #    2. The message body isn't part of a normal field, it's effectively a
-        #       a blank header field, then everythig else is part of the body.
-        #    3. If a particular field can't be decoded using utf8, then we want to
-        #       treat that field as unparseable, but getting the name out of that field
-        #       requires implementing (more) of RFC 822 ourselves, though it's a pretty
-        #       straight forward part.
-        #    4. RFC 822 very specifically calls out CRLF as the line endings, but the
-        #       python stdlib email.parser supports CRLF or LF, and in practice the
-        #       core metadata specs are emiting METADATA files using LF only.
-        #
-        # TODO: Is doing this unconditionally for `bytes` the best idea here? Another
-        #       option is to provide a helper function that will produce a possibly
-        #       mojibaked string, and expect people who want per field decoding
-        #       leniency to manually decode bytes using that method instead.
-        parser = email.feedparser.FeedParser(policy=email.policy.compat32)
-
-        # We don't use splitlines here, because it splits on a lot more different
-        # types of line endings than we want to split on. Since in practice we
-        # have to support just LF, we can just split on that, and do our decoding
-        # and let the FeedParser deal with sorting out if it should be CRLF or LF.
-        buf = b""
-        in_body = False
-        for line in data.split(b"\n"):
-            # Put our LF back onto our line that the call to .split() removed.
-            line = line + b"\n"
-
-            # If we're in the body of our message, line continuation no longer matters
-            # and we can just buffer the entire body so we can attempt to decode it
-            # all at once.
-            if in_body:
-                buf += line
-                continue
-
-            # Continuation lines always start with LWSP, so we'll check to if we have
-            # any data to parse and if so, if this is NOT a continuation line, if it's
-            # not then we've finished reading the previous logical line, and we need
-            # to decode it and pass it into the FeedParser.
-            if buf and line[:1] not in {b" ", b"\t"}:
-                try:
-                    encoded = buf.decode("utf8", "strict")
-                except UnicodeDecodeError:
-                    # If we've gotten here, then we can't actually determine what
-                    # encoding this line is in, so we'll try to pull a header key
-                    # out of it to give us something to put into our unparsed data.
-                    parts = buf.split(b":", 1)
-                    parts.extend([b""] * (max(0, 2 - len(parts))))  # Ensure 2 items
-
-                    # We're leaving this data as bytes and we're also leaving it folded,
-                    # if the caller wants to attempt to parse something out of this
-                    unparsed[parts[0]] = parts[1]
-                else:
-                    parser.feed(encoded)
-
-                # Either way, this logical line has been handled, so we'll reset our
-                # buffer and keep going.
-                buf = b""
-
-            # Check to see if this line is the "blank" line that signals the end
-            # of the header data and the start of the body data.
-            if line in {b"\n", b"\r\n"}:
-                parser.feed(line.decode("utf8", "strict"))
-                in_body = True
-            # More header data, add it to our buffer
-            else:
-                buf += line
-
-        # At this point, buf should be full of the entire body (if there was one) so
-        # we'll attempt to decode that.
-        try:
-            encoded = buf.decode("utf8", "strict")
-        except UnicodeDecodeError:
-            # Our body isn't valid UTF8, we know what the key name for the Description
-            # is though, so we can just use that
-            unparsed["Description"] = buf
-
-        # Actually consume our data, turning it into our email Message.
-        parsed = parser.close()
+        parsed = email.parser.BytesParser(policy=email.policy.compat32).parsebytes(data)
 
     # We have to wrap parsed.keys() in a set, because in the case of multiple
     # values for a key (a list), the key will appear multiple times in the
@@ -275,7 +173,63 @@ def parse_email(data: Union[bytes, str]) -> tuple[RawMetadata, dict[Any, Any]]:
         # We use get_all here, even for fields that aren't multiple use, because
         # otherwise someone could have say, two Name fields, and we would just
         # silently ignore it rather than doing something about it.
-        value = parsed.get_all(name)
+        headers = parsed.get_all(name)
+
+        # The way the email module works when parsing bytes is that it
+        # unconditionally decodes the bytes as ascii, using the surrogateescape
+        # handler, and then when you pull that data back out (such as with get_all)
+        # it looks to see if the str has any surrogate escapes, and if it does
+        # it wraps it in a Header object instead of returning the string.
+        #
+        # So we'll look for those Header objects, and fix up the encoding
+        value = []
+        valid_encoding = True
+        for h in headers:
+            # It's unclear if this can return more types than just a Header or
+            # a str, so we'll just assert here to make sure.
+            assert isinstance(h, (email.header.Header, str))
+
+            # If it's a header object, we need to do our little dance to get
+            # the real data out of it. In cases where there is invalid data
+            # we're going to end up with mojibake, but I don't see a good way
+            # around that without reimplementing parts of the Header object
+            # ourselves.
+            #
+            # That should be fine, since if that happens, this key is going
+            # into the unparsed dict anyways.
+            if isinstance(h, email.header.Header):
+                # The Heade object stores it's data as chunks, and each chunk
+                # can be independently encoded, so we'll need to check each
+                # of them.
+                chunks = []
+                for bin, encoding in email.header.decode_header(h):
+                    # This means it found a surrogate escape, that could be
+                    # valid data (if the source was utf8), or invalid.
+                    if encoding == "unknown-8bit":
+                        try:
+                            bin.decode("utf8", "strict")
+                        except UnicodeDecodeError:
+                            # Enable mojibake
+                            encoding = "latin1"
+                            valid_encoding = False
+                        else:
+                            encoding = "utf8"
+                    chunks.append((bin, encoding))
+
+                # Turn our chunks back into a Header object, then let that
+                # Header object do the right thing to turn them into a
+                # string for us.
+                value.append(str(email.header.make_header(chunks)))
+            # This is already a string, so just add it
+            else:
+                value.append(h)
+
+        # We've processed all of our values to get them into a list of str,
+        # but we may have mojibake data, in which case this is an unparsed
+        # field.
+        if not valid_encoding:
+            unparsed[name] = value
+            continue
 
         raw_name = _EMAIL_FIELD_MAPPING.get(name)
         if raw_name is None:
@@ -335,26 +289,27 @@ def parse_email(data: Union[bytes, str]) -> tuple[RawMetadata, dict[Any, Any]]:
     # conceptually a string, if it's already been set from headers then we'll
     # clear it out move them both to unparsed.
     #
-    # NOTE: For whatever reason, this will return a list of strings if the
-    #       message is in mutlipart format, otherwise it will return a single
-    #       string. The list format would be an unparseable error.
-    payload = parsed.get_payload()
-    if payload:
-        # Check to see if we've got duplicated values, if so remove the
-        # parsed one and move to unparsed.
-        if "description" in raw:
-            unparsed["Description"] = [raw.pop("description")]
-            if isinstance(payload, str):
-                unparsed["Description"].append(payload)
-            else:
-                unparsed["Description"].extend(payload)
-        # If payload is a string, then we're good to go to add this to our
-        # RawMetadata.
-        elif isinstance(payload, str):
-            raw["description"] = payload
-        # Otherwise, it's unparseable, and we need to record that.
+    # It's possible that someone has messed up and given us a multipart body,
+    # in which case we'll move the entire body to the unparsed dictionary.
+    if parsed.is_multipart():
+        unparsed["Description"] = parsed.get_payload(decode=True)
+    # We know we'll get a single bytes object out of this, so now we just need
+    # to deal with encodings.
+    else:
+        bpayload = parsed.get_payload(decode=True)
+        assert isinstance(bpayload, bytes)
+
+        try:
+            payload = bpayload.decode("utf", "strict")
+        except UnicodeDecodeError:
+            unparsed["Description"] = bpayload
         else:
-            unparsed["Description"] = payload
+            # Check to see if we've already got a description, if so then both
+            # it, and this body move to unparseable.
+            if "description" in raw:
+                unparsed["Description"] = [raw.pop("description"), payload]
+            else:
+                raw["description"] = payload
 
     # We need to cast our `raw` to a metadata, because a TypedDict only support
     # literal key names, but we're computing our key names on purpose, but the

From 54d663baa461791851f9361c54052fd79fb84d06 Mon Sep 17 00:00:00 2001
From: Donald Stufft <donald@stufft.io>
Date: Sat, 16 Jul 2022 12:40:20 -0400
Subject: [PATCH 06/19] correct casing and missing headers

---
 packaging/metadata/_parse.py | 57 ++++++++++++++++++++----------------
 1 file changed, 31 insertions(+), 26 deletions(-)

diff --git a/packaging/metadata/_parse.py b/packaging/metadata/_parse.py
index 46bcbf665..5bc42f7af 100644
--- a/packaging/metadata/_parse.py
+++ b/packaging/metadata/_parse.py
@@ -46,6 +46,8 @@ class RawMetadata(TypedDict, total=False):
     "name",
     "version",
     "summary",
+    "description",
+    "description_content_type",
     "home_page",
     "download_url",
     "author",
@@ -62,7 +64,6 @@ class RawMetadata(TypedDict, total=False):
     "supported_platforms",
     "classifiers",
     "requires_dist",
-    "requires_python",
     "requires_external",
     "provides_extra",
     "provides_dist",
@@ -129,31 +130,31 @@ def _parse_project_urls(data: list[str]) -> dict[str, str]:
 
 
 _EMAIL_FIELD_MAPPING = {
-    "Metadata-Version": "metadata_version",
-    "Name": "name",
-    "Version": "version",
-    "Dynamic": "dynamic",
-    "Platform": "platforms",
-    "Supported-Platform": "supported_platforms",
-    "Summary": "summary",
-    "Description": "description",
-    "Description-Content-Type": "description_content_type",
-    "Keywords": "keywords",
-    "Home-Page": "home_page",
-    "Download-URL": "download_url",
-    "Author": "author",
-    "Author-Email": "author_email",
-    "Maintainer": "maintainer",
-    "Maintainer-Email": "maintainer_email",
-    "License": "license",
-    "Classifier": "classifiers",
-    "Requires-Dist": "requires_dist",
-    "Requires-Python": "requires_python",
-    "Requires-External": "requires_external",
-    "Project-URL": "project_urls",
-    "Provides-Extra": "provides_extra",
-    "Provides-Dist": "provides_dist",
-    "Obsoletes-Dist": "obsoletes_dist",
+    "metadata-version": "metadata_version",
+    "name": "name",
+    "version": "version",
+    "dynamic": "dynamic",
+    "platform": "platforms",
+    "supported-platform": "supported_platforms",
+    "summary": "summary",
+    "description": "description",
+    "description-content-type": "description_content_type",
+    "keywords": "keywords",
+    "home-page": "home_page",
+    "download-url": "download_url",
+    "author": "author",
+    "author-email": "author_email",
+    "maintainer": "maintainer",
+    "maintainer-email": "maintainer_email",
+    "license": "license",
+    "classifier": "classifiers",
+    "requires-dist": "requires_dist",
+    "requires-python": "requires_python",
+    "requires-external": "requires_external",
+    "project-url": "project_urls",
+    "provides-extra": "provides_extra",
+    "provides-dist": "provides_dist",
+    "obsoletes-dist": "obsoletes_dist",
 }
 
 
@@ -170,6 +171,10 @@ def parse_email(data: Union[bytes, str]) -> tuple[RawMetadata, dict[Any, Any]]:
     # values for a key (a list), the key will appear multiple times in the
     # list of keys, but we're avoiding that by using get_all().
     for name in set(parsed.keys()):
+        # Header names in RFC are case insensitive, so we'll normalize to all
+        # lower case to make comparisons easier.
+        name = name.lower()
+
         # We use get_all here, even for fields that aren't multiple use, because
         # otherwise someone could have say, two Name fields, and we would just
         # silently ignore it rather than doing something about it.

From 85516bcc007952f00c609411da35c5a3b5d4ac4b Mon Sep 17 00:00:00 2001
From: Donald Stufft <donald@stufft.io>
Date: Sat, 16 Jul 2022 13:23:57 -0400
Subject: [PATCH 07/19] Handle str vs bytes data better

---
 packaging/metadata/_parse.py | 51 ++++++++++++++++++++++--------------
 1 file changed, 31 insertions(+), 20 deletions(-)

diff --git a/packaging/metadata/_parse.py b/packaging/metadata/_parse.py
index 5bc42f7af..59d04874c 100644
--- a/packaging/metadata/_parse.py
+++ b/packaging/metadata/_parse.py
@@ -2,6 +2,7 @@
 import email.header
 import email.parser
 import email.policy
+import email.message
 import json
 from typing import Any, TypedDict, Union, cast
 
@@ -293,28 +294,17 @@ def parse_email(data: Union[bytes, str]) -> tuple[RawMetadata, dict[Any, Any]]:
     # addition to getting it from the the headers, but since Description is
     # conceptually a string, if it's already been set from headers then we'll
     # clear it out move them both to unparsed.
-    #
-    # It's possible that someone has messed up and given us a multipart body,
-    # in which case we'll move the entire body to the unparsed dictionary.
-    if parsed.is_multipart():
-        unparsed["Description"] = parsed.get_payload(decode=True)
-    # We know we'll get a single bytes object out of this, so now we just need
-    # to deal with encodings.
+    try:
+        payload = _get_payload(parsed, data)
+    except ValueError:
+        unparsed["Description"] = parsed.get_payload(decode=isinstance(data, bytes))
     else:
-        bpayload = parsed.get_payload(decode=True)
-        assert isinstance(bpayload, bytes)
-
-        try:
-            payload = bpayload.decode("utf", "strict")
-        except UnicodeDecodeError:
-            unparsed["Description"] = bpayload
+        # Check to see if we've already got a description, if so then both
+        # it, and this body move to unparseable.
+        if "description" in raw:
+            unparsed["Description"] = [raw.pop("description"), payload]
         else:
-            # Check to see if we've already got a description, if so then both
-            # it, and this body move to unparseable.
-            if "description" in raw:
-                unparsed["Description"] = [raw.pop("description"), payload]
-            else:
-                raw["description"] = payload
+            raw["description"] = payload
 
     # We need to cast our `raw` to a metadata, because a TypedDict only support
     # literal key names, but we're computing our key names on purpose, but the
@@ -430,3 +420,24 @@ def parse_json(data: Union[bytes, str]) -> tuple[RawMetadata, dict[Any, Any]]:
     # way this function is implemented, our `TypedDict` can only have valid key
     # names.
     return cast(RawMetadata, raw), unparsed
+
+
+def _get_payload(msg: email.message.Message, source: Union[bytes, str]) -> str:
+    # If our source is a str, then our caller has managed encodings for us,
+    # and we don't need to deal with it.
+    if isinstance(source, str):
+        payload: Union[list[str], str] = msg.get_payload()
+        if isinstance(payload, list):
+            raise ValueError("payload is a multipart")
+        return payload
+    # If our source is a bytes, then we're managing the encoding and we need
+    # to deal with it.
+    else:
+        bpayload: Union[list[bytes], bytes] = msg.get_payload(decode=True)
+        if isinstance(bpayload, list):
+            raise ValueError("payload is a multipart")
+
+        try:
+            return bpayload.decode("utf8", "strict")
+        except UnicodeDecodeError:
+            raise ValueError("payload in an invalid encoding")

From e30b28ebc6ff49a48cba377cdb9e26ad8da0f46a Mon Sep 17 00:00:00 2001
From: Donald Stufft <donald@stufft.io>
Date: Sat, 16 Jul 2022 13:30:53 -0400
Subject: [PATCH 08/19] sort metadata fields better, add missing Metadata 1.1
 fields

---
 packaging/metadata/_parse.py | 148 +++++++++++++++++++++--------------
 1 file changed, 91 insertions(+), 57 deletions(-)

diff --git a/packaging/metadata/_parse.py b/packaging/metadata/_parse.py
index 59d04874c..95c6cb81a 100644
--- a/packaging/metadata/_parse.py
+++ b/packaging/metadata/_parse.py
@@ -15,60 +15,88 @@
 # but that map can be easily implemented as a list of strings)
 # then we can support serializing to and from that format.
 class RawMetadata(TypedDict, total=False):
+    # Metadata 1.0 - PEP 241
     metadata_version: str
     name: str
     version: str
-    dynamic: list[str]
     platforms: list[str]
-    supported_platforms: list[str]
     summary: str
     description: str
-    description_content_type: str
     keywords: list[str]
     home_page: str
-    download_url: str
     author: str
     author_email: str
-    maintainer: str
-    maintainer_email: str
     license: str
+
+    # Metadata 1.1 - PEP 314
+    supported_platforms: list[str]
+    download_url: str
     classifiers: list[str]
+    requires: list[str]
+    provides: list[str]
+    obsoletes: list[str]
+
+    # Metadata 1.2 - PEP 345
+    maintainer: str
+    maintainer_email: str
     requires_dist: list[str]
+    provides_dist: list[str]
+    obsoletes_dist: list[str]
     requires_python: str
     requires_external: list[str]
     project_urls: dict[str, str]
+
+    # Metadata 2.0
+    # PEP 426 attempted to completely revamp the metadata format
+    # but got stuck without ever being able to build consensus on
+    # it and ultimately ended up withdrawn.
+    #
+    # However, a number of tools had started emiting METADATA with
+    # `2.0` Metadata-Version, so for historical reasons, this version
+    # was skipped.
+
+    # Metadata 2.1 - PEP 566
+    description_content_type: str
     provides_extra: list[str]
-    provides_dist: list[str]
-    obsoletes_dist: list[str]
+
+    # Metadata 2.2 - PEP 643
+    dynamic: list[str]
+
+    # Metadata 2.3 - PEP 685
+    # No new fields were added in PEP 685, just some edge case were
+    # tightened up to provide better interoptability.
 
 
 _STRING_FIELDS = {
-    "metadata_version",
-    "name",
-    "version",
-    "summary",
+    "author",
+    "author_email",
     "description",
     "description_content_type",
-    "home_page",
     "download_url",
-    "author",
-    "author_email",
+    "home_page",
+    "license",
     "maintainer",
     "maintainer_email",
-    "license",
+    "metadata_version",
+    "name",
     "requires_python",
+    "summary",
+    "version",
 }
 
 _LIST_STRING_FIELDS = {
+    "classifiers",
     "dynamic",
+    "obsoletes",
+    "obsoletes_dist",
     "platforms",
-    "supported_platforms",
-    "classifiers",
+    "provides",
+    "provides_dist",
+    "provides_extra",
+    "requires",
     "requires_dist",
     "requires_external",
-    "provides_extra",
-    "provides_dist",
-    "obsoletes_dist",
+    "supported_platforms",
 }
 
 # General helper functions for parsing some string values for reusing in
@@ -131,31 +159,34 @@ def _parse_project_urls(data: list[str]) -> dict[str, str]:
 
 
 _EMAIL_FIELD_MAPPING = {
-    "metadata-version": "metadata_version",
-    "name": "name",
-    "version": "version",
-    "dynamic": "dynamic",
-    "platform": "platforms",
-    "supported-platform": "supported_platforms",
-    "summary": "summary",
+    "author": "author",
+    "author-email": "author_email",
+    "classifier": "classifiers",
     "description": "description",
     "description-content-type": "description_content_type",
-    "keywords": "keywords",
-    "home-page": "home_page",
     "download-url": "download_url",
-    "author": "author",
-    "author-email": "author_email",
+    "dynamic": "dynamic",
+    "home-page": "home_page",
+    "keywords": "keywords",
+    "license": "license",
     "maintainer": "maintainer",
     "maintainer-email": "maintainer_email",
-    "license": "license",
-    "classifier": "classifiers",
-    "requires-dist": "requires_dist",
-    "requires-python": "requires_python",
-    "requires-external": "requires_external",
+    "metadata-version": "metadata_version",
+    "name": "name",
+    "obsoletes": "obsoletes",
+    "obsoletes-dist": "obsoletes_dist",
+    "platform": "platforms",
     "project-url": "project_urls",
-    "provides-extra": "provides_extra",
+    "provides": "provides",
     "provides-dist": "provides_dist",
-    "obsoletes-dist": "obsoletes_dist",
+    "provides-extra": "provides_extra",
+    "requires": "requires",
+    "requires-dist": "requires_dist",
+    "requires-external": "requires_external",
+    "requires-python": "requires_python",
+    "summary": "summary",
+    "supported-platform": "supported_platforms",
+    "version": "version",
 }
 
 
@@ -317,31 +348,34 @@ def parse_email(data: Union[bytes, str]) -> tuple[RawMetadata, dict[Any, Any]]:
 # it is. However, the algorithm in PEP 566 doesn't match 100% the keys chosen
 # for RawMetadata, so we use this mapping just like with email to handle that.
 _JSON_FIELD_MAPPING = {
-    "metadata_version": "metadata_version",
-    "name": "name",
-    "version": "version",
-    "dynamic": "dynamic",
-    "platform": "platforms",
-    "supported_platform": "supported_platforms",
-    "summary": "summary",
+    "author": "author",
+    "author_email": "author_email",
+    "classifier": "classifiers",
     "description": "description",
     "description_content_type": "description_content_type",
-    "keywords": "keywords",
-    "home_page": "home_page",
     "download_url": "download_url",
-    "author": "author",
-    "author_email": "author_email",
+    "dynamic": "dynamic",
+    "home_page": "home_page",
+    "keywords": "keywords",
+    "license": "license",
     "maintainer": "maintainer",
     "maintainer_email": "maintainer_email",
-    "license": "license",
-    "classifier": "classifiers",
-    "requires_dist": "requires_dist",
-    "requires_python": "requires_python",
-    "requires_external": "requires_external",
+    "metadata_version": "metadata_version",
+    "name": "name",
+    "obsoletes": "obsoletes",
+    "obsoletes_dist": "obsoletes_dist",
+    "platform": "platforms",
     "project_url": "project_urls",
-    "provides_extra": "provides_extra",
+    "provides": "provides",
     "provides_dist": "provides_dist",
-    "obsoletes_dist": "obsoletes_dist",
+    "provides_extra": "provides_extra",
+    "requires": "requires",
+    "requires_dist": "requires_dist",
+    "requires_external": "requires_external",
+    "requires_python": "requires_python",
+    "summary": "summary",
+    "supported_platform": "supported_platforms",
+    "version": "version",
 }
 
 

From 5b7e097dc3f79ba855625b877413a3dbb6e67fc1 Mon Sep 17 00:00:00 2001
From: Donald Stufft <donald@stufft.io>
Date: Sat, 16 Jul 2022 13:31:38 -0400
Subject: [PATCH 09/19] linting

---
 packaging/metadata/_parse.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/packaging/metadata/_parse.py b/packaging/metadata/_parse.py
index 95c6cb81a..80080ec7b 100644
--- a/packaging/metadata/_parse.py
+++ b/packaging/metadata/_parse.py
@@ -1,8 +1,8 @@
 import email.feedparser
 import email.header
+import email.message
 import email.parser
 import email.policy
-import email.message
 import json
 from typing import Any, TypedDict, Union, cast
 

From 2a795e9446833291f0dd3ee89e6484ab88ed7931 Mon Sep 17 00:00:00 2001
From: Donald Stufft <donald@stufft.io>
Date: Sat, 16 Jul 2022 13:34:48 -0400
Subject: [PATCH 10/19] reorganization

---
 packaging/metadata/__init__.py            | 2 +-
 packaging/metadata/{_parse.py => _raw.py} | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename packaging/metadata/{_parse.py => _raw.py} (100%)

diff --git a/packaging/metadata/__init__.py b/packaging/metadata/__init__.py
index c4752576e..f9594b166 100644
--- a/packaging/metadata/__init__.py
+++ b/packaging/metadata/__init__.py
@@ -1,4 +1,4 @@
-from ._parse import RawMetadata, parse_email, parse_json
+from ._raw import RawMetadata, parse_email, parse_json
 from ._types import DynamicField, Metadata
 
 __all__ = ["DynamicField", "Metadata", "RawMetadata", "parse_email", "parse_json"]
diff --git a/packaging/metadata/_parse.py b/packaging/metadata/_raw.py
similarity index 100%
rename from packaging/metadata/_parse.py
rename to packaging/metadata/_raw.py

From e0959708170572b1d4aba736ec9752d700d579e0 Mon Sep 17 00:00:00 2001
From: Donald Stufft <donald@stufft.io>
Date: Sat, 16 Jul 2022 13:36:29 -0400
Subject: [PATCH 11/19] Expose packaging.metadata.raw as it's own module

---
 packaging/metadata/__init__.py         | 3 +--
 packaging/metadata/{_raw.py => raw.py} | 3 +++
 2 files changed, 4 insertions(+), 2 deletions(-)
 rename packaging/metadata/{_raw.py => raw.py} (99%)

diff --git a/packaging/metadata/__init__.py b/packaging/metadata/__init__.py
index f9594b166..d0ba767b1 100644
--- a/packaging/metadata/__init__.py
+++ b/packaging/metadata/__init__.py
@@ -1,4 +1,3 @@
-from ._raw import RawMetadata, parse_email, parse_json
 from ._types import DynamicField, Metadata
 
-__all__ = ["DynamicField", "Metadata", "RawMetadata", "parse_email", "parse_json"]
+__all__ = ["DynamicField", "Metadata"]
diff --git a/packaging/metadata/_raw.py b/packaging/metadata/raw.py
similarity index 99%
rename from packaging/metadata/_raw.py
rename to packaging/metadata/raw.py
index 80080ec7b..03d88990e 100644
--- a/packaging/metadata/_raw.py
+++ b/packaging/metadata/raw.py
@@ -7,6 +7,9 @@
 from typing import Any, TypedDict, Union, cast
 
 
+__all__ = ["RawMetadata", "parse_email", "parse_json"]
+
+
 # The RawMetadata class attempts to make as few assumptions about
 # the underlying serialization formats as possible, these could
 # possibly serialize in an entirely different way, but the idea

From 0a2b73376d4f3bc4e498b244791f0b75b63790e0 Mon Sep 17 00:00:00 2001
From: Donald Stufft <donald@stufft.io>
Date: Sat, 16 Jul 2022 13:40:15 -0400
Subject: [PATCH 12/19] More compatible type hints

---
 packaging/metadata/raw.py | 53 +++++++++++++++++++--------------------
 1 file changed, 26 insertions(+), 27 deletions(-)

diff --git a/packaging/metadata/raw.py b/packaging/metadata/raw.py
index 03d88990e..7ba0c2cc0 100644
--- a/packaging/metadata/raw.py
+++ b/packaging/metadata/raw.py
@@ -4,8 +4,7 @@
 import email.parser
 import email.policy
 import json
-from typing import Any, TypedDict, Union, cast
-
+from typing import Any, Dict, List, Tuple, TypedDict, Union, cast
 
 __all__ = ["RawMetadata", "parse_email", "parse_json"]
 
@@ -22,32 +21,32 @@ class RawMetadata(TypedDict, total=False):
     metadata_version: str
     name: str
     version: str
-    platforms: list[str]
+    platforms: List[str]
     summary: str
     description: str
-    keywords: list[str]
+    keywords: List[str]
     home_page: str
     author: str
     author_email: str
     license: str
 
     # Metadata 1.1 - PEP 314
-    supported_platforms: list[str]
+    supported_platforms: List[str]
     download_url: str
-    classifiers: list[str]
-    requires: list[str]
-    provides: list[str]
-    obsoletes: list[str]
+    classifiers: List[str]
+    requires: List[str]
+    provides: List[str]
+    obsoletes: List[str]
 
     # Metadata 1.2 - PEP 345
     maintainer: str
     maintainer_email: str
-    requires_dist: list[str]
-    provides_dist: list[str]
-    obsoletes_dist: list[str]
+    requires_dist: List[str]
+    provides_dist: List[str]
+    obsoletes_dist: List[str]
     requires_python: str
-    requires_external: list[str]
-    project_urls: dict[str, str]
+    requires_external: List[str]
+    project_urls: Dict[str, str]
 
     # Metadata 2.0
     # PEP 426 attempted to completely revamp the metadata format
@@ -60,10 +59,10 @@ class RawMetadata(TypedDict, total=False):
 
     # Metadata 2.1 - PEP 566
     description_content_type: str
-    provides_extra: list[str]
+    provides_extra: List[str]
 
     # Metadata 2.2 - PEP 643
-    dynamic: list[str]
+    dynamic: List[str]
 
     # Metadata 2.3 - PEP 685
     # No new fields were added in PEP 685, just some edge case were
@@ -106,11 +105,11 @@ class RawMetadata(TypedDict, total=False):
 # multiple parse_FORMAT functions
 
 
-def _parse_keywords(data: str) -> list[str]:
+def _parse_keywords(data: str) -> List[str]:
     return [k.strip() for k in data.split(",")]
 
 
-def _parse_project_urls(data: list[str]) -> dict[str, str]:
+def _parse_project_urls(data: List[str]) -> Dict[str, str]:
     urls = {}
     for pair in data:
         # Our logic is slightly tricky here as we want to try and do
@@ -193,9 +192,9 @@ def _parse_project_urls(data: list[str]) -> dict[str, str]:
 }
 
 
-def parse_email(data: Union[bytes, str]) -> tuple[RawMetadata, dict[Any, Any]]:
-    raw: dict[str, Any] = {}
-    unparsed: dict[Any, Any] = {}
+def parse_email(data: Union[bytes, str]) -> Tuple[RawMetadata, Dict[Any, Any]]:
+    raw: Dict[str, Any] = {}
+    unparsed: Dict[Any, Any] = {}
 
     if isinstance(data, str):
         parsed = email.parser.Parser(policy=email.policy.compat32).parsestr(data)
@@ -382,9 +381,9 @@ def parse_email(data: Union[bytes, str]) -> tuple[RawMetadata, dict[Any, Any]]:
 }
 
 
-def parse_json(data: Union[bytes, str]) -> tuple[RawMetadata, dict[Any, Any]]:
-    raw: dict[Any, Any] = {}
-    unparsed: dict[Any, Any] = {}
+def parse_json(data: Union[bytes, str]) -> Tuple[RawMetadata, Dict[Any, Any]]:
+    raw: Dict[Any, Any] = {}
+    unparsed: Dict[Any, Any] = {}
     parsed = json.loads(data)
 
     # We need to make sure that the data given to us actually implements
@@ -413,7 +412,7 @@ def parse_json(data: Union[bytes, str]) -> tuple[RawMetadata, dict[Any, Any]]:
             and isinstance(value, list)
             and all(isinstance(v, str) for v in value)
         ):
-            raw[raw_name] = cast(list[str], value)
+            raw[raw_name] = cast(List[str], value)
         # Special Case: Keywords
         # The keywords field is implemented in the metadata spec as a str,
         # but it conceptually is a list of strings. Interestingly, the
@@ -463,14 +462,14 @@ def _get_payload(msg: email.message.Message, source: Union[bytes, str]) -> str:
     # If our source is a str, then our caller has managed encodings for us,
     # and we don't need to deal with it.
     if isinstance(source, str):
-        payload: Union[list[str], str] = msg.get_payload()
+        payload: Union[List[str], str] = msg.get_payload()
         if isinstance(payload, list):
             raise ValueError("payload is a multipart")
         return payload
     # If our source is a bytes, then we're managing the encoding and we need
     # to deal with it.
     else:
-        bpayload: Union[list[bytes], bytes] = msg.get_payload(decode=True)
+        bpayload: Union[List[bytes], bytes] = msg.get_payload(decode=True)
         if isinstance(bpayload, list):
             raise ValueError("payload is a multipart")
 

From 36ac9b71fc8e8df97d123fff01793559673df108 Mon Sep 17 00:00:00 2001
From: Donald Stufft <donald@stufft.io>
Date: Sat, 16 Jul 2022 17:19:01 -0400
Subject: [PATCH 13/19] Enable emitting email/json from RawMetadata

---
 packaging/metadata/raw.py | 134 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 134 insertions(+)

diff --git a/packaging/metadata/raw.py b/packaging/metadata/raw.py
index 7ba0c2cc0..d25d8f37b 100644
--- a/packaging/metadata/raw.py
+++ b/packaging/metadata/raw.py
@@ -346,6 +346,93 @@ def parse_email(data: Union[bytes, str]) -> Tuple[RawMetadata, Dict[Any, Any]]:
     return cast(RawMetadata, raw), unparsed
 
 
+_EMAIL_FIELD_ORDER = [
+    # Always put the metadata version first, incase it ever changes how
+    # we parse this file.
+    "metadata_version",
+    # Put the other pieces of mandatory information next.
+    "name",
+    "version",
+    # We're just going to emit all of these in sorted order, except we'll
+    # float deprecated or "rarely used" fields to the bottom.
+    "author",
+    "author_email",
+    "classifiers",
+    # We are purposely excluding the description field, we don't want to
+    # write that field out as a header, so we won't include it in this list
+    # and it will have to be manually handled instead.
+    # "description",
+    "description_content_type",
+    "download_url",
+    "dynamic",
+    "home_page",
+    "keywords",
+    "license",
+    "maintainer",
+    "maintainer_email",
+    "platforms",
+    "project_urls",
+    "provides_extra",
+    "requires_dist",
+    "requires_python",
+    "summary",
+    "supported_platforms",
+    # Deprecated or "rarely used"
+    "obsoletes",
+    "obsoletes_dist",
+    "provides",
+    "provides_dist",
+    "requires",
+    "requires_external",
+]
+
+
+def emit_email(raw: RawMetadata) -> bytes:
+    # TypedDict only allows literal keys, we know that are dynamic keys are correct
+    # but to satisfy the type checker we'll cast things.
+    data = cast(Dict[str, Any], raw)
+
+    # Figure out our mapping to email names
+    field_names = dict((v, k) for (k, v) in _EMAIL_FIELD_MAPPING.items())
+
+    # From what I can tell, there is no way to get the email module in the stdlib
+    # to actually emit a ``METADATA``file in the format that we need, so instead
+    # we'll have to manually craft one.
+    lines = []
+
+    for field in _EMAIL_FIELD_ORDER:
+        field_name = field_names[field]
+        field_data = data.get(field)
+        if field_data:
+            # String fields get emitted as Key: Data
+            if field in _STRING_FIELDS and isinstance(field_data, str):
+                lines.append(f"{field_name}: {_rfc822_escape(field_data)}")
+            # List String fields get emitted as a Key: Data per entry.
+            elif field in _LIST_STRING_FIELDS and isinstance(field_data, list):
+                for item in field_data:
+                    lines.append(f"{field_name}: {_rfc822_escape(item)}")
+            # Special Case: Keywords
+            #   We need to turn our List String for Keywords back into a singular
+            #   string for the core metadata spec.
+            elif field == "keywords" and isinstance(field_data, list):
+                lines.append(f"{field_name}: {_rfc822_escape(', '.join(field_data))}")
+            # Special Case: Project-URL
+            #   We need to turn our dict[str, str] back into the list of specially
+            #   formatted strings to match what the core metadata expects.
+            elif field == "project_urls" and isinstance(field_data, dict):
+                for label, url in field_data.items():
+                    lines.append(
+                        f"{field_name}: {_rfc822_escape(', '.join([label, url]))}"
+                    )
+
+    msg = "\n".join(lines)
+    description = raw.get("description")
+    if description:
+        msg = msg + "\n\n" + description
+
+    return msg.encode("utf8")
+
+
 # This might appear to be a mapping of the same key to itself, and in many cases
 # it is. However, the algorithm in PEP 566 doesn't match 100% the keys chosen
 # for RawMetadata, so we use this mapping just like with email to handle that.
@@ -458,6 +545,43 @@ def parse_json(data: Union[bytes, str]) -> Tuple[RawMetadata, Dict[Any, Any]]:
     return cast(RawMetadata, raw), unparsed
 
 
+def emit_json(raw: RawMetadata) -> bytes:
+    # TypedDict only allows literal keys, we know that are dynamic keys are correct
+    # but to satisfy the type checker we'll cast things.
+    data = cast(Dict[str, Any], raw)
+
+    # Figure out our mapping to email names
+    field_names = dict((v, k) for (k, v) in _JSON_FIELD_MAPPING.items())
+
+    out = {}
+    for field in _EMAIL_FIELD_ORDER:
+        field_name = field_names[field]
+        field_data = data.get(field)
+        if field_data:
+            if (field in _STRING_FIELDS and isinstance(field_data, str)) or (
+                field in _LIST_STRING_FIELDS and isinstance(field_data, list)
+            ):
+                out[field_name] = field_data
+            # Special Case: Keywords
+            #   We need to turn our List String for Keywords back into a singular
+            #   string for the core metadata spec.
+            elif field == "keywords" and isinstance(field_data, list):
+                out[field_name] = ", ".join(field_data)
+            # Special Case: Project-URL
+            #   We need to turn our dict[str, str] back into the list of specially
+            #   formatted strings to match what the core metadata expects.
+            elif field == "project_urls" and isinstance(field_data, dict):
+                out[field_name] = [
+                    f"{label}, {url}" for (label, url) in field_data.items()
+                ]
+
+    description = raw.get("description")
+    if description:
+        out["description"] = description
+
+    return json.dumps(out).encode("utf8")
+
+
 def _get_payload(msg: email.message.Message, source: Union[bytes, str]) -> str:
     # If our source is a str, then our caller has managed encodings for us,
     # and we don't need to deal with it.
@@ -477,3 +601,13 @@ def _get_payload(msg: email.message.Message, source: Union[bytes, str]) -> str:
             return bpayload.decode("utf8", "strict")
         except UnicodeDecodeError:
             raise ValueError("payload in an invalid encoding")
+
+
+def _rfc822_escape(header: str) -> str:
+    """
+    Return a version of the string escaped for inclusion in an
+    RFC-822 header, by ensuring there are 8 spaces space after each newline.
+    """
+    lines = header.split("\n")
+    sep = "\n" + 8 * " "
+    return sep.join(lines)

From 54d354a70e8ecf68396decd3309f49295ecf6589 Mon Sep 17 00:00:00 2001
From: Donald Stufft <donald@stufft.io>
Date: Sat, 16 Jul 2022 17:27:16 -0400
Subject: [PATCH 14/19] pyupgrade

---
 packaging/metadata/raw.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/packaging/metadata/raw.py b/packaging/metadata/raw.py
index d25d8f37b..4a5e595df 100644
--- a/packaging/metadata/raw.py
+++ b/packaging/metadata/raw.py
@@ -393,7 +393,7 @@ def emit_email(raw: RawMetadata) -> bytes:
     data = cast(Dict[str, Any], raw)
 
     # Figure out our mapping to email names
-    field_names = dict((v, k) for (k, v) in _EMAIL_FIELD_MAPPING.items())
+    field_names = {v: k for (k, v) in _EMAIL_FIELD_MAPPING.items()}
 
     # From what I can tell, there is no way to get the email module in the stdlib
     # to actually emit a ``METADATA``file in the format that we need, so instead
@@ -551,7 +551,7 @@ def emit_json(raw: RawMetadata) -> bytes:
     data = cast(Dict[str, Any], raw)
 
     # Figure out our mapping to email names
-    field_names = dict((v, k) for (k, v) in _JSON_FIELD_MAPPING.items())
+    field_names = {v: k for (k, v) in _JSON_FIELD_MAPPING.items()}
 
     out = {}
     for field in _EMAIL_FIELD_ORDER:

From a0f99e8bd18b5042aad6aae3984acfdde972c170 Mon Sep 17 00:00:00 2001
From: Donald Stufft <donald@stufft.io>
Date: Sat, 16 Jul 2022 21:26:11 -0400
Subject: [PATCH 15/19] don't overwrite when payload is empty

---
 packaging/metadata/raw.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/packaging/metadata/raw.py b/packaging/metadata/raw.py
index 4a5e595df..9d508c507 100644
--- a/packaging/metadata/raw.py
+++ b/packaging/metadata/raw.py
@@ -332,12 +332,13 @@ def parse_email(data: Union[bytes, str]) -> Tuple[RawMetadata, Dict[Any, Any]]:
     except ValueError:
         unparsed["Description"] = parsed.get_payload(decode=isinstance(data, bytes))
     else:
-        # Check to see if we've already got a description, if so then both
-        # it, and this body move to unparseable.
-        if "description" in raw:
-            unparsed["Description"] = [raw.pop("description"), payload]
-        else:
-            raw["description"] = payload
+        if payload:
+            # Check to see if we've already got a description, if so then both
+            # it, and this body move to unparseable.
+            if "description" in raw:
+                unparsed["Description"] = [raw.pop("description"), payload]
+            else:
+                raw["description"] = payload
 
     # We need to cast our `raw` to a metadata, because a TypedDict only support
     # literal key names, but we're computing our key names on purpose, but the

From 52be4b5dbc1e7b912d1c9243ecc1e17698460411 Mon Sep 17 00:00:00 2001
From: Donald Stufft <donald@stufft.io>
Date: Sun, 17 Jul 2022 11:03:59 -0400
Subject: [PATCH 16/19] Start reworking the Metadata class

---
 packaging/metadata/_types.py      | 270 ++++++++++++------------------
 packaging/metadata/_validation.py | 134 +++++++++++++++
 2 files changed, 244 insertions(+), 160 deletions(-)
 create mode 100644 packaging/metadata/_validation.py

diff --git a/packaging/metadata/_types.py b/packaging/metadata/_types.py
index 1722981be..f6068b450 100644
--- a/packaging/metadata/_types.py
+++ b/packaging/metadata/_types.py
@@ -1,10 +1,11 @@
+from __future__ import annotations
+
 import enum
-from typing import Iterable, List, Optional, Tuple
+from typing import Optional, Tuple, TypedDict
 
-from ..requirements import Requirement
-from ..specifiers import SpecifierSet
-from ..utils import NormalizedName, canonicalize_name
 from ..version import Version
+from ._validation import RegexValidator, Required, eagerly_validate, lazy_validator
+from .raw import RawMetadata, parse_email, parse_json
 
 # Type aliases.
 _NameAndEmail = Tuple[Optional[str], str]
@@ -50,162 +51,111 @@ class DynamicField(enum.Enum):
     PROVIDES_EXTRA = "provides-extra"
 
 
-class Metadata:
-    """A class representing the `Core Metadata`_ for a project.
+class _ValidatedMetadata(TypedDict, total=False):
+    # Metadata 1.0 - PEP 241
+    name: str
+    version: Version
+    # platforms: List[str]
+    # summary: str
+    # description: str
+    # keywords: List[str]
+    # home_page: str
+    # author: str
+    # author_email: str
+    # license: str
 
-    Every potential metadata field except for ``Metadata-Version`` is represented by a
-    parameter to the class' constructor. The required metadata can be passed in
-    positionally or via keyword, while all optional metadata can only be passed in via
-    keyword.
 
-    Every parameter has a matching attribute on instances, except for *name* (see
-    :attr:`display_name` and :attr:`canonical_name`). Any parameter that accepts an
-    :class:`~collections.abc.Iterable` is represented as a :class:`list` on the
-    corresponding attribute.
-    """
+class Metadata:
 
-    # A property named `display_name` exposes the value.
-    _display_name: str
-    # A property named `canonical_name` exposes the value.
-    _canonical_name: NormalizedName
-    version: Version
-    platforms: List[str]
-    summary: str
-    description: str
-    keywords: List[str]
-    home_page: str
-    author: str
-    author_emails: List[_NameAndEmail]
-    license: str
-    supported_platforms: List[str]
-    download_url: str
-    classifiers: List[str]
-    maintainer: str
-    maintainer_emails: List[_NameAndEmail]
-    requires_dists: List[Requirement]
-    requires_python: SpecifierSet
-    requires_externals: List[str]
-    project_urls: List[_LabelAndURL]
-    provides_dists: List[str]
-    obsoletes_dists: List[str]
-    description_content_type: str
-    provides_extras: List[NormalizedName]
-    dynamic_fields: List[DynamicField]
-
-    def __init__(
-        self,
-        name: str,
-        version: Version,
-        *,
-        # 1.0
-        platforms: Optional[Iterable[str]] = None,
-        summary: Optional[str] = None,
-        description: Optional[str] = None,
-        keywords: Optional[Iterable[str]] = None,
-        home_page: Optional[str] = None,
-        author: Optional[str] = None,
-        author_emails: Optional[Iterable[_NameAndEmail]] = None,
-        license: Optional[str] = None,
-        # 1.1
-        supported_platforms: Optional[Iterable[str]] = None,
-        download_url: Optional[str] = None,
-        classifiers: Optional[Iterable[str]] = None,
-        # 1.2
-        maintainer: Optional[str] = None,
-        maintainer_emails: Optional[Iterable[_NameAndEmail]] = None,
-        requires_dists: Optional[Iterable[Requirement]] = None,
-        requires_python: Optional[SpecifierSet] = None,
-        requires_externals: Optional[Iterable[str]] = None,
-        project_urls: Optional[Iterable[_LabelAndURL]] = None,
-        provides_dists: Optional[Iterable[str]] = None,
-        obsoletes_dists: Optional[Iterable[str]] = None,
-        # 2.1
-        description_content_type: Optional[str] = None,
-        provides_extras: Optional[Iterable[NormalizedName]] = None,
-        # 2.2
-        dynamic_fields: Optional[Iterable[DynamicField]] = None,
-    ) -> None:
-        """Initialize a Metadata object.
-
-        The parameters all correspond to fields in `Core Metadata`_.
-
-        :param name: ``Name``
-        :param version: ``Version``
-        :param platforms: ``Platform``
-        :param summary: ``Summary``
-        :param description: ``Description``
-        :param keywords: ``Keywords``
-        :param home_page: ``Home-Page``
-        :param author: ``Author``
-        :param author_emails:
-            ``Author-Email`` (two-item tuple represents the name and email of the
-            author)
-        :param license: ``License``
-        :param supported_platforms: ``Supported-Platform``
-        :param download_url: ``Download-URL``
-        :param classifiers: ``Classifier``
-        :param maintainer: ``Maintainer``
-        :param maintainer_emails:
-            ``Maintainer-Email`` (two-item tuple represent the name and email of the
-            maintainer)
-        :param requires_dists: ``Requires-Dist``
-        :param SpecifierSet requires_python: ``Requires-Python``
-        :param requires_externals: ``Requires-External``
-        :param project_urls: ``Project-URL``
-        :param provides_dists: ``Provides-Dist``
-        :param obsoletes_dists: ``Obsoletes-Dist``
-        :param description_content_type: ``Description-Content-Type``
-        :param provides_extras: ``Provides-Extra``
-        :param dynamic_fields: ``Dynamic``
-        """
-        self.display_name = name
-        self.version = version
-        self.platforms = list(platforms or [])
-        self.summary = summary or ""
-        self.description = description or ""
-        self.keywords = list(keywords or [])
-        self.home_page = home_page or ""
-        self.author = author or ""
-        self.author_emails = list(author_emails or [])
-        self.license = license or ""
-        self.supported_platforms = list(supported_platforms or [])
-        self.download_url = download_url or ""
-        self.classifiers = list(classifiers or [])
-        self.maintainer = maintainer or ""
-        self.maintainer_emails = list(maintainer_emails or [])
-        self.requires_dists = list(requires_dists or [])
-        self.requires_python = requires_python or SpecifierSet()
-        self.requires_externals = list(requires_externals or [])
-        self.project_urls = list(project_urls or [])
-        self.provides_dists = list(provides_dists or [])
-        self.obsoletes_dists = list(obsoletes_dists or [])
-        self.description_content_type = description_content_type or ""
-        self.provides_extras = list(provides_extras or [])
-        self.dynamic_fields = list(dynamic_fields or [])
-
-    @property
-    def display_name(self) -> str:
-        """
-        The project name to be displayed to users (i.e. not normalized). Initially
-        set based on the `name` parameter.
-
-        Setting this attribute will also update :attr:`canonical_name`.
-        """
-        return self._display_name
-
-    @display_name.setter
-    def display_name(self, value: str) -> None:
-        self._display_name = value
-        self._canonical_name = canonicalize_name(value)
-
-    # Use functools.cached_property once Python 3.7 support is dropped.
-    # Value is set by self.display_name.setter to keep in sync with self.display_name.
-    @property
-    def canonical_name(self) -> NormalizedName:
-        """
-        The normalized project name as per :func:`packaging.utils.canonicalize_name`.
-
-        The attribute is read-only and automatically calculated based on the value of
-        :attr:`display_name`.
-        """
-        return self._canonical_name
+    # We store our "actual" metadata as a RawMetadata, which
+    # gives is a little bit of indirection here. The RawMetadata
+    # class is lenient as to what it will consider valid, but this
+    # class is not.
+    #
+    # However, we want to support validation to happen both up front
+    # and on the fly as you access attributes, and when using the
+    # on the fly validation, we don't want to validate anything else
+    # except for the specific piece of metadata that is being
+    # asked for.
+    #
+    # That means that we need to store, at least initially, the
+    # metadata in a form that is lenient, which is exactly the
+    # purpose of RawMetadata.
+    _raw: RawMetadata
+
+    # Likewise, we need a place to store our honest to goodness actually
+    # validated metadata too, we could just store this in a dict, but
+    # this will give us better typing.
+    _validated: _ValidatedMetadata
+
+    def __init__(self) -> None:
+        raise NotImplementedError
+
+    # It's not exactly the most pythonic thing to have a bunch of getter/setters
+    # like this for every attribute, however this enables us to do our on the
+    # fly validation.
+
+    # Name: Metadata 1.0
+    name = lazy_validator(
+        str,
+        validators=[
+            Required(),
+            RegexValidator("(?i)^([A-Z0-9]|[A-Z0-9][A-Z0-9._-]*[A-Z0-9])$"),
+        ],
+    )
+    # Version: Metadata 1.0
+    version = lazy_validator(Version, validators=[Required()])
+
+    @classmethod
+    def from_raw(cls, raw: RawMetadata, *, validate: bool = True) -> Metadata:
+        # Ok this is some kind of gross code here, but it has a specific
+        # purpose.
+        #
+        # We want to enable the progrmatic API of the Metadata
+        # class to strictly validate, including requires data, so
+        # we want something like Metadata("foo", "1.0", ...), but
+        # we also want from_raw to *not* require that data, so we
+        # treat our __init__ as our public constructor, then we bypass
+        # the __init__ when calling from_raw to let us setup the object
+        # in a completely different way, without exposing that as
+        # programatic API in and of itself.
+        meta = cls.__new__(cls)
+        meta._raw = raw
+        meta._validated = _ValidatedMetadata()
+
+        # It's not possible to use Metadata without validating, but the
+        # validate parameter here lets people control whether the entire
+        # metadata gets validated up front, or whether it gets validated
+        # on demand.
+        if validate:
+            eagerly_validate(meta)
+
+        return meta
+
+    @classmethod
+    def from_email(cls, data: bytes | str, *, validate: bool = True) -> Metadata:
+        raw, unparsed = parse_email(data)
+
+        # Regardless of the validate attribute, we don't let unparsed data
+        # pass silently, if someone wants to drop unparsed data on the floor
+        # they can call parse_email themselves and pass it into from_raw
+        if unparsed:
+            raise ValueError(
+                f"Could not parse, extra keys: {', '.join(unparsed.keys())}"
+            )
+
+        return cls.from_raw(raw, validate=validate)
+
+    @classmethod
+    def from_json(cls, data: bytes | str, *, validate: bool = True) -> Metadata:
+        raw, unparsed = parse_json(data)
+
+        # Regardless of the validate attribute, we don't let unparsed data
+        # pass silently, if someone wants to drop unparsed data on the floor
+        # they can call parse_email themselves and pass it into from_raw
+        if unparsed:
+            raise ValueError(
+                f"Could not parse, extra keys: {', '.join(unparsed.keys())}"
+            )
+
+        return cls.from_raw(raw, validate=validate)
diff --git a/packaging/metadata/_validation.py b/packaging/metadata/_validation.py
new file mode 100644
index 000000000..806e3d887
--- /dev/null
+++ b/packaging/metadata/_validation.py
@@ -0,0 +1,134 @@
+from __future__ import annotations
+
+import re
+from typing import (
+    Any,
+    Callable,
+    Type,
+    TypeVar,
+    Optional,
+    TYPE_CHECKING,
+    cast,
+    Dict,
+    Union,
+    List,
+    Generic,
+)
+
+if TYPE_CHECKING:
+    from ._types import Metadata
+
+
+V = TypeVar("V")
+
+
+Validator = Callable[[V], None]
+
+
+class lazy_validator(Generic[V]):
+
+    # This hack exists to work around https://github.com/python/mypy/issues/708
+    _creator: Union[Callable[[Any], V], Callable[[Any], V]]
+    _raw_name: str
+    _validators: List[Validator[Optional[V]]]
+
+    def __init__(
+        self,
+        creator: Callable[[Any], V],
+        *,
+        raw_name: Optional[str] = None,
+        validators: Optional[List[Validator[Optional[V]]]] = None,
+    ) -> None:
+        self._creator = creator
+        if raw_name is not None:
+            self._raw_name = raw_name
+        if validators is not None:
+            self._validators = validators
+        else:
+            self._validators = []
+
+    def __set_name__(self, owner: Metadata, name: str) -> None:
+        self._raw_name = name
+
+    def __get__(self, obj: Metadata, owner: Type[Metadata]) -> Optional[V]:
+        # TypedDict doesn't support variable key names, and Python 3.7 doesn't
+        # support Literal which would let us let it know that this is validated
+        # already to be safe, so we'll cast here to make things work.
+        raw = cast(Dict[str, Any], obj._raw)
+        validated = cast(Dict[str, Optional[V]], obj._validated)
+
+        if self._raw_name not in validated:
+            value = self._validate(raw.get(self._raw_name))
+            validated[self._raw_name] = value
+            del raw[self._raw_name]
+
+        return validated[self._raw_name]
+
+    def __set__(self, obj: Metadata, value: Any) -> None:
+        raw = cast(Dict[str, Any], obj._raw)
+        validated = cast(Dict[str, Optional[V]], obj._validated)
+
+        validated_value = self._validate(value)
+        validated[self._raw_name] = validated_value
+        raw.pop(self._raw_name, None)
+
+    def __delete__(self, obj: Metadata) -> None:
+        raw = cast(Dict[str, Any], obj._raw)
+        validated = cast(Dict[str, Optional[V]], obj._validated)
+
+        raw.pop(self._raw_name, None)
+        validated.pop(self._raw_name, None)
+
+    def _validate(self, data: Any) -> Optional[V]:
+        # Create our value from our raw data
+        value = self._creator(data) if data is not None else None
+
+        # Loop over our validators, and ensure that our value is actually valid
+        for validator in self._validators:
+            validator(value)
+
+        return value
+
+
+def eagerly_validate(obj: Metadata) -> None:
+    for name, field in obj.__class__.__dict__.items():
+        if isinstance(field, lazy_validator):
+            getattr(obj, name)
+
+
+class Required:
+
+    _error_msg: str
+
+    def __init__(self, message: Optional[str] = None):
+        if message is None:
+            self._error_msg = "value is required: {value!r}"
+        else:
+            self._error_msg = message
+
+    def __call__(self, value: V) -> None:
+        if value is None:
+            raise ValueError(self._error_msg.format(value=value))
+
+
+class RegexValidator:
+
+    _regex: re.Pattern[str]
+    _error_msg: str
+
+    def __init__(
+        self, regex: Union[str, re.Pattern[str]], *, message: Optional[str] = None
+    ):
+        if isinstance(regex, str):
+            self._regex = re.compile(regex)
+        else:
+            self._regex = regex
+
+        if message is None:
+            self._error_msg = "invalid value: {value!r}"
+        else:
+            self._error_msg = message
+
+    def __call__(self, value: Optional[str]) -> None:
+        if value is not None and self._regex.search(value) is None:
+            raise ValueError(self._error_msg.format(value=value))

From 67492856024627a389f380598a7dc591efe22edc Mon Sep 17 00:00:00 2001
From: Donald Stufft <donald@stufft.io>
Date: Sun, 17 Jul 2022 11:14:06 -0400
Subject: [PATCH 17/19] linting

---
 packaging/metadata/_validation.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/packaging/metadata/_validation.py b/packaging/metadata/_validation.py
index 806e3d887..d2944cb1a 100644
--- a/packaging/metadata/_validation.py
+++ b/packaging/metadata/_validation.py
@@ -2,17 +2,17 @@
 
 import re
 from typing import (
+    TYPE_CHECKING,
     Any,
     Callable,
+    Dict,
+    Generic,
+    List,
+    Optional,
     Type,
     TypeVar,
-    Optional,
-    TYPE_CHECKING,
-    cast,
-    Dict,
     Union,
-    List,
-    Generic,
+    cast,
 )
 
 if TYPE_CHECKING:

From 1d1487adb63f213abbbe875252e6c1f36d1f60f2 Mon Sep 17 00:00:00 2001
From: Donald Stufft <donald@stufft.io>
Date: Sun, 17 Jul 2022 13:30:36 -0400
Subject: [PATCH 18/19] more metadata

---
 packaging/metadata/_types.py      |  29 ++++++--
 packaging/metadata/_utils.py      |  19 ++++++
 packaging/metadata/_validation.py | 110 ++++++++++++++++++++----------
 3 files changed, 118 insertions(+), 40 deletions(-)
 create mode 100644 packaging/metadata/_utils.py

diff --git a/packaging/metadata/_types.py b/packaging/metadata/_types.py
index f6068b450..8c1f924c3 100644
--- a/packaging/metadata/_types.py
+++ b/packaging/metadata/_types.py
@@ -1,11 +1,13 @@
 from __future__ import annotations
 
 import enum
-from typing import Optional, Tuple, TypedDict
+from collections.abc import Iterable
+from typing import Any, List, Optional, Tuple, TypedDict
 
 from ..version import Version
 from ._validation import RegexValidator, Required, eagerly_validate, lazy_validator
 from .raw import RawMetadata, parse_email, parse_json
+from ._utils import as_str, as_list_str
 
 # Type aliases.
 _NameAndEmail = Tuple[Optional[str], str]
@@ -51,12 +53,24 @@ class DynamicField(enum.Enum):
     PROVIDES_EXTRA = "provides-extra"
 
 
+@enum.unique
+class MetadataVersion(enum.Enum):
+    v1_0 = "1.0"
+    v1_1 = "1.1"
+    v1_2 = "1.2"
+    v2_0 = "2.0"
+    v2_1 = "2.1"
+    v2_2 = "2.2"
+    v2_3 = "2.3"
+
+
 class _ValidatedMetadata(TypedDict, total=False):
     # Metadata 1.0 - PEP 241
+    metadata_version: str
     name: str
     version: Version
-    # platforms: List[str]
-    # summary: str
+    platforms: List[str]
+    summary: str
     # description: str
     # keywords: List[str]
     # home_page: str
@@ -95,9 +109,13 @@ def __init__(self) -> None:
     # like this for every attribute, however this enables us to do our on the
     # fly validation.
 
+    # Metadata-Version: Metadata 1.0
+    _metadata_version = lazy_validator(
+        MetadataVersion, raw_name="metadata_version", validators=[Required()]
+    )
     # Name: Metadata 1.0
     name = lazy_validator(
-        str,
+        as_str,
         validators=[
             Required(),
             RegexValidator("(?i)^([A-Z0-9]|[A-Z0-9][A-Z0-9._-]*[A-Z0-9])$"),
@@ -105,6 +123,9 @@ def __init__(self) -> None:
     )
     # Version: Metadata 1.0
     version = lazy_validator(Version, validators=[Required()])
+    # Platform: Metadata 1.0
+    platforms = lazy_validator(as_list_str)
+    summary = lazy_validator(as_str)
 
     @classmethod
     def from_raw(cls, raw: RawMetadata, *, validate: bool = True) -> Metadata:
diff --git a/packaging/metadata/_utils.py b/packaging/metadata/_utils.py
new file mode 100644
index 000000000..827d7b48f
--- /dev/null
+++ b/packaging/metadata/_utils.py
@@ -0,0 +1,19 @@
+from collections.abc import Iterable
+from typing import Any, List
+
+
+def as_str(inp: Any) -> str:
+    if not isinstance(inp, str):
+        raise ValueError("Must be a str")
+    return inp
+
+
+def as_list_str(inp: Any) -> List[str]:
+    if not isinstance(inp, Iterable):
+        raise ValueError("Must be a list of str")
+    results = []
+    for entry in inp:
+        if not isinstance(entry, str):
+            raise ValueError("Must a list of str")
+        results.append(entry)
+    return results
diff --git a/packaging/metadata/_validation.py b/packaging/metadata/_validation.py
index d2944cb1a..4b9a8ffcc 100644
--- a/packaging/metadata/_validation.py
+++ b/packaging/metadata/_validation.py
@@ -1,5 +1,7 @@
 from __future__ import annotations
 
+import abc
+from email import message
 import re
 from typing import (
     TYPE_CHECKING,
@@ -19,25 +21,22 @@
     from ._types import Metadata
 
 
-V = TypeVar("V")
-
-
-Validator = Callable[[V], None]
+T = TypeVar("T")
 
 
-class lazy_validator(Generic[V]):
+class lazy_validator(Generic[T]):
 
     # This hack exists to work around https://github.com/python/mypy/issues/708
-    _creator: Union[Callable[[Any], V], Callable[[Any], V]]
+    _creator: Union[Callable[[Any], T], Callable[[Any], T]]
     _raw_name: str
-    _validators: List[Validator[Optional[V]]]
+    _validators: List[Callable[[Any], None]]
 
     def __init__(
         self,
-        creator: Callable[[Any], V],
+        creator: Callable[[Any], T],
         *,
         raw_name: Optional[str] = None,
-        validators: Optional[List[Validator[Optional[V]]]] = None,
+        validators: Optional[List[Callable[[Any], None]]] = None,
     ) -> None:
         self._creator = creator
         if raw_name is not None:
@@ -48,14 +47,15 @@ def __init__(
             self._validators = []
 
     def __set_name__(self, owner: Metadata, name: str) -> None:
-        self._raw_name = name
+        if not hasattr(self, "_raw_name"):
+            self._raw_name = name
 
-    def __get__(self, obj: Metadata, owner: Type[Metadata]) -> Optional[V]:
+    def __get__(self, obj: Metadata, owner: Type[Metadata]) -> Optional[T]:
         # TypedDict doesn't support variable key names, and Python 3.7 doesn't
         # support Literal which would let us let it know that this is validated
         # already to be safe, so we'll cast here to make things work.
         raw = cast(Dict[str, Any], obj._raw)
-        validated = cast(Dict[str, Optional[V]], obj._validated)
+        validated = cast(Dict[str, Optional[T]], obj._validated)
 
         if self._raw_name not in validated:
             value = self._validate(raw.get(self._raw_name))
@@ -66,7 +66,7 @@ def __get__(self, obj: Metadata, owner: Type[Metadata]) -> Optional[V]:
 
     def __set__(self, obj: Metadata, value: Any) -> None:
         raw = cast(Dict[str, Any], obj._raw)
-        validated = cast(Dict[str, Optional[V]], obj._validated)
+        validated = cast(Dict[str, Optional[T]], obj._validated)
 
         validated_value = self._validate(value)
         validated[self._raw_name] = validated_value
@@ -74,12 +74,12 @@ def __set__(self, obj: Metadata, value: Any) -> None:
 
     def __delete__(self, obj: Metadata) -> None:
         raw = cast(Dict[str, Any], obj._raw)
-        validated = cast(Dict[str, Optional[V]], obj._validated)
+        validated = cast(Dict[str, Optional[T]], obj._validated)
 
         raw.pop(self._raw_name, None)
         validated.pop(self._raw_name, None)
 
-    def _validate(self, data: Any) -> Optional[V]:
+    def _validate(self, data: Any) -> Optional[T]:
         # Create our value from our raw data
         value = self._creator(data) if data is not None else None
 
@@ -96,39 +96,77 @@ def eagerly_validate(obj: Metadata) -> None:
             getattr(obj, name)
 
 
-class Required:
+V = TypeVar("V")
 
-    _error_msg: str
 
-    def __init__(self, message: Optional[str] = None):
-        if message is None:
-            self._error_msg = "value is required: {value!r}"
-        else:
-            self._error_msg = message
+class ValidationError(Exception):
+    pass
+
+
+class Validator(Generic[V], abc.ABC):
+
+    message: str
+
+    def __init__(self, *args: Any, message: Optional[str] = None, **kwargs: Any):
+        super().__init__(*args, **kwargs)
+        if message is not None:
+            self.message = message
+
+    def __call__(self, value: Optional[V]) -> None:
+        try:
+            self.full_validate(value)
+        except Exception as exc:
+            raise ValidationError(self.message.format(value=value)) from exc
+
+    def full_validate(self, value: Optional[V]) -> None:
+        if value is not None:
+            self.validate(value)
+
+    @abc.abstractmethod
+    def validate(self, value: V) -> None:
+        ...
 
-    def __call__(self, value: V) -> None:
+
+class Required(Validator[V]):
+
+    message: str = "value is required: {value!r}"
+
+    def full_validate(self, value: Optional[V]) -> None:
         if value is None:
-            raise ValueError(self._error_msg.format(value=value))
+            raise ValueError("required value")
 
+    def validate(self, value: V) -> None:
+        pass
 
-class RegexValidator:
+
+class RegexValidator(Validator[V]):
 
     _regex: re.Pattern[str]
-    _error_msg: str
+    message: str = "invalid value: {value!r}"
+
+    def __init__(self, regex: Union[str, re.Pattern[str]], *args: Any, **kwargs: Any):
+        super().__init__(*args, **kwargs)
 
-    def __init__(
-        self, regex: Union[str, re.Pattern[str]], *, message: Optional[str] = None
-    ):
         if isinstance(regex, str):
             self._regex = re.compile(regex)
         else:
             self._regex = regex
 
-        if message is None:
-            self._error_msg = "invalid value: {value!r}"
-        else:
-            self._error_msg = message
+    def validate(self, value: V) -> None:
+        if not isinstance(value, str):
+            raise TypeError
+
+        if self._regex.search(value) is None:
+            raise ValueError(f"doesn't match: {self._regex.pattern}")
+
+
+class SingleLine(Validator[V]):
+
+    message: str = "must contain only one line: {value!r}"
+
+    def validate(self, value: V) -> None:
+        if not isinstance(value, str):
+            raise TypeError
 
-    def __call__(self, value: Optional[str]) -> None:
-        if value is not None and self._regex.search(value) is None:
-            raise ValueError(self._error_msg.format(value=value))
+        if "\n" in value or "\r" in value:
+            raise ValueError("multiline str")

From d2d07e8d3d884e35e7db003ea7116fd585147e93 Mon Sep 17 00:00:00 2001
From: Donald Stufft <donald@stufft.io>
Date: Sun, 17 Jul 2022 13:34:40 -0400
Subject: [PATCH 19/19] linting

---
 packaging/metadata/_types.py      |  7 +++---
 packaging/metadata/_validation.py | 39 +++++++++++--------------------
 2 files changed, 16 insertions(+), 30 deletions(-)

diff --git a/packaging/metadata/_types.py b/packaging/metadata/_types.py
index 8c1f924c3..8c3b1b36a 100644
--- a/packaging/metadata/_types.py
+++ b/packaging/metadata/_types.py
@@ -1,13 +1,12 @@
 from __future__ import annotations
 
 import enum
-from collections.abc import Iterable
-from typing import Any, List, Optional, Tuple, TypedDict
+from typing import Optional, Tuple, TypedDict
 
 from ..version import Version
+from ._utils import as_list_str, as_str
 from ._validation import RegexValidator, Required, eagerly_validate, lazy_validator
 from .raw import RawMetadata, parse_email, parse_json
-from ._utils import as_str, as_list_str
 
 # Type aliases.
 _NameAndEmail = Tuple[Optional[str], str]
@@ -69,7 +68,7 @@ class _ValidatedMetadata(TypedDict, total=False):
     metadata_version: str
     name: str
     version: Version
-    platforms: List[str]
+    platforms: list[str]
     summary: str
     # description: str
     # keywords: List[str]
diff --git a/packaging/metadata/_validation.py b/packaging/metadata/_validation.py
index 4b9a8ffcc..d27e96924 100644
--- a/packaging/metadata/_validation.py
+++ b/packaging/metadata/_validation.py
@@ -1,21 +1,8 @@
 from __future__ import annotations
 
 import abc
-from email import message
 import re
-from typing import (
-    TYPE_CHECKING,
-    Any,
-    Callable,
-    Dict,
-    Generic,
-    List,
-    Optional,
-    Type,
-    TypeVar,
-    Union,
-    cast,
-)
+from typing import TYPE_CHECKING, Any, Callable, Dict, Generic, Optional, TypeVar, cast
 
 if TYPE_CHECKING:
     from ._types import Metadata
@@ -24,19 +11,19 @@
 T = TypeVar("T")
 
 
-class lazy_validator(Generic[T]):
+class lazy_validator(Generic[T]):  # noqa: N801
 
     # This hack exists to work around https://github.com/python/mypy/issues/708
-    _creator: Union[Callable[[Any], T], Callable[[Any], T]]
+    _creator: Callable[[Any], T] | Callable[[Any], T]
     _raw_name: str
-    _validators: List[Callable[[Any], None]]
+    _validators: list[Callable[[Any], None]]
 
     def __init__(
         self,
         creator: Callable[[Any], T],
         *,
-        raw_name: Optional[str] = None,
-        validators: Optional[List[Callable[[Any], None]]] = None,
+        raw_name: str | None = None,
+        validators: list[Callable[[Any], None]] | None = None,
     ) -> None:
         self._creator = creator
         if raw_name is not None:
@@ -50,7 +37,7 @@ def __set_name__(self, owner: Metadata, name: str) -> None:
         if not hasattr(self, "_raw_name"):
             self._raw_name = name
 
-    def __get__(self, obj: Metadata, owner: Type[Metadata]) -> Optional[T]:
+    def __get__(self, obj: Metadata, owner: type[Metadata]) -> T | None:
         # TypedDict doesn't support variable key names, and Python 3.7 doesn't
         # support Literal which would let us let it know that this is validated
         # already to be safe, so we'll cast here to make things work.
@@ -79,7 +66,7 @@ def __delete__(self, obj: Metadata) -> None:
         raw.pop(self._raw_name, None)
         validated.pop(self._raw_name, None)
 
-    def _validate(self, data: Any) -> Optional[T]:
+    def _validate(self, data: Any) -> T | None:
         # Create our value from our raw data
         value = self._creator(data) if data is not None else None
 
@@ -107,18 +94,18 @@ class Validator(Generic[V], abc.ABC):
 
     message: str
 
-    def __init__(self, *args: Any, message: Optional[str] = None, **kwargs: Any):
+    def __init__(self, *args: Any, message: str | None = None, **kwargs: Any):
         super().__init__(*args, **kwargs)
         if message is not None:
             self.message = message
 
-    def __call__(self, value: Optional[V]) -> None:
+    def __call__(self, value: V | None) -> None:
         try:
             self.full_validate(value)
         except Exception as exc:
             raise ValidationError(self.message.format(value=value)) from exc
 
-    def full_validate(self, value: Optional[V]) -> None:
+    def full_validate(self, value: V | None) -> None:
         if value is not None:
             self.validate(value)
 
@@ -131,7 +118,7 @@ class Required(Validator[V]):
 
     message: str = "value is required: {value!r}"
 
-    def full_validate(self, value: Optional[V]) -> None:
+    def full_validate(self, value: V | None) -> None:
         if value is None:
             raise ValueError("required value")
 
@@ -144,7 +131,7 @@ class RegexValidator(Validator[V]):
     _regex: re.Pattern[str]
     message: str = "invalid value: {value!r}"
 
-    def __init__(self, regex: Union[str, re.Pattern[str]], *args: Any, **kwargs: Any):
+    def __init__(self, regex: str | re.Pattern[str], *args: Any, **kwargs: Any):
         super().__init__(*args, **kwargs)
 
         if isinstance(regex, str):