diff --git a/packaging/metadata.py b/packaging/metadata.py deleted file mode 100644 index 4bc9c595c..000000000 --- a/packaging/metadata.py +++ /dev/null @@ -1,211 +0,0 @@ -import enum -from typing import Iterable, List, Optional, Tuple - -from .requirements import Requirement -from .specifiers import SpecifierSet -from .utils import NormalizedName, canonicalize_name -from .version import Version - -# Type aliases. -_NameAndEmail = Tuple[Optional[str], str] -_LabelAndURL = Tuple[str, str] - - -@enum.unique -class DynamicField(enum.Enum): - """ - An :class:`enum.Enum` representing fields which can be listed in the ``Dynamic`` - field of `core metadata`_. - - Every valid field is a name on this enum, upper-cased with any ``-`` replaced with - ``_``. Each value is the field name lower-cased (``-`` are kept). For example, the - ``Home-page`` field has a name of ``HOME_PAGE`` and a value of ``home-page``. - """ - - # `Name`, `Version`, and `Metadata-Version` are invalid in `Dynamic`. - # 1.0 - PLATFORM = "platform" - SUMMARY = "summary" - DESCRIPTION = "description" - KEYWORDS = "keywords" - HOME_PAGE = "home-page" - AUTHOR = "author" - AUTHOR_EMAIL = "author-email" - LICENSE = "license" - # 1.1 - SUPPORTED_PLATFORM = "supported-platform" - DOWNLOAD_URL = "download-url" - CLASSIFIER = "classifier" - # 1.2 - MAINTAINER = "maintainer" - MAINTAINER_EMAIL = "maintainer-email" - REQUIRES_DIST = "requires-dist" - REQUIRES_PYTHON = "requires-python" - REQUIRES_EXTERNAL = "requires-external" - PROJECT_URL = "project-url" - PROVIDES_DIST = "provides-dist" - OBSOLETES_DIST = "obsoletes-dist" - # 2.1 - DESCRIPTION_CONTENT_TYPE = "description-content-type" - PROVIDES_EXTRA = "provides-extra" - - -class Metadata: - """A class representing the `Core Metadata`_ for a project. - - Every potential metadata field except for ``Metadata-Version`` is represented by a - parameter to the class' constructor. The required metadata can be passed in - positionally or via keyword, while all optional metadata can only be passed in via - keyword. - - Every parameter has a matching attribute on instances, except for *name* (see - :attr:`display_name` and :attr:`canonical_name`). Any parameter that accepts an - :class:`~collections.abc.Iterable` is represented as a :class:`list` on the - corresponding attribute. - """ - - # A property named `display_name` exposes the value. - _display_name: str - # A property named `canonical_name` exposes the value. - _canonical_name: NormalizedName - version: Version - platforms: List[str] - summary: str - description: str - keywords: List[str] - home_page: str - author: str - author_emails: List[_NameAndEmail] - license: str - supported_platforms: List[str] - download_url: str - classifiers: List[str] - maintainer: str - maintainer_emails: List[_NameAndEmail] - requires_dists: List[Requirement] - requires_python: SpecifierSet - requires_externals: List[str] - project_urls: List[_LabelAndURL] - provides_dists: List[str] - obsoletes_dists: List[str] - description_content_type: str - provides_extras: List[NormalizedName] - dynamic_fields: List[DynamicField] - - def __init__( - self, - name: str, - version: Version, - *, - # 1.0 - platforms: Optional[Iterable[str]] = None, - summary: Optional[str] = None, - description: Optional[str] = None, - keywords: Optional[Iterable[str]] = None, - home_page: Optional[str] = None, - author: Optional[str] = None, - author_emails: Optional[Iterable[_NameAndEmail]] = None, - license: Optional[str] = None, - # 1.1 - supported_platforms: Optional[Iterable[str]] = None, - download_url: Optional[str] = None, - classifiers: Optional[Iterable[str]] = None, - # 1.2 - maintainer: Optional[str] = None, - maintainer_emails: Optional[Iterable[_NameAndEmail]] = None, - requires_dists: Optional[Iterable[Requirement]] = None, - requires_python: Optional[SpecifierSet] = None, - requires_externals: Optional[Iterable[str]] = None, - project_urls: Optional[Iterable[_LabelAndURL]] = None, - provides_dists: Optional[Iterable[str]] = None, - obsoletes_dists: Optional[Iterable[str]] = None, - # 2.1 - description_content_type: Optional[str] = None, - provides_extras: Optional[Iterable[NormalizedName]] = None, - # 2.2 - dynamic_fields: Optional[Iterable[DynamicField]] = None, - ) -> None: - """Initialize a Metadata object. - - The parameters all correspond to fields in `Core Metadata`_. - - :param name: ``Name`` - :param version: ``Version`` - :param platforms: ``Platform`` - :param summary: ``Summary`` - :param description: ``Description`` - :param keywords: ``Keywords`` - :param home_page: ``Home-Page`` - :param author: ``Author`` - :param author_emails: - ``Author-Email`` (two-item tuple represents the name and email of the - author) - :param license: ``License`` - :param supported_platforms: ``Supported-Platform`` - :param download_url: ``Download-URL`` - :param classifiers: ``Classifier`` - :param maintainer: ``Maintainer`` - :param maintainer_emails: - ``Maintainer-Email`` (two-item tuple represent the name and email of the - maintainer) - :param requires_dists: ``Requires-Dist`` - :param SpecifierSet requires_python: ``Requires-Python`` - :param requires_externals: ``Requires-External`` - :param project_urls: ``Project-URL`` - :param provides_dists: ``Provides-Dist`` - :param obsoletes_dists: ``Obsoletes-Dist`` - :param description_content_type: ``Description-Content-Type`` - :param provides_extras: ``Provides-Extra`` - :param dynamic_fields: ``Dynamic`` - """ - self.display_name = name - self.version = version - self.platforms = list(platforms or []) - self.summary = summary or "" - self.description = description or "" - self.keywords = list(keywords or []) - self.home_page = home_page or "" - self.author = author or "" - self.author_emails = list(author_emails or []) - self.license = license or "" - self.supported_platforms = list(supported_platforms or []) - self.download_url = download_url or "" - self.classifiers = list(classifiers or []) - self.maintainer = maintainer or "" - self.maintainer_emails = list(maintainer_emails or []) - self.requires_dists = list(requires_dists or []) - self.requires_python = requires_python or SpecifierSet() - self.requires_externals = list(requires_externals or []) - self.project_urls = list(project_urls or []) - self.provides_dists = list(provides_dists or []) - self.obsoletes_dists = list(obsoletes_dists or []) - self.description_content_type = description_content_type or "" - self.provides_extras = list(provides_extras or []) - self.dynamic_fields = list(dynamic_fields or []) - - @property - def display_name(self) -> str: - """ - The project name to be displayed to users (i.e. not normalized). Initially - set based on the `name` parameter. - - Setting this attribute will also update :attr:`canonical_name`. - """ - return self._display_name - - @display_name.setter - def display_name(self, value: str) -> None: - self._display_name = value - self._canonical_name = canonicalize_name(value) - - # Use functools.cached_property once Python 3.7 support is dropped. - # Value is set by self.display_name.setter to keep in sync with self.display_name. - @property - def canonical_name(self) -> NormalizedName: - """ - The normalized project name as per :func:`packaging.utils.canonicalize_name`. - - The attribute is read-only and automatically calculated based on the value of - :attr:`display_name`. - """ - return self._canonical_name diff --git a/packaging/metadata/__init__.py b/packaging/metadata/__init__.py new file mode 100644 index 000000000..d0ba767b1 --- /dev/null +++ b/packaging/metadata/__init__.py @@ -0,0 +1,3 @@ +from ._types import DynamicField, Metadata + +__all__ = ["DynamicField", "Metadata"] diff --git a/packaging/metadata/_types.py b/packaging/metadata/_types.py new file mode 100644 index 000000000..8c3b1b36a --- /dev/null +++ b/packaging/metadata/_types.py @@ -0,0 +1,181 @@ +from __future__ import annotations + +import enum +from typing import Optional, Tuple, TypedDict + +from ..version import Version +from ._utils import as_list_str, as_str +from ._validation import RegexValidator, Required, eagerly_validate, lazy_validator +from .raw import RawMetadata, parse_email, parse_json + +# Type aliases. +_NameAndEmail = Tuple[Optional[str], str] +_LabelAndURL = Tuple[str, str] + + +@enum.unique +class DynamicField(enum.Enum): + """ + An :class:`enum.Enum` representing fields which can be listed in the ``Dynamic`` + field of `core metadata`_. + + Every valid field is a name on this enum, upper-cased with any ``-`` replaced with + ``_``. Each value is the field name lower-cased (``-`` are kept). For example, the + ``Home-page`` field has a name of ``HOME_PAGE`` and a value of ``home-page``. + """ + + # `Name`, `Version`, and `Metadata-Version` are invalid in `Dynamic`. + # 1.0 + PLATFORM = "platform" + SUMMARY = "summary" + DESCRIPTION = "description" + KEYWORDS = "keywords" + HOME_PAGE = "home-page" + AUTHOR = "author" + AUTHOR_EMAIL = "author-email" + LICENSE = "license" + # 1.1 + SUPPORTED_PLATFORM = "supported-platform" + DOWNLOAD_URL = "download-url" + CLASSIFIER = "classifier" + # 1.2 + MAINTAINER = "maintainer" + MAINTAINER_EMAIL = "maintainer-email" + REQUIRES_DIST = "requires-dist" + REQUIRES_PYTHON = "requires-python" + REQUIRES_EXTERNAL = "requires-external" + PROJECT_URL = "project-url" + PROVIDES_DIST = "provides-dist" + OBSOLETES_DIST = "obsoletes-dist" + # 2.1 + DESCRIPTION_CONTENT_TYPE = "description-content-type" + PROVIDES_EXTRA = "provides-extra" + + +@enum.unique +class MetadataVersion(enum.Enum): + v1_0 = "1.0" + v1_1 = "1.1" + v1_2 = "1.2" + v2_0 = "2.0" + v2_1 = "2.1" + v2_2 = "2.2" + v2_3 = "2.3" + + +class _ValidatedMetadata(TypedDict, total=False): + # Metadata 1.0 - PEP 241 + metadata_version: str + name: str + version: Version + platforms: list[str] + summary: str + # description: str + # keywords: List[str] + # home_page: str + # author: str + # author_email: str + # license: str + + +class Metadata: + + # We store our "actual" metadata as a RawMetadata, which + # gives is a little bit of indirection here. The RawMetadata + # class is lenient as to what it will consider valid, but this + # class is not. + # + # However, we want to support validation to happen both up front + # and on the fly as you access attributes, and when using the + # on the fly validation, we don't want to validate anything else + # except for the specific piece of metadata that is being + # asked for. + # + # That means that we need to store, at least initially, the + # metadata in a form that is lenient, which is exactly the + # purpose of RawMetadata. + _raw: RawMetadata + + # Likewise, we need a place to store our honest to goodness actually + # validated metadata too, we could just store this in a dict, but + # this will give us better typing. + _validated: _ValidatedMetadata + + def __init__(self) -> None: + raise NotImplementedError + + # It's not exactly the most pythonic thing to have a bunch of getter/setters + # like this for every attribute, however this enables us to do our on the + # fly validation. + + # Metadata-Version: Metadata 1.0 + _metadata_version = lazy_validator( + MetadataVersion, raw_name="metadata_version", validators=[Required()] + ) + # Name: Metadata 1.0 + name = lazy_validator( + as_str, + validators=[ + Required(), + RegexValidator("(?i)^([A-Z0-9]|[A-Z0-9][A-Z0-9._-]*[A-Z0-9])$"), + ], + ) + # Version: Metadata 1.0 + version = lazy_validator(Version, validators=[Required()]) + # Platform: Metadata 1.0 + platforms = lazy_validator(as_list_str) + summary = lazy_validator(as_str) + + @classmethod + def from_raw(cls, raw: RawMetadata, *, validate: bool = True) -> Metadata: + # Ok this is some kind of gross code here, but it has a specific + # purpose. + # + # We want to enable the progrmatic API of the Metadata + # class to strictly validate, including requires data, so + # we want something like Metadata("foo", "1.0", ...), but + # we also want from_raw to *not* require that data, so we + # treat our __init__ as our public constructor, then we bypass + # the __init__ when calling from_raw to let us setup the object + # in a completely different way, without exposing that as + # programatic API in and of itself. + meta = cls.__new__(cls) + meta._raw = raw + meta._validated = _ValidatedMetadata() + + # It's not possible to use Metadata without validating, but the + # validate parameter here lets people control whether the entire + # metadata gets validated up front, or whether it gets validated + # on demand. + if validate: + eagerly_validate(meta) + + return meta + + @classmethod + def from_email(cls, data: bytes | str, *, validate: bool = True) -> Metadata: + raw, unparsed = parse_email(data) + + # Regardless of the validate attribute, we don't let unparsed data + # pass silently, if someone wants to drop unparsed data on the floor + # they can call parse_email themselves and pass it into from_raw + if unparsed: + raise ValueError( + f"Could not parse, extra keys: {', '.join(unparsed.keys())}" + ) + + return cls.from_raw(raw, validate=validate) + + @classmethod + def from_json(cls, data: bytes | str, *, validate: bool = True) -> Metadata: + raw, unparsed = parse_json(data) + + # Regardless of the validate attribute, we don't let unparsed data + # pass silently, if someone wants to drop unparsed data on the floor + # they can call parse_email themselves and pass it into from_raw + if unparsed: + raise ValueError( + f"Could not parse, extra keys: {', '.join(unparsed.keys())}" + ) + + return cls.from_raw(raw, validate=validate) diff --git a/packaging/metadata/_utils.py b/packaging/metadata/_utils.py new file mode 100644 index 000000000..827d7b48f --- /dev/null +++ b/packaging/metadata/_utils.py @@ -0,0 +1,19 @@ +from collections.abc import Iterable +from typing import Any, List + + +def as_str(inp: Any) -> str: + if not isinstance(inp, str): + raise ValueError("Must be a str") + return inp + + +def as_list_str(inp: Any) -> List[str]: + if not isinstance(inp, Iterable): + raise ValueError("Must be a list of str") + results = [] + for entry in inp: + if not isinstance(entry, str): + raise ValueError("Must a list of str") + results.append(entry) + return results diff --git a/packaging/metadata/_validation.py b/packaging/metadata/_validation.py new file mode 100644 index 000000000..d27e96924 --- /dev/null +++ b/packaging/metadata/_validation.py @@ -0,0 +1,159 @@ +from __future__ import annotations + +import abc +import re +from typing import TYPE_CHECKING, Any, Callable, Dict, Generic, Optional, TypeVar, cast + +if TYPE_CHECKING: + from ._types import Metadata + + +T = TypeVar("T") + + +class lazy_validator(Generic[T]): # noqa: N801 + + # This hack exists to work around https://github.com/python/mypy/issues/708 + _creator: Callable[[Any], T] | Callable[[Any], T] + _raw_name: str + _validators: list[Callable[[Any], None]] + + def __init__( + self, + creator: Callable[[Any], T], + *, + raw_name: str | None = None, + validators: list[Callable[[Any], None]] | None = None, + ) -> None: + self._creator = creator + if raw_name is not None: + self._raw_name = raw_name + if validators is not None: + self._validators = validators + else: + self._validators = [] + + def __set_name__(self, owner: Metadata, name: str) -> None: + if not hasattr(self, "_raw_name"): + self._raw_name = name + + def __get__(self, obj: Metadata, owner: type[Metadata]) -> T | None: + # TypedDict doesn't support variable key names, and Python 3.7 doesn't + # support Literal which would let us let it know that this is validated + # already to be safe, so we'll cast here to make things work. + raw = cast(Dict[str, Any], obj._raw) + validated = cast(Dict[str, Optional[T]], obj._validated) + + if self._raw_name not in validated: + value = self._validate(raw.get(self._raw_name)) + validated[self._raw_name] = value + del raw[self._raw_name] + + return validated[self._raw_name] + + def __set__(self, obj: Metadata, value: Any) -> None: + raw = cast(Dict[str, Any], obj._raw) + validated = cast(Dict[str, Optional[T]], obj._validated) + + validated_value = self._validate(value) + validated[self._raw_name] = validated_value + raw.pop(self._raw_name, None) + + def __delete__(self, obj: Metadata) -> None: + raw = cast(Dict[str, Any], obj._raw) + validated = cast(Dict[str, Optional[T]], obj._validated) + + raw.pop(self._raw_name, None) + validated.pop(self._raw_name, None) + + def _validate(self, data: Any) -> T | None: + # Create our value from our raw data + value = self._creator(data) if data is not None else None + + # Loop over our validators, and ensure that our value is actually valid + for validator in self._validators: + validator(value) + + return value + + +def eagerly_validate(obj: Metadata) -> None: + for name, field in obj.__class__.__dict__.items(): + if isinstance(field, lazy_validator): + getattr(obj, name) + + +V = TypeVar("V") + + +class ValidationError(Exception): + pass + + +class Validator(Generic[V], abc.ABC): + + message: str + + def __init__(self, *args: Any, message: str | None = None, **kwargs: Any): + super().__init__(*args, **kwargs) + if message is not None: + self.message = message + + def __call__(self, value: V | None) -> None: + try: + self.full_validate(value) + except Exception as exc: + raise ValidationError(self.message.format(value=value)) from exc + + def full_validate(self, value: V | None) -> None: + if value is not None: + self.validate(value) + + @abc.abstractmethod + def validate(self, value: V) -> None: + ... + + +class Required(Validator[V]): + + message: str = "value is required: {value!r}" + + def full_validate(self, value: V | None) -> None: + if value is None: + raise ValueError("required value") + + def validate(self, value: V) -> None: + pass + + +class RegexValidator(Validator[V]): + + _regex: re.Pattern[str] + message: str = "invalid value: {value!r}" + + def __init__(self, regex: str | re.Pattern[str], *args: Any, **kwargs: Any): + super().__init__(*args, **kwargs) + + if isinstance(regex, str): + self._regex = re.compile(regex) + else: + self._regex = regex + + def validate(self, value: V) -> None: + if not isinstance(value, str): + raise TypeError + + if self._regex.search(value) is None: + raise ValueError(f"doesn't match: {self._regex.pattern}") + + +class SingleLine(Validator[V]): + + message: str = "must contain only one line: {value!r}" + + def validate(self, value: V) -> None: + if not isinstance(value, str): + raise TypeError + + if "\n" in value or "\r" in value: + raise ValueError("multiline str") diff --git a/packaging/metadata/raw.py b/packaging/metadata/raw.py new file mode 100644 index 000000000..9d508c507 --- /dev/null +++ b/packaging/metadata/raw.py @@ -0,0 +1,614 @@ +import email.feedparser +import email.header +import email.message +import email.parser +import email.policy +import json +from typing import Any, Dict, List, Tuple, TypedDict, Union, cast + +__all__ = ["RawMetadata", "parse_email", "parse_json"] + + +# The RawMetadata class attempts to make as few assumptions about +# the underlying serialization formats as possible, these could +# possibly serialize in an entirely different way, but the idea +# here is that as long as a serialization formats some very +# basic primitives in *some* way (strings, lists, and one map +# but that map can be easily implemented as a list of strings) +# then we can support serializing to and from that format. +class RawMetadata(TypedDict, total=False): + # Metadata 1.0 - PEP 241 + metadata_version: str + name: str + version: str + platforms: List[str] + summary: str + description: str + keywords: List[str] + home_page: str + author: str + author_email: str + license: str + + # Metadata 1.1 - PEP 314 + supported_platforms: List[str] + download_url: str + classifiers: List[str] + requires: List[str] + provides: List[str] + obsoletes: List[str] + + # Metadata 1.2 - PEP 345 + maintainer: str + maintainer_email: str + requires_dist: List[str] + provides_dist: List[str] + obsoletes_dist: List[str] + requires_python: str + requires_external: List[str] + project_urls: Dict[str, str] + + # Metadata 2.0 + # PEP 426 attempted to completely revamp the metadata format + # but got stuck without ever being able to build consensus on + # it and ultimately ended up withdrawn. + # + # However, a number of tools had started emiting METADATA with + # `2.0` Metadata-Version, so for historical reasons, this version + # was skipped. + + # Metadata 2.1 - PEP 566 + description_content_type: str + provides_extra: List[str] + + # Metadata 2.2 - PEP 643 + dynamic: List[str] + + # Metadata 2.3 - PEP 685 + # No new fields were added in PEP 685, just some edge case were + # tightened up to provide better interoptability. + + +_STRING_FIELDS = { + "author", + "author_email", + "description", + "description_content_type", + "download_url", + "home_page", + "license", + "maintainer", + "maintainer_email", + "metadata_version", + "name", + "requires_python", + "summary", + "version", +} + +_LIST_STRING_FIELDS = { + "classifiers", + "dynamic", + "obsoletes", + "obsoletes_dist", + "platforms", + "provides", + "provides_dist", + "provides_extra", + "requires", + "requires_dist", + "requires_external", + "supported_platforms", +} + +# General helper functions for parsing some string values for reusing in +# multiple parse_FORMAT functions + + +def _parse_keywords(data: str) -> List[str]: + return [k.strip() for k in data.split(",")] + + +def _parse_project_urls(data: List[str]) -> Dict[str, str]: + urls = {} + for pair in data: + # Our logic is slightly tricky here as we want to try and do + # *something* reasonable with malformed data. + # + # The main thing that we have to worry about, is data that does + # not have a ',' at all to split the Key from the Value. There + # isn't a singular right answer here, and we will fail validation + # later on (if the caller is validating) so it doesn't *really* + # matter, but since the missing value has to be an empty str + # and our return value is dict[str, str], if we let the key + # be the missing value, then they'd just multiple '' values that + # overwrite each other. + # + # The other potentional issue is that it's possible to have the + # same Key multiple times in the metadata, with no solid "right" + # answer with what to do in that case, we'll do the only thing + # we can, which is treat the field as unparseable and add it + # to our list of unparsed fields. + parts = [p.strip() for p in pair.split(",", 1)] + parts.extend([""] * (max(0, 2 - len(parts)))) # Ensure 2 items + + # TODO: The spec doesn't say anything about if the keys should be + # considered case sensitive or not... logically they should + # be case preserving, but case insensitive, but doing that + # would open up more cases where we might have duplicated + # entries. + label, url = parts + if label in urls: + # The label already exists in our set of urls, so this field + # is unparseable, and we can just add the whole thing to our + # unparseable data and stop processing it. + raise KeyError("duplicate keys in project urls") + urls[label] = url + + return urls + + +# The various parse_FORMAT functions here are intended to be as lenient as +# possible in their parsing, while still returning a correctly typed +# RawMetadata. +# +# To aid in this, we also generally want to do as little touching of the +# data as possible, except where there are possibly some historic holdovers +# that make valid data awkward to work with. +# +# While this is a lower level, intermediate format than our ``Metadata`` +# class, some light touch ups can make a massive different in usability. + + +_EMAIL_FIELD_MAPPING = { + "author": "author", + "author-email": "author_email", + "classifier": "classifiers", + "description": "description", + "description-content-type": "description_content_type", + "download-url": "download_url", + "dynamic": "dynamic", + "home-page": "home_page", + "keywords": "keywords", + "license": "license", + "maintainer": "maintainer", + "maintainer-email": "maintainer_email", + "metadata-version": "metadata_version", + "name": "name", + "obsoletes": "obsoletes", + "obsoletes-dist": "obsoletes_dist", + "platform": "platforms", + "project-url": "project_urls", + "provides": "provides", + "provides-dist": "provides_dist", + "provides-extra": "provides_extra", + "requires": "requires", + "requires-dist": "requires_dist", + "requires-external": "requires_external", + "requires-python": "requires_python", + "summary": "summary", + "supported-platform": "supported_platforms", + "version": "version", +} + + +def parse_email(data: Union[bytes, str]) -> Tuple[RawMetadata, Dict[Any, Any]]: + raw: Dict[str, Any] = {} + unparsed: Dict[Any, Any] = {} + + if isinstance(data, str): + parsed = email.parser.Parser(policy=email.policy.compat32).parsestr(data) + else: + parsed = email.parser.BytesParser(policy=email.policy.compat32).parsebytes(data) + + # We have to wrap parsed.keys() in a set, because in the case of multiple + # values for a key (a list), the key will appear multiple times in the + # list of keys, but we're avoiding that by using get_all(). + for name in set(parsed.keys()): + # Header names in RFC are case insensitive, so we'll normalize to all + # lower case to make comparisons easier. + name = name.lower() + + # We use get_all here, even for fields that aren't multiple use, because + # otherwise someone could have say, two Name fields, and we would just + # silently ignore it rather than doing something about it. + headers = parsed.get_all(name) + + # The way the email module works when parsing bytes is that it + # unconditionally decodes the bytes as ascii, using the surrogateescape + # handler, and then when you pull that data back out (such as with get_all) + # it looks to see if the str has any surrogate escapes, and if it does + # it wraps it in a Header object instead of returning the string. + # + # So we'll look for those Header objects, and fix up the encoding + value = [] + valid_encoding = True + for h in headers: + # It's unclear if this can return more types than just a Header or + # a str, so we'll just assert here to make sure. + assert isinstance(h, (email.header.Header, str)) + + # If it's a header object, we need to do our little dance to get + # the real data out of it. In cases where there is invalid data + # we're going to end up with mojibake, but I don't see a good way + # around that without reimplementing parts of the Header object + # ourselves. + # + # That should be fine, since if that happens, this key is going + # into the unparsed dict anyways. + if isinstance(h, email.header.Header): + # The Heade object stores it's data as chunks, and each chunk + # can be independently encoded, so we'll need to check each + # of them. + chunks = [] + for bin, encoding in email.header.decode_header(h): + # This means it found a surrogate escape, that could be + # valid data (if the source was utf8), or invalid. + if encoding == "unknown-8bit": + try: + bin.decode("utf8", "strict") + except UnicodeDecodeError: + # Enable mojibake + encoding = "latin1" + valid_encoding = False + else: + encoding = "utf8" + chunks.append((bin, encoding)) + + # Turn our chunks back into a Header object, then let that + # Header object do the right thing to turn them into a + # string for us. + value.append(str(email.header.make_header(chunks))) + # This is already a string, so just add it + else: + value.append(h) + + # We've processed all of our values to get them into a list of str, + # but we may have mojibake data, in which case this is an unparsed + # field. + if not valid_encoding: + unparsed[name] = value + continue + + raw_name = _EMAIL_FIELD_MAPPING.get(name) + if raw_name is None: + # This is a bit of a weird situation, we've encountered a key that + # we don't know what it means, so we don't know whether it's meant + # to be a list or not. + # + # Since we can't really tell one way or another, we'll just leave it + # as a list, even though it may be a single item list, because that's + # what makes the most sense for email headers. + unparsed[name] = value + continue + + # If this is one of our string fields, then we'll check to see if our + # value is a list of a single item, if it is then we'll assume that + # it was emited as a single string, and unwrap the str from inside + # the list. + # + # If it's any other kind of data, then we haven't the faintest clue + # what we should parse it as, and we have to just add it to our list + # of unparsed stuff. + if raw_name in _STRING_FIELDS and len(value) == 1: + raw[raw_name] = value[0] + # If this is one our list of string fields, then we can just assign + # the value, since email *only* has strings, and our get_all() call + # above ensures that this is a list. + elif raw_name in _LIST_STRING_FIELDS: + raw[raw_name] = value + # Special Case: Keywords + # The keywords field is implemented in the metadata spec as a str, + # but it conceptually is a list of strings, and is serialized using + # ", ".join(keywords), so we'll do some light data massaging to turn + # this into what it logically is. + elif raw_name == "keywords" and len(value) == 1: + raw[raw_name] = _parse_keywords(value[0]) + # Special Case: Project-URL + # The project urls is implemented in the metadata spec as a list of + # specially formatted strings that represent a key and a value, which + # is fundamentally a mapping, however the email format doesn't support + # mappings in a sane way, so it was crammed into a list of strings + # instead. + # + # We will do a little light data massaging to turn this into a map as + # it logically should be. + elif raw_name == "project_urls": + try: + raw[raw_name] = _parse_project_urls(value) + except ValueError: + unparsed[name] = value + # Nothing that we've done has managed to parse this, so it'll just + # throw it in our unparseable data and move on. + else: + unparsed[name] = value + + # We need to support getting the Description from the message payload in + # addition to getting it from the the headers, but since Description is + # conceptually a string, if it's already been set from headers then we'll + # clear it out move them both to unparsed. + try: + payload = _get_payload(parsed, data) + except ValueError: + unparsed["Description"] = parsed.get_payload(decode=isinstance(data, bytes)) + else: + if payload: + # Check to see if we've already got a description, if so then both + # it, and this body move to unparseable. + if "description" in raw: + unparsed["Description"] = [raw.pop("description"), payload] + else: + raw["description"] = payload + + # We need to cast our `raw` to a metadata, because a TypedDict only support + # literal key names, but we're computing our key names on purpose, but the + # way this function is implemented, our `TypedDict` can only have valid key + # names. + return cast(RawMetadata, raw), unparsed + + +_EMAIL_FIELD_ORDER = [ + # Always put the metadata version first, incase it ever changes how + # we parse this file. + "metadata_version", + # Put the other pieces of mandatory information next. + "name", + "version", + # We're just going to emit all of these in sorted order, except we'll + # float deprecated or "rarely used" fields to the bottom. + "author", + "author_email", + "classifiers", + # We are purposely excluding the description field, we don't want to + # write that field out as a header, so we won't include it in this list + # and it will have to be manually handled instead. + # "description", + "description_content_type", + "download_url", + "dynamic", + "home_page", + "keywords", + "license", + "maintainer", + "maintainer_email", + "platforms", + "project_urls", + "provides_extra", + "requires_dist", + "requires_python", + "summary", + "supported_platforms", + # Deprecated or "rarely used" + "obsoletes", + "obsoletes_dist", + "provides", + "provides_dist", + "requires", + "requires_external", +] + + +def emit_email(raw: RawMetadata) -> bytes: + # TypedDict only allows literal keys, we know that are dynamic keys are correct + # but to satisfy the type checker we'll cast things. + data = cast(Dict[str, Any], raw) + + # Figure out our mapping to email names + field_names = {v: k for (k, v) in _EMAIL_FIELD_MAPPING.items()} + + # From what I can tell, there is no way to get the email module in the stdlib + # to actually emit a ``METADATA``file in the format that we need, so instead + # we'll have to manually craft one. + lines = [] + + for field in _EMAIL_FIELD_ORDER: + field_name = field_names[field] + field_data = data.get(field) + if field_data: + # String fields get emitted as Key: Data + if field in _STRING_FIELDS and isinstance(field_data, str): + lines.append(f"{field_name}: {_rfc822_escape(field_data)}") + # List String fields get emitted as a Key: Data per entry. + elif field in _LIST_STRING_FIELDS and isinstance(field_data, list): + for item in field_data: + lines.append(f"{field_name}: {_rfc822_escape(item)}") + # Special Case: Keywords + # We need to turn our List String for Keywords back into a singular + # string for the core metadata spec. + elif field == "keywords" and isinstance(field_data, list): + lines.append(f"{field_name}: {_rfc822_escape(', '.join(field_data))}") + # Special Case: Project-URL + # We need to turn our dict[str, str] back into the list of specially + # formatted strings to match what the core metadata expects. + elif field == "project_urls" and isinstance(field_data, dict): + for label, url in field_data.items(): + lines.append( + f"{field_name}: {_rfc822_escape(', '.join([label, url]))}" + ) + + msg = "\n".join(lines) + description = raw.get("description") + if description: + msg = msg + "\n\n" + description + + return msg.encode("utf8") + + +# This might appear to be a mapping of the same key to itself, and in many cases +# it is. However, the algorithm in PEP 566 doesn't match 100% the keys chosen +# for RawMetadata, so we use this mapping just like with email to handle that. +_JSON_FIELD_MAPPING = { + "author": "author", + "author_email": "author_email", + "classifier": "classifiers", + "description": "description", + "description_content_type": "description_content_type", + "download_url": "download_url", + "dynamic": "dynamic", + "home_page": "home_page", + "keywords": "keywords", + "license": "license", + "maintainer": "maintainer", + "maintainer_email": "maintainer_email", + "metadata_version": "metadata_version", + "name": "name", + "obsoletes": "obsoletes", + "obsoletes_dist": "obsoletes_dist", + "platform": "platforms", + "project_url": "project_urls", + "provides": "provides", + "provides_dist": "provides_dist", + "provides_extra": "provides_extra", + "requires": "requires", + "requires_dist": "requires_dist", + "requires_external": "requires_external", + "requires_python": "requires_python", + "summary": "summary", + "supported_platform": "supported_platforms", + "version": "version", +} + + +def parse_json(data: Union[bytes, str]) -> Tuple[RawMetadata, Dict[Any, Any]]: + raw: Dict[Any, Any] = {} + unparsed: Dict[Any, Any] = {} + parsed = json.loads(data) + + # We need to make sure that the data given to us actually implements + # a dict, if it's any other type then there is no way we can parse + # anything meaningful out of it, so we'll just give up and bail out. + if not isinstance(parsed, dict): + raise ValueError("Invalid json data, must be a mapping") + + for name, value in parsed.items(): + raw_name = _JSON_FIELD_MAPPING.get(name) + if raw_name is None: + # We don't know this key, so chuck it into our unparsed data + # and continue on. + unparsed[name] = value + continue + + # If this is one of our string fields, check to see if it's actually + # a string, if it's not then we don't have any idea how to handle it + if raw_name in _STRING_FIELDS and isinstance(value, str): + raw[raw_name] = value + # If this is one of our string fields, check to see if it's actually + # a list of strings, if it's not then we don't have any idea how to + # handle it + elif ( + raw_name in _LIST_STRING_FIELDS + and isinstance(value, list) + and all(isinstance(v, str) for v in value) + ): + raw[raw_name] = cast(List[str], value) + # Special Case: Keywords + # The keywords field is implemented in the metadata spec as a str, + # but it conceptually is a list of strings. Interestingly, the + # JSON spec as described in PEP 566 already implements this as a + # list of strings, so we don't technically have to do anything. + # + # We're still treating this as as a special case though, because + # in the metadata specification it's a single string, so it's not + # included in our list of list string fields. + elif ( + raw_name == "keywords" + and isinstance(value, list) + and all(isinstance(v, str) for v in value) + ): + raw[raw_name] = value + # Special Case: Project-URL + # The project urls is implemented in the metadata spec as a list of + # specially formatted strings that represent a key and a value, which + # is fundamentally a mapping, however the email format doesn't support + # mappings in a sane way, so it was crammed into a list of strings + # instead. + # + # We will do a little light data massaging to turn this into a map as + # it logically should be. + elif ( + raw_name == "project_urls" + and isinstance(value, list) + and all(isinstance(v, str) for v in value) + ): + try: + raw[raw_name] = _parse_project_urls(value) + except ValueError: + unparsed[name] = value + # Nothing that we've done has managed to parse this, so it'll just + # throw it in our unparseable data and move on. + else: + unparsed[name] = value + + # We need to cast our `raw` to a metadata, because a TypedDict only support + # literal key names, but we're computing our key names on purpose, but the + # way this function is implemented, our `TypedDict` can only have valid key + # names. + return cast(RawMetadata, raw), unparsed + + +def emit_json(raw: RawMetadata) -> bytes: + # TypedDict only allows literal keys, we know that are dynamic keys are correct + # but to satisfy the type checker we'll cast things. + data = cast(Dict[str, Any], raw) + + # Figure out our mapping to email names + field_names = {v: k for (k, v) in _JSON_FIELD_MAPPING.items()} + + out = {} + for field in _EMAIL_FIELD_ORDER: + field_name = field_names[field] + field_data = data.get(field) + if field_data: + if (field in _STRING_FIELDS and isinstance(field_data, str)) or ( + field in _LIST_STRING_FIELDS and isinstance(field_data, list) + ): + out[field_name] = field_data + # Special Case: Keywords + # We need to turn our List String for Keywords back into a singular + # string for the core metadata spec. + elif field == "keywords" and isinstance(field_data, list): + out[field_name] = ", ".join(field_data) + # Special Case: Project-URL + # We need to turn our dict[str, str] back into the list of specially + # formatted strings to match what the core metadata expects. + elif field == "project_urls" and isinstance(field_data, dict): + out[field_name] = [ + f"{label}, {url}" for (label, url) in field_data.items() + ] + + description = raw.get("description") + if description: + out["description"] = description + + return json.dumps(out).encode("utf8") + + +def _get_payload(msg: email.message.Message, source: Union[bytes, str]) -> str: + # If our source is a str, then our caller has managed encodings for us, + # and we don't need to deal with it. + if isinstance(source, str): + payload: Union[List[str], str] = msg.get_payload() + if isinstance(payload, list): + raise ValueError("payload is a multipart") + return payload + # If our source is a bytes, then we're managing the encoding and we need + # to deal with it. + else: + bpayload: Union[List[bytes], bytes] = msg.get_payload(decode=True) + if isinstance(bpayload, list): + raise ValueError("payload is a multipart") + + try: + return bpayload.decode("utf8", "strict") + except UnicodeDecodeError: + raise ValueError("payload in an invalid encoding") + + +def _rfc822_escape(header: str) -> str: + """ + Return a version of the string escaped for inclusion in an + RFC-822 header, by ensuring there are 8 spaces space after each newline. + """ + lines = header.split("\n") + sep = "\n" + 8 * " " + return sep.join(lines)