From 0f77bc83d2cd5271b1bc6ffca00e6770c8bfcf3b Mon Sep 17 00:00:00 2001 From: Jelle Zijlstra Date: Thu, 15 Feb 2024 21:47:20 -0800 Subject: [PATCH 1/3] html5lib: Add various types I started out investigating comments in #11411 and ended up adding a few other types that were reasonably obvious from the source code. For reference: https://github.com/html5lib/html5lib-python/tree/master/html5lib cc @leonardr --- stubs/html5lib/html5lib/_inputstream.pyi | 35 ++++++++++---- stubs/html5lib/html5lib/_tokenizer.pyi | 60 ++++++++++++------------ stubs/html5lib/html5lib/html5parser.pyi | 27 ++++++----- 3 files changed, 70 insertions(+), 52 deletions(-) diff --git a/stubs/html5lib/html5lib/_inputstream.pyi b/stubs/html5lib/html5lib/_inputstream.pyi index 992aaa2f07a6..36010b5bdde7 100644 --- a/stubs/html5lib/html5lib/_inputstream.pyi +++ b/stubs/html5lib/html5lib/_inputstream.pyi @@ -1,5 +1,9 @@ -from _typeshed import Incomplete -from typing import Any +from _typeshed import Incomplete, SupportsRead +from typing import Any, TypeAlias, overload + +_UnicodeInputStream: TypeAlias = str | SupportsRead[str] +_BinaryInputStream: TypeAlias = bytes | SupportsRead[bytes] +_InputStream: TypeAlias = _UnicodeInputStream spaceCharactersBytes: Any asciiLettersBytes: Any @@ -20,14 +24,25 @@ class BufferedStream: def seek(self, pos) -> None: ... def read(self, bytes): ... -def HTMLInputStream(source, **kwargs): ... +@overload +def HTMLInputStream(source: _UnicodeInputStream) -> HTMLUnicodeInputStream: ... +@overload +def HTMLInputStream( + source: _BinaryInputStream, + override_encoding: str | bytes | None = None, + transport_encoding: str | bytes | None = None, + same_origin_parent_encoding: str | bytes | None = None, + likely_encoding: str | bytes | None = None, + default_encoding: str = "windows-1252", + useChardet: bool = True, +) -> HTMLBinaryInputStream: ... class HTMLUnicodeInputStream: reportCharacterErrors: Any newLines: Any charEncoding: Any dataStream: Any - def __init__(self, source) -> None: ... + def __init__(self, source: _UnicodeInputStream) -> None: ... chunk: str chunkSize: int chunkOffset: int @@ -56,11 +71,11 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream): charEncoding: Any def __init__( self, - source, - override_encoding: Incomplete | None = None, - transport_encoding: Incomplete | None = None, - same_origin_parent_encoding: Incomplete | None = None, - likely_encoding: Incomplete | None = None, + source: _BinaryInputStream, + override_encoding: str | bytes | None = None, + transport_encoding: str | bytes | None = None, + same_origin_parent_encoding: str | bytes | None = None, + likely_encoding: str | bytes | None = None, default_encoding: str = "windows-1252", useChardet: bool = True, ) -> None: ... @@ -108,4 +123,4 @@ class ContentAttrParser: def __init__(self, data) -> None: ... def parse(self): ... -def lookupEncoding(encoding): ... +def lookupEncoding(encoding: str | bytes | None) -> str | None: ... diff --git a/stubs/html5lib/html5lib/_tokenizer.pyi b/stubs/html5lib/html5lib/_tokenizer.pyi index f9685a0dd112..72b701c37491 100644 --- a/stubs/html5lib/html5lib/_tokenizer.pyi +++ b/stubs/html5lib/html5lib/_tokenizer.pyi @@ -1,6 +1,8 @@ from _typeshed import Incomplete from typing import Any +from ._inputstream import _InputStream + entitiesTrie: Any attributeMap = dict @@ -12,7 +14,7 @@ class HTMLTokenizer: state: Any escape: bool currentToken: Any - def __init__(self, stream, parser: Incomplete | None = None, **kwargs) -> None: ... + def __init__(self, stream: _InputStream, parser: Incomplete | None = None, **kwargs) -> None: ... tokenQueue: Any def __iter__(self): ... def consumeNumberEntity(self, isHex): ... @@ -36,23 +38,23 @@ class HTMLTokenizer: def rawtextLessThanSignState(self): ... def rawtextEndTagOpenState(self): ... def rawtextEndTagNameState(self): ... - def scriptDataLessThanSignState(self): ... - def scriptDataEndTagOpenState(self): ... - def scriptDataEndTagNameState(self): ... - def scriptDataEscapeStartState(self): ... - def scriptDataEscapeStartDashState(self): ... - def scriptDataEscapedState(self): ... - def scriptDataEscapedDashState(self): ... - def scriptDataEscapedDashDashState(self): ... - def scriptDataEscapedLessThanSignState(self): ... - def scriptDataEscapedEndTagOpenState(self): ... - def scriptDataEscapedEndTagNameState(self): ... - def scriptDataDoubleEscapeStartState(self): ... - def scriptDataDoubleEscapedState(self): ... - def scriptDataDoubleEscapedDashState(self): ... - def scriptDataDoubleEscapedDashDashState(self): ... - def scriptDataDoubleEscapedLessThanSignState(self): ... - def scriptDataDoubleEscapeEndState(self): ... + def scriptDataLessThanSignState(self) -> bool: ... + def scriptDataEndTagOpenState(self) -> bool: ... + def scriptDataEndTagNameState(self) -> bool: ... + def scriptDataEscapeStartState(self) -> bool: ... + def scriptDataEscapeStartDashState(self) -> bool: ... + def scriptDataEscapedState(self) -> bool: ... + def scriptDataEscapedDashState(self) -> bool: ... + def scriptDataEscapedDashDashState(self) -> bool: ... + def scriptDataEscapedLessThanSignState(self) -> bool: ... + def scriptDataEscapedEndTagOpenState(self) -> bool: ... + def scriptDataEscapedEndTagNameState(self) -> bool: ... + def scriptDataDoubleEscapeStartState(self) -> bool: ... + def scriptDataDoubleEscapedState(self) -> bool: ... + def scriptDataDoubleEscapedDashState(self) -> bool: ... + def scriptDataDoubleEscapedDashDashState(self) -> bool: ... + def scriptDataDoubleEscapedLessThanSignState(self) -> bool: ... + def scriptDataDoubleEscapeEndState(self) -> bool: ... def beforeAttributeNameState(self): ... def attributeNameState(self): ... def afterAttributeNameState(self): ... @@ -64,17 +66,17 @@ class HTMLTokenizer: def selfClosingStartTagState(self): ... def bogusCommentState(self): ... def markupDeclarationOpenState(self): ... - def commentStartState(self): ... - def commentStartDashState(self): ... - def commentState(self): ... - def commentEndDashState(self): ... - def commentEndState(self): ... - def commentEndBangState(self): ... - def doctypeState(self): ... - def beforeDoctypeNameState(self): ... - def doctypeNameState(self): ... - def afterDoctypeNameState(self): ... - def afterDoctypePublicKeywordState(self): ... + def commentStartState(self) -> bool: ... + def commentStartDashState(self) -> bool: ... + def commentState(self) -> bool: ... + def commentEndDashState(self) -> bool: ... + def commentEndState(self) -> bool: ... + def commentEndBangState(self) -> bool: ... + def doctypeState(self) -> bool: ... + def beforeDoctypeNameState(self) -> bool: ... + def doctypeNameState(self) -> bool: ... + def afterDoctypeNameState(self) -> bool: ... + def afterDoctypePublicKeywordState(self) -> bool: ... def beforeDoctypePublicIdentifierState(self): ... def doctypePublicIdentifierDoubleQuotedState(self): ... def doctypePublicIdentifierSingleQuotedState(self): ... diff --git a/stubs/html5lib/html5lib/html5parser.pyi b/stubs/html5lib/html5lib/html5parser.pyi index e946f7d99485..d19a6954ad15 100644 --- a/stubs/html5lib/html5lib/html5parser.pyi +++ b/stubs/html5lib/html5lib/html5parser.pyi @@ -1,25 +1,25 @@ -from _typeshed import Incomplete, SupportsRead +from _typeshed import Incomplete from typing import Any, Literal, overload from xml.etree.ElementTree import Element +from ._inputstream import _InputStream +from ._tokenizer import HTMLTokenizer + @overload def parse( - doc: str | bytes | SupportsRead[str] | SupportsRead[bytes], - treebuilder: Literal["etree"] = "etree", - namespaceHTMLElements: bool = True, - **kwargs, + doc: _InputStream, treebuilder: Literal["etree"] = "etree", namespaceHTMLElements: bool = True, **kwargs ) -> Element: ... @overload -def parse( - doc: str | bytes | SupportsRead[str] | SupportsRead[bytes], treebuilder: str, namespaceHTMLElements: bool = True, **kwargs +def parse(doc: _InputStream, treebuilder: str, namespaceHTMLElements: bool = True, **kwargs): ... +def parseFragment( + doc: _InputStream, container: str = "div", treebuilder: str = "etree", namespaceHTMLElements: bool = True, **kwargs ): ... -def parseFragment(doc, container: str = "div", treebuilder: str = "etree", namespaceHTMLElements: bool = True, **kwargs): ... def method_decorator_metaclass(function): ... class HTMLParser: - strict: Any + strict: bool tree: Any - errors: Any + errors: list[Incomplete] phases: Any def __init__( self, tree: Incomplete | None = None, strict: bool = False, namespaceHTMLElements: bool = True, debug: bool = False @@ -27,17 +27,18 @@ class HTMLParser: firstStartTag: bool log: Any compatMode: str + container: str innerHTML: Any phase: Any lastPhase: Any beforeRCDataPhase: Any framesetOK: bool - tokenizer: Any + tokenizer: HTMLTokenizer def reset(self) -> None: ... @property def documentEncoding(self) -> str | None: ... - def isHTMLIntegrationPoint(self, element) -> bool: ... - def isMathMLTextIntegrationPoint(self, element) -> bool: ... + def isHTMLIntegrationPoint(self, element: Element) -> bool: ... + def isMathMLTextIntegrationPoint(self, element: Element) -> bool: ... def mainLoop(self) -> None: ... def parse(self, stream, scripting: bool = ..., **kwargs): ... def parseFragment(self, stream, *args, **kwargs): ... From 01b729a5aca054b4b0f65d541ae8d19868373907 Mon Sep 17 00:00:00 2001 From: Jelle Zijlstra Date: Thu, 15 Feb 2024 22:04:15 -0800 Subject: [PATCH 2/3] Fixes and some more types --- stubs/html5lib/html5lib/_inputstream.pyi | 6 ++++-- stubs/html5lib/html5lib/html5parser.pyi | 4 ++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/stubs/html5lib/html5lib/_inputstream.pyi b/stubs/html5lib/html5lib/_inputstream.pyi index 36010b5bdde7..83d53c965c88 100644 --- a/stubs/html5lib/html5lib/_inputstream.pyi +++ b/stubs/html5lib/html5lib/_inputstream.pyi @@ -1,9 +1,10 @@ from _typeshed import Incomplete, SupportsRead -from typing import Any, TypeAlias, overload +from typing import Any, overload +from typing_extensions import TypeAlias _UnicodeInputStream: TypeAlias = str | SupportsRead[str] _BinaryInputStream: TypeAlias = bytes | SupportsRead[bytes] -_InputStream: TypeAlias = _UnicodeInputStream +_InputStream: TypeAlias = _UnicodeInputStream # noqa: Y7047 # used in other files spaceCharactersBytes: Any asciiLettersBytes: Any @@ -29,6 +30,7 @@ def HTMLInputStream(source: _UnicodeInputStream) -> HTMLUnicodeInputStream: ... @overload def HTMLInputStream( source: _BinaryInputStream, + *, override_encoding: str | bytes | None = None, transport_encoding: str | bytes | None = None, same_origin_parent_encoding: str | bytes | None = None, diff --git a/stubs/html5lib/html5lib/html5parser.pyi b/stubs/html5lib/html5lib/html5parser.pyi index d19a6954ad15..3f2fa19db7a1 100644 --- a/stubs/html5lib/html5lib/html5parser.pyi +++ b/stubs/html5lib/html5lib/html5parser.pyi @@ -40,8 +40,8 @@ class HTMLParser: def isHTMLIntegrationPoint(self, element: Element) -> bool: ... def isMathMLTextIntegrationPoint(self, element: Element) -> bool: ... def mainLoop(self) -> None: ... - def parse(self, stream, scripting: bool = ..., **kwargs): ... - def parseFragment(self, stream, *args, **kwargs): ... + def parse(self, stream: _InputStream, scripting: bool = ..., **kwargs): ... + def parseFragment(self, stream: _InputStream, *args, **kwargs): ... def parseError(self, errorcode: str = "XXX-undefined-error", datavars: Incomplete | None = None) -> None: ... def adjustMathMLAttributes(self, token) -> None: ... def adjustSVGAttributes(self, token) -> None: ... From 3eeca40f1f53cdd9f8758be9bde50d6390e850e1 Mon Sep 17 00:00:00 2001 From: Jelle Zijlstra Date: Thu, 15 Feb 2024 22:08:38 -0800 Subject: [PATCH 3/3] spell --- stubs/html5lib/html5lib/_inputstream.pyi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stubs/html5lib/html5lib/_inputstream.pyi b/stubs/html5lib/html5lib/_inputstream.pyi index 83d53c965c88..02bb378e77ed 100644 --- a/stubs/html5lib/html5lib/_inputstream.pyi +++ b/stubs/html5lib/html5lib/_inputstream.pyi @@ -4,7 +4,7 @@ from typing_extensions import TypeAlias _UnicodeInputStream: TypeAlias = str | SupportsRead[str] _BinaryInputStream: TypeAlias = bytes | SupportsRead[bytes] -_InputStream: TypeAlias = _UnicodeInputStream # noqa: Y7047 # used in other files +_InputStream: TypeAlias = _UnicodeInputStream # noqa: Y047 # used in other files spaceCharactersBytes: Any asciiLettersBytes: Any