From cf47eb1be1252f7004b7324108637e815b0c2602 Mon Sep 17 00:00:00 2001 From: Jelle Zijlstra Date: Thu, 1 Aug 2024 10:18:27 -0700 Subject: [PATCH 1/4] Fix incorrect linenos on fstring tokens with escaped newlines I don't think this can affect Black itself much (maybe for formatting ranges), but I ran into this with https://github.com/JelleZijlstra/lib2toast --- CHANGES.md | 2 + src/blib2to3/pgen2/tokenize.py | 2 +- tests/test_tokenize.py | 117 +++++++++++++++++++++++++++++++++ 3 files changed, 120 insertions(+), 1 deletion(-) create mode 100644 tests/test_tokenize.py diff --git a/CHANGES.md b/CHANGES.md index e3e37484a59..d224701bd21 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -39,6 +39,8 @@ - Fix bug with Black incorrectly parsing empty lines with a backslash (#4343) +- Fix incorrect line numbers in the tokenizer for certain tokens within f-strings (#4423) + ### Performance diff --git a/src/blib2to3/pgen2/tokenize.py b/src/blib2to3/pgen2/tokenize.py index 28972a9bd78..ecd017b3148 100644 --- a/src/blib2to3/pgen2/tokenize.py +++ b/src/blib2to3/pgen2/tokenize.py @@ -638,7 +638,7 @@ def generate_tokens( else: if is_fstring_start(token): fstring_start, token = _split_fstring_start_and_middle(token) - fstring_start_epos = (lnum, spos[1] + len(fstring_start)) + fstring_start_epos = (spos[0], spos[1] + len(fstring_start)) yield ( FSTRING_START, fstring_start, diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py new file mode 100644 index 00000000000..54b1e3c4693 --- /dev/null +++ b/tests/test_tokenize.py @@ -0,0 +1,117 @@ +"""Tests for the blib2to3 tokenizer.""" + +import io +import black +import sys +import textwrap +from blib2to3.pgen2 import tokenize, token +from dataclasses import dataclass + + +@dataclass +class Token: + type: str + string: str + start: tokenize.Coord + end: tokenize.Coord + + +def get_tokens(text: str) -> list[Token]: + """Return the tokens produced by the tokenizer.""" + readline = io.StringIO(text).readline + tokens: list[Token] = [] + + def tokeneater( + type: int, string: str, start: tokenize.Coord, end: tokenize.Coord, line: str + ) -> None: + tokens.append(Token(token.tok_name[type], string, start, end)) + + tokenize.tokenize(readline, tokeneater) + return tokens + + +def assert_tokenizes(text: str, tokens: list[Token]) -> None: + """Assert that the tokenizer produces the expected tokens.""" + actual_tokens = get_tokens(text) + assert actual_tokens == tokens + + +def test_simple() -> None: + assert_tokenizes( + "1", + [Token("NUMBER", "1", (1, 0), (1, 1)), Token("ENDMARKER", "", (2, 0), (2, 0))], + ) + assert_tokenizes( + "'a'", + [ + Token("STRING", "'a'", (1, 0), (1, 3)), + Token("ENDMARKER", "", (2, 0), (2, 0)), + ], + ) + assert_tokenizes( + "a", + [Token("NAME", "a", (1, 0), (1, 1)), Token("ENDMARKER", "", (2, 0), (2, 0))], + ) + + +def test_fstring() -> None: + assert_tokenizes( + 'f"x"', + [ + Token("FSTRING_START", 'f"', (1, 0), (1, 2)), + Token("FSTRING_MIDDLE", 'x', (1, 2), (1, 3)), + Token("FSTRING_END", '"', (1, 3), (1, 4)), + Token("ENDMARKER", "", (2, 0), (2, 0)), + ], + ) + assert_tokenizes( + 'f"{x}"', + [ + Token("FSTRING_START", 'f"', (1, 0), (1, 2)), + Token("FSTRING_MIDDLE", '', (1, 2), (1, 2)), + Token("LBRACE", '{', (1, 2), (1, 3)), + Token("NAME", "x", (1, 3), (1, 4)), + Token("RBRACE", '}', (1, 4), (1, 5)), + Token("FSTRING_MIDDLE", '', (1, 5), (1, 5)), + Token("FSTRING_END", '"', (1, 5), (1, 6)), + Token("ENDMARKER", "", (2, 0), (2, 0)), + ], + ) + assert_tokenizes( + 'f"{x:y}"\n', + [ + Token(type="FSTRING_START", string='f"', start=(1, 0), end=(1, 2)), + Token(type="FSTRING_MIDDLE", string="", start=(1, 2), end=(1, 2)), + Token(type="LBRACE", string="{", start=(1, 2), end=(1, 3)), + Token(type="NAME", string="x", start=(1, 3), end=(1, 4)), + Token(type="OP", string=":", start=(1, 4), end=(1, 5)), + Token(type="FSTRING_MIDDLE", string="y", start=(1, 5), end=(1, 6)), + Token(type="RBRACE", string="}", start=(1, 6), end=(1, 7)), + Token(type="FSTRING_MIDDLE", string="", start=(1, 7), end=(1, 7)), + Token(type="FSTRING_END", string='"', start=(1, 7), end=(1, 8)), + Token(type="NEWLINE", string="\n", start=(1, 8), end=(1, 9)), + Token(type="ENDMARKER", string="", start=(2, 0), end=(2, 0)), + ], + ) + assert_tokenizes( + 'f"x\\\n{a}"\n', + [ + Token(type="FSTRING_START", string='f"', start=(1, 0), end=(1, 2)), + Token(type="FSTRING_MIDDLE", string="x\\\n", start=(1, 2), end=(2, 0)), + Token(type="LBRACE", string="{", start=(2, 0), end=(2, 1)), + Token(type="NAME", string="a", start=(2, 1), end=(2, 2)), + Token(type="RBRACE", string="}", start=(2, 2), end=(2, 3)), + Token(type="FSTRING_MIDDLE", string="", start=(2, 3), end=(2, 3)), + Token(type="FSTRING_END", string='"', start=(2, 3), end=(2, 4)), + Token(type="NEWLINE", string="\n", start=(2, 4), end=(2, 5)), + Token(type="ENDMARKER", string="", start=(3, 0), end=(3, 0)), + ], + ) + + +if __name__ == "__main__": + code = sys.stdin.read() + tokens = get_tokens(code) + text = f"assert_tokenizes({code!r}, {tokens!r})" + text = black.format_str(text, mode=black.FileMode()) + print(textwrap.indent(text, " ")) \ No newline at end of file From 09986797dfb47f59950ac22535b8b5679f2173c0 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 1 Aug 2024 17:20:31 +0000 Subject: [PATCH 2/4] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- CHANGES.md | 3 ++- tests/test_tokenize.py | 7 ++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index d224701bd21..48fe337392d 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -39,7 +39,8 @@ - Fix bug with Black incorrectly parsing empty lines with a backslash (#4343) -- Fix incorrect line numbers in the tokenizer for certain tokens within f-strings (#4423) +- Fix incorrect line numbers in the tokenizer for certain tokens within f-strings + (#4423) ### Performance diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index 54b1e3c4693..1794533b782 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -1,12 +1,13 @@ """Tests for the blib2to3 tokenizer.""" import io -import black import sys import textwrap -from blib2to3.pgen2 import tokenize, token from dataclasses import dataclass +import black +from blib2to3.pgen2 import token, tokenize + @dataclass class Token: @@ -114,4 +115,4 @@ def test_fstring() -> None: tokens = get_tokens(code) text = f"assert_tokenizes({code!r}, {tokens!r})" text = black.format_str(text, mode=black.FileMode()) - print(textwrap.indent(text, " ")) \ No newline at end of file + print(textwrap.indent(text, " ")) From d1789dd93b32d0148be3faa6fea1da92d700e74d Mon Sep 17 00:00:00 2001 From: Jelle Zijlstra Date: Thu, 1 Aug 2024 10:21:49 -0700 Subject: [PATCH 3/4] format --- tests/test_tokenize.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index 1794533b782..71773069546 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -60,7 +60,7 @@ def test_fstring() -> None: 'f"x"', [ Token("FSTRING_START", 'f"', (1, 0), (1, 2)), - Token("FSTRING_MIDDLE", 'x', (1, 2), (1, 3)), + Token("FSTRING_MIDDLE", "x", (1, 2), (1, 3)), Token("FSTRING_END", '"', (1, 3), (1, 4)), Token("ENDMARKER", "", (2, 0), (2, 0)), ], @@ -69,11 +69,11 @@ def test_fstring() -> None: 'f"{x}"', [ Token("FSTRING_START", 'f"', (1, 0), (1, 2)), - Token("FSTRING_MIDDLE", '', (1, 2), (1, 2)), - Token("LBRACE", '{', (1, 2), (1, 3)), + Token("FSTRING_MIDDLE", "", (1, 2), (1, 2)), + Token("LBRACE", "{", (1, 2), (1, 3)), Token("NAME", "x", (1, 3), (1, 4)), - Token("RBRACE", '}', (1, 4), (1, 5)), - Token("FSTRING_MIDDLE", '', (1, 5), (1, 5)), + Token("RBRACE", "}", (1, 4), (1, 5)), + Token("FSTRING_MIDDLE", "", (1, 5), (1, 5)), Token("FSTRING_END", '"', (1, 5), (1, 6)), Token("ENDMARKER", "", (2, 0), (2, 0)), ], @@ -110,9 +110,10 @@ def test_fstring() -> None: ) +# Run "echo some code | python tests/test_tokenize.py" to generate test cases. if __name__ == "__main__": code = sys.stdin.read() tokens = get_tokens(code) text = f"assert_tokenizes({code!r}, {tokens!r})" - text = black.format_str(text, mode=black.FileMode()) + text = black.format_str(text, mode=black.Mode()) print(textwrap.indent(text, " ")) From d193bd5ca655e8d9aba0296ea572663a27443b29 Mon Sep 17 00:00:00 2001 From: Jelle Zijlstra Date: Thu, 1 Aug 2024 10:35:36 -0700 Subject: [PATCH 4/4] fix 3.8 --- tests/test_tokenize.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index 71773069546..3798a9b6a92 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -4,6 +4,7 @@ import sys import textwrap from dataclasses import dataclass +from typing import List import black from blib2to3.pgen2 import token, tokenize @@ -17,10 +18,10 @@ class Token: end: tokenize.Coord -def get_tokens(text: str) -> list[Token]: +def get_tokens(text: str) -> List[Token]: """Return the tokens produced by the tokenizer.""" readline = io.StringIO(text).readline - tokens: list[Token] = [] + tokens: List[Token] = [] def tokeneater( type: int, string: str, start: tokenize.Coord, end: tokenize.Coord, line: str @@ -31,7 +32,7 @@ def tokeneater( return tokens -def assert_tokenizes(text: str, tokens: list[Token]) -> None: +def assert_tokenizes(text: str, tokens: List[Token]) -> None: """Assert that the tokenizer produces the expected tokens.""" actual_tokens = get_tokens(text) assert actual_tokens == tokens