From 9e22e8877b70c1c336ed23aad0a5241e71c1d9cf Mon Sep 17 00:00:00 2001 From: Inada Naoki Date: Fri, 12 Apr 2024 17:37:50 +0900 Subject: [PATCH 1/4] site: try utf-8 and fallback to locale encoding --- Lib/site.py | 59 +++++++++++-------- ...4-04-12-17-37-11.gh-issue-77102.Mk6X_E.rst | 3 + 2 files changed, 36 insertions(+), 26 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2024-04-12-17-37-11.gh-issue-77102.Mk6X_E.rst diff --git a/Lib/site.py b/Lib/site.py index 162bbec4f8f41b..ebeaf7419ee4a5 100644 --- a/Lib/site.py +++ b/Lib/site.py @@ -179,35 +179,42 @@ def addpackage(sitedir, name, known_paths): return _trace(f"Processing .pth file: {fullname!r}") try: - # locale encoding is not ideal especially on Windows. But we have used - # it for a long time. setuptools uses the locale encoding too. - f = io.TextIOWrapper(io.open_code(fullname), encoding="locale") + with io.open_code(fullname) as f: + pth_content = f.read() except OSError: return - with f: - for n, line in enumerate(f): - if line.startswith("#"): - continue - if line.strip() == "": + + try: + pth_content = pth_content.decode() + except UnicodeDecodeError: + # Fallback to locale encoding for backward compatibility. + # We will deprecate this fallback in the future. + import locale + pth_content = pth_content.decode(locale.getencoding()) + + for n, line in enumerate(pth_content.splitlines(), 1): + if line.startswith("#"): + continue + if line.strip() == "": + continue + try: + if line.startswith(("import ", "import\t")): + exec(line) continue - try: - if line.startswith(("import ", "import\t")): - exec(line) - continue - line = line.rstrip() - dir, dircase = makepath(sitedir, line) - if not dircase in known_paths and os.path.exists(dir): - sys.path.append(dir) - known_paths.add(dircase) - except Exception as exc: - print("Error processing line {:d} of {}:\n".format(n+1, fullname), - file=sys.stderr) - import traceback - for record in traceback.format_exception(exc): - for line in record.splitlines(): - print(' '+line, file=sys.stderr) - print("\nRemainder of file ignored", file=sys.stderr) - break + line = line.rstrip() + dir, dircase = makepath(sitedir, line) + if dircase not in known_paths and os.path.exists(dir): + sys.path.append(dir) + known_paths.add(dircase) + except Exception as exc: + print(f"Error processing line {n:d} of {fullname}:\n", + file=sys.stderr) + import traceback + for record in traceback.format_exception(exc): + for line in record.splitlines(): + print(' '+line, file=sys.stderr) + print("\nRemainder of file ignored", file=sys.stderr) + break if reset: known_paths = None return known_paths diff --git a/Misc/NEWS.d/next/Library/2024-04-12-17-37-11.gh-issue-77102.Mk6X_E.rst b/Misc/NEWS.d/next/Library/2024-04-12-17-37-11.gh-issue-77102.Mk6X_E.rst new file mode 100644 index 00000000000000..6f91251126dc7b --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-04-12-17-37-11.gh-issue-77102.Mk6X_E.rst @@ -0,0 +1,3 @@ +:mod:`site` module now parses ``.pth`` file with UTF-8 first, and +:term:`locale encoding` if ``UnicodeDecodeError`` happened. It supported +only locale encoding before. From 4a09f06af2ef93e0f194a1b01e1da363ae791967 Mon Sep 17 00:00:00 2001 From: Inada Naoki Date: Mon, 15 Apr 2024 22:15:17 +0900 Subject: [PATCH 2/4] add trace for fallback encoding use --- Lib/site.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Lib/site.py b/Lib/site.py index ebeaf7419ee4a5..93af9c453ac7bb 100644 --- a/Lib/site.py +++ b/Lib/site.py @@ -191,6 +191,8 @@ def addpackage(sitedir, name, known_paths): # We will deprecate this fallback in the future. import locale pth_content = pth_content.decode(locale.getencoding()) + _trace(f"Cannot read {fullname!r} as UTF-8. " + f"Using fallback encoding {locale.getencoding()!r}") for n, line in enumerate(pth_content.splitlines(), 1): if line.startswith("#"): From 72985c124bbe5009ccace90691becffeb22d0165 Mon Sep 17 00:00:00 2001 From: Inada Naoki Date: Mon, 15 Apr 2024 22:24:49 +0900 Subject: [PATCH 3/4] update docs --- Doc/library/site.rst | 4 ++++ Doc/whatsnew/3.13.rst | 7 +++++++ 2 files changed, 11 insertions(+) diff --git a/Doc/library/site.rst b/Doc/library/site.rst index 2dc9fb09d727e2..df02562876dc5e 100644 --- a/Doc/library/site.rst +++ b/Doc/library/site.rst @@ -74,6 +74,10 @@ with ``import`` (followed by space or tab) are executed. Limiting a code chunk to a single line is a deliberate measure to discourage putting anything more complex here. +.. versionchangedd:: 3.13 + The :file:`.pth` files are now decoded by UTF-8 at first and then by the + :term:`locale encoding` if it fails. + .. index:: single: package triple: path; configuration; file diff --git a/Doc/whatsnew/3.13.rst b/Doc/whatsnew/3.13.rst index 65985ddc65a86f..06e7b026e2b5af 100644 --- a/Doc/whatsnew/3.13.rst +++ b/Doc/whatsnew/3.13.rst @@ -634,6 +634,13 @@ re * Rename :exc:`!re.error` to :exc:`re.PatternError` for improved clarity. :exc:`!re.error` is kept for backward compatibility. +site +---- + +* :file:`.pth` files are now decoded by UTF-8 first, and then by the + :term:`locale encoding` if the UTF-8 decoding fails. + (Contributed by Inada Naoki in :gh:`117802`.) + sqlite3 ------- From 88f857b6be7d24898ed5cf5ceb9e02ef934343df Mon Sep 17 00:00:00 2001 From: Inada Naoki Date: Mon, 15 Apr 2024 23:11:19 +0900 Subject: [PATCH 4/4] fix typo --- Doc/library/site.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Doc/library/site.rst b/Doc/library/site.rst index df02562876dc5e..e52bbd32d4d493 100644 --- a/Doc/library/site.rst +++ b/Doc/library/site.rst @@ -74,7 +74,7 @@ with ``import`` (followed by space or tab) are executed. Limiting a code chunk to a single line is a deliberate measure to discourage putting anything more complex here. -.. versionchangedd:: 3.13 +.. versionchanged:: 3.13 The :file:`.pth` files are now decoded by UTF-8 at first and then by the :term:`locale encoding` if it fails.