From b3bf0c0e58b90037bfcd380fb3bdb8c26d7cf660 Mon Sep 17 00:00:00 2001 From: Barry Scott Date: Sun, 12 Apr 2020 15:09:02 +0100 Subject: [PATCH 1/4] bpo-40260: Allow compile() to handle the module's source decoding by passing in an fp that is opened in "rb". This fixes problems seen on Windows where the source code is not decoded as utf-8 but as the code page default for the user, cp1252 for example. Add three new tests to test_modulefinder.py to excercise the decoding of source code. Always write the test source as bytes to avoid using locale.getpreferredencoding() as the encoding. test_coding_default_utf8 - designed to fail if decoded with cp1252 test_coding_explicit_utf8 - designed to fail if decoded with cp1252 test_coding_explicit_cp1252 - designed to fail if decoded with utf-8 --- Lib/modulefinder.py | 8 ++--- Lib/test/test_modulefinder.py | 65 ++++++++++++++++++++++++++++++++--- 2 files changed, 64 insertions(+), 9 deletions(-) diff --git a/Lib/modulefinder.py b/Lib/modulefinder.py index e0d29984f862cf..4c6fe900099a24 100644 --- a/Lib/modulefinder.py +++ b/Lib/modulefinder.py @@ -80,7 +80,7 @@ def _find_module(name, path=None): if isinstance(spec.loader, importlib.machinery.SourceFileLoader): kind = _PY_SOURCE - mode = "r" + mode = "rb" elif isinstance(spec.loader, importlib.machinery.ExtensionFileLoader): kind = _C_EXTENSION @@ -160,14 +160,14 @@ def msgout(self, *args): def run_script(self, pathname): self.msg(2, "run_script", pathname) - with open(pathname) as fp: + with open(pathname, "rb") as fp: stuff = ("", "r", _PY_SOURCE) self.load_module('__main__', fp, pathname, stuff) def load_file(self, pathname): dir, name = os.path.split(pathname) name, ext = os.path.splitext(name) - with open(pathname) as fp: + with open(pathname, "rb") as fp: stuff = (ext, "r", _PY_SOURCE) self.load_module(name, fp, pathname, stuff) @@ -340,7 +340,7 @@ def load_module(self, fqname, fp, pathname, file_info): self.msgout(2, "load_module ->", m) return m if type == _PY_SOURCE: - co = compile(fp.read()+'\n', pathname, 'exec') + co = compile(fp.read()+b'\n', pathname, 'exec') elif type == _PY_COMPILED: try: data = fp.read() diff --git a/Lib/test/test_modulefinder.py b/Lib/test/test_modulefinder.py index ebd96e1c8a2dd0..1aa45011df0bb4 100644 --- a/Lib/test/test_modulefinder.py +++ b/Lib/test/test_modulefinder.py @@ -40,7 +40,8 @@ from c import something b/__init__.py from sys import * -"""] +""", +] maybe_test_new = [ "a.module", @@ -245,6 +246,48 @@ def foo(): pass b/c.py """] +coding_default_utf8_test = [ + "a_utf8", + ["a_utf8", "b_utf8"], + [], [], + """\ +a_utf8.py + # use the default of utf8 + print('Unicode test A code point 2090 \u2090 that is not valid in cp1252') + import b_utf8 +b_utf8.py + # use the default of utf8 + print('Unicode test B code point 2090 \u2090 that is not valid in cp1252') +"""] + +coding_explicit_utf8_test = [ + "a_utf8", + ["a_utf8", "b_utf8"], + [], [], + """\ +a_utf8.py + # coding=utf8 + print('Unicode test A code point 2090 \u2090 that is not valid in cp1252') + import b_utf8 +b_utf8.py + # use the default of utf8 + print('Unicode test B code point 2090 \u2090 that is not valid in cp1252') +"""] + +coding_explicit_cp1252_test = [ + "a_cp1252", + ["a_cp1252", "b_utf8"], + [], [], + b"""\ +a_cp1252.py + # coding=cp1252 + # 0xe2 is not allowed in utf8 + print('CP1252 test P\xe2t\xe9') + import b_utf8 +b_utf8.py + # use the default of utf8 + print('Unicode test A code point 2090 \u2090 that is not valid in cp1252') +"""] def open_file(path): dirname = os.path.dirname(path) @@ -253,18 +296,22 @@ def open_file(path): except OSError as e: if e.errno != errno.EEXIST: raise - return open(path, "w") + return open(path, 'wb') def create_package(source): ofi = None try: for line in source.splitlines(): - if line.startswith(" ") or line.startswith("\t"): - ofi.write(line.strip() + "\n") + if type(line) != bytes: + line = line.encode('utf-8') + if line.startswith(b' ') or line.startswith(b'\t'): + ofi.write(line.strip() + b'\n') else: if ofi: ofi.close() + if type(line) == bytes: + line = line.decode('utf-8') ofi = open_file(os.path.join(TEST_DIR, line.strip())) finally: if ofi: @@ -337,7 +384,7 @@ def test_bytecode(self): source_path = base_path + importlib.machinery.SOURCE_SUFFIXES[0] bytecode_path = base_path + importlib.machinery.BYTECODE_SUFFIXES[0] with open_file(source_path) as file: - file.write('testing_modulefinder = True\n') + file.write('testing_modulefinder = True\n'.encode('utf-8')) py_compile.compile(source_path, cfile=bytecode_path) os.remove(source_path) self._do_test(bytecode_test) @@ -365,6 +412,14 @@ def test_extended_opargs(self): """ % list(range(2**16))] # 2**16 constants self._do_test(extended_opargs_test) + def test_coding_default_utf8(self): + self._do_test(coding_default_utf8_test) + + def test_coding_explicit_utf8(self): + self._do_test(coding_explicit_utf8_test) + + def test_coding_explicit_cp1252(self): + self._do_test(coding_explicit_cp1252_test) if __name__ == "__main__": unittest.main() From 7757c8a5c72ff1df16dda3182074523e551ce8bf Mon Sep 17 00:00:00 2001 From: Barry Scott Date: Mon, 13 Apr 2020 09:31:24 +0100 Subject: [PATCH 2/4] Add news. --- .../NEWS.d/next/Library/2020-04-12-21-18-56.bpo-40260.F6VWaE.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Library/2020-04-12-21-18-56.bpo-40260.F6VWaE.rst diff --git a/Misc/NEWS.d/next/Library/2020-04-12-21-18-56.bpo-40260.F6VWaE.rst b/Misc/NEWS.d/next/Library/2020-04-12-21-18-56.bpo-40260.F6VWaE.rst new file mode 100644 index 00000000000000..6cc323443a54a7 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2020-04-12-21-18-56.bpo-40260.F6VWaE.rst @@ -0,0 +1 @@ +fix modulefinder UnicodeDecodeError especially on Windows From 73dd742fde76ba764e981e6a431d23b7846af7e6 Mon Sep 17 00:00:00 2001 From: Barry Scott Date: Mon, 13 Apr 2020 16:46:12 +0100 Subject: [PATCH 3/4] Use io.open_code() that is intended for open python code. Remove the mode in file_info that is usused. --- Lib/modulefinder.py | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/Lib/modulefinder.py b/Lib/modulefinder.py index 4c6fe900099a24..84ddbdb6ee3df2 100644 --- a/Lib/modulefinder.py +++ b/Lib/modulefinder.py @@ -5,6 +5,7 @@ import importlib.machinery import marshal import os +import io import sys import types import warnings @@ -68,35 +69,32 @@ def _find_module(name, path=None): # Some special cases: if spec.loader is importlib.machinery.BuiltinImporter: - return None, None, ("", "", _C_BUILTIN) + return None, None, ("", _C_BUILTIN) if spec.loader is importlib.machinery.FrozenImporter: - return None, None, ("", "", _PY_FROZEN) + return None, None, ("", _PY_FROZEN) file_path = spec.origin if spec.loader.is_package(name): - return None, os.path.dirname(file_path), ("", "", _PKG_DIRECTORY) + return None, os.path.dirname(file_path), ("", _PKG_DIRECTORY) if isinstance(spec.loader, importlib.machinery.SourceFileLoader): kind = _PY_SOURCE - mode = "rb" elif isinstance(spec.loader, importlib.machinery.ExtensionFileLoader): kind = _C_EXTENSION - mode = "rb" elif isinstance(spec.loader, importlib.machinery.SourcelessFileLoader): kind = _PY_COMPILED - mode = "rb" else: # Should never happen. - return None, None, ("", "", _SEARCH_ERROR) + return None, None, ("", _SEARCH_ERROR) - file = open(file_path, mode) + file = io.open_code(file_path) suffix = os.path.splitext(file_path)[-1] - return file, file_path, (suffix, mode, kind) + return file, file_path, (suffix, kind) class Module: @@ -160,15 +158,15 @@ def msgout(self, *args): def run_script(self, pathname): self.msg(2, "run_script", pathname) - with open(pathname, "rb") as fp: - stuff = ("", "r", _PY_SOURCE) + with io.open_code(pathname) as fp: + stuff = ("", _PY_SOURCE) self.load_module('__main__', fp, pathname, stuff) def load_file(self, pathname): dir, name = os.path.split(pathname) name, ext = os.path.splitext(name) - with open(pathname, "rb") as fp: - stuff = (ext, "r", _PY_SOURCE) + with io.open_code(pathname) as fp: + stuff = (ext, _PY_SOURCE) self.load_module(name, fp, pathname, stuff) def import_hook(self, name, caller=None, fromlist=None, level=-1): @@ -333,7 +331,7 @@ def import_module(self, partname, fqname, parent): return m def load_module(self, fqname, fp, pathname, file_info): - suffix, mode, type = file_info + suffix, type = file_info self.msgin(2, "load_module", fqname, fp and "fp", pathname) if type == _PKG_DIRECTORY: m = self.load_package(fqname, pathname) @@ -504,7 +502,7 @@ def find_module(self, name, path, parent=None): if path is None: if name in sys.builtin_module_names: - return (None, None, ("", "", _C_BUILTIN)) + return (None, None, ("", _C_BUILTIN)) path = self.path From db5bc2f0e730eb7028ce64d5f7a9d87fe5bb967b Mon Sep 17 00:00:00 2001 From: Steve Dower Date: Tue, 14 Apr 2020 19:33:45 +0100 Subject: [PATCH 4/4] Update NEWS --- .../next/Library/2020-04-12-21-18-56.bpo-40260.F6VWaE.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Misc/NEWS.d/next/Library/2020-04-12-21-18-56.bpo-40260.F6VWaE.rst b/Misc/NEWS.d/next/Library/2020-04-12-21-18-56.bpo-40260.F6VWaE.rst index 6cc323443a54a7..decc073bf4d611 100644 --- a/Misc/NEWS.d/next/Library/2020-04-12-21-18-56.bpo-40260.F6VWaE.rst +++ b/Misc/NEWS.d/next/Library/2020-04-12-21-18-56.bpo-40260.F6VWaE.rst @@ -1 +1 @@ -fix modulefinder UnicodeDecodeError especially on Windows +Ensure :mod:`modulefinder` uses :func:`io.open_code` and respects coding comments.