From f16cff489997535b417c1f957f99702e0bff9311 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Sun, 14 May 2017 15:20:34 +0300 Subject: [PATCH 1/2] bpo-30363: Backport warnings in the re module. Running Python with the -3 option now warns about regular expression syntax that is invalid or has different semantic in Python 3 or will change the behavior in future Python versions. --- Lib/_strptime.py | 4 +- Lib/sre_compile.py | 2 +- Lib/sre_parse.py | 34 +++++++++++++-- Lib/test/test_re.py | 100 +++++++++++++++++++++++++++++++++++++------- Misc/NEWS | 4 ++ Modules/_sre.c | 14 +++++++ 6 files changed, 137 insertions(+), 21 deletions(-) diff --git a/Lib/_strptime.py b/Lib/_strptime.py index feac05a001c6a9..8eb2718d5ca9be 100644 --- a/Lib/_strptime.py +++ b/Lib/_strptime.py @@ -254,8 +254,8 @@ def pattern(self, format): # format directives (%m, etc.). regex_chars = re_compile(r"([\\.^$*+?\(\){}\[\]|])") format = regex_chars.sub(r"\\\1", format) - whitespace_replacement = re_compile('\s+') - format = whitespace_replacement.sub('\s+', format) + whitespace_replacement = re_compile(r'\s+') + format = whitespace_replacement.sub(r'\\s+', format) while '%' in format: directive_index = format.index('%')+1 processed_format = "%s%s%s" % (processed_format, diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py index c5a7e89d079695..b6689fa7a77b55 100644 --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -435,7 +435,7 @@ def _compile_info(code, pattern, flags): # this contains min/max pattern width, and an optional literal # prefix or a character map lo, hi = pattern.getwidth() - if lo == 0: + if not lo and hi: return # not worth it # look for a literal prefix prefix = [] diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py index 75f488b5475093..0e4699dd782e2e 100644 --- a/Lib/sre_parse.py +++ b/Lib/sre_parse.py @@ -23,6 +23,7 @@ OCTDIGITS = set("01234567") HEXDIGITS = set("0123456789abcdefABCDEF") +ASCIILETTERS = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ") WHITESPACE = set(" \t\n\r\v\f") @@ -233,7 +234,7 @@ def isname(name): return False return True -def _class_escape(source, escape): +def _class_escape(source, escape, state): # handle escape code inside character class code = ESCAPES.get(escape) if code: @@ -260,6 +261,14 @@ def _class_escape(source, escape): elif c in DIGITS: raise error, "bogus escape: %s" % repr(escape) if len(escape) == 2: + if sys.py3kwarning and c in ASCIILETTERS: + import warnings + if c in 'Uu' and state.flags & SRE_FLAG_UNICODE: + warnings.warn('unicode escape %s' % escape, + FutureWarning, stacklevel=8) + else: + warnings.warnpy3k('bad escape %s' % escape, + DeprecationWarning, stacklevel=8) return LITERAL, ord(escape[1]) except ValueError: pass @@ -309,6 +318,14 @@ def _escape(source, escape, state): return GROUPREF, group raise ValueError if len(escape) == 2: + if sys.py3kwarning and c in ASCIILETTERS: + import warnings + if c in 'Uu' and state.flags & SRE_FLAG_UNICODE: + warnings.warn('unicode escape %s' % escape, + FutureWarning, stacklevel=8) + else: + warnings.warnpy3k('bad escape %s' % escape, + DeprecationWarning, stacklevel=8) return LITERAL, ord(escape[1]) except ValueError: pass @@ -443,7 +460,7 @@ def _parse(source, state): if this == "]" and set != start: break elif this and this[0] == "\\": - code1 = _class_escape(source, this) + code1 = _class_escape(source, this, state) elif this: code1 = LITERAL, ord(this) else: @@ -459,7 +476,7 @@ def _parse(source, state): break elif this: if this[0] == "\\": - code2 = _class_escape(source, this) + code2 = _class_escape(source, this, state) else: code2 = LITERAL, ord(this) if code1[0] != LITERAL or code2[0] != LITERAL: @@ -714,6 +731,12 @@ def parse(str, flags=0, pattern=None): pattern.str = str p = _parse_sub(source, pattern, 0) + if (sys.py3kwarning and + (p.pattern.flags & SRE_FLAG_LOCALE) and + (p.pattern.flags & SRE_FLAG_UNICODE)): + import warnings + warnings.warnpy3k("LOCALE and UNICODE flags are incompatible", + DeprecationWarning, stacklevel=5) tail = source.get() if tail == ")": @@ -801,7 +824,10 @@ def literal(literal, p=p, pappend=a): try: this = makechar(ESCAPES[this][1]) except KeyError: - pass + if sys.py3kwarning and c in ASCIILETTERS: + import warnings + warnings.warnpy3k('bad escape %s' % this, + DeprecationWarning, stacklevel=4) literal(this) else: literal(this) diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 5725a99ad6d1ef..3d892100b7cce2 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -3,7 +3,7 @@ verbose, run_unittest, import_module, precisionbigmemtest, _2G, cpython_only, captured_stdout, have_unicode, requires_unicode, u, - check_warnings) + check_warnings, check_py3k_warnings) import locale import re from re import Scanner @@ -66,11 +66,13 @@ def test_basic_re_sub(self): self.assertEqual(re.sub('(?Px)', '\g\g', 'xx'), 'xxxx') self.assertEqual(re.sub('(?Px)', '\g<1>\g<1>', 'xx'), 'xxxx') - self.assertEqual(re.sub('a',r'\t\n\v\r\f\a\b\B\Z\a\A\w\W\s\S\d\D','a'), - '\t\n\v\r\f\a\b\\B\\Z\a\\A\\w\\W\\s\\S\\d\\D') - self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'), '\t\n\v\r\f\a') - self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'), - (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7))) + self.assertEqual(re.sub('a',r'\t\n\v\r\f\a\b','a'), '\t\n\v\r\f\a\b') + self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b') + self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'), + (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)+chr(8))) + for c in 'cdehijklmopqsuwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ': + with check_py3k_warnings(): + self.assertEqual(re.sub('a', '\\' + c, 'a'), '\\' + c) self.assertEqual(re.sub('^\s*', 'X', 'test'), 'Xtest') @@ -223,11 +225,11 @@ def test_re_subn(self): def test_re_split(self): self.assertEqual(re.split(":", ":a:b::c"), ['', 'a', 'b', '', 'c']) - self.assertEqual(re.split(":*", ":a:b::c"), ['', 'a', 'b', 'c']) - self.assertEqual(re.split("(:*)", ":a:b::c"), + self.assertEqual(re.split(":+", ":a:b::c"), ['', 'a', 'b', 'c']) + self.assertEqual(re.split("(:+)", ":a:b::c"), ['', ':', 'a', ':', 'b', '::', 'c']) - self.assertEqual(re.split("(?::*)", ":a:b::c"), ['', 'a', 'b', 'c']) - self.assertEqual(re.split("(:)*", ":a:b::c"), + self.assertEqual(re.split("(?::+)", ":a:b::c"), ['', 'a', 'b', 'c']) + self.assertEqual(re.split("(:)+", ":a:b::c"), ['', ':', 'a', ':', 'b', ':', 'c']) self.assertEqual(re.split("([b:]+)", ":a:b::c"), ['', ':', 'a', ':b::', 'c']) @@ -237,13 +239,34 @@ def test_re_split(self): self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"), ['', 'a', '', '', 'c']) + for sep, expected in [ + (':*', ['', 'a', 'b', 'c']), + ('(?::*)', ['', 'a', 'b', 'c']), + ('(:*)', ['', ':', 'a', ':', 'b', '::', 'c']), + ('(:)*', ['', ':', 'a', ':', 'b', ':', 'c']), + ]: + with check_py3k_warnings(('', FutureWarning)): + self.assertEqual(re.split(sep, ':a:b::c'), expected) + + for sep, expected in [ + ('', [':a:b::c']), + (r'\b', [':a:b::c']), + (r'(?=:)', [':a:b::c']), + (r'(?<=:)', [':a:b::c']), + ]: + with check_py3k_warnings(): + self.assertEqual(re.split(sep, ':a:b::c'), expected) + def test_qualified_re_split(self): self.assertEqual(re.split(":", ":a:b::c", 2), ['', 'a', 'b::c']) self.assertEqual(re.split(':', 'a:b:c:d', 2), ['a', 'b', 'c:d']) self.assertEqual(re.split("(:)", ":a:b::c", 2), ['', ':', 'a', ':', 'b::c']) - self.assertEqual(re.split("(:*)", ":a:b::c", 2), + self.assertEqual(re.split("(:+)", ":a:b::c", 2), ['', ':', 'a', ':', 'b::c']) + with check_py3k_warnings(('', FutureWarning)): + self.assertEqual(re.split("(:*)", ":a:b::c", maxsplit=2), + ['', ':', 'a', ':', 'b::c']) def test_re_findall(self): self.assertEqual(re.findall(":+", "abc"), []) @@ -404,6 +427,37 @@ def test_special_escapes(self): self.assertEqual(re.search(r"\d\D\w\W\s\S", "1aa! a", re.UNICODE).group(0), "1aa! a") + def test_other_escapes(self): + self.assertRaises(re.error, re.compile, "\\") + self.assertEqual(re.match(r"\(", '(').group(), '(') + self.assertIsNone(re.match(r"\(", ')')) + self.assertEqual(re.match(r"\\", '\\').group(), '\\') + self.assertEqual(re.match(r"[\]]", ']').group(), ']') + self.assertIsNone(re.match(r"[\]]", '[')) + self.assertEqual(re.match(r"[a\-c]", '-').group(), '-') + self.assertIsNone(re.match(r"[a\-c]", 'b')) + self.assertEqual(re.match(r"[\^a]+", 'a^').group(), 'a^') + self.assertIsNone(re.match(r"[\^a]+", 'b')) + re.purge() # for warnings + for c in 'ceghijklmopquyzCEFGHIJKLMNOPQRTUVXY': + with check_py3k_warnings(): + self.assertEqual(re.match('\\%c$' % c, c).group(), c) + self.assertIsNone(re.match('\\%c' % c, 'a')) + if have_unicode: + warn = FutureWarning if c in 'Uu' else DeprecationWarning + with check_py3k_warnings(('', warn)): + self.assertEqual(re.match('\\%c$' % c, c, re.UNICODE).group(), c) + self.assertIsNone(re.match('\\%c' % c, 'a', re.UNICODE)) + for c in 'ceghijklmopquyzABCEFGHIJKLMNOPQRTUVXYZ': + with check_py3k_warnings(): + self.assertEqual(re.match('[\\%c]$' % c, c).group(), c) + self.assertIsNone(re.match('[\\%c]' % c, 'a')) + if have_unicode: + warn = FutureWarning if c in 'Uu' else DeprecationWarning + with check_py3k_warnings(('', warn)): + self.assertEqual(re.match('[\\%c]$' % c, c, re.UNICODE).group(), c) + self.assertIsNone(re.match('[\\%c]' % c, 'a', re.UNICODE)) + def test_string_boundaries(self): # See http://bugs.python.org/issue10713 self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1), @@ -931,6 +985,19 @@ def test_inline_flags(self): self.assertTrue(re.match('(?ixu) ' + upper_char, lower_char)) self.assertTrue(re.match('(?ixu) ' + lower_char, upper_char)) + # Incompatibilities + re.purge() + with check_py3k_warnings(): + re.compile('', re.LOCALE|re.UNICODE) + with check_py3k_warnings(): + re.compile('(?L)', re.UNICODE) + with check_py3k_warnings(): + re.compile('(?u)', re.LOCALE) + with check_py3k_warnings(): + re.compile('(?Lu)') + with check_py3k_warnings(): + re.compile('(?uL)') + def test_dollar_matches_twice(self): "$ matches the end of string, and just before the terminating \n" pattern = re.compile('$') @@ -967,8 +1034,9 @@ def test_compile(self): def test_bug_13899(self): # Issue #13899: re pattern r"[\A]" should work like "A" but matches # nothing. Ditto B and Z. - self.assertEqual(re.findall(r'[\A\B\b\C\Z]', 'AB\bCZ'), - ['A', 'B', '\b', 'C', 'Z']) + with check_py3k_warnings(): + self.assertEqual(re.findall(r'[\A\B\b\C\Z]', 'AB\bCZ'), + ['A', 'B', '\b', 'C', 'Z']) @precisionbigmemtest(size=_2G, memuse=1) def test_large_search(self, size): @@ -1261,7 +1329,11 @@ def run_re_tests(): def test_main(): run_unittest(ReTests) - run_re_tests() + deprecations = [ + ('bad escape', DeprecationWarning), + ] + with check_py3k_warnings(*deprecations): + run_re_tests() if __name__ == "__main__": test_main() diff --git a/Misc/NEWS b/Misc/NEWS index 4c9f2d375e8911..6538ea53a90465 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -42,6 +42,10 @@ Extension Modules Library ------- +- bpo-30363: Running Python with the -3 option now warns about regular + expression syntax that is invalid or has different semantic in Python 3 + or will change the behavior in future Python versions. + - bpo-30342: Fix sysconfig.is_python_build() if Python is built with Visual Studio 2008 (VS 9.0). diff --git a/Modules/_sre.c b/Modules/_sre.c index 8e16c1d140adfa..6fd3affb09ab43 100644 --- a/Modules/_sre.c +++ b/Modules/_sre.c @@ -2267,6 +2267,20 @@ pattern_split(PatternObject* self, PyObject* args, PyObject* kw) if (!string) return NULL; + if (Py_Py3kWarningFlag && + (self->code[0] != SRE_OP_INFO || self->code[3] == 0)) + { + if (self->code[0] == SRE_OP_INFO && self->code[4] == 0) { + if (PyErr_WarnPy3k("split() requires a non-empty pattern match.", + 1) < 0) + return NULL; + } + else if (PyErr_WarnEx(PyExc_FutureWarning, + "split() requires a non-empty pattern match.", + 1) < 0) + return NULL; + } + string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX); if (!string) return NULL; From 427b09d808e00eddd60e56fa53068912c471f7c5 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Thu, 18 May 2017 10:24:27 +0300 Subject: [PATCH 2/2] Address review comments. --- Lib/sre_parse.py | 16 +++++++++------- Lib/test/test_re.py | 18 +++++------------- 2 files changed, 14 insertions(+), 20 deletions(-) diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py index 0e4699dd782e2e..e0d003ed85bda1 100644 --- a/Lib/sre_parse.py +++ b/Lib/sre_parse.py @@ -234,7 +234,7 @@ def isname(name): return False return True -def _class_escape(source, escape, state): +def _class_escape(source, escape): # handle escape code inside character class code = ESCAPES.get(escape) if code: @@ -263,8 +263,9 @@ def _class_escape(source, escape, state): if len(escape) == 2: if sys.py3kwarning and c in ASCIILETTERS: import warnings - if c in 'Uu' and state.flags & SRE_FLAG_UNICODE: - warnings.warn('unicode escape %s' % escape, + if c in 'Uu': + warnings.warn('bad escape %s; Unicode escapes are ' + 'supported only since Python 3.3' % escape, FutureWarning, stacklevel=8) else: warnings.warnpy3k('bad escape %s' % escape, @@ -320,8 +321,9 @@ def _escape(source, escape, state): if len(escape) == 2: if sys.py3kwarning and c in ASCIILETTERS: import warnings - if c in 'Uu' and state.flags & SRE_FLAG_UNICODE: - warnings.warn('unicode escape %s' % escape, + if c in 'Uu': + warnings.warn('bad escape %s; Unicode escapes are ' + 'supported only since Python 3.3' % escape, FutureWarning, stacklevel=8) else: warnings.warnpy3k('bad escape %s' % escape, @@ -460,7 +462,7 @@ def _parse(source, state): if this == "]" and set != start: break elif this and this[0] == "\\": - code1 = _class_escape(source, this, state) + code1 = _class_escape(source, this) elif this: code1 = LITERAL, ord(this) else: @@ -476,7 +478,7 @@ def _parse(source, state): break elif this: if this[0] == "\\": - code2 = _class_escape(source, this, state) + code2 = _class_escape(source, this) else: code2 = LITERAL, ord(this) if code1[0] != LITERAL or code2[0] != LITERAL: diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 3d892100b7cce2..174c5ca462cdca 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -66,7 +66,7 @@ def test_basic_re_sub(self): self.assertEqual(re.sub('(?Px)', '\g\g', 'xx'), 'xxxx') self.assertEqual(re.sub('(?Px)', '\g<1>\g<1>', 'xx'), 'xxxx') - self.assertEqual(re.sub('a',r'\t\n\v\r\f\a\b','a'), '\t\n\v\r\f\a\b') + self.assertEqual(re.sub('a', r'\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b') self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b') self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'), (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)+chr(8))) @@ -440,23 +440,15 @@ def test_other_escapes(self): self.assertIsNone(re.match(r"[\^a]+", 'b')) re.purge() # for warnings for c in 'ceghijklmopquyzCEFGHIJKLMNOPQRTUVXY': - with check_py3k_warnings(): + warn = FutureWarning if c in 'Uu' else DeprecationWarning + with check_py3k_warnings(('', warn)): self.assertEqual(re.match('\\%c$' % c, c).group(), c) self.assertIsNone(re.match('\\%c' % c, 'a')) - if have_unicode: - warn = FutureWarning if c in 'Uu' else DeprecationWarning - with check_py3k_warnings(('', warn)): - self.assertEqual(re.match('\\%c$' % c, c, re.UNICODE).group(), c) - self.assertIsNone(re.match('\\%c' % c, 'a', re.UNICODE)) for c in 'ceghijklmopquyzABCEFGHIJKLMNOPQRTUVXYZ': - with check_py3k_warnings(): + warn = FutureWarning if c in 'Uu' else DeprecationWarning + with check_py3k_warnings(('', warn)): self.assertEqual(re.match('[\\%c]$' % c, c).group(), c) self.assertIsNone(re.match('[\\%c]' % c, 'a')) - if have_unicode: - warn = FutureWarning if c in 'Uu' else DeprecationWarning - with check_py3k_warnings(('', warn)): - self.assertEqual(re.match('[\\%c]$' % c, c, re.UNICODE).group(), c) - self.assertIsNone(re.match('[\\%c]' % c, 'a', re.UNICODE)) def test_string_boundaries(self): # See http://bugs.python.org/issue10713