From f16cff489997535b417c1f957f99702e0bff9311 Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Sun, 14 May 2017 15:20:34 +0300
Subject: [PATCH 1/2] bpo-30363: Backport warnings in the re module.

Running Python with the -3 option now warns about regular expression
syntax that is invalid or has different semantic in Python 3
or will change the behavior in future Python versions.
---
 Lib/_strptime.py    |   4 +-
 Lib/sre_compile.py  |   2 +-
 Lib/sre_parse.py    |  34 +++++++++++++--
 Lib/test/test_re.py | 100 +++++++++++++++++++++++++++++++++++++-------
 Misc/NEWS           |   4 ++
 Modules/_sre.c      |  14 +++++++
 6 files changed, 137 insertions(+), 21 deletions(-)

diff --git a/Lib/_strptime.py b/Lib/_strptime.py
index feac05a001c6a9..8eb2718d5ca9be 100644
--- a/Lib/_strptime.py
+++ b/Lib/_strptime.py
@@ -254,8 +254,8 @@ def pattern(self, format):
         # format directives (%m, etc.).
         regex_chars = re_compile(r"([\\.^$*+?\(\){}\[\]|])")
         format = regex_chars.sub(r"\\\1", format)
-        whitespace_replacement = re_compile('\s+')
-        format = whitespace_replacement.sub('\s+', format)
+        whitespace_replacement = re_compile(r'\s+')
+        format = whitespace_replacement.sub(r'\\s+', format)
         while '%' in format:
             directive_index = format.index('%')+1
             processed_format = "%s%s%s" % (processed_format,
diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py
index c5a7e89d079695..b6689fa7a77b55 100644
--- a/Lib/sre_compile.py
+++ b/Lib/sre_compile.py
@@ -435,7 +435,7 @@ def _compile_info(code, pattern, flags):
     # this contains min/max pattern width, and an optional literal
     # prefix or a character map
     lo, hi = pattern.getwidth()
-    if lo == 0:
+    if not lo and hi:
         return # not worth it
     # look for a literal prefix
     prefix = []
diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py
index 75f488b5475093..0e4699dd782e2e 100644
--- a/Lib/sre_parse.py
+++ b/Lib/sre_parse.py
@@ -23,6 +23,7 @@
 
 OCTDIGITS = set("01234567")
 HEXDIGITS = set("0123456789abcdefABCDEF")
+ASCIILETTERS = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
 
 WHITESPACE = set(" \t\n\r\v\f")
 
@@ -233,7 +234,7 @@ def isname(name):
             return False
     return True
 
-def _class_escape(source, escape):
+def _class_escape(source, escape, state):
     # handle escape code inside character class
     code = ESCAPES.get(escape)
     if code:
@@ -260,6 +261,14 @@ def _class_escape(source, escape):
         elif c in DIGITS:
             raise error, "bogus escape: %s" % repr(escape)
         if len(escape) == 2:
+            if sys.py3kwarning and c in ASCIILETTERS:
+                import warnings
+                if c in 'Uu' and state.flags & SRE_FLAG_UNICODE:
+                    warnings.warn('unicode escape %s' % escape,
+                                  FutureWarning, stacklevel=8)
+                else:
+                    warnings.warnpy3k('bad escape %s' % escape,
+                                      DeprecationWarning, stacklevel=8)
             return LITERAL, ord(escape[1])
     except ValueError:
         pass
@@ -309,6 +318,14 @@ def _escape(source, escape, state):
                 return GROUPREF, group
             raise ValueError
         if len(escape) == 2:
+            if sys.py3kwarning and c in ASCIILETTERS:
+                import warnings
+                if c in 'Uu' and state.flags & SRE_FLAG_UNICODE:
+                    warnings.warn('unicode escape %s' % escape,
+                                  FutureWarning, stacklevel=8)
+                else:
+                    warnings.warnpy3k('bad escape %s' % escape,
+                                      DeprecationWarning, stacklevel=8)
             return LITERAL, ord(escape[1])
     except ValueError:
         pass
@@ -443,7 +460,7 @@ def _parse(source, state):
                 if this == "]" and set != start:
                     break
                 elif this and this[0] == "\\":
-                    code1 = _class_escape(source, this)
+                    code1 = _class_escape(source, this, state)
                 elif this:
                     code1 = LITERAL, ord(this)
                 else:
@@ -459,7 +476,7 @@ def _parse(source, state):
                         break
                     elif this:
                         if this[0] == "\\":
-                            code2 = _class_escape(source, this)
+                            code2 = _class_escape(source, this, state)
                         else:
                             code2 = LITERAL, ord(this)
                         if code1[0] != LITERAL or code2[0] != LITERAL:
@@ -714,6 +731,12 @@ def parse(str, flags=0, pattern=None):
     pattern.str = str
 
     p = _parse_sub(source, pattern, 0)
+    if (sys.py3kwarning and
+        (p.pattern.flags & SRE_FLAG_LOCALE) and
+        (p.pattern.flags & SRE_FLAG_UNICODE)):
+        import warnings
+        warnings.warnpy3k("LOCALE and UNICODE flags are incompatible",
+                          DeprecationWarning, stacklevel=5)
 
     tail = source.get()
     if tail == ")":
@@ -801,7 +824,10 @@ def literal(literal, p=p, pappend=a):
                 try:
                     this = makechar(ESCAPES[this][1])
                 except KeyError:
-                    pass
+                    if sys.py3kwarning and c in ASCIILETTERS:
+                        import warnings
+                        warnings.warnpy3k('bad escape %s' % this,
+                                          DeprecationWarning, stacklevel=4)
                 literal(this)
         else:
             literal(this)
diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py
index 5725a99ad6d1ef..3d892100b7cce2 100644
--- a/Lib/test/test_re.py
+++ b/Lib/test/test_re.py
@@ -3,7 +3,7 @@
     verbose, run_unittest, import_module,
     precisionbigmemtest, _2G, cpython_only,
     captured_stdout, have_unicode, requires_unicode, u,
-    check_warnings)
+    check_warnings, check_py3k_warnings)
 import locale
 import re
 from re import Scanner
@@ -66,11 +66,13 @@ def test_basic_re_sub(self):
         self.assertEqual(re.sub('(?P<unk>x)', '\g<unk>\g<unk>', 'xx'), 'xxxx')
         self.assertEqual(re.sub('(?P<unk>x)', '\g<1>\g<1>', 'xx'), 'xxxx')
 
-        self.assertEqual(re.sub('a',r'\t\n\v\r\f\a\b\B\Z\a\A\w\W\s\S\d\D','a'),
-                         '\t\n\v\r\f\a\b\\B\\Z\a\\A\\w\\W\\s\\S\\d\\D')
-        self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'), '\t\n\v\r\f\a')
-        self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'),
-                         (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)))
+        self.assertEqual(re.sub('a',r'\t\n\v\r\f\a\b','a'), '\t\n\v\r\f\a\b')
+        self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
+        self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'),
+                         (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)+chr(8)))
+        for c in 'cdehijklmopqsuwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ':
+            with check_py3k_warnings():
+                self.assertEqual(re.sub('a', '\\' + c, 'a'), '\\' + c)
 
         self.assertEqual(re.sub('^\s*', 'X', 'test'), 'Xtest')
 
@@ -223,11 +225,11 @@ def test_re_subn(self):
 
     def test_re_split(self):
         self.assertEqual(re.split(":", ":a:b::c"), ['', 'a', 'b', '', 'c'])
-        self.assertEqual(re.split(":*", ":a:b::c"), ['', 'a', 'b', 'c'])
-        self.assertEqual(re.split("(:*)", ":a:b::c"),
+        self.assertEqual(re.split(":+", ":a:b::c"), ['', 'a', 'b', 'c'])
+        self.assertEqual(re.split("(:+)", ":a:b::c"),
                          ['', ':', 'a', ':', 'b', '::', 'c'])
-        self.assertEqual(re.split("(?::*)", ":a:b::c"), ['', 'a', 'b', 'c'])
-        self.assertEqual(re.split("(:)*", ":a:b::c"),
+        self.assertEqual(re.split("(?::+)", ":a:b::c"), ['', 'a', 'b', 'c'])
+        self.assertEqual(re.split("(:)+", ":a:b::c"),
                          ['', ':', 'a', ':', 'b', ':', 'c'])
         self.assertEqual(re.split("([b:]+)", ":a:b::c"),
                          ['', ':', 'a', ':b::', 'c'])
@@ -237,13 +239,34 @@ def test_re_split(self):
         self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"),
                          ['', 'a', '', '', 'c'])
 
+        for sep, expected in [
+            (':*', ['', 'a', 'b', 'c']),
+            ('(?::*)', ['', 'a', 'b', 'c']),
+            ('(:*)', ['', ':', 'a', ':', 'b', '::', 'c']),
+            ('(:)*', ['', ':', 'a', ':', 'b', ':', 'c']),
+        ]:
+            with check_py3k_warnings(('', FutureWarning)):
+                self.assertEqual(re.split(sep, ':a:b::c'), expected)
+
+        for sep, expected in [
+            ('', [':a:b::c']),
+            (r'\b', [':a:b::c']),
+            (r'(?=:)', [':a:b::c']),
+            (r'(?<=:)', [':a:b::c']),
+        ]:
+            with check_py3k_warnings():
+                self.assertEqual(re.split(sep, ':a:b::c'), expected)
+
     def test_qualified_re_split(self):
         self.assertEqual(re.split(":", ":a:b::c", 2), ['', 'a', 'b::c'])
         self.assertEqual(re.split(':', 'a:b:c:d', 2), ['a', 'b', 'c:d'])
         self.assertEqual(re.split("(:)", ":a:b::c", 2),
                          ['', ':', 'a', ':', 'b::c'])
-        self.assertEqual(re.split("(:*)", ":a:b::c", 2),
+        self.assertEqual(re.split("(:+)", ":a:b::c", 2),
                          ['', ':', 'a', ':', 'b::c'])
+        with check_py3k_warnings(('', FutureWarning)):
+            self.assertEqual(re.split("(:*)", ":a:b::c", maxsplit=2),
+                             ['', ':', 'a', ':', 'b::c'])
 
     def test_re_findall(self):
         self.assertEqual(re.findall(":+", "abc"), [])
@@ -404,6 +427,37 @@ def test_special_escapes(self):
             self.assertEqual(re.search(r"\d\D\w\W\s\S",
                                        "1aa! a", re.UNICODE).group(0), "1aa! a")
 
+    def test_other_escapes(self):
+        self.assertRaises(re.error, re.compile, "\\")
+        self.assertEqual(re.match(r"\(", '(').group(), '(')
+        self.assertIsNone(re.match(r"\(", ')'))
+        self.assertEqual(re.match(r"\\", '\\').group(), '\\')
+        self.assertEqual(re.match(r"[\]]", ']').group(), ']')
+        self.assertIsNone(re.match(r"[\]]", '['))
+        self.assertEqual(re.match(r"[a\-c]", '-').group(), '-')
+        self.assertIsNone(re.match(r"[a\-c]", 'b'))
+        self.assertEqual(re.match(r"[\^a]+", 'a^').group(), 'a^')
+        self.assertIsNone(re.match(r"[\^a]+", 'b'))
+        re.purge()  # for warnings
+        for c in 'ceghijklmopquyzCEFGHIJKLMNOPQRTUVXY':
+            with check_py3k_warnings():
+                self.assertEqual(re.match('\\%c$' % c, c).group(), c)
+                self.assertIsNone(re.match('\\%c' % c, 'a'))
+            if have_unicode:
+                warn = FutureWarning if c in 'Uu' else DeprecationWarning
+                with check_py3k_warnings(('', warn)):
+                    self.assertEqual(re.match('\\%c$' % c, c, re.UNICODE).group(), c)
+                    self.assertIsNone(re.match('\\%c' % c, 'a', re.UNICODE))
+        for c in 'ceghijklmopquyzABCEFGHIJKLMNOPQRTUVXYZ':
+            with check_py3k_warnings():
+                self.assertEqual(re.match('[\\%c]$' % c, c).group(), c)
+                self.assertIsNone(re.match('[\\%c]' % c, 'a'))
+            if have_unicode:
+                warn = FutureWarning if c in 'Uu' else DeprecationWarning
+                with check_py3k_warnings(('', warn)):
+                    self.assertEqual(re.match('[\\%c]$' % c, c, re.UNICODE).group(), c)
+                    self.assertIsNone(re.match('[\\%c]' % c, 'a', re.UNICODE))
+
     def test_string_boundaries(self):
         # See http://bugs.python.org/issue10713
         self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1),
@@ -931,6 +985,19 @@ def test_inline_flags(self):
         self.assertTrue(re.match('(?ixu) ' + upper_char, lower_char))
         self.assertTrue(re.match('(?ixu) ' + lower_char, upper_char))
 
+        # Incompatibilities
+        re.purge()
+        with check_py3k_warnings():
+            re.compile('', re.LOCALE|re.UNICODE)
+        with check_py3k_warnings():
+            re.compile('(?L)', re.UNICODE)
+        with check_py3k_warnings():
+            re.compile('(?u)', re.LOCALE)
+        with check_py3k_warnings():
+            re.compile('(?Lu)')
+        with check_py3k_warnings():
+            re.compile('(?uL)')
+
     def test_dollar_matches_twice(self):
         "$ matches the end of string, and just before the terminating \n"
         pattern = re.compile('$')
@@ -967,8 +1034,9 @@ def test_compile(self):
     def test_bug_13899(self):
         # Issue #13899: re pattern r"[\A]" should work like "A" but matches
         # nothing. Ditto B and Z.
-        self.assertEqual(re.findall(r'[\A\B\b\C\Z]', 'AB\bCZ'),
-                         ['A', 'B', '\b', 'C', 'Z'])
+        with check_py3k_warnings():
+            self.assertEqual(re.findall(r'[\A\B\b\C\Z]', 'AB\bCZ'),
+                             ['A', 'B', '\b', 'C', 'Z'])
 
     @precisionbigmemtest(size=_2G, memuse=1)
     def test_large_search(self, size):
@@ -1261,7 +1329,11 @@ def run_re_tests():
 
 def test_main():
     run_unittest(ReTests)
-    run_re_tests()
+    deprecations = [
+        ('bad escape', DeprecationWarning),
+    ]
+    with check_py3k_warnings(*deprecations):
+        run_re_tests()
 
 if __name__ == "__main__":
     test_main()
diff --git a/Misc/NEWS b/Misc/NEWS
index 4c9f2d375e8911..6538ea53a90465 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -42,6 +42,10 @@ Extension Modules
 Library
 -------
 
+- bpo-30363: Running Python with the -3 option now warns about regular
+  expression syntax that is invalid or has different semantic in Python 3
+  or will change the behavior in future Python versions.
+
 - bpo-30342: Fix sysconfig.is_python_build() if Python is built with Visual
   Studio 2008 (VS 9.0).
 
diff --git a/Modules/_sre.c b/Modules/_sre.c
index 8e16c1d140adfa..6fd3affb09ab43 100644
--- a/Modules/_sre.c
+++ b/Modules/_sre.c
@@ -2267,6 +2267,20 @@ pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
     if (!string)
         return NULL;
 
+    if (Py_Py3kWarningFlag &&
+        (self->code[0] != SRE_OP_INFO || self->code[3] == 0))
+    {
+        if (self->code[0] == SRE_OP_INFO && self->code[4] == 0) {
+            if (PyErr_WarnPy3k("split() requires a non-empty pattern match.",
+                               1) < 0)
+                return NULL;
+        }
+        else if (PyErr_WarnEx(PyExc_FutureWarning,
+                              "split() requires a non-empty pattern match.",
+                              1) < 0)
+            return NULL;
+    }
+
     string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
     if (!string)
         return NULL;

From 427b09d808e00eddd60e56fa53068912c471f7c5 Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Thu, 18 May 2017 10:24:27 +0300
Subject: [PATCH 2/2] Address review comments.

---
 Lib/sre_parse.py    | 16 +++++++++-------
 Lib/test/test_re.py | 18 +++++-------------
 2 files changed, 14 insertions(+), 20 deletions(-)

diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py
index 0e4699dd782e2e..e0d003ed85bda1 100644
--- a/Lib/sre_parse.py
+++ b/Lib/sre_parse.py
@@ -234,7 +234,7 @@ def isname(name):
             return False
     return True
 
-def _class_escape(source, escape, state):
+def _class_escape(source, escape):
     # handle escape code inside character class
     code = ESCAPES.get(escape)
     if code:
@@ -263,8 +263,9 @@ def _class_escape(source, escape, state):
         if len(escape) == 2:
             if sys.py3kwarning and c in ASCIILETTERS:
                 import warnings
-                if c in 'Uu' and state.flags & SRE_FLAG_UNICODE:
-                    warnings.warn('unicode escape %s' % escape,
+                if c in 'Uu':
+                    warnings.warn('bad escape %s; Unicode escapes are '
+                                  'supported only since Python 3.3' % escape,
                                   FutureWarning, stacklevel=8)
                 else:
                     warnings.warnpy3k('bad escape %s' % escape,
@@ -320,8 +321,9 @@ def _escape(source, escape, state):
         if len(escape) == 2:
             if sys.py3kwarning and c in ASCIILETTERS:
                 import warnings
-                if c in 'Uu' and state.flags & SRE_FLAG_UNICODE:
-                    warnings.warn('unicode escape %s' % escape,
+                if c in 'Uu':
+                    warnings.warn('bad escape %s; Unicode escapes are '
+                                  'supported only since Python 3.3' % escape,
                                   FutureWarning, stacklevel=8)
                 else:
                     warnings.warnpy3k('bad escape %s' % escape,
@@ -460,7 +462,7 @@ def _parse(source, state):
                 if this == "]" and set != start:
                     break
                 elif this and this[0] == "\\":
-                    code1 = _class_escape(source, this, state)
+                    code1 = _class_escape(source, this)
                 elif this:
                     code1 = LITERAL, ord(this)
                 else:
@@ -476,7 +478,7 @@ def _parse(source, state):
                         break
                     elif this:
                         if this[0] == "\\":
-                            code2 = _class_escape(source, this, state)
+                            code2 = _class_escape(source, this)
                         else:
                             code2 = LITERAL, ord(this)
                         if code1[0] != LITERAL or code2[0] != LITERAL:
diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py
index 3d892100b7cce2..174c5ca462cdca 100644
--- a/Lib/test/test_re.py
+++ b/Lib/test/test_re.py
@@ -66,7 +66,7 @@ def test_basic_re_sub(self):
         self.assertEqual(re.sub('(?P<unk>x)', '\g<unk>\g<unk>', 'xx'), 'xxxx')
         self.assertEqual(re.sub('(?P<unk>x)', '\g<1>\g<1>', 'xx'), 'xxxx')
 
-        self.assertEqual(re.sub('a',r'\t\n\v\r\f\a\b','a'), '\t\n\v\r\f\a\b')
+        self.assertEqual(re.sub('a', r'\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
         self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
         self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'),
                          (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)+chr(8)))
@@ -440,23 +440,15 @@ def test_other_escapes(self):
         self.assertIsNone(re.match(r"[\^a]+", 'b'))
         re.purge()  # for warnings
         for c in 'ceghijklmopquyzCEFGHIJKLMNOPQRTUVXY':
-            with check_py3k_warnings():
+            warn = FutureWarning if c in 'Uu' else DeprecationWarning
+            with check_py3k_warnings(('', warn)):
                 self.assertEqual(re.match('\\%c$' % c, c).group(), c)
                 self.assertIsNone(re.match('\\%c' % c, 'a'))
-            if have_unicode:
-                warn = FutureWarning if c in 'Uu' else DeprecationWarning
-                with check_py3k_warnings(('', warn)):
-                    self.assertEqual(re.match('\\%c$' % c, c, re.UNICODE).group(), c)
-                    self.assertIsNone(re.match('\\%c' % c, 'a', re.UNICODE))
         for c in 'ceghijklmopquyzABCEFGHIJKLMNOPQRTUVXYZ':
-            with check_py3k_warnings():
+            warn = FutureWarning if c in 'Uu' else DeprecationWarning
+            with check_py3k_warnings(('', warn)):
                 self.assertEqual(re.match('[\\%c]$' % c, c).group(), c)
                 self.assertIsNone(re.match('[\\%c]' % c, 'a'))
-            if have_unicode:
-                warn = FutureWarning if c in 'Uu' else DeprecationWarning
-                with check_py3k_warnings(('', warn)):
-                    self.assertEqual(re.match('[\\%c]$' % c, c, re.UNICODE).group(), c)
-                    self.assertIsNone(re.match('[\\%c]' % c, 'a', re.UNICODE))
 
     def test_string_boundaries(self):
         # See http://bugs.python.org/issue10713