From 54e0f71e1b85bbfe45c715216c5384bea45d2116 Mon Sep 17 00:00:00 2001 From: animalize Date: Fri, 1 Mar 2019 17:57:12 +0800 Subject: [PATCH 1/5] Step 1/4, test-cases Show the wrong behaviors before this fix. --- Lib/test/test_re.py | 98 +++++++++++++++++++ .../2019-03-01-18-08-40.bpo-35859.9dWTQt.rst | 2 + 2 files changed, 100 insertions(+) create mode 100644 Misc/NEWS.d/next/Library/2019-03-01-18-08-40.bpo-35859.9dWTQt.rst diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 0a77e6fe9edc03..99035a9e890dd2 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -2100,6 +2100,104 @@ def test_bug_34294(self): {'tag': 'foo', 'text': None}, {'tag': 'foo', 'text': None}]) + def test_bug_35859(self): + # three bugs about capturing groups, these bugs exist since Python 2 + + # ===================================================== + # macro MARK_PUSH(lastmark) bug, reported in issue35859 + # ===================================================== + self.assertEqual(re.match(r'(ab|a)*?b', 'ab').groups(), ('',)) + self.assertEqual(re.match(r'(ab|a)+?b', 'ab').groups(), ('',)) + self.assertEqual(re.match(r'(ab|a){0,2}?b', 'ab').groups(), ('',)) + self.assertEqual(re.match(r'(.b|a)*?b', 'ab').groups(), ('',)) + + # ======================================================= + # JUMP_MIN_UNTIL_3 should LASTMARK_SAVE() and MARK_PUSH() + # ======================================================= + + # 1, triggered by SRE_OP_REPEAT_ONE, b? in this pattern + self.assertEqual(re.match(r'(ab?)*?b', 'ab').groups(), ('',)) + + # 2, triggered by SRE_OP_MIN_REPEAT_ONE, a*? in this pattern + s = 'axxzaz' + p = r'(?:a*?(xx)??z)*' + self.assertEqual(re.match(p, s).groups(), ('',)) + + # 3, triggered by SRE_OP_MIN_UNTIL (JUMP_MIN_UNTIL_2) + # (?:a|bc)*? in this pattern + s = 'axxzbcz' + p = r'(?:(?:a|bc)*?(xx)??z)*' + self.assertEqual(re.match(p, s).groups(), ('',)) + # test-case provided by issue9134 + s = 'xtcxyzxc' + p = r'((x|yz)+?(t)??c)*' + self.assertEqual(re.match(p, s).groups(), ('xyzxc', 'x', '')) + + # ====================================== + # JUMP_ASSERT_NOT should LASTMARK_SAVE() + # ====================================== + # reported in issue725149 + # negative assertion + self.assertEqual(re.match(r'(?!(..)c)', 'ab').groups(), ('ab',)) + # negative assertion in a repeat + self.assertEqual(re.match(r'(?:(?!(ab)c).)*', 'ab').groups(), ('b',)) + + # ============================================================= + # below asserts didn't fail before fix, just prevent regression + # ============================================================= + + # 1, why JUMP_MIN_REPEAT_ONE should LASTMARK_SAVE() + # .?? in this pattern + m = re.match(r'.??(?=(a)?)b', 'ab') + self.assertEqual(m.span(), (0, 2)) + self.assertEqual(m.groups(), (None,)) + # put in a repeat + m = re.match(r'(?:.??(?=(a)?)b)*', 'abab') + self.assertEqual(m.span(), (0, 4)) + self.assertEqual(m.groups(), (None,)) + + # 2, why JUMP_MIN_UNTIL_2 should LASTMARK_SAVE() + # (?:..)?? in this pattern + m = re.match(r'(?:..)??(?=(aa)?)bb', 'aabb') + self.assertEqual(m.span(), (0, 4)) + self.assertEqual(m.groups(), (None,)) + # put in a repeat + m = re.match(r'(?:(?:..)??(?=(aa)?)bb)*', 'aabbaabb') + self.assertEqual(m.span(), (0, 8)) + self.assertEqual(m.groups(), (None,)) + + # 3, why JUMP_REPEAT_ONE_1 should LASTMARK_SAVE() + # .* in this pattern, tail starts with a literal. + self.assertEqual(re.match(r'.*x(?=(b)?)a', 'xaxb').groups(), (None,)) + + # 4, why JUMP_REPEAT_ONE_2 should LASTMARK_SAVE() + # .* in this pattern, tail is general case + self.assertEqual(re.match(r'.*(?=(b)?)a', 'ab').groups(), (None,)) + + # 5, demonstrate that JUMP_MAX_UNTIL_3 doesn't need LASTMARK_SAVE() + # this pattern is similar to 4 + self.assertEqual(re.match(r'(.)*(?=(b)?)a', 'ab').groups(), + (None, None)) + self.assertEqual(re.match(r'(.){0}(?=(b)?)a', 'ab').groups(), + (None, None)) + + # 6, positive assertion in a repeat + # strictly speaking, this is a bug, the correct result should be + # (None,), but it's very hard to fix with the current fundamental + # implementation of sre. + # PHP 7.3.2, Java 11.0.2, Ruby 2.6.1, and the third-party module + # regex 2019.2.21, return ('a',) as well. + # Perl 5.26.1, Node.js 10.15.1, return the correct result (None,) + # Go 1.12, Rust 1.32.0, don't support lookaround yet. + self.assertEqual(re.match(r'(?:(?=(a)?).)*', 'ab').groups(), ('a',)) + + # 7, negative assertion + # PHP 7.3.2, Ruby 2.6.1, Node.js 10.15.1, regex 2019.2.21 return + # (None,) + # Java 11.0.2, Perl 5.26.1, return ('b',) + # Go 1.12, Rust 1.32.0, don't support lookaround yet. + self.assertEqual(re.match(r'a*(?!(b))', 'ab').groups(), (None,)) + class PatternReprTests(unittest.TestCase): def check(self, pattern, expected): diff --git a/Misc/NEWS.d/next/Library/2019-03-01-18-08-40.bpo-35859.9dWTQt.rst b/Misc/NEWS.d/next/Library/2019-03-01-18-08-40.bpo-35859.9dWTQt.rst new file mode 100644 index 00000000000000..f91d4944e0bdd3 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2019-03-01-18-08-40.bpo-35859.9dWTQt.rst @@ -0,0 +1,2 @@ +re module, fix wrong capturing groups in rare cases. The span of capturing +group may be lost when backtracking. Patch by Ma Lin. From ddf7ac25c1425d5932bd2301f2b6befa1d8fc6d8 Mon Sep 17 00:00:00 2001 From: animalize Date: Fri, 1 Mar 2019 18:37:27 +0800 Subject: [PATCH 2/5] Step 2/4, MARK_PUSH(lastmark) macro bug MARK_PUSH(lastmark) macro didn't protect MARK-0 if it was the only available mark. --- Lib/test/test_re.py | 8 ++++---- Modules/sre_lib.h | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 99035a9e890dd2..3dc092e972821d 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -2106,10 +2106,10 @@ def test_bug_35859(self): # ===================================================== # macro MARK_PUSH(lastmark) bug, reported in issue35859 # ===================================================== - self.assertEqual(re.match(r'(ab|a)*?b', 'ab').groups(), ('',)) - self.assertEqual(re.match(r'(ab|a)+?b', 'ab').groups(), ('',)) - self.assertEqual(re.match(r'(ab|a){0,2}?b', 'ab').groups(), ('',)) - self.assertEqual(re.match(r'(.b|a)*?b', 'ab').groups(), ('',)) + self.assertEqual(re.match(r'(ab|a)*?b', 'ab').groups(), ('a',)) + self.assertEqual(re.match(r'(ab|a)+?b', 'ab').groups(), ('a',)) + self.assertEqual(re.match(r'(ab|a){0,2}?b', 'ab').groups(), ('a',)) + self.assertEqual(re.match(r'(.b|a)*?b', 'ab').groups(), ('a',)) # ======================================================= # JUMP_MIN_UNTIL_3 should LASTMARK_SAVE() and MARK_PUSH() diff --git a/Modules/sre_lib.h b/Modules/sre_lib.h index 437ab43f434a62..effecb8318416d 100644 --- a/Modules/sre_lib.h +++ b/Modules/sre_lib.h @@ -478,20 +478,20 @@ do { \ DATA_STACK_LOOKUP_AT(state,t,p,pos) #define MARK_PUSH(lastmark) \ - do if (lastmark > 0) { \ + do if (lastmark >= 0) { \ i = lastmark; /* ctx->lastmark may change if reallocated */ \ DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \ } while (0) #define MARK_POP(lastmark) \ - do if (lastmark > 0) { \ + do if (lastmark >= 0) { \ DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \ } while (0) #define MARK_POP_KEEP(lastmark) \ - do if (lastmark > 0) { \ + do if (lastmark >= 0) { \ DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \ } while (0) #define MARK_POP_DISCARD(lastmark) \ - do if (lastmark > 0) { \ + do if (lastmark >= 0) { \ DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \ } while (0) From 1a8891ead65622d4ab2e821f17107b042ef9db99 Mon Sep 17 00:00:00 2001 From: animalize Date: Fri, 1 Mar 2019 19:01:47 +0800 Subject: [PATCH 3/5] Step 3/4, JUMP_MIN_UNTIL_3 needs LASTMARK_SAVE() and MARK_PUSH() --- Lib/test/test_re.py | 8 ++++---- Modules/sre_lib.h | 4 ++++ 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 3dc092e972821d..771b047a2428e7 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -2116,22 +2116,22 @@ def test_bug_35859(self): # ======================================================= # 1, triggered by SRE_OP_REPEAT_ONE, b? in this pattern - self.assertEqual(re.match(r'(ab?)*?b', 'ab').groups(), ('',)) + self.assertEqual(re.match(r'(ab?)*?b', 'ab').groups(), ('a',)) # 2, triggered by SRE_OP_MIN_REPEAT_ONE, a*? in this pattern s = 'axxzaz' p = r'(?:a*?(xx)??z)*' - self.assertEqual(re.match(p, s).groups(), ('',)) + self.assertEqual(re.match(p, s).groups(), ('xx',)) # 3, triggered by SRE_OP_MIN_UNTIL (JUMP_MIN_UNTIL_2) # (?:a|bc)*? in this pattern s = 'axxzbcz' p = r'(?:(?:a|bc)*?(xx)??z)*' - self.assertEqual(re.match(p, s).groups(), ('',)) + self.assertEqual(re.match(p, s).groups(), ('xx',)) # test-case provided by issue9134 s = 'xtcxyzxc' p = r'((x|yz)+?(t)??c)*' - self.assertEqual(re.match(p, s).groups(), ('xyzxc', 'x', '')) + self.assertEqual(re.match(p, s).groups(), ('xyzxc', 'x', 't')) # ====================================== # JUMP_ASSERT_NOT should LASTMARK_SAVE() diff --git a/Modules/sre_lib.h b/Modules/sre_lib.h index effecb8318416d..328a6379d7b58d 100644 --- a/Modules/sre_lib.h +++ b/Modules/sre_lib.h @@ -1126,6 +1126,7 @@ SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int toplevel) RETURN_FAILURE; ctx->u.rep->count = ctx->count; + MARK_PUSH(ctx->lastmark); /* already LASTMARK_SAVE() above */ /* zero-width match protection */ DATA_PUSH(&ctx->u.rep->last_ptr); ctx->u.rep->last_ptr = state->ptr; @@ -1133,9 +1134,12 @@ SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int toplevel) ctx->u.rep->pattern+3); DATA_POP(&ctx->u.rep->last_ptr); if (ret) { + MARK_POP_DISCARD(ctx->lastmark); RETURN_ON_ERROR(ret); RETURN_SUCCESS; } + MARK_POP(ctx->lastmark); + LASTMARK_RESTORE(); ctx->u.rep->count = ctx->count-1; state->ptr = ctx->ptr; RETURN_FAILURE; From 8bed8d95c0fe2e34cfe6d81bd8b43970a5f5b134 Mon Sep 17 00:00:00 2001 From: animalize Date: Fri, 1 Mar 2019 19:37:02 +0800 Subject: [PATCH 4/5] Step 4/4, JUMP_ASSERT_NOT needs LASTMARK_SAVE() --- Lib/test/test_re.py | 4 ++-- Modules/sre_lib.h | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 771b047a2428e7..0d606688f3104d 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -2138,9 +2138,9 @@ def test_bug_35859(self): # ====================================== # reported in issue725149 # negative assertion - self.assertEqual(re.match(r'(?!(..)c)', 'ab').groups(), ('ab',)) + self.assertEqual(re.match(r'(?!(..)c)', 'ab').groups(), (None,)) # negative assertion in a repeat - self.assertEqual(re.match(r'(?:(?!(ab)c).)*', 'ab').groups(), ('b',)) + self.assertEqual(re.match(r'(?:(?!(ab)c).)*', 'ab').groups(), (None,)) # ============================================================= # below asserts didn't fail before fix, just prevent regression diff --git a/Modules/sre_lib.h b/Modules/sre_lib.h index 328a6379d7b58d..fbe49f7c67559f 100644 --- a/Modules/sre_lib.h +++ b/Modules/sre_lib.h @@ -1289,11 +1289,13 @@ SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int toplevel) ctx->ptr, ctx->pattern[1])); if (ctx->ptr - (SRE_CHAR *)state->beginning >= (Py_ssize_t)ctx->pattern[1]) { state->ptr = ctx->ptr - ctx->pattern[1]; + LASTMARK_SAVE(); DO_JUMP0(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2); if (ret) { RETURN_ON_ERROR(ret); RETURN_FAILURE; } + LASTMARK_RESTORE(); } ctx->pattern += ctx->pattern[0]; break; From e5fb79008470e7c1df737721d0fa734c43bd1f54 Mon Sep 17 00:00:00 2001 From: animalize Date: Sat, 2 Mar 2019 06:58:19 +0800 Subject: [PATCH 5/5] Fix for Step 4, JUMP_ASSERT_NOT needs MARK_PUSH() as well before this fix, the test-case returns ('c', 'c', 'c') No need to save state->repeat in ctx->u.rep. SRE_OP_BRANCH saves state->repeat in ctx->u.rep, this is because after JUMP_BRANCH, the state->repeat may be NULL. While SRE_OP_ASSERT_NOT doesn't have this problem. --- Lib/test/test_re.py | 2 ++ Modules/sre_lib.h | 6 ++++++ 2 files changed, 8 insertions(+) diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 0d606688f3104d..98d04610f28958 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -2141,6 +2141,8 @@ def test_bug_35859(self): self.assertEqual(re.match(r'(?!(..)c)', 'ab').groups(), (None,)) # negative assertion in a repeat self.assertEqual(re.match(r'(?:(?!(ab)c).)*', 'ab').groups(), (None,)) + self.assertEqual(re.match(r'((?!(bc)d)(.))*', 'abc').groups(), + ('c', None, 'c')) # ============================================================= # below asserts didn't fail before fix, just prevent regression diff --git a/Modules/sre_lib.h b/Modules/sre_lib.h index fbe49f7c67559f..89e8fa194330de 100644 --- a/Modules/sre_lib.h +++ b/Modules/sre_lib.h @@ -1290,11 +1290,17 @@ SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int toplevel) if (ctx->ptr - (SRE_CHAR *)state->beginning >= (Py_ssize_t)ctx->pattern[1]) { state->ptr = ctx->ptr - ctx->pattern[1]; LASTMARK_SAVE(); + if (state->repeat) + MARK_PUSH(ctx->lastmark); DO_JUMP0(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2); if (ret) { + if (state->repeat) + MARK_POP_DISCARD(ctx->lastmark); RETURN_ON_ERROR(ret); RETURN_FAILURE; } + if (state->repeat) + MARK_POP(ctx->lastmark); LASTMARK_RESTORE(); } ctx->pattern += ctx->pattern[0];