diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 0a77e6fe9edc03..98d04610f28958 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -2100,6 +2100,106 @@ def test_bug_34294(self): {'tag': 'foo', 'text': None}, {'tag': 'foo', 'text': None}]) + def test_bug_35859(self): + # three bugs about capturing groups, these bugs exist since Python 2 + + # ===================================================== + # macro MARK_PUSH(lastmark) bug, reported in issue35859 + # ===================================================== + self.assertEqual(re.match(r'(ab|a)*?b', 'ab').groups(), ('a',)) + self.assertEqual(re.match(r'(ab|a)+?b', 'ab').groups(), ('a',)) + self.assertEqual(re.match(r'(ab|a){0,2}?b', 'ab').groups(), ('a',)) + self.assertEqual(re.match(r'(.b|a)*?b', 'ab').groups(), ('a',)) + + # ======================================================= + # JUMP_MIN_UNTIL_3 should LASTMARK_SAVE() and MARK_PUSH() + # ======================================================= + + # 1, triggered by SRE_OP_REPEAT_ONE, b? in this pattern + self.assertEqual(re.match(r'(ab?)*?b', 'ab').groups(), ('a',)) + + # 2, triggered by SRE_OP_MIN_REPEAT_ONE, a*? in this pattern + s = 'axxzaz' + p = r'(?:a*?(xx)??z)*' + self.assertEqual(re.match(p, s).groups(), ('xx',)) + + # 3, triggered by SRE_OP_MIN_UNTIL (JUMP_MIN_UNTIL_2) + # (?:a|bc)*? in this pattern + s = 'axxzbcz' + p = r'(?:(?:a|bc)*?(xx)??z)*' + self.assertEqual(re.match(p, s).groups(), ('xx',)) + # test-case provided by issue9134 + s = 'xtcxyzxc' + p = r'((x|yz)+?(t)??c)*' + self.assertEqual(re.match(p, s).groups(), ('xyzxc', 'x', 't')) + + # ====================================== + # JUMP_ASSERT_NOT should LASTMARK_SAVE() + # ====================================== + # reported in issue725149 + # negative assertion + self.assertEqual(re.match(r'(?!(..)c)', 'ab').groups(), (None,)) + # negative assertion in a repeat + self.assertEqual(re.match(r'(?:(?!(ab)c).)*', 'ab').groups(), (None,)) + self.assertEqual(re.match(r'((?!(bc)d)(.))*', 'abc').groups(), + ('c', None, 'c')) + + # ============================================================= + # below asserts didn't fail before fix, just prevent regression + # ============================================================= + + # 1, why JUMP_MIN_REPEAT_ONE should LASTMARK_SAVE() + # .?? in this pattern + m = re.match(r'.??(?=(a)?)b', 'ab') + self.assertEqual(m.span(), (0, 2)) + self.assertEqual(m.groups(), (None,)) + # put in a repeat + m = re.match(r'(?:.??(?=(a)?)b)*', 'abab') + self.assertEqual(m.span(), (0, 4)) + self.assertEqual(m.groups(), (None,)) + + # 2, why JUMP_MIN_UNTIL_2 should LASTMARK_SAVE() + # (?:..)?? in this pattern + m = re.match(r'(?:..)??(?=(aa)?)bb', 'aabb') + self.assertEqual(m.span(), (0, 4)) + self.assertEqual(m.groups(), (None,)) + # put in a repeat + m = re.match(r'(?:(?:..)??(?=(aa)?)bb)*', 'aabbaabb') + self.assertEqual(m.span(), (0, 8)) + self.assertEqual(m.groups(), (None,)) + + # 3, why JUMP_REPEAT_ONE_1 should LASTMARK_SAVE() + # .* in this pattern, tail starts with a literal. + self.assertEqual(re.match(r'.*x(?=(b)?)a', 'xaxb').groups(), (None,)) + + # 4, why JUMP_REPEAT_ONE_2 should LASTMARK_SAVE() + # .* in this pattern, tail is general case + self.assertEqual(re.match(r'.*(?=(b)?)a', 'ab').groups(), (None,)) + + # 5, demonstrate that JUMP_MAX_UNTIL_3 doesn't need LASTMARK_SAVE() + # this pattern is similar to 4 + self.assertEqual(re.match(r'(.)*(?=(b)?)a', 'ab').groups(), + (None, None)) + self.assertEqual(re.match(r'(.){0}(?=(b)?)a', 'ab').groups(), + (None, None)) + + # 6, positive assertion in a repeat + # strictly speaking, this is a bug, the correct result should be + # (None,), but it's very hard to fix with the current fundamental + # implementation of sre. + # PHP 7.3.2, Java 11.0.2, Ruby 2.6.1, and the third-party module + # regex 2019.2.21, return ('a',) as well. + # Perl 5.26.1, Node.js 10.15.1, return the correct result (None,) + # Go 1.12, Rust 1.32.0, don't support lookaround yet. + self.assertEqual(re.match(r'(?:(?=(a)?).)*', 'ab').groups(), ('a',)) + + # 7, negative assertion + # PHP 7.3.2, Ruby 2.6.1, Node.js 10.15.1, regex 2019.2.21 return + # (None,) + # Java 11.0.2, Perl 5.26.1, return ('b',) + # Go 1.12, Rust 1.32.0, don't support lookaround yet. + self.assertEqual(re.match(r'a*(?!(b))', 'ab').groups(), (None,)) + class PatternReprTests(unittest.TestCase): def check(self, pattern, expected): diff --git a/Misc/NEWS.d/next/Library/2019-03-01-18-08-40.bpo-35859.9dWTQt.rst b/Misc/NEWS.d/next/Library/2019-03-01-18-08-40.bpo-35859.9dWTQt.rst new file mode 100644 index 00000000000000..f91d4944e0bdd3 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2019-03-01-18-08-40.bpo-35859.9dWTQt.rst @@ -0,0 +1,2 @@ +re module, fix wrong capturing groups in rare cases. The span of capturing +group may be lost when backtracking. Patch by Ma Lin. diff --git a/Modules/sre_lib.h b/Modules/sre_lib.h index 437ab43f434a62..89e8fa194330de 100644 --- a/Modules/sre_lib.h +++ b/Modules/sre_lib.h @@ -478,20 +478,20 @@ do { \ DATA_STACK_LOOKUP_AT(state,t,p,pos) #define MARK_PUSH(lastmark) \ - do if (lastmark > 0) { \ + do if (lastmark >= 0) { \ i = lastmark; /* ctx->lastmark may change if reallocated */ \ DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \ } while (0) #define MARK_POP(lastmark) \ - do if (lastmark > 0) { \ + do if (lastmark >= 0) { \ DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \ } while (0) #define MARK_POP_KEEP(lastmark) \ - do if (lastmark > 0) { \ + do if (lastmark >= 0) { \ DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \ } while (0) #define MARK_POP_DISCARD(lastmark) \ - do if (lastmark > 0) { \ + do if (lastmark >= 0) { \ DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \ } while (0) @@ -1126,6 +1126,7 @@ SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int toplevel) RETURN_FAILURE; ctx->u.rep->count = ctx->count; + MARK_PUSH(ctx->lastmark); /* already LASTMARK_SAVE() above */ /* zero-width match protection */ DATA_PUSH(&ctx->u.rep->last_ptr); ctx->u.rep->last_ptr = state->ptr; @@ -1133,9 +1134,12 @@ SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int toplevel) ctx->u.rep->pattern+3); DATA_POP(&ctx->u.rep->last_ptr); if (ret) { + MARK_POP_DISCARD(ctx->lastmark); RETURN_ON_ERROR(ret); RETURN_SUCCESS; } + MARK_POP(ctx->lastmark); + LASTMARK_RESTORE(); ctx->u.rep->count = ctx->count-1; state->ptr = ctx->ptr; RETURN_FAILURE; @@ -1285,11 +1289,19 @@ SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int toplevel) ctx->ptr, ctx->pattern[1])); if (ctx->ptr - (SRE_CHAR *)state->beginning >= (Py_ssize_t)ctx->pattern[1]) { state->ptr = ctx->ptr - ctx->pattern[1]; + LASTMARK_SAVE(); + if (state->repeat) + MARK_PUSH(ctx->lastmark); DO_JUMP0(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2); if (ret) { + if (state->repeat) + MARK_POP_DISCARD(ctx->lastmark); RETURN_ON_ERROR(ret); RETURN_FAILURE; } + if (state->repeat) + MARK_POP(ctx->lastmark); + LASTMARK_RESTORE(); } ctx->pattern += ctx->pattern[0]; break;