From e77d0fe0c7d87dcc27242e0dc704bb4ffa698573 Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Thu, 23 May 2024 18:22:24 -0500 Subject: [PATCH 1/7] WIP - many problems. --- Lib/difflib.py | 64 ++++++++++++++++++-------------------------------- 1 file changed, 23 insertions(+), 41 deletions(-) diff --git a/Lib/difflib.py b/Lib/difflib.py index 79b446c2afbdc6..f7ba9a0dbd2282 100644 --- a/Lib/difflib.py +++ b/Lib/difflib.py @@ -908,21 +908,16 @@ def _fancy_replace(self, a, alo, ahi, b, blo, bhi): + abcdefGhijkl ? ^ ^ ^ """ - from operator import ge, gt - # Don't synch up unless the lines have a similarity score of at - # least cutoff; best_ratio tracks the best score seen so far. - # Keep track of all index pairs achieving the best ratio and - # deal with them here. Previously only the smallest pair was - # handled here, and if there are many pairs with the best ratio, - # recursion could grow very deep, and runtime cubic. See: + # Don't synch up unless the lines have a similarity score above + # cutoff. Previously only the smallest pair was handled here, + # and if there are many pairs with the best ratio, recursion + # could grow very deep, and runtime cubic. See: # https://github.com/python/cpython/issues/119105 - best_ratio, cutoff = 0.74, 0.75 + cutoff = 0.74999 cruncher = SequenceMatcher(self.charjunk) - eqi, eqj = None, None # 1st indices of equal lines (if any) # List of index pairs achieving best_ratio. Strictly increasing # in both index positions. max_pairs = [] - maxi = -1 # `i` index of last pair in max_pairs # search for the pair that matches best without being identical # (identical lines must be junk lines, & we don't want to synch @@ -935,44 +930,31 @@ def _fancy_replace(self, a, alo, ahi, b, blo, bhi): cruncher.set_seq2(bj) # Find new best, if possible. Else search for the smallest i # (if any) > maxi that equals the best ratio - search_equal = True + best_ratio = cutoff + best_i = None for i in range(alo, ahi): ai = a[i] if ai == bj: - if eqi is None: - eqi, eqj = i, j + if best_i is None: + best_i = i continue cruncher.set_seq1(ai) # computing similarity is expensive, so use the quick # upper bounds first -- have seen this speed up messy # compares by a factor of 3. - cmp = ge if search_equal and i > maxi else gt - if (cmp(crqr(), best_ratio) - and cmp(cqr(), best_ratio) - and cmp((ratio := cr()), best_ratio)): - if ratio > best_ratio: - best_ratio = ratio - max_pairs.clear() - else: - assert best_ratio == ratio and search_equal - assert i > maxi - max_pairs.append((i, j)) - maxi = i - search_equal = False - if best_ratio < cutoff: - assert not max_pairs - # no non-identical "pretty close" pair - if eqi is None: - # no identical pair either -- treat it as a straight replace - yield from self._plain_replace(a, alo, ahi, b, blo, bhi) - return - # no close pair, but an identical pair -- synch up on that - max_pairs = [(eqi, eqj)] - else: - # there's a close pair, so forget the identical pair (if any) - assert max_pairs - eqi = None - + if (crqr() > best_ratio + and cqr() > best_ratio + and (ratio := cr()) > best_ratio): + best_ratio = ratio + best_i = i + if best_i is not None: + #assert not max_pairs or best_i > max_pairs[-1][0], (max_pairs, i, j) + if not max_pairs or best_i > max_pairs[-1][0]: + max_pairs.append((best_i, j)) + if not max_pairs: + yield from self._plain_replace(a, alo, ahi, b, blo, bhi) + + print(max_pairs) last_i, last_j = alo, blo for this_i, this_j in max_pairs: # pump out diffs from before the synch point @@ -980,7 +962,7 @@ def _fancy_replace(self, a, alo, ahi, b, blo, bhi): b, last_j, this_j) # do intraline marking on the synch pair aelt, belt = a[this_i], b[this_j] - if eqi is None: + if aelt != belt: # pump out a '-', '?', '+', '?' quad for the synched lines atags = btags = "" cruncher.set_seqs(aelt, belt) From 5eeeef589a0733f9b6729c0036471050f608f1b9 Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Thu, 23 May 2024 22:50:07 -0500 Subject: [PATCH 2/7] `_fancy_replace()` is no longer recursive. Instead it uses a sliding window of 10 lines, and picks "the first" pair within the current window with a ratio > cutoff to synch on. This should make cubic-time cases impossible. It may change results in some cases, but I don't think I care - difflib is fundamentally opposed to synching on lines "far apart", and using a window enforces a notion of locality. --- Lib/difflib.py | 93 ++++++++++++++++++++++++++------------------------ 1 file changed, 48 insertions(+), 45 deletions(-) diff --git a/Lib/difflib.py b/Lib/difflib.py index f7ba9a0dbd2282..ce12220c6686c2 100644 --- a/Lib/difflib.py +++ b/Lib/difflib.py @@ -913,55 +913,59 @@ def _fancy_replace(self, a, alo, ahi, b, blo, bhi): # and if there are many pairs with the best ratio, recursion # could grow very deep, and runtime cubic. See: # https://github.com/python/cpython/issues/119105 + # + # Later, more pathological cases prompted removing recursion + # entirely. cutoff = 0.74999 cruncher = SequenceMatcher(self.charjunk) - # List of index pairs achieving best_ratio. Strictly increasing - # in both index positions. - max_pairs = [] - # search for the pair that matches best without being identical - # (identical lines must be junk lines, & we don't want to synch - # up on junk -- unless we have to) + WINDOW = 10 crqr = cruncher.real_quick_ratio cqr = cruncher.quick_ratio cr = cruncher.ratio - for j in range(blo, bhi): - bj = b[j] - cruncher.set_seq2(bj) - # Find new best, if possible. Else search for the smallest i - # (if any) > maxi that equals the best ratio + dump_i, dump_j = alo, blo + start_i, start_j = alo, blo + while start_i < ahi and start_j < bhi: + # Search for the pair in the next WINDOW i's and j's that + # scores better than `cutoff` without being identical + # (identical lines must be junk lines, & we don't want to + # synch up on junk -- unless we have to) best_ratio = cutoff - best_i = None - for i in range(alo, ahi): - ai = a[i] - if ai == bj: - if best_i is None: - best_i = i - continue - cruncher.set_seq1(ai) - # computing similarity is expensive, so use the quick - # upper bounds first -- have seen this speed up messy - # compares by a factor of 3. - if (crqr() > best_ratio - and cqr() > best_ratio - and (ratio := cr()) > best_ratio): - best_ratio = ratio - best_i = i - if best_i is not None: - #assert not max_pairs or best_i > max_pairs[-1][0], (max_pairs, i, j) - if not max_pairs or best_i > max_pairs[-1][0]: - max_pairs.append((best_i, j)) - if not max_pairs: - yield from self._plain_replace(a, alo, ahi, b, blo, bhi) - - print(max_pairs) - last_i, last_j = alo, blo - for this_i, this_j in max_pairs: - # pump out diffs from before the synch point - yield from self._fancy_helper(a, last_i, this_i, - b, last_j, this_j) + best_i = best_j = None + for j in range(start_j, min(start_j + WINDOW, bhi)): + bj = b[j] + cruncher.set_seq2(bj) + for i in range(start_i, min(start_i + WINDOW, ahi)): + ai = a[i] + if ai == bj: + if best_i is None: + best_i, best_j = i, j + continue + cruncher.set_seq1(ai) + if (crqr() > best_ratio + and cqr() > best_ratio + and (ratio := cr()) > best_ratio): + best_ratio = ratio + best_i, best_j = i, j + break + if best_ratio > cutoff: + # if best_ratio <= cutoff, we don't yet have a + # non-identical pair, so keep searching in the + # window + break + + if best_i is None: + # found nothing to synch on - move to next window + start_i += WINDOW + start_j += WINDOW + continue + + yield from self._fancy_helper(a, dump_i, best_i, + b, dump_j, best_j) + start_i, start_j = best_i + 1, best_j + 1 + dump_i, dump_j = start_i, start_j # do intraline marking on the synch pair - aelt, belt = a[this_i], b[this_j] + aelt, belt = a[best_i], b[best_j] if aelt != belt: # pump out a '-', '?', '+', '?' quad for the synched lines atags = btags = "" @@ -984,17 +988,16 @@ def _fancy_replace(self, a, alo, ahi, b, blo, bhi): else: # the synch pair is identical yield ' ' + aelt - last_i, last_j = this_i + 1, this_j + 1 # pump out diffs from after the last synch point - yield from self._fancy_helper(a, last_i, ahi, - b, last_j, bhi) + yield from self._fancy_helper(a, dump_i, ahi, + b, dump_j, bhi) def _fancy_helper(self, a, alo, ahi, b, blo, bhi): g = [] if alo < ahi: if blo < bhi: - g = self._fancy_replace(a, alo, ahi, b, blo, bhi) + g = self._plain_replace(a, alo, ahi, b, blo, bhi) else: g = self._dump('-', a, alo, ahi) elif blo < bhi: From ff389778cd120599b4acb9bcf99d6512212d0330 Mon Sep 17 00:00:00 2001 From: "blurb-it[bot]" <43283697+blurb-it[bot]@users.noreply.github.com> Date: Fri, 24 May 2024 04:05:40 +0000 Subject: [PATCH 3/7] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20blu?= =?UTF-8?q?rb=5Fit.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../next/Library/2024-05-24-04-05-37.gh-issue-119105.aDSRFn.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Library/2024-05-24-04-05-37.gh-issue-119105.aDSRFn.rst diff --git a/Misc/NEWS.d/next/Library/2024-05-24-04-05-37.gh-issue-119105.aDSRFn.rst b/Misc/NEWS.d/next/Library/2024-05-24-04-05-37.gh-issue-119105.aDSRFn.rst new file mode 100644 index 00000000000000..cb68c1f4af5c24 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-05-24-04-05-37.gh-issue-119105.aDSRFn.rst @@ -0,0 +1 @@ +``difflib``'s ``DIffer.compare()`` (and so also ``ndiff``) can no longer be provoked into cubic-time behavior, or into unbounded recursion, and should generally be faster in ordinary cases too. Results may change in some cases, although that should be rare. From 1a0426d06bca312f7793b9f6d34cbf61d97d9d91 Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Thu, 23 May 2024 23:49:30 -0500 Subject: [PATCH 4/7] move a line; add some comments --- Lib/difflib.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Lib/difflib.py b/Lib/difflib.py index ce12220c6686c2..441fff8a38d6a3 100644 --- a/Lib/difflib.py +++ b/Lib/difflib.py @@ -918,11 +918,11 @@ def _fancy_replace(self, a, alo, ahi, b, blo, bhi): # entirely. cutoff = 0.74999 cruncher = SequenceMatcher(self.charjunk) - - WINDOW = 10 crqr = cruncher.real_quick_ratio cqr = cruncher.quick_ratio cr = cruncher.ratio + + WINDOW = 10 dump_i, dump_j = alo, blo start_i, start_j = alo, blo while start_i < ahi and start_j < bhi: @@ -960,6 +960,7 @@ def _fancy_replace(self, a, alo, ahi, b, blo, bhi): start_j += WINDOW continue + # pump out straight replace from before this synch pair yield from self._fancy_helper(a, dump_i, best_i, b, dump_j, best_j) start_i, start_j = best_i + 1, best_j + 1 @@ -989,7 +990,7 @@ def _fancy_replace(self, a, alo, ahi, b, blo, bhi): # the synch pair is identical yield ' ' + aelt - # pump out diffs from after the last synch point + # pump out straight replace from after the last synch pair yield from self._fancy_helper(a, dump_i, ahi, b, dump_j, bhi) From c187ce140a0bb4734ebc7244efe2c69fe8b064a1 Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Fri, 24 May 2024 02:28:43 -0500 Subject: [PATCH 5/7] Fix the windowing code to be symmetric. Stop tresting junk lines specially - if we synch on junk, so be it - it almost never occurs in real life anyway. --- Lib/difflib.py | 57 ++++++++++++++++++++------------------------------ 1 file changed, 23 insertions(+), 34 deletions(-) diff --git a/Lib/difflib.py b/Lib/difflib.py index 441fff8a38d6a3..a8d769a9cc88d7 100644 --- a/Lib/difflib.py +++ b/Lib/difflib.py @@ -923,48 +923,35 @@ def _fancy_replace(self, a, alo, ahi, b, blo, bhi): cr = cruncher.ratio WINDOW = 10 - dump_i, dump_j = alo, blo - start_i, start_j = alo, blo - while start_i < ahi and start_j < bhi: - # Search for the pair in the next WINDOW i's and j's that - # scores better than `cutoff` without being identical - # (identical lines must be junk lines, & we don't want to - # synch up on junk -- unless we have to) + best_i = best_j = None + dump_i, dump_j = alo, blo # smallest indices not yet resolved + for j in range(blo, bhi): + bj = b[j] + cruncher.set_seq2(bj) + # Search the corresponding i's within WINDOW for rhe highest + # ratio greater than `cutoff`. + aequiv = alo + j - blo + arange = range(max(aequiv - WINDOW, dump_i), + min(aequiv + WINDOW + 1, ahi)) + if not arange: # likely exit if `a` is shorter than `b` + break best_ratio = cutoff - best_i = best_j = None - for j in range(start_j, min(start_j + WINDOW, bhi)): - bj = b[j] - cruncher.set_seq2(bj) - for i in range(start_i, min(start_i + WINDOW, ahi)): - ai = a[i] - if ai == bj: - if best_i is None: - best_i, best_j = i, j - continue - cruncher.set_seq1(ai) - if (crqr() > best_ratio - and cqr() > best_ratio - and (ratio := cr()) > best_ratio): - best_ratio = ratio - best_i, best_j = i, j - break - if best_ratio > cutoff: - # if best_ratio <= cutoff, we don't yet have a - # non-identical pair, so keep searching in the - # window - break + for i in arange: + ai = a[i] + cruncher.set_seq1(ai) + if (crqr() > best_ratio + and cqr() > best_ratio + and cr() > best_ratio): + best_ratio = cr() + best_i, best_j = i, j if best_i is None: - # found nothing to synch on - move to next window - start_i += WINDOW - start_j += WINDOW + # found nothing to synch on yet continue # pump out straight replace from before this synch pair yield from self._fancy_helper(a, dump_i, best_i, b, dump_j, best_j) - start_i, start_j = best_i + 1, best_j + 1 - dump_i, dump_j = start_i, start_j # do intraline marking on the synch pair aelt, belt = a[best_i], b[best_j] if aelt != belt: @@ -989,6 +976,8 @@ def _fancy_replace(self, a, alo, ahi, b, blo, bhi): else: # the synch pair is identical yield ' ' + aelt + dump_i, dump_j = best_i + 1, best_j + 1 + best_i = best_j = None # pump out straight replace from after the last synch pair yield from self._fancy_helper(a, dump_i, ahi, From 25904d62ddecd9a958437b8ffcc0d0807a07382a Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Fri, 24 May 2024 13:24:19 -0500 Subject: [PATCH 6/7] Minor fiddling. --- Lib/difflib.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/Lib/difflib.py b/Lib/difflib.py index a8d769a9cc88d7..0443963b4fd697 100644 --- a/Lib/difflib.py +++ b/Lib/difflib.py @@ -926,27 +926,26 @@ def _fancy_replace(self, a, alo, ahi, b, blo, bhi): best_i = best_j = None dump_i, dump_j = alo, blo # smallest indices not yet resolved for j in range(blo, bhi): - bj = b[j] - cruncher.set_seq2(bj) + cruncher.set_seq2(b[j]) # Search the corresponding i's within WINDOW for rhe highest # ratio greater than `cutoff`. - aequiv = alo + j - blo + aequiv = alo + (j - blo) arange = range(max(aequiv - WINDOW, dump_i), min(aequiv + WINDOW + 1, ahi)) if not arange: # likely exit if `a` is shorter than `b` break best_ratio = cutoff for i in arange: - ai = a[i] - cruncher.set_seq1(ai) + cruncher.set_seq1(a[i]) + # Ordering by cheapest to most expensive ratio is very + # valuable, most often getting out early. if (crqr() > best_ratio and cqr() > best_ratio and cr() > best_ratio): - best_ratio = cr() - best_i, best_j = i, j + best_i, best_j, best_ratio = i, j, cr() if best_i is None: - # found nothing to synch on yet + # found nothing to synch on yet - move to next j continue # pump out straight replace from before this synch pair From d6a8c31b8c4f0a0d2e32356040898687c7b9d8fc Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Fri, 24 May 2024 21:36:16 -0500 Subject: [PATCH 7/7] Beef up the NEWS entry. --- .../next/Library/2024-05-24-04-05-37.gh-issue-119105.aDSRFn.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Misc/NEWS.d/next/Library/2024-05-24-04-05-37.gh-issue-119105.aDSRFn.rst b/Misc/NEWS.d/next/Library/2024-05-24-04-05-37.gh-issue-119105.aDSRFn.rst index cb68c1f4af5c24..3205061a68ce7f 100644 --- a/Misc/NEWS.d/next/Library/2024-05-24-04-05-37.gh-issue-119105.aDSRFn.rst +++ b/Misc/NEWS.d/next/Library/2024-05-24-04-05-37.gh-issue-119105.aDSRFn.rst @@ -1 +1 @@ -``difflib``'s ``DIffer.compare()`` (and so also ``ndiff``) can no longer be provoked into cubic-time behavior, or into unbounded recursion, and should generally be faster in ordinary cases too. Results may change in some cases, although that should be rare. +``difflib``'s ``DIffer.compare()`` (and so also ``ndiff``) can no longer be provoked into cubic-time behavior, or into unbounded recursion, and should generally be faster in ordinary cases too. Results may change in some cases, although that should be rare. Correctness of diffs is not affected. Some similar lines far apart may be reported as deleting one and adding the other, where before they were displayed on adjacent output lines with markup showing the intraline differences.