From 5f35b0dbb9ebec63d805d230e01b4a0676e613af Mon Sep 17 00:00:00 2001 From: Tian Gao Date: Tue, 18 Jul 2023 15:37:36 -0700 Subject: [PATCH 1/2] Improve difflib performance with a cache --- Lib/difflib.py | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/Lib/difflib.py b/Lib/difflib.py index ba0b256969ebff..a89393580bd132 100644 --- a/Lib/difflib.py +++ b/Lib/difflib.py @@ -371,20 +371,34 @@ def find_longest_match(self, alo=0, ahi=None, blo=0, bhi=None): # junk-free match ending with a[i-1] and b[j] j2len = {} nothing = [] + b2jcache = {} for i in range(alo, ahi): # look at all instances of a[i] in b; note that because # b2j has no junk keys, the loop is skipped if a[i] is junk j2lenget = j2len.get newj2len = {} - for j in b2j.get(a[i], nothing): - # a[i] matches b[j] - if j < blo: - continue - if j >= bhi: - break - k = newj2len[j] = j2lenget(j-1, 0) + 1 - if k > bestsize: - besti, bestj, bestsize = i-k+1, j-k+1, k + if a[i] not in b2jcache: + cache = [] + for j in b2j.get(a[i], nothing): + # a[i] matches b[j] + if j < blo: + continue + if j >= bhi: + break + cache.append(j) + k = newj2len[j] = j2lenget(j-1, 0) + 1 + if k > bestsize: + besti, bestj, bestsize = i-k+1, j-k+1, k + b2jcache[a[i]] = cache + else: + for j in b2jcache[a[i]]: + # a[i] matches b[j] + if j-1 in j2len: + k = newj2len[j] = j2len[j-1] + 1 + if k > bestsize: + besti, bestj, bestsize = i-k+1, j-k+1, k + else: + k = 1 j2len = newj2len # Extend the best by non-junk elements on each end. In particular, From 0396f814eb56defa9adcc5433cc48ca08bd1f2c5 Mon Sep 17 00:00:00 2001 From: "blurb-it[bot]" <43283697+blurb-it[bot]@users.noreply.github.com> Date: Tue, 18 Jul 2023 23:45:20 +0000 Subject: [PATCH 2/2] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20blu?= =?UTF-8?q?rb=5Fit.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../next/Library/2023-07-18-23-45-19.gh-issue-106865.dxO3Qr.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Library/2023-07-18-23-45-19.gh-issue-106865.dxO3Qr.rst diff --git a/Misc/NEWS.d/next/Library/2023-07-18-23-45-19.gh-issue-106865.dxO3Qr.rst b/Misc/NEWS.d/next/Library/2023-07-18-23-45-19.gh-issue-106865.dxO3Qr.rst new file mode 100644 index 00000000000000..3cece781a0a7af --- /dev/null +++ b/Misc/NEWS.d/next/Library/2023-07-18-23-45-19.gh-issue-106865.dxO3Qr.rst @@ -0,0 +1 @@ +Improve performance of :func:`difflib.SequenceMatcher.find_longest_match` with a cache on hot path