diff --git a/Lib/difflib.py b/Lib/difflib.py index 0b14d3c779..ba0b256969 100644 --- a/Lib/difflib.py +++ b/Lib/difflib.py @@ -62,7 +62,7 @@ class SequenceMatcher: notion, pairing up elements that appear uniquely in each sequence. That, and the method here, appear to yield more intuitive difference reports than does diff. This method appears to be the least vulnerable - to synching up on blocks of "junk lines", though (like blank lines in + to syncing up on blocks of "junk lines", though (like blank lines in ordinary text files, or maybe "

" lines in HTML files). That may be because this is the only method of the 3 that has a *concept* of "junk" . @@ -115,38 +115,6 @@ class SequenceMatcher: case. SequenceMatcher is quadratic time for the worst case and has expected-case behavior dependent in a complicated way on how many elements the sequences have in common; best case time is linear. - - Methods: - - __init__(isjunk=None, a='', b='') - Construct a SequenceMatcher. - - set_seqs(a, b) - Set the two sequences to be compared. - - set_seq1(a) - Set the first sequence to be compared. - - set_seq2(b) - Set the second sequence to be compared. - - find_longest_match(alo, ahi, blo, bhi) - Find longest matching block in a[alo:ahi] and b[blo:bhi]. - - get_matching_blocks() - Return list of triples describing matching subsequences. - - get_opcodes() - Return list of 5-tuples describing how to turn a into b. - - ratio() - Return a measure of the sequences' similarity (float in [0,1]). - - quick_ratio() - Return an upper bound on .ratio() relatively quickly. - - real_quick_ratio() - Return an upper bound on ratio() very quickly. """ def __init__(self, isjunk=None, a='', b='', autojunk=True): @@ -334,9 +302,11 @@ def __chain_b(self): for elt in popular: # ditto; as fast for 1% deletion del b2j[elt] - def find_longest_match(self, alo, ahi, blo, bhi): + def find_longest_match(self, alo=0, ahi=None, blo=0, bhi=None): """Find longest matching block in a[alo:ahi] and b[blo:bhi]. + By default it will find the longest match in the entirety of a and b. + If isjunk is not defined: Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where @@ -391,6 +361,10 @@ def find_longest_match(self, alo, ahi, blo, bhi): # the unique 'b's and then matching the first two 'a's. a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.bjunk.__contains__ + if ahi is None: + ahi = len(a) + if bhi is None: + bhi = len(b) besti, bestj, bestsize = alo, blo, 0 # find longest junk-free match # during an iteration of the loop, j2len[j] = length of longest @@ -688,6 +662,7 @@ def real_quick_ratio(self): __class_getitem__ = classmethod(GenericAlias) + def get_close_matches(word, possibilities, n=3, cutoff=0.6): """Use SequenceMatcher to return list of the best "good enough" matches. @@ -830,14 +805,6 @@ class Differ: + 4. Complicated is better than complex. ? ++++ ^ ^ + 5. Flat is better than nested. - - Methods: - - __init__(linejunk=None, charjunk=None) - Construct a text differencer, with optional filters. - - compare(a, b) - Compare two sequences of lines; generate the resulting delta. """ def __init__(self, linejunk=None, charjunk=None): @@ -870,7 +837,7 @@ def compare(self, a, b): Each sequence must contain individual single-line strings ending with newlines. Such sequences can be obtained from the `readlines()` method of file-like objects. The delta generated also consists of newline- - terminated strings, ready to be printed as-is via the writeline() + terminated strings, ready to be printed as-is via the writelines() method of a file-like object. Example: diff --git a/Lib/test/test_difflib.py b/Lib/test/test_difflib.py index 208208fd68..ed41074f7e 100644 --- a/Lib/test/test_difflib.py +++ b/Lib/test/test_difflib.py @@ -1,5 +1,5 @@ import difflib -from test.support import run_unittest, findfile +from test.support import findfile import unittest import doctest import sys @@ -241,7 +241,7 @@ def test_html_diff(self): #with open('test_difflib_expect.html','w') as fp: # fp.write(actual) - with open(findfile('test_difflib_expect.html')) as fp: + with open(findfile('test_difflib_expect.html'), encoding="utf-8") as fp: self.assertEqual(actual, fp.read()) def test_recursion_limit(self): @@ -503,12 +503,60 @@ def test_is_character_junk_false(self): for char in ['a', '#', '\n', '\f', '\r', '\v']: self.assertFalse(difflib.IS_CHARACTER_JUNK(char), repr(char)) -def test_main(): +class TestFindLongest(unittest.TestCase): + def longer_match_exists(self, a, b, n): + return any(b_part in a for b_part in + [b[i:i + n + 1] for i in range(0, len(b) - n - 1)]) + + def test_default_args(self): + a = 'foo bar' + b = 'foo baz bar' + sm = difflib.SequenceMatcher(a=a, b=b) + match = sm.find_longest_match() + self.assertEqual(match.a, 0) + self.assertEqual(match.b, 0) + self.assertEqual(match.size, 6) + self.assertEqual(a[match.a: match.a + match.size], + b[match.b: match.b + match.size]) + self.assertFalse(self.longer_match_exists(a, b, match.size)) + + match = sm.find_longest_match(alo=2, blo=4) + self.assertEqual(match.a, 3) + self.assertEqual(match.b, 7) + self.assertEqual(match.size, 4) + self.assertEqual(a[match.a: match.a + match.size], + b[match.b: match.b + match.size]) + self.assertFalse(self.longer_match_exists(a[2:], b[4:], match.size)) + + match = sm.find_longest_match(bhi=5, blo=1) + self.assertEqual(match.a, 1) + self.assertEqual(match.b, 1) + self.assertEqual(match.size, 4) + self.assertEqual(a[match.a: match.a + match.size], + b[match.b: match.b + match.size]) + self.assertFalse(self.longer_match_exists(a, b[1:5], match.size)) + + def test_longest_match_with_popular_chars(self): + a = 'dabcd' + b = 'd'*100 + 'abc' + 'd'*100 # length over 200 so popular used + sm = difflib.SequenceMatcher(a=a, b=b) + match = sm.find_longest_match(0, len(a), 0, len(b)) + self.assertEqual(match.a, 0) + self.assertEqual(match.b, 99) + self.assertEqual(match.size, 5) + self.assertEqual(a[match.a: match.a + match.size], + b[match.b: match.b + match.size]) + self.assertFalse(self.longer_match_exists(a, b, match.size)) + + +def setUpModule(): difflib.HtmlDiff._default_prefix = 0 - Doctests = doctest.DocTestSuite(difflib) - run_unittest( - TestWithAscii, TestAutojunk, TestSFpatches, TestSFbugs, - TestOutputFormat, TestBytes, TestJunkAPIs, Doctests) + + +def load_tests(loader, tests, pattern): + tests.addTest(doctest.DocTestSuite(difflib)) + return tests + if __name__ == '__main__': - test_main() + unittest.main()