test/difflib.py

   1 #! /usr/bin/env python
   2
   3 """
   4 Module difflib -- helpers for computing deltas between objects.
   5
   6 Function get_close_matches(word, possibilities, n=3, cutoff=0.6):
   7     Use SequenceMatcher to return list of the best "good enough" matches.
   8
   9 Function ndiff(a, b):
  10     Return a delta: the difference between `a` and `b` (lists of strings).
  11
  12 Function restore(delta, which):
  13     Return one of the two sequences that generated an ndiff delta.
  14
  15 Class SequenceMatcher:
  16     A flexible class for comparing pairs of sequences of any type.
  17
  18 Class Differ:
  19     For producing human-readable deltas from sequences of lines of text.
  20 """
  21
  22 __all__ = ['get_close_matches', 'ndiff', 'restore', 'SequenceMatcher',
  23            'Differ']
  24
  25 TRACE = 0
  26
  27 class SequenceMatcher:
  28
  29     """
  30     SequenceMatcher is a flexible class for comparing pairs of sequences of
  31     any type, so long as the sequence elements are hashable.  The basic
  32     algorithm predates, and is a little fancier than, an algorithm
  33     published in the late 1980's by Ratcliff and Obershelp under the
  34     hyperbolic name "gestalt pattern matching".  The basic idea is to find
  35     the longest contiguous matching subsequence that contains no "junk"
  36     elements (R-O doesn't address junk).  The same idea is then applied
  37     recursively to the pieces of the sequences to the left and to the right
  38     of the matching subsequence.  This does not yield minimal edit
  39     sequences, but does tend to yield matches that "look right" to people.
  40
  41     SequenceMatcher tries to compute a "human-friendly diff" between two
  42     sequences.  Unlike e.g. UNIX(tm) diff, the fundamental notion is the
  43     longest *contiguous* & junk-free matching subsequence.  That's what
  44     catches peoples' eyes.  The Windows(tm) windiff has another interesting
  45     notion, pairing up elements that appear uniquely in each sequence.
  46     That, and the method here, appear to yield more intuitive difference
  47     reports than does diff.  This method appears to be the least vulnerable
  48     to synching up on blocks of "junk lines", though (like blank lines in
  49     ordinary text files, or maybe "<P>" lines in HTML files).  That may be
  50     because this is the only method of the 3 that has a *concept* of
  51     "junk" <wink>.
  52
  53     Example, comparing two strings, and considering blanks to be "junk":
  54
  55     >>> s = SequenceMatcher(lambda x: x == " ",
  56     ...                     "private Thread currentThread;",
  57     ...                     "private volatile Thread currentThread;")
  58     >>>
  59
  60     .ratio() returns a float in [0, 1], measuring the "similarity" of the
  61     sequences.  As a rule of thumb, a .ratio() value over 0.6 means the
  62     sequences are close matches:
  63
  64     >>> print round(s.ratio(), 3)
  65     0.866
  66     >>>
  67
  68     If you're only interested in where the sequences match,
  69     .get_matching_blocks() is handy:
  70
  71     >>> for block in s.get_matching_blocks():
  72     ...     print "a[%d] and b[%d] match for %d elements" % block
  73     a[0] and b[0] match for 8 elements
  74     a[8] and b[17] match for 6 elements
  75     a[14] and b[23] match for 15 elements
  76     a[29] and b[38] match for 0 elements
  77
  78     Note that the last tuple returned by .get_matching_blocks() is always a
  79     dummy, (len(a), len(b), 0), and this is the only case in which the last
  80     tuple element (number of elements matched) is 0.
  81
  82     If you want to know how to change the first sequence into the second,
  83     use .get_opcodes():
  84
  85     >>> for opcode in s.get_opcodes():
  86     ...     print "%6s a[%d:%d] b[%d:%d]" % opcode
  87      equal a[0:8] b[0:8]
  88     insert a[8:8] b[8:17]
  89      equal a[8:14] b[17:23]
  90      equal a[14:29] b[23:38]
  91
  92     See the Differ class for a fancy human-friendly file differencer, which
  93     uses SequenceMatcher both to compare sequences of lines, and to compare
  94     sequences of characters within similar (near-matching) lines.
  95
  96     See also function get_close_matches() in this module, which shows how
  97     simple code building on SequenceMatcher can be used to do useful work.
  98
  99     Timing:  Basic R-O is cubic time worst case and quadratic time expected
 100     case.  SequenceMatcher is quadratic time for the worst case and has
 101     expected-case behavior dependent in a complicated way on how many
 102     elements the sequences have in common; best case time is linear.
 103
 104     Methods:
 105
 106     __init__(isjunk=None, a='', b='')
 107         Construct a SequenceMatcher.
 108
 109     set_seqs(a, b)
 110         Set the two sequences to be compared.
 111
 112     set_seq1(a)
 113         Set the first sequence to be compared.
 114
 115     set_seq2(b)
 116         Set the second sequence to be compared.
 117
 118     find_longest_match(alo, ahi, blo, bhi)
 119         Find longest matching block in a[alo:ahi] and b[blo:bhi].
 120
 121     get_matching_blocks()
 122         Return list of triples describing matching subsequences.
 123
 124     get_opcodes()
 125         Return list of 5-tuples describing how to turn a into b.
 126
 127     ratio()
 128         Return a measure of the sequences' similarity (float in [0,1]).
 129
 130     quick_ratio()
 131         Return an upper bound on .ratio() relatively quickly.
 132
 133     real_quick_ratio()
 134         Return an upper bound on ratio() very quickly.
 135     """
 136
 137     def __init__(self, isjunk=None, a='', b=''):
 138         """Construct a SequenceMatcher.
 139
 140         Optional arg isjunk is None (the default), or a one-argument
 141         function that takes a sequence element and returns true iff the
 142         element is junk.  None is equivalent to passing "lambda x: 0", i.e.
 143         no elements are considered to be junk.  For example, pass
 144             lambda x: x in " \\t"
 145         if you're comparing lines as sequences of characters, and don't
 146         want to synch up on blanks or hard tabs.
 147
 148         Optional arg a is the first of two sequences to be compared.  By
 149         default, an empty string.  The elements of a must be hashable.  See
 150         also .set_seqs() and .set_seq1().
 151
 152         Optional arg b is the second of two sequences to be compared.  By
 153         default, an empty string.  The elements of b must be hashable. See
 154         also .set_seqs() and .set_seq2().
 155         """
 156
 157         # Members:
 158         # a
 159         #      first sequence
 160         # b
 161         #      second sequence; differences are computed as "what do
 162         #      we need to do to 'a' to change it into 'b'?"
 163         # b2j
 164         #      for x in b, b2j[x] is a list of the indices (into b)
 165         #      at which x appears; junk elements do not appear
 166         # b2jhas
 167         #      b2j.has_key
 168         # fullbcount
 169         #      for x in b, fullbcount[x] == the number of times x
 170         #      appears in b; only materialized if really needed (used
 171         #      only for computing quick_ratio())
 172         # matching_blocks
 173         #      a list of (i, j, k) triples, where a[i:i+k] == b[j:j+k];
 174         #      ascending & non-overlapping in i and in j; terminated by
 175         #      a dummy (len(a), len(b), 0) sentinel
 176         # opcodes
 177         #      a list of (tag, i1, i2, j1, j2) tuples, where tag is
 178         #      one of
 179         #          'replace'   a[i1:i2] should be replaced by b[j1:j2]
 180         #          'delete'    a[i1:i2] should be deleted
 181         #          'insert'    b[j1:j2] should be inserted
 182         #          'equal'     a[i1:i2] == b[j1:j2]
 183         # isjunk
 184         #      a user-supplied function taking a sequence element and
 185         #      returning true iff the element is "junk" -- this has
 186         #      subtle but helpful effects on the algorithm, which I'll
 187         #      get around to writing up someday <0.9 wink>.
 188         #      DON'T USE!  Only __chain_b uses this.  Use isbjunk.
 189         # isbjunk
 190         #      for x in b, isbjunk(x) == isjunk(x) but much faster;
 191         #      it's really the has_key method of a hidden dict.
 192         #      DOES NOT WORK for x in a!
 193
 194         self.isjunk = isjunk
 195         self.a = self.b = None
 196         self.set_seqs(a, b)
 197
 198     def set_seqs(self, a, b):
 199         """Set the two sequences to be compared.
 200
 201         >>> s = SequenceMatcher()
 202         >>> s.set_seqs("abcd", "bcde")
 203         >>> s.ratio()
 204         0.75
 205         """
 206
 207         self.set_seq1(a)
 208         self.set_seq2(b)
 209
 210     def set_seq1(self, a):
 211         """Set the first sequence to be compared.
 212
 213         The second sequence to be compared is not changed.
 214
 215         >>> s = SequenceMatcher(None, "abcd", "bcde")
 216         >>> s.ratio()
 217         0.75
 218         >>> s.set_seq1("bcde")
 219         >>> s.ratio()
 220         1.0
 221         >>>
 222
 223         SequenceMatcher computes and caches detailed information about the
 224         second sequence, so if you want to compare one sequence S against
 225         many sequences, use .set_seq2(S) once and call .set_seq1(x)
 226         repeatedly for each of the other sequences.
 227
 228         See also set_seqs() and set_seq2().
 229         """
 230
 231         if a is self.a:
 232             return
 233         self.a = a
 234         self.matching_blocks = self.opcodes = None
 235
 236     def set_seq2(self, b):
 237         """Set the second sequence to be compared.
 238
 239         The first sequence to be compared is not changed.
 240
 241         >>> s = SequenceMatcher(None, "abcd", "bcde")
 242         >>> s.ratio()
 243         0.75
 244         >>> s.set_seq2("abcd")
 245         >>> s.ratio()
 246         1.0
 247         >>>
 248
 249         SequenceMatcher computes and caches detailed information about the
 250         second sequence, so if you want to compare one sequence S against
 251         many sequences, use .set_seq2(S) once and call .set_seq1(x)
 252         repeatedly for each of the other sequences.
 253
 254         See also set_seqs() and set_seq1().
 255         """
 256
 257         if b is self.b:
 258             return
 259         self.b = b
 260         self.matching_blocks = self.opcodes = None
 261         self.fullbcount = None
 262         self.__chain_b()
 263
 264     # For each element x in b, set b2j[x] to a list of the indices in
 265     # b where x appears; the indices are in increasing order; note that
 266     # the number of times x appears in b is len(b2j[x]) ...
 267     # when self.isjunk is defined, junk elements don't show up in this
 268     # map at all, which stops the central find_longest_match method
 269     # from starting any matching block at a junk element ...
 270     # also creates the fast isbjunk function ...
 271     # note that this is only called when b changes; so for cross-product
 272     # kinds of matches, it's best to call set_seq2 once, then set_seq1
 273     # repeatedly
 274
 275     def __chain_b(self):
 276         # Because isjunk is a user-defined (not C) function, and we test
 277         # for junk a LOT, it's important to minimize the number of calls.
 278         # Before the tricks described here, __chain_b was by far the most
 279         # time-consuming routine in the whole module!  If anyone sees
 280         # Jim Roskind, thank him again for profile.py -- I never would
 281         # have guessed that.
 282         # The first trick is to build b2j ignoring the possibility
 283         # of junk.  I.e., we don't call isjunk at all yet.  Throwing
 284         # out the junk later is much cheaper than building b2j "right"
 285         # from the start.
 286         b = self.b
 287         self.b2j = b2j = {}
 288         self.b2jhas = b2jhas = b2j.has_key
 289         for i in xrange(len(b)):
 290             elt = b[i]
 291             if b2jhas(elt):
 292                 b2j[elt].append(i)
 293             else:
 294                 b2j[elt] = [i]
 295
 296         # Now b2j.keys() contains elements uniquely, and especially when
 297         # the sequence is a string, that's usually a good deal smaller
 298         # than len(string).  The difference is the number of isjunk calls
 299         # saved.
 300         isjunk, junkdict = self.isjunk, {}
 301         if isjunk:
 302             for elt in b2j.keys():
 303                 if isjunk(elt):
 304                     junkdict[elt] = 1   # value irrelevant; it's a set
 305                     del b2j[elt]
 306
 307         # Now for x in b, isjunk(x) == junkdict.has_key(x), but the
 308         # latter is much faster.  Note too that while there may be a
 309         # lot of junk in the sequence, the number of *unique* junk
 310         # elements is probably small.  So the memory burden of keeping
 311         # this dict alive is likely trivial compared to the size of b2j.
 312         self.isbjunk = junkdict.has_key
 313
 314     def find_longest_match(self, alo, ahi, blo, bhi):
 315         """Find longest matching block in a[alo:ahi] and b[blo:bhi].
 316
 317         If isjunk is not defined:
 318
 319         Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where
 320             alo <= i <= i+k <= ahi
 321             blo <= j <= j+k <= bhi
 322         and for all (i',j',k') meeting those conditions,
 323             k >= k'
 324             i <= i'
 325             and if i == i', j <= j'
 326
 327         In other words, of all maximal matching blocks, return one that
 328         starts earliest in a, and of all those maximal matching blocks that
 329         start earliest in a, return the one that starts earliest in b.
 330
 331         >>> s = SequenceMatcher(None, " abcd", "abcd abcd")
 332         >>> s.find_longest_match(0, 5, 0, 9)
 333         (0, 4, 5)
 334
 335         If isjunk is defined, first the longest matching block is
 336         determined as above, but with the additional restriction that no
 337         junk element appears in the block.  Then that block is extended as
 338         far as possible by matching (only) junk elements on both sides.  So
 339         the resulting block never matches on junk except as identical junk
 340         happens to be adjacent to an "interesting" match.
 341
 342         Here's the same example as before, but considering blanks to be
 343         junk.  That prevents " abcd" from matching the " abcd" at the tail
 344         end of the second sequence directly.  Instead only the "abcd" can
 345         match, and matches the leftmost "abcd" in the second sequence:
 346
 347         >>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")
 348         >>> s.find_longest_match(0, 5, 0, 9)
 349         (1, 0, 4)
 350
 351         If no blocks match, return (alo, blo, 0).
 352
 353         >>> s = SequenceMatcher(None, "ab", "c")
 354         >>> s.find_longest_match(0, 2, 0, 1)
 355         (0, 0, 0)
 356         """
 357
 358         # CAUTION:  stripping common prefix or suffix would be incorrect.
 359         # E.g.,
 360         #    ab
 361         #    acab
 362         # Longest matching block is "ab", but if common prefix is
 363         # stripped, it's "a" (tied with "b").  UNIX(tm) diff does so
 364         # strip, so ends up claiming that ab is changed to acab by
 365         # inserting "ca" in the middle.  That's minimal but unintuitive:
 366         # "it's obvious" that someone inserted "ac" at the front.
 367         # Windiff ends up at the same place as diff, but by pairing up
 368         # the unique 'b's and then matching the first two 'a's.
 369
 370         a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk
 371         besti, bestj, bestsize = alo, blo, 0
 372         # find longest junk-free match
 373         # during an iteration of the loop, j2len[j] = length of longest
 374         # junk-free match ending with a[i-1] and b[j]
 375         j2len = {}
 376         nothing = []
 377         for i in xrange(alo, ahi):
 378             # look at all instances of a[i] in b; note that because
 379             # b2j has no junk keys, the loop is skipped if a[i] is junk
 380             j2lenget = j2len.get
 381             newj2len = {}
 382             for j in b2j.get(a[i], nothing):
 383                 # a[i] matches b[j]
 384                 if j < blo:
 385                     continue
 386                 if j >= bhi:
 387                     break
 388                 k = newj2len[j] = j2lenget(j-1, 0) + 1
 389                 if k > bestsize:
 390                     besti, bestj, bestsize = i-k+1, j-k+1, k
 391             j2len = newj2len
 392
 393         # Now that we have a wholly interesting match (albeit possibly
 394         # empty!), we may as well suck up the matching junk on each
 395         # side of it too.  Can't think of a good reason not to, and it
 396         # saves post-processing the (possibly considerable) expense of
 397         # figuring out what to do with it.  In the case of an empty
 398         # interesting match, this is clearly the right thing to do,
 399         # because no other kind of match is possible in the regions.
 400         while besti > alo and bestj > blo and \
 401               isbjunk(b[bestj-1]) and \
 402               a[besti-1] == b[bestj-1]:
 403             besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
 404         while besti+bestsize < ahi and bestj+bestsize < bhi and \
 405               isbjunk(b[bestj+bestsize]) and \
 406               a[besti+bestsize] == b[bestj+bestsize]:
 407             bestsize = bestsize + 1
 408
 409         if TRACE:
 410             print "get_matching_blocks", alo, ahi, blo, bhi
 411             print "    returns", besti, bestj, bestsize
 412         return besti, bestj, bestsize
 413
 414     def get_matching_blocks(self):
 415         """Return list of triples describing matching subsequences.
 416
 417         Each triple is of the form (i, j, n), and means that
 418         a[i:i+n] == b[j:j+n].  The triples are monotonically increasing in
 419         i and in j.
 420
 421         The last triple is a dummy, (len(a), len(b), 0), and is the only
 422         triple with n==0.
 423
 424         >>> s = SequenceMatcher(None, "abxcd", "abcd")
 425         >>> s.get_matching_blocks()
 426         [(0, 0, 2), (3, 2, 2), (5, 4, 0)]
 427         """
 428
 429         if self.matching_blocks is not None:
 430             return self.matching_blocks
 431         self.matching_blocks = []
 432         la, lb = len(self.a), len(self.b)
 433         self.__helper(0, la, 0, lb, self.matching_blocks)
 434         self.matching_blocks.append( (la, lb, 0) )
 435         if TRACE:
 436             print '*** matching blocks', self.matching_blocks
 437         return self.matching_blocks
 438
 439     # builds list of matching blocks covering a[alo:ahi] and
 440     # b[blo:bhi], appending them in increasing order to answer
 441
 442     def __helper(self, alo, ahi, blo, bhi, answer):
 443         i, j, k = x = self.find_longest_match(alo, ahi, blo, bhi)
 444         # a[alo:i] vs b[blo:j] unknown
 445         # a[i:i+k] same as b[j:j+k]
 446         # a[i+k:ahi] vs b[j+k:bhi] unknown
 447         if k:
 448             if alo < i and blo < j:
 449                 self.__helper(alo, i, blo, j, answer)
 450             answer.append(x)
 451             if i+k < ahi and j+k < bhi:
 452                 self.__helper(i+k, ahi, j+k, bhi, answer)
 453
 454     def get_opcodes(self):
 455         """Return list of 5-tuples describing how to turn a into b.
 456
 457         Each tuple is of the form (tag, i1, i2, j1, j2).  The first tuple
 458         has i1 == j1 == 0, and remaining tuples have i1 == the i2 from the
 459         tuple preceding it, and likewise for j1 == the previous j2.
 460
 461         The tags are strings, with these meanings:
 462
 463         'replace':  a[i1:i2] should be replaced by b[j1:j2]
 464         'delete':   a[i1:i2] should be deleted.
 465                     Note that j1==j2 in this case.
 466         'insert':   b[j1:j2] should be inserted at a[i1:i1].
 467                     Note that i1==i2 in this case.
 468         'equal':    a[i1:i2] == b[j1:j2]
 469
 470         >>> a = "qabxcd"
 471         >>> b = "abycdf"
 472         >>> s = SequenceMatcher(None, a, b)
 473         >>> for tag, i1, i2, j1, j2 in s.get_opcodes():
 474         ...    print ("%7s a[%d:%d] (%s) b[%d:%d] (%s)" %
 475         ...           (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2]))
 476          delete a[0:1] (q) b[0:0] ()
 477           equal a[1:3] (ab) b[0:2] (ab)
 478         replace a[3:4] (x) b[2:3] (y)
 479           equal a[4:6] (cd) b[3:5] (cd)
 480          insert a[6:6] () b[5:6] (f)
 481         """
 482
 483         if self.opcodes is not None:
 484             return self.opcodes
 485         i = j = 0
 486         self.opcodes = answer = []
 487         for ai, bj, size in self.get_matching_blocks():
 488             # invariant:  we've pumped out correct diffs to change
 489             # a[:i] into b[:j], and the next matching block is
 490             # a[ai:ai+size] == b[bj:bj+size].  So we need to pump
 491             # out a diff to change a[i:ai] into b[j:bj], pump out
 492             # the matching block, and move (i,j) beyond the match
 493             tag = ''
 494             if i < ai and j < bj:
 495                 tag = 'replace'
 496             elif i < ai:
 497                 tag = 'delete'
 498             elif j < bj:
 499                 tag = 'insert'
 500             if tag:
 501                 answer.append( (tag, i, ai, j, bj) )
 502             i, j = ai+size, bj+size
 503             # the list of matching blocks is terminated by a
 504             # sentinel with size 0
 505             if size:
 506                 answer.append( ('equal', ai, i, bj, j) )
 507         return answer
 508
 509     def ratio(self):
 510         """Return a measure of the sequences' similarity (float in [0,1]).
 511
 512         Where T is the total number of elements in both sequences, and
 513         M is the number of matches, this is 2,0*M / T.
 514         Note that this is 1 if the sequences are identical, and 0 if
 515         they have nothing in common.
 516
 517         .ratio() is expensive to compute if you haven't already computed
 518         .get_matching_blocks() or .get_opcodes(), in which case you may
 519         want to try .quick_ratio() or .real_quick_ratio() first to get an
 520         upper bound.
 521
 522         >>> s = SequenceMatcher(None, "abcd", "bcde")
 523         >>> s.ratio()
 524         0.75
 525         >>> s.quick_ratio()
 526         0.75
 527         >>> s.real_quick_ratio()
 528         1.0
 529         """
 530
 531         matches = reduce(lambda sum, triple: sum + triple[-1],
 532                          self.get_matching_blocks(), 0)
 533         return 2.0 * matches / (len(self.a) + len(self.b))
 534
 535     def quick_ratio(self):
 536         """Return an upper bound on ratio() relatively quickly.
 537
 538         This isn't defined beyond that it is an upper bound on .ratio(), and
 539         is faster to compute.
 540         """
 541
 542         # viewing a and b as multisets, set matches to the cardinality
 543         # of their intersection; this counts the number of matches
 544         # without regard to order, so is clearly an upper bound
 545         if self.fullbcount is None:
 546             self.fullbcount = fullbcount = {}
 547             for elt in self.b:
 548                 fullbcount[elt] = fullbcount.get(elt, 0) + 1
 549         fullbcount = self.fullbcount
 550         # avail[x] is the number of times x appears in 'b' less the
 551         # number of times we've seen it in 'a' so far ... kinda
 552         avail = {}
 553         availhas, matches = avail.has_key, 0
 554         for elt in self.a:
 555             if availhas(elt):
 556                 numb = avail[elt]
 557             else:
 558                 numb = fullbcount.get(elt, 0)
 559             avail[elt] = numb - 1
 560             if numb > 0:
 561                 matches = matches + 1
 562         return 2.0 * matches / (len(self.a) + len(self.b))
 563
 564     def real_quick_ratio(self):
 565         """Return an upper bound on ratio() very quickly.
 566
 567         This isn't defined beyond that it is an upper bound on .ratio(), and
 568         is faster to compute than either .ratio() or .quick_ratio().
 569         """
 570
 571         la, lb = len(self.a), len(self.b)
 572         # can't have more matches than the number of elements in the
 573         # shorter sequence
 574         return 2.0 * min(la, lb) / (la + lb)
 575
 576 def get_close_matches(word, possibilities, n=3, cutoff=0.6):
 577     """Use SequenceMatcher to return list of the best "good enough" matches.
 578
 579     word is a sequence for which close matches are desired (typically a
 580     string).
 581
 582     possibilities is a list of sequences against which to match word
 583     (typically a list of strings).
 584
 585     Optional arg n (default 3) is the maximum number of close matches to
 586     return.  n must be > 0.
 587
 588     Optional arg cutoff (default 0.6) is a float in [0, 1].  Possibilities
 589     that don't score at least that similar to word are ignored.
 590
 591     The best (no more than n) matches among the possibilities are returned
 592     in a list, sorted by similarity score, most similar first.
 593
 594     >>> get_close_matches("appel", ["ape", "apple", "peach", "puppy"])
 595     ['apple', 'ape']
 596     >>> import keyword as _keyword
 597     >>> get_close_matches("wheel", _keyword.kwlist)
 598     ['while']
 599     >>> get_close_matches("apple", _keyword.kwlist)
 600     []
 601     >>> get_close_matches("accept", _keyword.kwlist)
 602     ['except']
 603     """
 604
 605     if not n >  0:
 606         raise ValueError("n must be > 0: " + `n`)
 607     if not 0.0 <= cutoff <= 1.0:
 608         raise ValueError("cutoff must be in [0.0, 1.0]: " + `cutoff`)
 609     result = []
 610     s = SequenceMatcher()
 611     s.set_seq2(word)
 612     for x in possibilities:
 613         s.set_seq1(x)
 614         if s.real_quick_ratio() >= cutoff and \
 615            s.quick_ratio() >= cutoff and \
 616            s.ratio() >= cutoff:
 617             result.append((s.ratio(), x))
 618     # Sort by score.
 619     result.sort()
 620     # Retain only the best n.
 621     result = result[-n:]
 622     # Move best-scorer to head of list.
 623     result.reverse()
 624     # Strip scores.
 625     return [x for score, x in result]
 626
 627
 628 def _count_leading(line, ch):
 629     """
 630     Return number of `ch` characters at the start of `line`.
 631
 632     Example:
 633
 634     >>> _count_leading('   abc', ' ')
 635     3
 636     """
 637
 638     i, n = 0, len(line)
 639     while i < n and line[i] == ch:
 640         i += 1
 641     return i
 642
 643 class Differ:
 644     r"""
 645     Differ is a class for comparing sequences of lines of text, and
 646     producing human-readable differences or deltas.  Differ uses
 647     SequenceMatcher both to compare sequences of lines, and to compare
 648     sequences of characters within similar (near-matching) lines.
 649
 650     Each line of a Differ delta begins with a two-letter code:
 651
 652         '- '    line unique to sequence 1
 653         '+ '    line unique to sequence 2
 654         '  '    line common to both sequences
 655         '? '    line not present in either input sequence
 656
 657     Lines beginning with '? ' attempt to guide the eye to intraline
 658     differences, and were not present in either input sequence.  These lines
 659     can be confusing if the sequences contain tab characters.
 660
 661     Note that Differ makes no claim to produce a *minimal* diff.  To the
 662     contrary, minimal diffs are often counter-intuitive, because they synch
 663     up anywhere possible, sometimes accidental matches 100 pages apart.
 664     Restricting synch points to contiguous matches preserves some notion of
 665     locality, at the occasional cost of producing a longer diff.
 666
 667     Example: Comparing two texts.
 668
 669     First we set up the texts, sequences of individual single-line strings
 670     ending with newlines (such sequences can also be obtained from the
 671     `readlines()` method of file-like objects):
 672
 673     >>> text1 = '''  1. Beautiful is better than ugly.
 674     ...   2. Explicit is better than implicit.
 675     ...   3. Simple is better than complex.
 676     ...   4. Complex is better than complicated.
 677     ... '''.splitlines(1)
 678     >>> len(text1)
 679     4
 680     >>> text1[0][-1]
 681     '\n'
 682     >>> text2 = '''  1. Beautiful is better than ugly.
 683     ...   3.   Simple is better than complex.
 684     ...   4. Complicated is better than complex.
 685     ...   5. Flat is better than nested.
 686     ... '''.splitlines(1)
 687
 688     Next we instantiate a Differ object:
 689
 690     >>> d = Differ()
 691
 692     Note that when instantiating a Differ object we may pass functions to
 693     filter out line and character 'junk'.  See Differ.__init__ for details.
 694
 695     Finally, we compare the two:
 696
 697     >>> result = d.compare(text1, text2)
 698
 699     'result' is a list of strings, so let's pretty-print it:
 700
 701     >>> from pprint import pprint as _pprint
 702     >>> _pprint(result)
 703     ['    1. Beautiful is better than ugly.\n',
 704      '-   2. Explicit is better than implicit.\n',
 705      '-   3. Simple is better than complex.\n',
 706      '+   3.   Simple is better than complex.\n',
 707      '?     ++\n',
 708      '-   4. Complex is better than complicated.\n',
 709      '?            ^                     ---- ^\n',
 710      '+   4. Complicated is better than complex.\n',
 711      '?           ++++ ^                      ^\n',
 712      '+   5. Flat is better than nested.\n']
 713
 714     As a single multi-line string it looks like this:
 715
 716     >>> print ''.join(result),
 717         1. Beautiful is better than ugly.
 718     -   2. Explicit is better than implicit.
 719     -   3. Simple is better than complex.
 720     +   3.   Simple is better than complex.
 721     ?     ++
 722     -   4. Complex is better than complicated.
 723     ?            ^                     ---- ^
 724     +   4. Complicated is better than complex.
 725     ?           ++++ ^                      ^
 726     +   5. Flat is better than nested.
 727
 728     Methods:
 729
 730     __init__(linejunk=None, charjunk=None)
 731         Construct a text differencer, with optional filters.
 732
 733     compare(a, b)
 734         Compare two sequences of lines; return the resulting delta (list).
 735     """
 736
 737     def __init__(self, linejunk=None, charjunk=None):
 738         """
 739         Construct a text differencer, with optional filters.
 740
 741         The two optional keyword parameters are for filter functions:
 742
 743         - `linejunk`: A function that should accept a single string argument,
 744           and return true iff the string is junk. The module-level function
 745           `IS_LINE_JUNK` may be used to filter out lines without visible
 746           characters, except for at most one splat ('#').
 747
 748         - `charjunk`: A function that should accept a string of length 1. The
 749           module-level function `IS_CHARACTER_JUNK` may be used to filter out
 750           whitespace characters (a blank or tab; **note**: bad idea to include
 751           newline in this!).
 752         """
 753
 754         self.linejunk = linejunk
 755         self.charjunk = charjunk
 756         self.results = []
 757
 758     def compare(self, a, b):
 759         r"""
 760         Compare two sequences of lines; return the resulting delta (list).
 761
 762         Each sequence must contain individual single-line strings ending with
 763         newlines. Such sequences can be obtained from the `readlines()` method
 764         of file-like objects. The list returned is also made up of
 765         newline-terminated strings, ready to be used with the `writelines()`
 766         method of a file-like object.
 767
 768         Example:
 769
 770         >>> print ''.join(Differ().compare('one\ntwo\nthree\n'.splitlines(1),
 771         ...                                'ore\ntree\nemu\n'.splitlines(1))),
 772         - one
 773         ?  ^
 774         + ore
 775         ?  ^
 776         - two
 777         - three
 778         ?  -
 779         + tree
 780         + emu
 781         """
 782
 783         cruncher = SequenceMatcher(self.linejunk, a, b)
 784         for tag, alo, ahi, blo, bhi in cruncher.get_opcodes():
 785             if tag == 'replace':
 786                 self._fancy_replace(a, alo, ahi, b, blo, bhi)
 787             elif tag == 'delete':
 788                 self._dump('-', a, alo, ahi)
 789             elif tag == 'insert':
 790                 self._dump('+', b, blo, bhi)
 791             elif tag == 'equal':
 792                 self._dump(' ', a, alo, ahi)
 793             else:
 794                 raise ValueError, 'unknown tag ' + `tag`
 795         results = self.results
 796         self.results = []
 797         return results
 798
 799     def _dump(self, tag, x, lo, hi):
 800         """Store comparison results for a same-tagged range."""
 801         for i in xrange(lo, hi):
 802             self.results.append('%s %s' % (tag, x[i]))
 803
 804     def _plain_replace(self, a, alo, ahi, b, blo, bhi):
 805         assert alo < ahi and blo < bhi
 806         # dump the shorter block first -- reduces the burden on short-term
 807         # memory if the blocks are of very different sizes
 808         if bhi - blo < ahi - alo:
 809             self._dump('+', b, blo, bhi)
 810             self._dump('-', a, alo, ahi)
 811         else:
 812             self._dump('-', a, alo, ahi)
 813             self._dump('+', b, blo, bhi)
 814
 815     def _fancy_replace(self, a, alo, ahi, b, blo, bhi):
 816         r"""
 817         When replacing one block of lines with another, search the blocks
 818         for *similar* lines; the best-matching pair (if any) is used as a
 819         synch point, and intraline difference marking is done on the
 820         similar pair. Lots of work, but often worth it.
 821
 822         Example:
 823
 824         >>> d = Differ()
 825         >>> d._fancy_replace(['abcDefghiJkl\n'], 0, 1, ['abcdefGhijkl\n'], 0, 1)
 826         >>> print ''.join(d.results),
 827         - abcDefghiJkl
 828         ?    ^  ^  ^
 829         + abcdefGhijkl
 830         ?    ^  ^  ^
 831         """
 832
 833         if TRACE:
 834             self.results.append('*** _fancy_replace %s %s %s %s\n'
 835                                 % (alo, ahi, blo, bhi))
 836             self._dump('>', a, alo, ahi)
 837             self._dump('<', b, blo, bhi)
 838
 839         # don't synch up unless the lines have a similarity score of at
 840         # least cutoff; best_ratio tracks the best score seen so far
 841         best_ratio, cutoff = 0.74, 0.75
 842         cruncher = SequenceMatcher(self.charjunk)
 843         eqi, eqj = None, None   # 1st indices of equal lines (if any)
 844
 845         # search for the pair that matches best without being identical
 846         # (identical lines must be junk lines, & we don't want to synch up
 847         # on junk -- unless we have to)
 848         for j in xrange(blo, bhi):
 849             bj = b[j]
 850             cruncher.set_seq2(bj)
 851             for i in xrange(alo, ahi):
 852                 ai = a[i]
 853                 if ai == bj:
 854                     if eqi is None:
 855                         eqi, eqj = i, j
 856                     continue
 857                 cruncher.set_seq1(ai)
 858                 # computing similarity is expensive, so use the quick
 859                 # upper bounds first -- have seen this speed up messy
 860                 # compares by a factor of 3.
 861                 # note that ratio() is only expensive to compute the first
 862                 # time it's called on a sequence pair; the expensive part
 863                 # of the computation is cached by cruncher
 864                 if cruncher.real_quick_ratio() > best_ratio and \
 865                       cruncher.quick_ratio() > best_ratio and \
 866                       cruncher.ratio() > best_ratio:
 867                     best_ratio, best_i, best_j = cruncher.ratio(), i, j
 868         if best_ratio < cutoff:
 869             # no non-identical "pretty close" pair
 870             if eqi is None:
 871                 # no identical pair either -- treat it as a straight replace
 872                 self._plain_replace(a, alo, ahi, b, blo, bhi)
 873                 return
 874             # no close pair, but an identical pair -- synch up on that
 875             best_i, best_j, best_ratio = eqi, eqj, 1.0
 876         else:
 877             # there's a close pair, so forget the identical pair (if any)
 878             eqi = None
 879
 880         # a[best_i] very similar to b[best_j]; eqi is None iff they're not
 881         # identical
 882         if TRACE:
 883             self.results.append('*** best_ratio %s %s %s %s\n'
 884                                 % (best_ratio, best_i, best_j))
 885             self._dump('>', a, best_i, best_i+1)
 886             self._dump('<', b, best_j, best_j+1)
 887
 888         # pump out diffs from before the synch point
 889         self._fancy_helper(a, alo, best_i, b, blo, best_j)
 890
 891         # do intraline marking on the synch pair
 892         aelt, belt = a[best_i], b[best_j]
 893         if eqi is None:
 894             # pump out a '-', '?', '+', '?' quad for the synched lines
 895             atags = btags = ""
 896             cruncher.set_seqs(aelt, belt)
 897             for tag, ai1, ai2, bj1, bj2 in cruncher.get_opcodes():
 898                 la, lb = ai2 - ai1, bj2 - bj1
 899                 if tag == 'replace':
 900                     atags += '^' * la
 901                     btags += '^' * lb
 902                 elif tag == 'delete':
 903                     atags += '-' * la
 904                 elif tag == 'insert':
 905                     btags += '+' * lb
 906                 elif tag == 'equal':
 907                     atags += ' ' * la
 908                     btags += ' ' * lb
 909                 else:
 910                     raise ValueError, 'unknown tag ' + `tag`
 911             self._qformat(aelt, belt, atags, btags)
 912         else:
 913             # the synch pair is identical
 914             self.results.append('  ' + aelt)
 915
 916         # pump out diffs from after the synch point
 917         self._fancy_helper(a, best_i+1, ahi, b, best_j+1, bhi)
 918
 919     def _fancy_helper(self, a, alo, ahi, b, blo, bhi):
 920         if alo < ahi:
 921             if blo < bhi:
 922                 self._fancy_replace(a, alo, ahi, b, blo, bhi)
 923             else:
 924                 self._dump('-', a, alo, ahi)
 925         elif blo < bhi:
 926             self._dump('+', b, blo, bhi)
 927
 928     def _qformat(self, aline, bline, atags, btags):
 929         r"""
 930         Format "?" output and deal with leading tabs.
 931
 932         Example:
 933
 934         >>> d = Differ()
 935         >>> d._qformat('\tabcDefghiJkl\n', '\t\tabcdefGhijkl\n',
 936         ...            '  ^ ^  ^      ', '+  ^ ^  ^      ')
 937         >>> for line in d.results: print repr(line)
 938         ...
 939         '- \tabcDefghiJkl\n'
 940         '? \t ^ ^  ^\n'
 941         '+ \t\tabcdefGhijkl\n'
 942         '? \t  ^ ^  ^\n'
 943         """
 944
 945         # Can hurt, but will probably help most of the time.
 946         common = min(_count_leading(aline, "\t"),
 947                      _count_leading(bline, "\t"))
 948         common = min(common, _count_leading(atags[:common], " "))
 949         atags = atags[common:].rstrip()
 950         btags = btags[common:].rstrip()
 951
 952         self.results.append("- " + aline)
 953         if atags:
 954             self.results.append("? %s%s\n" % ("\t" * common, atags))
 955
 956         self.results.append("+ " + bline)
 957         if btags:
 958             self.results.append("? %s%s\n" % ("\t" * common, btags))
 959
 960 # With respect to junk, an earlier version of ndiff simply refused to
 961 # *start* a match with a junk element.  The result was cases like this:
 962 #     before: private Thread currentThread;
 963 #     after:  private volatile Thread currentThread;
 964 # If you consider whitespace to be junk, the longest contiguous match
 965 # not starting with junk is "e Thread currentThread".  So ndiff reported
 966 # that "e volatil" was inserted between the 't' and the 'e' in "private".
 967 # While an accurate view, to people that's absurd.  The current version
 968 # looks for matching blocks that are entirely junk-free, then extends the
 969 # longest one of those as far as possible but only with matching junk.
 970 # So now "currentThread" is matched, then extended to suck up the
 971 # preceding blank; then "private" is matched, and extended to suck up the
 972 # following blank; then "Thread" is matched; and finally ndiff reports
 973 # that "volatile " was inserted before "Thread".  The only quibble
 974 # remaining is that perhaps it was really the case that " volatile"
 975 # was inserted after "private".  I can live with that <wink>.
 976
 977 import re
 978
 979 def IS_LINE_JUNK(line, pat=re.compile(r"\s*#?\s*$").match):
 980     r"""
 981     Return 1 for ignorable line: iff `line` is blank or contains a single '#'.
 982
 983     Examples:
 984
 985     >>> IS_LINE_JUNK('\n')
 986     1
 987     >>> IS_LINE_JUNK('  #   \n')
 988     1
 989     >>> IS_LINE_JUNK('hello\n')
 990     0
 991     """
 992
 993     return pat(line) is not None
 994
 995 def IS_CHARACTER_JUNK(ch, ws=" \t"):
 996     r"""
 997     Return 1 for ignorable character: iff `ch` is a space or tab.
 998
 999     Examples:
1000
1001     >>> IS_CHARACTER_JUNK(' ')
1002     1
1003     >>> IS_CHARACTER_JUNK('\t')
1004     1
1005     >>> IS_CHARACTER_JUNK('\n')
1006     0
1007     >>> IS_CHARACTER_JUNK('x')
1008     0
1009     """
1010
1011     return ch in ws
1012
1013 del re
1014
1015 def ndiff(a, b, linejunk=IS_LINE_JUNK, charjunk=IS_CHARACTER_JUNK):
1016     r"""
1017     Compare `a` and `b` (lists of strings); return a `Differ`-style delta.
1018
1019     Optional keyword parameters `linejunk` and `charjunk` are for filter
1020     functions (or None):
1021
1022     - linejunk: A function that should accept a single string argument, and
1023       return true iff the string is junk. The default is module-level function
1024       IS_LINE_JUNK, which filters out lines without visible characters, except
1025       for at most one splat ('#').
1026
1027     - charjunk: A function that should accept a string of length 1. The
1028       default is module-level function IS_CHARACTER_JUNK, which filters out
1029       whitespace characters (a blank or tab; note: bad idea to include newline
1030       in this!).
1031
1032     Tools/scripts/ndiff.py is a command-line front-end to this function.
1033
1034     Example:
1035
1036     >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(1),
1037     ...              'ore\ntree\nemu\n'.splitlines(1))
1038     >>> print ''.join(diff),
1039     - one
1040     ?  ^
1041     + ore
1042     ?  ^
1043     - two
1044     - three
1045     ?  -
1046     + tree
1047     + emu
1048     """
1049     return Differ(linejunk, charjunk).compare(a, b)
1050
1051 def restore(delta, which):
1052     r"""
1053     Return one of the two sequences that generated a delta.
1054
1055     Given a `delta` produced by `Differ.compare()` or `ndiff()`, extract
1056     lines originating from file 1 or 2 (parameter `which`), stripping off line
1057     prefixes.
1058
1059     Examples:
1060
1061     >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(1),
1062     ...              'ore\ntree\nemu\n'.splitlines(1))
1063     >>> print ''.join(restore(diff, 1)),
1064     one
1065     two
1066     three
1067     >>> print ''.join(restore(diff, 2)),
1068     ore
1069     tree
1070     emu
1071     """
1072     try:
1073         tag = {1: "- ", 2: "+ "}[int(which)]
1074     except KeyError:
1075         raise ValueError, ('unknown delta choice (must be 1 or 2): %r'
1076                            % which)
1077     prefixes = ("  ", tag)
1078     results = []
1079     for line in delta:
1080         if line[:2] in prefixes:
1081             results.append(line[2:])
1082     return results
1083
1084 def _test():
1085     import doctest, difflib
1086     return doctest.testmod(difflib)
1087
1088 if __name__ == "__main__":
1089     _test()