Lib/difflib.py

   1 #! /usr/bin/env python
   2
   3 """
   4 Module difflib -- helpers for computing deltas between objects.
   5
   6 Function get_close_matches(word, possibilities, n=3, cutoff=0.6):
   7     Use SequenceMatcher to return list of the best "good enough" matches.
   8
   9 Function context_diff(a, b):
  10     For two lists of strings, return a delta in context diff format.
  11
  12 Function ndiff(a, b):
  13     Return a delta: the difference between `a` and `b` (lists of strings).
  14
  15 Function restore(delta, which):
  16     Return one of the two sequences that generated an ndiff delta.
  17
  18 Function unified_diff(a, b):
  19     For two lists of strings, return a delta in unified diff format.
  20
  21 Class SequenceMatcher:
  22     A flexible class for comparing pairs of sequences of any type.
  23
  24 Class Differ:
  25     For producing human-readable deltas from sequences of lines of text.
  26
  27 Class HtmlDiff:
  28     For producing HTML side by side comparison with change highlights.
  29 """
  30
  31 __all__ = ['get_close_matches', 'ndiff', 'restore', 'SequenceMatcher',
  32            'Differ','IS_CHARACTER_JUNK', 'IS_LINE_JUNK', 'context_diff',
  33            'unified_diff', 'HtmlDiff']
  34
  35 import heapq
  36
  37 def _calculate_ratio(matches, length):
  38     if length:
  39         return 2.0 * matches / length
  40     return 1.0
  41
  42 class SequenceMatcher:
  43
  44     """
  45     SequenceMatcher is a flexible class for comparing pairs of sequences of
  46     any type, so long as the sequence elements are hashable.  The basic
  47     algorithm predates, and is a little fancier than, an algorithm
  48     published in the late 1980's by Ratcliff and Obershelp under the
  49     hyperbolic name "gestalt pattern matching".  The basic idea is to find
  50     the longest contiguous matching subsequence that contains no "junk"
  51     elements (R-O doesn't address junk).  The same idea is then applied
  52     recursively to the pieces of the sequences to the left and to the right
  53     of the matching subsequence.  This does not yield minimal edit
  54     sequences, but does tend to yield matches that "look right" to people.
  55
  56     SequenceMatcher tries to compute a "human-friendly diff" between two
  57     sequences.  Unlike e.g. UNIX(tm) diff, the fundamental notion is the
  58     longest *contiguous* & junk-free matching subsequence.  That's what
  59     catches peoples' eyes.  The Windows(tm) windiff has another interesting
  60     notion, pairing up elements that appear uniquely in each sequence.
  61     That, and the method here, appear to yield more intuitive difference
  62     reports than does diff.  This method appears to be the least vulnerable
  63     to synching up on blocks of "junk lines", though (like blank lines in
  64     ordinary text files, or maybe "<P>" lines in HTML files).  That may be
  65     because this is the only method of the 3 that has a *concept* of
  66     "junk" <wink>.
  67
  68     Example, comparing two strings, and considering blanks to be "junk":
  69
  70     >>> s = SequenceMatcher(lambda x: x == " ",
  71     ...                     "private Thread currentThread;",
  72     ...                     "private volatile Thread currentThread;")
  73     >>>
  74
  75     .ratio() returns a float in [0, 1], measuring the "similarity" of the
  76     sequences.  As a rule of thumb, a .ratio() value over 0.6 means the
  77     sequences are close matches:
  78
  79     >>> print round(s.ratio(), 3)
  80     0.866
  81     >>>
  82
  83     If you're only interested in where the sequences match,
  84     .get_matching_blocks() is handy:
  85
  86     >>> for block in s.get_matching_blocks():
  87     ...     print "a[%d] and b[%d] match for %d elements" % block
  88     a[0] and b[0] match for 8 elements
  89     a[8] and b[17] match for 6 elements
  90     a[14] and b[23] match for 15 elements
  91     a[29] and b[38] match for 0 elements
  92
  93     Note that the last tuple returned by .get_matching_blocks() is always a
  94     dummy, (len(a), len(b), 0), and this is the only case in which the last
  95     tuple element (number of elements matched) is 0.
  96
  97     If you want to know how to change the first sequence into the second,
  98     use .get_opcodes():
  99
 100     >>> for opcode in s.get_opcodes():
 101     ...     print "%6s a[%d:%d] b[%d:%d]" % opcode
 102      equal a[0:8] b[0:8]
 103     insert a[8:8] b[8:17]
 104      equal a[8:14] b[17:23]
 105      equal a[14:29] b[23:38]
 106
 107     See the Differ class for a fancy human-friendly file differencer, which
 108     uses SequenceMatcher both to compare sequences of lines, and to compare
 109     sequences of characters within similar (near-matching) lines.
 110
 111     See also function get_close_matches() in this module, which shows how
 112     simple code building on SequenceMatcher can be used to do useful work.
 113
 114     Timing:  Basic R-O is cubic time worst case and quadratic time expected
 115     case.  SequenceMatcher is quadratic time for the worst case and has
 116     expected-case behavior dependent in a complicated way on how many
 117     elements the sequences have in common; best case time is linear.
 118
 119     Methods:
 120
 121     __init__(isjunk=None, a='', b='')
 122         Construct a SequenceMatcher.
 123
 124     set_seqs(a, b)
 125         Set the two sequences to be compared.
 126
 127     set_seq1(a)
 128         Set the first sequence to be compared.
 129
 130     set_seq2(b)
 131         Set the second sequence to be compared.
 132
 133     find_longest_match(alo, ahi, blo, bhi)
 134         Find longest matching block in a[alo:ahi] and b[blo:bhi].
 135
 136     get_matching_blocks()
 137         Return list of triples describing matching subsequences.
 138
 139     get_opcodes()
 140         Return list of 5-tuples describing how to turn a into b.
 141
 142     ratio()
 143         Return a measure of the sequences' similarity (float in [0,1]).
 144
 145     quick_ratio()
 146         Return an upper bound on .ratio() relatively quickly.
 147
 148     real_quick_ratio()
 149         Return an upper bound on ratio() very quickly.
 150     """
 151
 152     def __init__(self, isjunk=None, a='', b=''):
 153         """Construct a SequenceMatcher.
 154
 155         Optional arg isjunk is None (the default), or a one-argument
 156         function that takes a sequence element and returns true iff the
 157         element is junk.  None is equivalent to passing "lambda x: 0", i.e.
 158         no elements are considered to be junk.  For example, pass
 159             lambda x: x in " \\t"
 160         if you're comparing lines as sequences of characters, and don't
 161         want to synch up on blanks or hard tabs.
 162
 163         Optional arg a is the first of two sequences to be compared.  By
 164         default, an empty string.  The elements of a must be hashable.  See
 165         also .set_seqs() and .set_seq1().
 166
 167         Optional arg b is the second of two sequences to be compared.  By
 168         default, an empty string.  The elements of b must be hashable. See
 169         also .set_seqs() and .set_seq2().
 170         """
 171
 172         # Members:
 173         # a
 174         #      first sequence
 175         # b
 176         #      second sequence; differences are computed as "what do
 177         #      we need to do to 'a' to change it into 'b'?"
 178         # b2j
 179         #      for x in b, b2j[x] is a list of the indices (into b)
 180         #      at which x appears; junk elements do not appear
 181         # fullbcount
 182         #      for x in b, fullbcount[x] == the number of times x
 183         #      appears in b; only materialized if really needed (used
 184         #      only for computing quick_ratio())
 185         # matching_blocks
 186         #      a list of (i, j, k) triples, where a[i:i+k] == b[j:j+k];
 187         #      ascending & non-overlapping in i and in j; terminated by
 188         #      a dummy (len(a), len(b), 0) sentinel
 189         # opcodes
 190         #      a list of (tag, i1, i2, j1, j2) tuples, where tag is
 191         #      one of
 192         #          'replace'   a[i1:i2] should be replaced by b[j1:j2]
 193         #          'delete'    a[i1:i2] should be deleted
 194         #          'insert'    b[j1:j2] should be inserted
 195         #          'equal'     a[i1:i2] == b[j1:j2]
 196         # isjunk
 197         #      a user-supplied function taking a sequence element and
 198         #      returning true iff the element is "junk" -- this has
 199         #      subtle but helpful effects on the algorithm, which I'll
 200         #      get around to writing up someday <0.9 wink>.
 201         #      DON'T USE!  Only __chain_b uses this.  Use isbjunk.
 202         # isbjunk
 203         #      for x in b, isbjunk(x) == isjunk(x) but much faster;
 204         #      it's really the has_key method of a hidden dict.
 205         #      DOES NOT WORK for x in a!
 206         # isbpopular
 207         #      for x in b, isbpopular(x) is true iff b is reasonably long
 208         #      (at least 200 elements) and x accounts for more than 1% of
 209         #      its elements.  DOES NOT WORK for x in a!
 210
 211         self.isjunk = isjunk
 212         self.a = self.b = None
 213         self.set_seqs(a, b)
 214
 215     def set_seqs(self, a, b):
 216         """Set the two sequences to be compared.
 217
 218         >>> s = SequenceMatcher()
 219         >>> s.set_seqs("abcd", "bcde")
 220         >>> s.ratio()
 221         0.75
 222         """
 223
 224         self.set_seq1(a)
 225         self.set_seq2(b)
 226
 227     def set_seq1(self, a):
 228         """Set the first sequence to be compared.
 229
 230         The second sequence to be compared is not changed.
 231
 232         >>> s = SequenceMatcher(None, "abcd", "bcde")
 233         >>> s.ratio()
 234         0.75
 235         >>> s.set_seq1("bcde")
 236         >>> s.ratio()
 237         1.0
 238         >>>
 239
 240         SequenceMatcher computes and caches detailed information about the
 241         second sequence, so if you want to compare one sequence S against
 242         many sequences, use .set_seq2(S) once and call .set_seq1(x)
 243         repeatedly for each of the other sequences.
 244
 245         See also set_seqs() and set_seq2().
 246         """
 247
 248         if a is self.a:
 249             return
 250         self.a = a
 251         self.matching_blocks = self.opcodes = None
 252
 253     def set_seq2(self, b):
 254         """Set the second sequence to be compared.
 255
 256         The first sequence to be compared is not changed.
 257
 258         >>> s = SequenceMatcher(None, "abcd", "bcde")
 259         >>> s.ratio()
 260         0.75
 261         >>> s.set_seq2("abcd")
 262         >>> s.ratio()
 263         1.0
 264         >>>
 265
 266         SequenceMatcher computes and caches detailed information about the
 267         second sequence, so if you want to compare one sequence S against
 268         many sequences, use .set_seq2(S) once and call .set_seq1(x)
 269         repeatedly for each of the other sequences.
 270
 271         See also set_seqs() and set_seq1().
 272         """
 273
 274         if b is self.b:
 275             return
 276         self.b = b
 277         self.matching_blocks = self.opcodes = None
 278         self.fullbcount = None
 279         self.__chain_b()
 280
 281     # For each element x in b, set b2j[x] to a list of the indices in
 282     # b where x appears; the indices are in increasing order; note that
 283     # the number of times x appears in b is len(b2j[x]) ...
 284     # when self.isjunk is defined, junk elements don't show up in this
 285     # map at all, which stops the central find_longest_match method
 286     # from starting any matching block at a junk element ...
 287     # also creates the fast isbjunk function ...
 288     # b2j also does not contain entries for "popular" elements, meaning
 289     # elements that account for more than 1% of the total elements, and
 290     # when the sequence is reasonably large (>= 200 elements); this can
 291     # be viewed as an adaptive notion of semi-junk, and yields an enormous
 292     # speedup when, e.g., comparing program files with hundreds of
 293     # instances of "return NULL;" ...
 294     # note that this is only called when b changes; so for cross-product
 295     # kinds of matches, it's best to call set_seq2 once, then set_seq1
 296     # repeatedly
 297
 298     def __chain_b(self):
 299         # Because isjunk is a user-defined (not C) function, and we test
 300         # for junk a LOT, it's important to minimize the number of calls.
 301         # Before the tricks described here, __chain_b was by far the most
 302         # time-consuming routine in the whole module!  If anyone sees
 303         # Jim Roskind, thank him again for profile.py -- I never would
 304         # have guessed that.
 305         # The first trick is to build b2j ignoring the possibility
 306         # of junk.  I.e., we don't call isjunk at all yet.  Throwing
 307         # out the junk later is much cheaper than building b2j "right"
 308         # from the start.
 309         b = self.b
 310         n = len(b)
 311         self.b2j = b2j = {}
 312         populardict = {}
 313         for i, elt in enumerate(b):
 314             if elt in b2j:
 315                 indices = b2j[elt]
 316                 if n >= 200 and len(indices) * 100 > n:
 317                     populardict[elt] = 1
 318                     del indices[:]
 319                 else:
 320                     indices.append(i)
 321             else:
 322                 b2j[elt] = [i]
 323
 324         # Purge leftover indices for popular elements.
 325         for elt in populardict:
 326             del b2j[elt]
 327
 328         # Now b2j.keys() contains elements uniquely, and especially when
 329         # the sequence is a string, that's usually a good deal smaller
 330         # than len(string).  The difference is the number of isjunk calls
 331         # saved.
 332         isjunk = self.isjunk
 333         junkdict = {}
 334         if isjunk:
 335             for d in populardict, b2j:
 336                 for elt in d.keys():
 337                     if isjunk(elt):
 338                         junkdict[elt] = 1
 339                         del d[elt]
 340
 341         # Now for x in b, isjunk(x) == x in junkdict, but the
 342         # latter is much faster.  Note too that while there may be a
 343         # lot of junk in the sequence, the number of *unique* junk
 344         # elements is probably small.  So the memory burden of keeping
 345         # this dict alive is likely trivial compared to the size of b2j.
 346         self.isbjunk = junkdict.has_key
 347         self.isbpopular = populardict.has_key
 348
 349     def find_longest_match(self, alo, ahi, blo, bhi):
 350         """Find longest matching block in a[alo:ahi] and b[blo:bhi].
 351
 352         If isjunk is not defined:
 353
 354         Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where
 355             alo <= i <= i+k <= ahi
 356             blo <= j <= j+k <= bhi
 357         and for all (i',j',k') meeting those conditions,
 358             k >= k'
 359             i <= i'
 360             and if i == i', j <= j'
 361
 362         In other words, of all maximal matching blocks, return one that
 363         starts earliest in a, and of all those maximal matching blocks that
 364         start earliest in a, return the one that starts earliest in b.
 365
 366         >>> s = SequenceMatcher(None, " abcd", "abcd abcd")
 367         >>> s.find_longest_match(0, 5, 0, 9)
 368         (0, 4, 5)
 369
 370         If isjunk is defined, first the longest matching block is
 371         determined as above, but with the additional restriction that no
 372         junk element appears in the block.  Then that block is extended as
 373         far as possible by matching (only) junk elements on both sides.  So
 374         the resulting block never matches on junk except as identical junk
 375         happens to be adjacent to an "interesting" match.
 376
 377         Here's the same example as before, but considering blanks to be
 378         junk.  That prevents " abcd" from matching the " abcd" at the tail
 379         end of the second sequence directly.  Instead only the "abcd" can
 380         match, and matches the leftmost "abcd" in the second sequence:
 381
 382         >>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")
 383         >>> s.find_longest_match(0, 5, 0, 9)
 384         (1, 0, 4)
 385
 386         If no blocks match, return (alo, blo, 0).
 387
 388         >>> s = SequenceMatcher(None, "ab", "c")
 389         >>> s.find_longest_match(0, 2, 0, 1)
 390         (0, 0, 0)
 391         """
 392
 393         # CAUTION:  stripping common prefix or suffix would be incorrect.
 394         # E.g.,
 395         #    ab
 396         #    acab
 397         # Longest matching block is "ab", but if common prefix is
 398         # stripped, it's "a" (tied with "b").  UNIX(tm) diff does so
 399         # strip, so ends up claiming that ab is changed to acab by
 400         # inserting "ca" in the middle.  That's minimal but unintuitive:
 401         # "it's obvious" that someone inserted "ac" at the front.
 402         # Windiff ends up at the same place as diff, but by pairing up
 403         # the unique 'b's and then matching the first two 'a's.
 404
 405         a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk
 406         besti, bestj, bestsize = alo, blo, 0
 407         # find longest junk-free match
 408         # during an iteration of the loop, j2len[j] = length of longest
 409         # junk-free match ending with a[i-1] and b[j]
 410         j2len = {}
 411         nothing = []
 412         for i in xrange(alo, ahi):
 413             # look at all instances of a[i] in b; note that because
 414             # b2j has no junk keys, the loop is skipped if a[i] is junk
 415             j2lenget = j2len.get
 416             newj2len = {}
 417             for j in b2j.get(a[i], nothing):
 418                 # a[i] matches b[j]
 419                 if j < blo:
 420                     continue
 421                 if j >= bhi:
 422                     break
 423                 k = newj2len[j] = j2lenget(j-1, 0) + 1
 424                 if k > bestsize:
 425                     besti, bestj, bestsize = i-k+1, j-k+1, k
 426             j2len = newj2len
 427
 428         # Extend the best by non-junk elements on each end.  In particular,
 429         # "popular" non-junk elements aren't in b2j, which greatly speeds
 430         # the inner loop above, but also means "the best" match so far
 431         # doesn't contain any junk *or* popular non-junk elements.
 432         while besti > alo and bestj > blo and \
 433               not isbjunk(b[bestj-1]) and \
 434               a[besti-1] == b[bestj-1]:
 435             besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
 436         while besti+bestsize < ahi and bestj+bestsize < bhi and \
 437               not isbjunk(b[bestj+bestsize]) and \
 438               a[besti+bestsize] == b[bestj+bestsize]:
 439             bestsize += 1
 440
 441         # Now that we have a wholly interesting match (albeit possibly
 442         # empty!), we may as well suck up the matching junk on each
 443         # side of it too.  Can't think of a good reason not to, and it
 444         # saves post-processing the (possibly considerable) expense of
 445         # figuring out what to do with it.  In the case of an empty
 446         # interesting match, this is clearly the right thing to do,
 447         # because no other kind of match is possible in the regions.
 448         while besti > alo and bestj > blo and \
 449               isbjunk(b[bestj-1]) and \
 450               a[besti-1] == b[bestj-1]:
 451             besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
 452         while besti+bestsize < ahi and bestj+bestsize < bhi and \
 453               isbjunk(b[bestj+bestsize]) and \
 454               a[besti+bestsize] == b[bestj+bestsize]:
 455             bestsize = bestsize + 1
 456
 457         return besti, bestj, bestsize
 458
 459     def get_matching_blocks(self):
 460         """Return list of triples describing matching subsequences.
 461
 462         Each triple is of the form (i, j, n), and means that
 463         a[i:i+n] == b[j:j+n].  The triples are monotonically increasing in
 464         i and in j.
 465
 466         The last triple is a dummy, (len(a), len(b), 0), and is the only
 467         triple with n==0.
 468
 469         >>> s = SequenceMatcher(None, "abxcd", "abcd")
 470         >>> s.get_matching_blocks()
 471         [(0, 0, 2), (3, 2, 2), (5, 4, 0)]
 472         """
 473
 474         if self.matching_blocks is not None:
 475             return self.matching_blocks
 476         la, lb = len(self.a), len(self.b)
 477
 478         indexed_blocks = []
 479         queue = [(0, la, 0, lb)]
 480         while queue:
 481             # builds list of matching blocks covering a[alo:ahi] and
 482             # b[blo:bhi], appending them in increasing order to answer
 483             alo, ahi, blo, bhi = queue.pop()
 484
 485             # a[alo:i] vs b[blo:j] unknown
 486             # a[i:i+k] same as b[j:j+k]
 487             # a[i+k:ahi] vs b[j+k:bhi] unknown
 488             i, j, k = x = self.find_longest_match(alo, ahi, blo, bhi)
 489
 490             if k:
 491                 if alo < i and blo < j:
 492                     queue.append((alo, i, blo, j))
 493                 indexed_blocks.append((i, x))
 494                 if i+k < ahi and j+k < bhi:
 495                     queue.append((i+k, ahi, j+k, bhi))
 496         indexed_blocks.sort()
 497
 498         self.matching_blocks = [elem[1] for elem in indexed_blocks]
 499         self.matching_blocks.append( (la, lb, 0) )
 500         return self.matching_blocks
 501
 502     def get_opcodes(self):
 503         """Return list of 5-tuples describing how to turn a into b.
 504
 505         Each tuple is of the form (tag, i1, i2, j1, j2).  The first tuple
 506         has i1 == j1 == 0, and remaining tuples have i1 == the i2 from the
 507         tuple preceding it, and likewise for j1 == the previous j2.
 508
 509         The tags are strings, with these meanings:
 510
 511         'replace':  a[i1:i2] should be replaced by b[j1:j2]
 512         'delete':   a[i1:i2] should be deleted.
 513                     Note that j1==j2 in this case.
 514         'insert':   b[j1:j2] should be inserted at a[i1:i1].
 515                     Note that i1==i2 in this case.
 516         'equal':    a[i1:i2] == b[j1:j2]
 517
 518         >>> a = "qabxcd"
 519         >>> b = "abycdf"
 520         >>> s = SequenceMatcher(None, a, b)
 521         >>> for tag, i1, i2, j1, j2 in s.get_opcodes():
 522         ...    print ("%7s a[%d:%d] (%s) b[%d:%d] (%s)" %
 523         ...           (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2]))
 524          delete a[0:1] (q) b[0:0] ()
 525           equal a[1:3] (ab) b[0:2] (ab)
 526         replace a[3:4] (x) b[2:3] (y)
 527           equal a[4:6] (cd) b[3:5] (cd)
 528          insert a[6:6] () b[5:6] (f)
 529         """
 530
 531         if self.opcodes is not None:
 532             return self.opcodes
 533         i = j = 0
 534         self.opcodes = answer = []
 535         for ai, bj, size in self.get_matching_blocks():
 536             # invariant:  we've pumped out correct diffs to change
 537             # a[:i] into b[:j], and the next matching block is
 538             # a[ai:ai+size] == b[bj:bj+size].  So we need to pump
 539             # out a diff to change a[i:ai] into b[j:bj], pump out
 540             # the matching block, and move (i,j) beyond the match
 541             tag = ''
 542             if i < ai and j < bj:
 543                 tag = 'replace'
 544             elif i < ai:
 545                 tag = 'delete'
 546             elif j < bj:
 547                 tag = 'insert'
 548             if tag:
 549                 answer.append( (tag, i, ai, j, bj) )
 550             i, j = ai+size, bj+size
 551             # the list of matching blocks is terminated by a
 552             # sentinel with size 0
 553             if size:
 554                 answer.append( ('equal', ai, i, bj, j) )
 555         return answer
 556
 557     def get_grouped_opcodes(self, n=3):
 558         """ Isolate change clusters by eliminating ranges with no changes.
 559
 560         Return a generator of groups with upto n lines of context.
 561         Each group is in the same format as returned by get_opcodes().
 562
 563         >>> from pprint import pprint
 564         >>> a = map(str, range(1,40))
 565         >>> b = a[:]
 566         >>> b[8:8] = ['i']     # Make an insertion
 567         >>> b[20] += 'x'       # Make a replacement
 568         >>> b[23:28] = []      # Make a deletion
 569         >>> b[30] += 'y'       # Make another replacement
 570         >>> pprint(list(SequenceMatcher(None,a,b).get_grouped_opcodes()))
 571         [[('equal', 5, 8, 5, 8), ('insert', 8, 8, 8, 9), ('equal', 8, 11, 9, 12)],
 572          [('equal', 16, 19, 17, 20),
 573           ('replace', 19, 20, 20, 21),
 574           ('equal', 20, 22, 21, 23),
 575           ('delete', 22, 27, 23, 23),
 576           ('equal', 27, 30, 23, 26)],
 577          [('equal', 31, 34, 27, 30),
 578           ('replace', 34, 35, 30, 31),
 579           ('equal', 35, 38, 31, 34)]]
 580         """
 581
 582         codes = self.get_opcodes()
 583         if not codes:
 584             codes = [("equal", 0, 1, 0, 1)]
 585         # Fixup leading and trailing groups if they show no changes.
 586         if codes[0][0] == 'equal':
 587             tag, i1, i2, j1, j2 = codes[0]
 588             codes[0] = tag, max(i1, i2-n), i2, max(j1, j2-n), j2
 589         if codes[-1][0] == 'equal':
 590             tag, i1, i2, j1, j2 = codes[-1]
 591             codes[-1] = tag, i1, min(i2, i1+n), j1, min(j2, j1+n)
 592
 593         nn = n + n
 594         group = []
 595         for tag, i1, i2, j1, j2 in codes:
 596             # End the current group and start a new one whenever
 597             # there is a large range with no changes.
 598             if tag == 'equal' and i2-i1 > nn:
 599                 group.append((tag, i1, min(i2, i1+n), j1, min(j2, j1+n)))
 600                 yield group
 601                 group = []
 602                 i1, j1 = max(i1, i2-n), max(j1, j2-n)
 603             group.append((tag, i1, i2, j1 ,j2))
 604         if group and not (len(group)==1 and group[0][0] == 'equal'):
 605             yield group
 606
 607     def ratio(self):
 608         """Return a measure of the sequences' similarity (float in [0,1]).
 609
 610         Where T is the total number of elements in both sequences, and
 611         M is the number of matches, this is 2.0*M / T.
 612         Note that this is 1 if the sequences are identical, and 0 if
 613         they have nothing in common.
 614
 615         .ratio() is expensive to compute if you haven't already computed
 616         .get_matching_blocks() or .get_opcodes(), in which case you may
 617         want to try .quick_ratio() or .real_quick_ratio() first to get an
 618         upper bound.
 619
 620         >>> s = SequenceMatcher(None, "abcd", "bcde")
 621         >>> s.ratio()
 622         0.75
 623         >>> s.quick_ratio()
 624         0.75
 625         >>> s.real_quick_ratio()
 626         1.0
 627         """
 628
 629         matches = reduce(lambda sum, triple: sum + triple[-1],
 630                          self.get_matching_blocks(), 0)
 631         return _calculate_ratio(matches, len(self.a) + len(self.b))
 632
 633     def quick_ratio(self):
 634         """Return an upper bound on ratio() relatively quickly.
 635
 636         This isn't defined beyond that it is an upper bound on .ratio(), and
 637         is faster to compute.
 638         """
 639
 640         # viewing a and b as multisets, set matches to the cardinality
 641         # of their intersection; this counts the number of matches
 642         # without regard to order, so is clearly an upper bound
 643         if self.fullbcount is None:
 644             self.fullbcount = fullbcount = {}
 645             for elt in self.b:
 646                 fullbcount[elt] = fullbcount.get(elt, 0) + 1
 647         fullbcount = self.fullbcount
 648         # avail[x] is the number of times x appears in 'b' less the
 649         # number of times we've seen it in 'a' so far ... kinda
 650         avail = {}
 651         availhas, matches = avail.has_key, 0
 652         for elt in self.a:
 653             if availhas(elt):
 654                 numb = avail[elt]
 655             else:
 656                 numb = fullbcount.get(elt, 0)
 657             avail[elt] = numb - 1
 658             if numb > 0:
 659                 matches = matches + 1
 660         return _calculate_ratio(matches, len(self.a) + len(self.b))
 661
 662     def real_quick_ratio(self):
 663         """Return an upper bound on ratio() very quickly.
 664
 665         This isn't defined beyond that it is an upper bound on .ratio(), and
 666         is faster to compute than either .ratio() or .quick_ratio().
 667         """
 668
 669         la, lb = len(self.a), len(self.b)
 670         # can't have more matches than the number of elements in the
 671         # shorter sequence
 672         return _calculate_ratio(min(la, lb), la + lb)
 673
 674 def get_close_matches(word, possibilities, n=3, cutoff=0.6):
 675     """Use SequenceMatcher to return list of the best "good enough" matches.
 676
 677     word is a sequence for which close matches are desired (typically a
 678     string).
 679
 680     possibilities is a list of sequences against which to match word
 681     (typically a list of strings).
 682
 683     Optional arg n (default 3) is the maximum number of close matches to
 684     return.  n must be > 0.
 685
 686     Optional arg cutoff (default 0.6) is a float in [0, 1].  Possibilities
 687     that don't score at least that similar to word are ignored.
 688
 689     The best (no more than n) matches among the possibilities are returned
 690     in a list, sorted by similarity score, most similar first.
 691
 692     >>> get_close_matches("appel", ["ape", "apple", "peach", "puppy"])
 693     ['apple', 'ape']
 694     >>> import keyword as _keyword
 695     >>> get_close_matches("wheel", _keyword.kwlist)
 696     ['while']
 697     >>> get_close_matches("apple", _keyword.kwlist)
 698     []
 699     >>> get_close_matches("accept", _keyword.kwlist)
 700     ['except']
 701     """
 702
 703     if not n >  0:
 704         raise ValueError("n must be > 0: %r" % (n,))
 705     if not 0.0 <= cutoff <= 1.0:
 706         raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,))
 707     result = []
 708     s = SequenceMatcher()
 709     s.set_seq2(word)
 710     for x in possibilities:
 711         s.set_seq1(x)
 712         if s.real_quick_ratio() >= cutoff and \
 713            s.quick_ratio() >= cutoff and \
 714            s.ratio() >= cutoff:
 715             result.append((s.ratio(), x))
 716
 717     # Move the best scorers to head of list
 718     result = heapq.nlargest(n, result)
 719     # Strip scores for the best n matches
 720     return [x for score, x in result]
 721
 722 def _count_leading(line, ch):
 723     """
 724     Return number of `ch` characters at the start of `line`.
 725
 726     Example:
 727
 728     >>> _count_leading('   abc', ' ')
 729     3
 730     """
 731
 732     i, n = 0, len(line)
 733     while i < n and line[i] == ch:
 734         i += 1
 735     return i
 736
 737 class Differ:
 738     r"""
 739     Differ is a class for comparing sequences of lines of text, and
 740     producing human-readable differences or deltas.  Differ uses
 741     SequenceMatcher both to compare sequences of lines, and to compare
 742     sequences of characters within similar (near-matching) lines.
 743
 744     Each line of a Differ delta begins with a two-letter code:
 745
 746         '- '    line unique to sequence 1
 747         '+ '    line unique to sequence 2
 748         '  '    line common to both sequences
 749         '? '    line not present in either input sequence
 750
 751     Lines beginning with '? ' attempt to guide the eye to intraline
 752     differences, and were not present in either input sequence.  These lines
 753     can be confusing if the sequences contain tab characters.
 754
 755     Note that Differ makes no claim to produce a *minimal* diff.  To the
 756     contrary, minimal diffs are often counter-intuitive, because they synch
 757     up anywhere possible, sometimes accidental matches 100 pages apart.
 758     Restricting synch points to contiguous matches preserves some notion of
 759     locality, at the occasional cost of producing a longer diff.
 760
 761     Example: Comparing two texts.
 762
 763     First we set up the texts, sequences of individual single-line strings
 764     ending with newlines (such sequences can also be obtained from the
 765     `readlines()` method of file-like objects):
 766
 767     >>> text1 = '''  1. Beautiful is better than ugly.
 768     ...   2. Explicit is better than implicit.
 769     ...   3. Simple is better than complex.
 770     ...   4. Complex is better than complicated.
 771     ... '''.splitlines(1)
 772     >>> len(text1)
 773     4
 774     >>> text1[0][-1]
 775     '\n'
 776     >>> text2 = '''  1. Beautiful is better than ugly.
 777     ...   3.   Simple is better than complex.
 778     ...   4. Complicated is better than complex.
 779     ...   5. Flat is better than nested.
 780     ... '''.splitlines(1)
 781
 782     Next we instantiate a Differ object:
 783
 784     >>> d = Differ()
 785
 786     Note that when instantiating a Differ object we may pass functions to
 787     filter out line and character 'junk'.  See Differ.__init__ for details.
 788
 789     Finally, we compare the two:
 790
 791     >>> result = list(d.compare(text1, text2))
 792
 793     'result' is a list of strings, so let's pretty-print it:
 794
 795     >>> from pprint import pprint as _pprint
 796     >>> _pprint(result)
 797     ['    1. Beautiful is better than ugly.\n',
 798      '-   2. Explicit is better than implicit.\n',
 799      '-   3. Simple is better than complex.\n',
 800      '+   3.   Simple is better than complex.\n',
 801      '?     ++\n',
 802      '-   4. Complex is better than complicated.\n',
 803      '?            ^                     ---- ^\n',
 804      '+   4. Complicated is better than complex.\n',
 805      '?           ++++ ^                      ^\n',
 806      '+   5. Flat is better than nested.\n']
 807
 808     As a single multi-line string it looks like this:
 809
 810     >>> print ''.join(result),
 811         1. Beautiful is better than ugly.
 812     -   2. Explicit is better than implicit.
 813     -   3. Simple is better than complex.
 814     +   3.   Simple is better than complex.
 815     ?     ++
 816     -   4. Complex is better than complicated.
 817     ?            ^                     ---- ^
 818     +   4. Complicated is better than complex.
 819     ?           ++++ ^                      ^
 820     +   5. Flat is better than nested.
 821
 822     Methods:
 823
 824     __init__(linejunk=None, charjunk=None)
 825         Construct a text differencer, with optional filters.
 826
 827     compare(a, b)
 828         Compare two sequences of lines; generate the resulting delta.
 829     """
 830
 831     def __init__(self, linejunk=None, charjunk=None):
 832         """
 833         Construct a text differencer, with optional filters.
 834
 835         The two optional keyword parameters are for filter functions:
 836
 837         - `linejunk`: A function that should accept a single string argument,
 838           and return true iff the string is junk. The module-level function
 839           `IS_LINE_JUNK` may be used to filter out lines without visible
 840           characters, except for at most one splat ('#').  It is recommended
 841           to leave linejunk None; as of Python 2.3, the underlying
 842           SequenceMatcher class has grown an adaptive notion of "noise" lines
 843           that's better than any static definition the author has ever been
 844           able to craft.
 845
 846         - `charjunk`: A function that should accept a string of length 1. The
 847           module-level function `IS_CHARACTER_JUNK` may be used to filter out
 848           whitespace characters (a blank or tab; **note**: bad idea to include
 849           newline in this!).  Use of IS_CHARACTER_JUNK is recommended.
 850         """
 851
 852         self.linejunk = linejunk
 853         self.charjunk = charjunk
 854
 855     def compare(self, a, b):
 856         r"""
 857         Compare two sequences of lines; generate the resulting delta.
 858
 859         Each sequence must contain individual single-line strings ending with
 860         newlines. Such sequences can be obtained from the `readlines()` method
 861         of file-like objects.  The delta generated also consists of newline-
 862         terminated strings, ready to be printed as-is via the writeline()
 863         method of a file-like object.
 864
 865         Example:
 866
 867         >>> print ''.join(Differ().compare('one\ntwo\nthree\n'.splitlines(1),
 868         ...                                'ore\ntree\nemu\n'.splitlines(1))),
 869         - one
 870         ?  ^
 871         + ore
 872         ?  ^
 873         - two
 874         - three
 875         ?  -
 876         + tree
 877         + emu
 878         """
 879
 880         cruncher = SequenceMatcher(self.linejunk, a, b)
 881         for tag, alo, ahi, blo, bhi in cruncher.get_opcodes():
 882             if tag == 'replace':
 883                 g = self._fancy_replace(a, alo, ahi, b, blo, bhi)
 884             elif tag == 'delete':
 885                 g = self._dump('-', a, alo, ahi)
 886             elif tag == 'insert':
 887                 g = self._dump('+', b, blo, bhi)
 888             elif tag == 'equal':
 889                 g = self._dump(' ', a, alo, ahi)
 890             else:
 891                 raise ValueError, 'unknown tag %r' % (tag,)
 892
 893             for line in g:
 894                 yield line
 895
 896     def _dump(self, tag, x, lo, hi):
 897         """Generate comparison results for a same-tagged range."""
 898         for i in xrange(lo, hi):
 899             yield '%s %s' % (tag, x[i])
 900
 901     def _plain_replace(self, a, alo, ahi, b, blo, bhi):
 902         assert alo < ahi and blo < bhi
 903         # dump the shorter block first -- reduces the burden on short-term
 904         # memory if the blocks are of very different sizes
 905         if bhi - blo < ahi - alo:
 906             first  = self._dump('+', b, blo, bhi)
 907             second = self._dump('-', a, alo, ahi)
 908         else:
 909             first  = self._dump('-', a, alo, ahi)
 910             second = self._dump('+', b, blo, bhi)
 911
 912         for g in first, second:
 913             for line in g:
 914                 yield line
 915
 916     def _fancy_replace(self, a, alo, ahi, b, blo, bhi):
 917         r"""
 918         When replacing one block of lines with another, search the blocks
 919         for *similar* lines; the best-matching pair (if any) is used as a
 920         synch point, and intraline difference marking is done on the
 921         similar pair. Lots of work, but often worth it.
 922
 923         Example:
 924
 925         >>> d = Differ()
 926         >>> results = d._fancy_replace(['abcDefghiJkl\n'], 0, 1,
 927         ...                            ['abcdefGhijkl\n'], 0, 1)
 928         >>> print ''.join(results),
 929         - abcDefghiJkl
 930         ?    ^  ^  ^
 931         + abcdefGhijkl
 932         ?    ^  ^  ^
 933         """
 934
 935         # don't synch up unless the lines have a similarity score of at
 936         # least cutoff; best_ratio tracks the best score seen so far
 937         best_ratio, cutoff = 0.74, 0.75
 938         cruncher = SequenceMatcher(self.charjunk)
 939         eqi, eqj = None, None   # 1st indices of equal lines (if any)
 940
 941         # search for the pair that matches best without being identical
 942         # (identical lines must be junk lines, & we don't want to synch up
 943         # on junk -- unless we have to)
 944         for j in xrange(blo, bhi):
 945             bj = b[j]
 946             cruncher.set_seq2(bj)
 947             for i in xrange(alo, ahi):
 948                 ai = a[i]
 949                 if ai == bj:
 950                     if eqi is None:
 951                         eqi, eqj = i, j
 952                     continue
 953                 cruncher.set_seq1(ai)
 954                 # computing similarity is expensive, so use the quick
 955                 # upper bounds first -- have seen this speed up messy
 956                 # compares by a factor of 3.
 957                 # note that ratio() is only expensive to compute the first
 958                 # time it's called on a sequence pair; the expensive part
 959                 # of the computation is cached by cruncher
 960                 if cruncher.real_quick_ratio() > best_ratio and \
 961                       cruncher.quick_ratio() > best_ratio and \
 962                       cruncher.ratio() > best_ratio:
 963                     best_ratio, best_i, best_j = cruncher.ratio(), i, j
 964         if best_ratio < cutoff:
 965             # no non-identical "pretty close" pair
 966             if eqi is None:
 967                 # no identical pair either -- treat it as a straight replace
 968                 for line in self._plain_replace(a, alo, ahi, b, blo, bhi):
 969                     yield line
 970                 return
 971             # no close pair, but an identical pair -- synch up on that
 972             best_i, best_j, best_ratio = eqi, eqj, 1.0
 973         else:
 974             # there's a close pair, so forget the identical pair (if any)
 975             eqi = None
 976
 977         # a[best_i] very similar to b[best_j]; eqi is None iff they're not
 978         # identical
 979
 980         # pump out diffs from before the synch point
 981         for line in self._fancy_helper(a, alo, best_i, b, blo, best_j):
 982             yield line
 983
 984         # do intraline marking on the synch pair
 985         aelt, belt = a[best_i], b[best_j]
 986         if eqi is None:
 987             # pump out a '-', '?', '+', '?' quad for the synched lines
 988             atags = btags = ""
 989             cruncher.set_seqs(aelt, belt)
 990             for tag, ai1, ai2, bj1, bj2 in cruncher.get_opcodes():
 991                 la, lb = ai2 - ai1, bj2 - bj1
 992                 if tag == 'replace':
 993                     atags += '^' * la
 994                     btags += '^' * lb
 995                 elif tag == 'delete':
 996                     atags += '-' * la
 997                 elif tag == 'insert':
 998                     btags += '+' * lb
 999                 elif tag == 'equal':
1000                     atags += ' ' * la
1001                     btags += ' ' * lb
1002                 else:
1003                     raise ValueError, 'unknown tag %r' % (tag,)
1004             for line in self._qformat(aelt, belt, atags, btags):
1005                 yield line
1006         else:
1007             # the synch pair is identical
1008             yield '  ' + aelt
1009
1010         # pump out diffs from after the synch point
1011         for line in self._fancy_helper(a, best_i+1, ahi, b, best_j+1, bhi):
1012             yield line
1013
1014     def _fancy_helper(self, a, alo, ahi, b, blo, bhi):
1015         g = []
1016         if alo < ahi:
1017             if blo < bhi:
1018                 g = self._fancy_replace(a, alo, ahi, b, blo, bhi)
1019             else:
1020                 g = self._dump('-', a, alo, ahi)
1021         elif blo < bhi:
1022             g = self._dump('+', b, blo, bhi)
1023
1024         for line in g:
1025             yield line
1026
1027     def _qformat(self, aline, bline, atags, btags):
1028         r"""
1029         Format "?" output and deal with leading tabs.
1030
1031         Example:
1032
1033         >>> d = Differ()
1034         >>> results = d._qformat('\tabcDefghiJkl\n', '\t\tabcdefGhijkl\n',
1035         ...                      '  ^ ^  ^      ', '+  ^ ^  ^      ')
1036         >>> for line in results: print repr(line)
1037         ...
1038         '- \tabcDefghiJkl\n'
1039         '? \t ^ ^  ^\n'
1040         '+ \t\tabcdefGhijkl\n'
1041         '? \t  ^ ^  ^\n'
1042         """
1043
1044         # Can hurt, but will probably help most of the time.
1045         common = min(_count_leading(aline, "\t"),
1046                      _count_leading(bline, "\t"))
1047         common = min(common, _count_leading(atags[:common], " "))
1048         atags = atags[common:].rstrip()
1049         btags = btags[common:].rstrip()
1050
1051         yield "- " + aline
1052         if atags:
1053             yield "? %s%s\n" % ("\t" * common, atags)
1054
1055         yield "+ " + bline
1056         if btags:
1057             yield "? %s%s\n" % ("\t" * common, btags)
1058
1059 # With respect to junk, an earlier version of ndiff simply refused to
1060 # *start* a match with a junk element.  The result was cases like this:
1061 #     before: private Thread currentThread;
1062 #     after:  private volatile Thread currentThread;
1063 # If you consider whitespace to be junk, the longest contiguous match
1064 # not starting with junk is "e Thread currentThread".  So ndiff reported
1065 # that "e volatil" was inserted between the 't' and the 'e' in "private".
1066 # While an accurate view, to people that's absurd.  The current version
1067 # looks for matching blocks that are entirely junk-free, then extends the
1068 # longest one of those as far as possible but only with matching junk.
1069 # So now "currentThread" is matched, then extended to suck up the
1070 # preceding blank; then "private" is matched, and extended to suck up the
1071 # following blank; then "Thread" is matched; and finally ndiff reports
1072 # that "volatile " was inserted before "Thread".  The only quibble
1073 # remaining is that perhaps it was really the case that " volatile"
1074 # was inserted after "private".  I can live with that <wink>.
1075
1076 import re
1077
1078 def IS_LINE_JUNK(line, pat=re.compile(r"\s*#?\s*$").match):
1079     r"""
1080     Return 1 for ignorable line: iff `line` is blank or contains a single '#'.
1081
1082     Examples:
1083
1084     >>> IS_LINE_JUNK('\n')
1085     True
1086     >>> IS_LINE_JUNK('  #   \n')
1087     True
1088     >>> IS_LINE_JUNK('hello\n')
1089     False
1090     """
1091
1092     return pat(line) is not None
1093
1094 def IS_CHARACTER_JUNK(ch, ws=" \t"):
1095     r"""
1096     Return 1 for ignorable character: iff `ch` is a space or tab.
1097
1098     Examples:
1099
1100     >>> IS_CHARACTER_JUNK(' ')
1101     True
1102     >>> IS_CHARACTER_JUNK('\t')
1103     True
1104     >>> IS_CHARACTER_JUNK('\n')
1105     False
1106     >>> IS_CHARACTER_JUNK('x')
1107     False
1108     """
1109
1110     return ch in ws
1111
1112
1113 def unified_diff(a, b, fromfile='', tofile='', fromfiledate='',
1114                  tofiledate='', n=3, lineterm='\n'):
1115     r"""
1116     Compare two sequences of lines; generate the delta as a unified diff.
1117
1118     Unified diffs are a compact way of showing line changes and a few
1119     lines of context.  The number of context lines is set by 'n' which
1120     defaults to three.
1121
1122     By default, the diff control lines (those with ---, +++, or @@) are
1123     created with a trailing newline.  This is helpful so that inputs
1124     created from file.readlines() result in diffs that are suitable for
1125     file.writelines() since both the inputs and outputs have trailing
1126     newlines.
1127
1128     For inputs that do not have trailing newlines, set the lineterm
1129     argument to "" so that the output will be uniformly newline free.
1130
1131     The unidiff format normally has a header for filenames and modification
1132     times.  Any or all of these may be specified using strings for
1133     'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'.  The modification
1134     times are normally expressed in the format returned by time.ctime().
1135
1136     Example:
1137
1138     >>> for line in unified_diff('one two three four'.split(),
1139     ...             'zero one tree four'.split(), 'Original', 'Current',
1140     ...             'Sat Jan 26 23:30:50 1991', 'Fri Jun 06 10:20:52 2003',
1141     ...             lineterm=''):
1142     ...     print line
1143     --- Original Sat Jan 26 23:30:50 1991
1144     +++ Current Fri Jun 06 10:20:52 2003
1145     @@ -1,4 +1,4 @@
1146     +zero
1147      one
1148     -two
1149     -three
1150     +tree
1151      four
1152     """
1153
1154     started = False
1155     for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n):
1156         if not started:
1157             yield '--- %s %s%s' % (fromfile, fromfiledate, lineterm)
1158             yield '+++ %s %s%s' % (tofile, tofiledate, lineterm)
1159             started = True
1160         i1, i2, j1, j2 = group[0][1], group[-1][2], group[0][3], group[-1][4]
1161         yield "@@ -%d,%d +%d,%d @@%s" % (i1+1, i2-i1, j1+1, j2-j1, lineterm)
1162         for tag, i1, i2, j1, j2 in group:
1163             if tag == 'equal':
1164                 for line in a[i1:i2]:
1165                     yield ' ' + line
1166                 continue
1167             if tag == 'replace' or tag == 'delete':
1168                 for line in a[i1:i2]:
1169                     yield '-' + line
1170             if tag == 'replace' or tag == 'insert':
1171                 for line in b[j1:j2]:
1172                     yield '+' + line
1173
1174 # See http://www.unix.org/single_unix_specification/
1175 def context_diff(a, b, fromfile='', tofile='',
1176                  fromfiledate='', tofiledate='', n=3, lineterm='\n'):
1177     r"""
1178     Compare two sequences of lines; generate the delta as a context diff.
1179
1180     Context diffs are a compact way of showing line changes and a few
1181     lines of context.  The number of context lines is set by 'n' which
1182     defaults to three.
1183
1184     By default, the diff control lines (those with *** or ---) are
1185     created with a trailing newline.  This is helpful so that inputs
1186     created from file.readlines() result in diffs that are suitable for
1187     file.writelines() since both the inputs and outputs have trailing
1188     newlines.
1189
1190     For inputs that do not have trailing newlines, set the lineterm
1191     argument to "" so that the output will be uniformly newline free.
1192
1193     The context diff format normally has a header for filenames and
1194     modification times.  Any or all of these may be specified using
1195     strings for 'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'.
1196     The modification times are normally expressed in the format returned
1197     by time.ctime().  If not specified, the strings default to blanks.
1198
1199     Example:
1200
1201     >>> print ''.join(context_diff('one\ntwo\nthree\nfour\n'.splitlines(1),
1202     ...       'zero\none\ntree\nfour\n'.splitlines(1), 'Original', 'Current',
1203     ...       'Sat Jan 26 23:30:50 1991', 'Fri Jun 06 10:22:46 2003')),
1204     *** Original Sat Jan 26 23:30:50 1991
1205     --- Current Fri Jun 06 10:22:46 2003
1206     ***************
1207     *** 1,4 ****
1208       one
1209     ! two
1210     ! three
1211       four
1212     --- 1,4 ----
1213     + zero
1214       one
1215     ! tree
1216       four
1217     """
1218
1219     started = False
1220     prefixmap = {'insert':'+ ', 'delete':'- ', 'replace':'! ', 'equal':'  '}
1221     for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n):
1222         if not started:
1223             yield '*** %s %s%s' % (fromfile, fromfiledate, lineterm)
1224             yield '--- %s %s%s' % (tofile, tofiledate, lineterm)
1225             started = True
1226
1227         yield '***************%s' % (lineterm,)
1228         if group[-1][2] - group[0][1] >= 2:
1229             yield '*** %d,%d ****%s' % (group[0][1]+1, group[-1][2], lineterm)
1230         else:
1231             yield '*** %d ****%s' % (group[-1][2], lineterm)
1232         visiblechanges = [e for e in group if e[0] in ('replace', 'delete')]
1233         if visiblechanges:
1234             for tag, i1, i2, _, _ in group:
1235                 if tag != 'insert':
1236                     for line in a[i1:i2]:
1237                         yield prefixmap[tag] + line
1238
1239         if group[-1][4] - group[0][3] >= 2:
1240             yield '--- %d,%d ----%s' % (group[0][3]+1, group[-1][4], lineterm)
1241         else:
1242             yield '--- %d ----%s' % (group[-1][4], lineterm)
1243         visiblechanges = [e for e in group if e[0] in ('replace', 'insert')]
1244         if visiblechanges:
1245             for tag, _, _, j1, j2 in group:
1246                 if tag != 'delete':
1247                     for line in b[j1:j2]:
1248                         yield prefixmap[tag] + line
1249
1250 def ndiff(a, b, linejunk=None, charjunk=IS_CHARACTER_JUNK):
1251     r"""
1252     Compare `a` and `b` (lists of strings); return a `Differ`-style delta.
1253
1254     Optional keyword parameters `linejunk` and `charjunk` are for filter
1255     functions (or None):
1256
1257     - linejunk: A function that should accept a single string argument, and
1258       return true iff the string is junk.  The default is None, and is
1259       recommended; as of Python 2.3, an adaptive notion of "noise" lines is
1260       used that does a good job on its own.
1261
1262     - charjunk: A function that should accept a string of length 1. The
1263       default is module-level function IS_CHARACTER_JUNK, which filters out
1264       whitespace characters (a blank or tab; note: bad idea to include newline
1265       in this!).
1266
1267     Tools/scripts/ndiff.py is a command-line front-end to this function.
1268
1269     Example:
1270
1271     >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(1),
1272     ...              'ore\ntree\nemu\n'.splitlines(1))
1273     >>> print ''.join(diff),
1274     - one
1275     ?  ^
1276     + ore
1277     ?  ^
1278     - two
1279     - three
1280     ?  -
1281     + tree
1282     + emu
1283     """
1284     return Differ(linejunk, charjunk).compare(a, b)
1285
1286 def _mdiff(fromlines, tolines, context=None, linejunk=None,
1287            charjunk=IS_CHARACTER_JUNK):
1288     """Returns generator yielding marked up from/to side by side differences.
1289
1290     Arguments:
1291     fromlines -- list of text lines to compared to tolines
1292     tolines -- list of text lines to be compared to fromlines
1293     context -- number of context lines to display on each side of difference,
1294                if None, all from/to text lines will be generated.
1295     linejunk -- passed on to ndiff (see ndiff documentation)
1296     charjunk -- passed on to ndiff (see ndiff documentation)
1297
1298     This function returns an interator which returns a tuple:
1299     (from line tuple, to line tuple, boolean flag)
1300
1301     from/to line tuple -- (line num, line text)
1302         line num -- integer or None (to indicate a context seperation)
1303         line text -- original line text with following markers inserted:
1304             '\0+' -- marks start of added text
1305             '\0-' -- marks start of deleted text
1306             '\0^' -- marks start of changed text
1307             '\1' -- marks end of added/deleted/changed text
1308
1309     boolean flag -- None indicates context separation, True indicates
1310         either "from" or "to" line contains a change, otherwise False.
1311
1312     This function/iterator was originally developed to generate side by side
1313     file difference for making HTML pages (see HtmlDiff class for example
1314     usage).
1315
1316     Note, this function utilizes the ndiff function to generate the side by
1317     side difference markup.  Optional ndiff arguments may be passed to this
1318     function and they in turn will be passed to ndiff.
1319     """
1320     import re
1321
1322     # regular expression for finding intraline change indices
1323     change_re = re.compile('(\++|\-+|\^+)')
1324
1325     # create the difference iterator to generate the differences
1326     diff_lines_iterator = ndiff(fromlines,tolines,linejunk,charjunk)
1327
1328     def _make_line(lines, format_key, side, num_lines=[0,0]):
1329         """Returns line of text with user's change markup and line formatting.
1330
1331         lines -- list of lines from the ndiff generator to produce a line of
1332                  text from.  When producing the line of text to return, the
1333                  lines used are removed from this list.
1334         format_key -- '+' return first line in list with "add" markup around
1335                           the entire line.
1336                       '-' return first line in list with "delete" markup around
1337                           the entire line.
1338                       '?' return first line in list with add/delete/change
1339                           intraline markup (indices obtained from second line)
1340                       None return first line in list with no markup
1341         side -- indice into the num_lines list (0=from,1=to)
1342         num_lines -- from/to current line number.  This is NOT intended to be a
1343                      passed parameter.  It is present as a keyword argument to
1344                      maintain memory of the current line numbers between calls
1345                      of this function.
1346
1347         Note, this function is purposefully not defined at the module scope so
1348         that data it needs from its parent function (within whose context it
1349         is defined) does not need to be of module scope.
1350         """
1351         num_lines[side] += 1
1352         # Handle case where no user markup is to be added, just return line of
1353         # text with user's line format to allow for usage of the line number.
1354         if format_key is None:
1355             return (num_lines[side],lines.pop(0)[2:])
1356         # Handle case of intraline changes
1357         if format_key == '?':
1358             text, markers = lines.pop(0), lines.pop(0)
1359             # find intraline changes (store change type and indices in tuples)
1360             sub_info = []
1361             def record_sub_info(match_object,sub_info=sub_info):
1362                 sub_info.append([match_object.group(1)[0],match_object.span()])
1363                 return match_object.group(1)
1364             change_re.sub(record_sub_info,markers)
1365             # process each tuple inserting our special marks that won't be
1366             # noticed by an xml/html escaper.
1367             for key,(begin,end) in sub_info[::-1]:
1368                 text = text[0:begin]+'\0'+key+text[begin:end]+'\1'+text[end:]
1369             text = text[2:]
1370         # Handle case of add/delete entire line
1371         else:
1372             text = lines.pop(0)[2:]
1373             # if line of text is just a newline, insert a space so there is
1374             # something for the user to highlight and see.
1375             if not text:
1376                 text = ' '
1377             # insert marks that won't be noticed by an xml/html escaper.
1378             text = '\0' + format_key + text + '\1'
1379         # Return line of text, first allow user's line formatter to do its
1380         # thing (such as adding the line number) then replace the special
1381         # marks with what the user's change markup.
1382         return (num_lines[side],text)
1383
1384     def _line_iterator():
1385         """Yields from/to lines of text with a change indication.
1386
1387         This function is an iterator.  It itself pulls lines from a
1388         differencing iterator, processes them and yields them.  When it can
1389         it yields both a "from" and a "to" line, otherwise it will yield one
1390         or the other.  In addition to yielding the lines of from/to text, a
1391         boolean flag is yielded to indicate if the text line(s) have
1392         differences in them.
1393
1394         Note, this function is purposefully not defined at the module scope so
1395         that data it needs from its parent function (within whose context it
1396         is defined) does not need to be of module scope.
1397         """
1398         lines = []
1399         num_blanks_pending, num_blanks_to_yield = 0, 0
1400         while True:
1401             # Load up next 4 lines so we can look ahead, create strings which
1402             # are a concatenation of the first character of each of the 4 lines
1403             # so we can do some very readable comparisons.
1404             while len(lines) < 4:
1405                 try:
1406                     lines.append(diff_lines_iterator.next())
1407                 except StopIteration:
1408                     lines.append('X')
1409             s = ''.join([line[0] for line in lines])
1410             if s.startswith('X'):
1411                 # When no more lines, pump out any remaining blank lines so the
1412                 # corresponding add/delete lines get a matching blank line so
1413                 # all line pairs get yielded at the next level.
1414                 num_blanks_to_yield = num_blanks_pending
1415             elif s.startswith('-?+?'):
1416                 # simple intraline change
1417                 yield _make_line(lines,'?',0), _make_line(lines,'?',1), True
1418                 continue
1419             elif s.startswith('--++'):
1420                 # in delete block, add block coming: we do NOT want to get
1421                 # caught up on blank lines yet, just process the delete line
1422                 num_blanks_pending -= 1
1423                 yield _make_line(lines,'-',0), None, True
1424                 continue
1425             elif s.startswith('--?+') or s.startswith('--+') or \
1426                  s.startswith('- '):
1427                 # in delete block and see a intraline change or unchanged line
1428                 # coming: yield the delete line and then blanks
1429                 from_line,to_line = _make_line(lines,'-',0), None
1430                 num_blanks_to_yield,num_blanks_pending = num_blanks_pending-1,0
1431             elif s.startswith('-+?'):
1432                 # intraline change
1433                 yield _make_line(lines,None,0), _make_line(lines,'?',1), True
1434                 continue
1435             elif s.startswith('-?+'):
1436                 # intraline change
1437                 yield _make_line(lines,'?',0), _make_line(lines,None,1), True
1438                 continue
1439             elif s.startswith('-'):
1440                 # delete FROM line
1441                 num_blanks_pending -= 1
1442                 yield _make_line(lines,'-',0), None, True
1443                 continue
1444             elif s.startswith('+--'):
1445                 # in add block, delete block coming: we do NOT want to get
1446                 # caught up on blank lines yet, just process the add line
1447                 num_blanks_pending += 1
1448                 yield None, _make_line(lines,'+',1), True
1449                 continue
1450             elif s.startswith('+ ') or s.startswith('+-'):
1451                 # will be leaving an add block: yield blanks then add line
1452                 from_line, to_line = None, _make_line(lines,'+',1)
1453                 num_blanks_to_yield,num_blanks_pending = num_blanks_pending+1,0
1454             elif s.startswith('+'):
1455                 # inside an add block, yield the add line
1456                 num_blanks_pending += 1
1457                 yield None, _make_line(lines,'+',1), True
1458                 continue
1459             elif s.startswith(' '):
1460                 # unchanged text, yield it to both sides
1461                 yield _make_line(lines[:],None,0),_make_line(lines,None,1),False
1462                 continue
1463             # Catch up on the blank lines so when we yield the next from/to
1464             # pair, they are lined up.
1465             while(num_blanks_to_yield < 0):
1466                 num_blanks_to_yield += 1
1467                 yield None,('','\n'),True
1468             while(num_blanks_to_yield > 0):
1469                 num_blanks_to_yield -= 1
1470                 yield ('','\n'),None,True
1471             if s.startswith('X'):
1472                 raise StopIteration
1473             else:
1474                 yield from_line,to_line,True
1475
1476     def _line_pair_iterator():
1477         """Yields from/to lines of text with a change indication.
1478
1479         This function is an iterator.  It itself pulls lines from the line
1480         iterator.  Its difference from that iterator is that this function
1481         always yields a pair of from/to text lines (with the change
1482         indication).  If necessary it will collect single from/to lines
1483         until it has a matching pair from/to pair to yield.
1484
1485         Note, this function is purposefully not defined at the module scope so
1486         that data it needs from its parent function (within whose context it
1487         is defined) does not need to be of module scope.
1488         """
1489         line_iterator = _line_iterator()
1490         fromlines,tolines=[],[]
1491         while True:
1492             # Collecting lines of text until we have a from/to pair
1493             while (len(fromlines)==0 or len(tolines)==0):
1494                 from_line, to_line, found_diff =line_iterator.next()
1495                 if from_line is not None:
1496                     fromlines.append((from_line,found_diff))
1497                 if to_line is not None:
1498                     tolines.append((to_line,found_diff))
1499             # Once we have a pair, remove them from the collection and yield it
1500             from_line, fromDiff = fromlines.pop(0)
1501             to_line, to_diff = tolines.pop(0)
1502             yield (from_line,to_line,fromDiff or to_diff)
1503
1504     # Handle case where user does not want context differencing, just yield
1505     # them up without doing anything else with them.
1506     line_pair_iterator = _line_pair_iterator()
1507     if context is None:
1508         while True:
1509             yield line_pair_iterator.next()
1510     # Handle case where user wants context differencing.  We must do some
1511     # storage of lines until we know for sure that they are to be yielded.
1512     else:
1513         context += 1
1514         lines_to_write = 0
1515         while True:
1516             # Store lines up until we find a difference, note use of a
1517             # circular queue because we only need to keep around what
1518             # we need for context.
1519             index, contextLines = 0, [None]*(context)
1520             found_diff = False
1521             while(found_diff is False):
1522                 from_line, to_line, found_diff = line_pair_iterator.next()
1523                 i = index % context
1524                 contextLines[i] = (from_line, to_line, found_diff)
1525                 index += 1
1526             # Yield lines that we have collected so far, but first yield
1527             # the user's separator.
1528             if index > context:
1529                 yield None, None, None
1530                 lines_to_write = context
1531             else:
1532                 lines_to_write = index
1533                 index = 0
1534             while(lines_to_write):
1535                 i = index % context
1536                 index += 1
1537                 yield contextLines[i]
1538                 lines_to_write -= 1
1539             # Now yield the context lines after the change
1540             lines_to_write = context-1
1541             while(lines_to_write):
1542                 from_line, to_line, found_diff = line_pair_iterator.next()
1543                 # If another change within the context, extend the context
1544                 if found_diff:
1545                     lines_to_write = context-1
1546                 else:
1547                     lines_to_write -= 1
1548                 yield from_line, to_line, found_diff
1549
1550
1551 _file_template = """
1552 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
1553           "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
1554
1555 <html>
1556
1557 <head>
1558     <meta http-equiv="Content-Type"
1559           content="text/html; charset=ISO-8859-1" />
1560     <title></title>
1561     <style type="text/css">%(styles)s
1562     </style>
1563 </head>
1564
1565 <body>
1566     %(table)s%(legend)s
1567 </body>
1568
1569 </html>"""
1570
1571 _styles = """
1572         table.diff {font-family:Courier; border:medium;}
1573         .diff_header {background-color:#e0e0e0}
1574         td.diff_header {text-align:right}
1575         .diff_next {background-color:#c0c0c0}
1576         .diff_add {background-color:#aaffaa}
1577         .diff_chg {background-color:#ffff77}
1578         .diff_sub {background-color:#ffaaaa}"""
1579
1580 _table_template = """
1581     <table class="diff" id="difflib_chg_%(prefix)s_top"
1582            cellspacing="0" cellpadding="0" rules="groups" >
1583         <colgroup></colgroup> <colgroup></colgroup> <colgroup></colgroup>
1584         <colgroup></colgroup> <colgroup></colgroup> <colgroup></colgroup>
1585         %(header_row)s
1586         <tbody>
1587 %(data_rows)s        </tbody>
1588     </table>"""
1589
1590 _legend = """
1591     <table class="diff" summary="Legends">
1592         <tr> <th colspan="2"> Legends </th> </tr>
1593         <tr> <td> <table border="" summary="Colors">
1594                       <tr><th> Colors </th> </tr>
1595                       <tr><td class="diff_add">&nbsp;Added&nbsp;</td></tr>
1596                       <tr><td class="diff_chg">Changed</td> </tr>
1597                       <tr><td class="diff_sub">Deleted</td> </tr>
1598                   </table></td>
1599              <td> <table border="" summary="Links">
1600                       <tr><th colspan="2"> Links </th> </tr>
1601                       <tr><td>(f)irst change</td> </tr>
1602                       <tr><td>(n)ext change</td> </tr>
1603                       <tr><td>(t)op</td> </tr>
1604                   </table></td> </tr>
1605     </table>"""
1606
1607 class HtmlDiff(object):
1608     """For producing HTML side by side comparison with change highlights.
1609
1610     This class can be used to create an HTML table (or a complete HTML file
1611     containing the table) showing a side by side, line by line comparison
1612     of text with inter-line and intra-line change highlights.  The table can
1613     be generated in either full or contextual difference mode.
1614
1615     The following methods are provided for HTML generation:
1616
1617     make_table -- generates HTML for a single side by side table
1618     make_file -- generates complete HTML file with a single side by side table
1619
1620     See tools/scripts/diff.py for an example usage of this class.
1621     """
1622
1623     _file_template = _file_template
1624     _styles = _styles
1625     _table_template = _table_template
1626     _legend = _legend
1627     _default_prefix = 0
1628
1629     def __init__(self,tabsize=8,wrapcolumn=None,linejunk=None,
1630                  charjunk=IS_CHARACTER_JUNK):
1631         """HtmlDiff instance initializer
1632
1633         Arguments:
1634         tabsize -- tab stop spacing, defaults to 8.
1635         wrapcolumn -- column number where lines are broken and wrapped,
1636             defaults to None where lines are not wrapped.
1637         linejunk,charjunk -- keyword arguments passed into ndiff() (used to by
1638             HtmlDiff() to generate the side by side HTML differences).  See
1639             ndiff() documentation for argument default values and descriptions.
1640         """
1641         self._tabsize = tabsize
1642         self._wrapcolumn = wrapcolumn
1643         self._linejunk = linejunk
1644         self._charjunk = charjunk
1645
1646     def make_file(self,fromlines,tolines,fromdesc='',todesc='',context=False,
1647                   numlines=5):
1648         """Returns HTML file of side by side comparison with change highlights
1649
1650         Arguments:
1651         fromlines -- list of "from" lines
1652         tolines -- list of "to" lines
1653         fromdesc -- "from" file column header string
1654         todesc -- "to" file column header string
1655         context -- set to True for contextual differences (defaults to False
1656             which shows full differences).
1657         numlines -- number of context lines.  When context is set True,
1658             controls number of lines displayed before and after the change.
1659             When context is False, controls the number of lines to place
1660             the "next" link anchors before the next change (so click of
1661             "next" link jumps to just before the change).
1662         """
1663
1664         return self._file_template % dict(
1665             styles = self._styles,
1666             legend = self._legend,
1667             table = self.make_table(fromlines,tolines,fromdesc,todesc,
1668                                     context=context,numlines=numlines))
1669
1670     def _tab_newline_replace(self,fromlines,tolines):
1671         """Returns from/to line lists with tabs expanded and newlines removed.
1672
1673         Instead of tab characters being replaced by the number of spaces
1674         needed to fill in to the next tab stop, this function will fill
1675         the space with tab characters.  This is done so that the difference
1676         algorithms can identify changes in a file when tabs are replaced by
1677         spaces and vice versa.  At the end of the HTML generation, the tab
1678         characters will be replaced with a nonbreakable space.
1679         """
1680         def expand_tabs(line):
1681             # hide real spaces
1682             line = line.replace(' ','\0')
1683             # expand tabs into spaces
1684             line = line.expandtabs(self._tabsize)
1685             # relace spaces from expanded tabs back into tab characters
1686             # (we'll replace them with markup after we do differencing)
1687             line = line.replace(' ','\t')
1688             return line.replace('\0',' ').rstrip('\n')
1689         fromlines = [expand_tabs(line) for line in fromlines]
1690         tolines = [expand_tabs(line) for line in tolines]
1691         return fromlines,tolines
1692
1693     def _split_line(self,data_list,line_num,text):
1694         """Builds list of text lines by splitting text lines at wrap point
1695
1696         This function will determine if the input text line needs to be
1697         wrapped (split) into separate lines.  If so, the first wrap point
1698         will be determined and the first line appended to the output
1699         text line list.  This function is used recursively to handle
1700         the second part of the split line to further split it.
1701         """
1702         # if blank line or context separator, just add it to the output list
1703         if not line_num:
1704             data_list.append((line_num,text))
1705             return
1706
1707         # if line text doesn't need wrapping, just add it to the output list
1708         size = len(text)
1709         max = self._wrapcolumn
1710         if (size <= max) or ((size -(text.count('\0')*3)) <= max):
1711             data_list.append((line_num,text))
1712             return
1713
1714         # scan text looking for the wrap point, keeping track if the wrap
1715         # point is inside markers
1716         i = 0
1717         n = 0
1718         mark = ''
1719         while n < max and i < size:
1720             if text[i] == '\0':
1721                 i += 1
1722                 mark = text[i]
1723                 i += 1
1724             elif text[i] == '\1':
1725                 i += 1
1726                 mark = ''
1727             else:
1728                 i += 1
1729                 n += 1
1730
1731         # wrap point is inside text, break it up into separate lines
1732         line1 = text[:i]
1733         line2 = text[i:]
1734
1735         # if wrap point is inside markers, place end marker at end of first
1736         # line and start marker at beginning of second line because each
1737         # line will have its own table tag markup around it.
1738         if mark:
1739             line1 = line1 + '\1'
1740             line2 = '\0' + mark + line2
1741
1742         # tack on first line onto the output list
1743         data_list.append((line_num,line1))
1744
1745         # use this routine again to wrap the remaining text
1746         self._split_line(data_list,'>',line2)
1747
1748     def _line_wrapper(self,diffs):
1749         """Returns iterator that splits (wraps) mdiff text lines"""
1750
1751         # pull from/to data and flags from mdiff iterator
1752         for fromdata,todata,flag in diffs:
1753             # check for context separators and pass them through
1754             if flag is None:
1755                 yield fromdata,todata,flag
1756                 continue
1757             (fromline,fromtext),(toline,totext) = fromdata,todata
1758             # for each from/to line split it at the wrap column to form
1759             # list of text lines.
1760             fromlist,tolist = [],[]
1761             self._split_line(fromlist,fromline,fromtext)
1762             self._split_line(tolist,toline,totext)
1763             # yield from/to line in pairs inserting blank lines as
1764             # necessary when one side has more wrapped lines
1765             while fromlist or tolist:
1766                 if fromlist:
1767                     fromdata = fromlist.pop(0)
1768                 else:
1769                     fromdata = ('',' ')
1770                 if tolist:
1771                     todata = tolist.pop(0)
1772                 else:
1773                     todata = ('',' ')
1774                 yield fromdata,todata,flag
1775
1776     def _collect_lines(self,diffs):
1777         """Collects mdiff output into separate lists
1778
1779         Before storing the mdiff from/to data into a list, it is converted
1780         into a single line of text with HTML markup.
1781         """
1782
1783         fromlist,tolist,flaglist = [],[],[]
1784         # pull from/to data and flags from mdiff style iterator
1785         for fromdata,todata,flag in diffs:
1786             try:
1787                 # store HTML markup of the lines into the lists
1788                 fromlist.append(self._format_line(0,flag,*fromdata))
1789                 tolist.append(self._format_line(1,flag,*todata))
1790             except TypeError:
1791                 # exceptions occur for lines where context separators go
1792                 fromlist.append(None)
1793                 tolist.append(None)
1794             flaglist.append(flag)
1795         return fromlist,tolist,flaglist
1796
1797     def _format_line(self,side,flag,linenum,text):
1798         """Returns HTML markup of "from" / "to" text lines
1799
1800         side -- 0 or 1 indicating "from" or "to" text
1801         flag -- indicates if difference on line
1802         linenum -- line number (used for line number column)
1803         text -- line text to be marked up
1804         """
1805         try:
1806             linenum = '%d' % linenum
1807             id = ' id="%s%s"' % (self._prefix[side],linenum)
1808         except TypeError:
1809             # handle blank lines where linenum is '>' or ''
1810             id = ''
1811         # replace those things that would get confused with HTML symbols
1812         text=text.replace("&","&amp;").replace(">","&gt;").replace("<","&lt;")
1813
1814         # make space non-breakable so they don't get compressed or line wrapped
1815         text = text.replace(' ','&nbsp;').rstrip()
1816
1817         return '<td class="diff_header"%s>%s</td><td nowrap="nowrap">%s</td>' \
1818                % (id,linenum,text)
1819
1820     def _make_prefix(self):
1821         """Create unique anchor prefixes"""
1822
1823         # Generate a unique anchor prefix so multiple tables
1824         # can exist on the same HTML page without conflicts.
1825         fromprefix = "from%d_" % HtmlDiff._default_prefix
1826         toprefix = "to%d_" % HtmlDiff._default_prefix
1827         HtmlDiff._default_prefix += 1
1828         # store prefixes so line format method has access
1829         self._prefix = [fromprefix,toprefix]
1830
1831     def _convert_flags(self,fromlist,tolist,flaglist,context,numlines):
1832         """Makes list of "next" links"""
1833
1834         # all anchor names will be generated using the unique "to" prefix
1835         toprefix = self._prefix[1]
1836
1837         # process change flags, generating middle column of next anchors/links
1838         next_id = ['']*len(flaglist)
1839         next_href = ['']*len(flaglist)
1840         num_chg, in_change = 0, False
1841         last = 0
1842         for i,flag in enumerate(flaglist):
1843             if flag:
1844                 if not in_change:
1845                     in_change = True
1846                     last = i
1847                     # at the beginning of a change, drop an anchor a few lines
1848                     # (the context lines) before the change for the previous
1849                     # link
1850                     i = max([0,i-numlines])
1851                     next_id[i] = ' id="difflib_chg_%s_%d"' % (toprefix,num_chg)
1852                     # at the beginning of a change, drop a link to the next
1853                     # change
1854                     num_chg += 1
1855                     next_href[last] = '<a href="#difflib_chg_%s_%d">n</a>' % (
1856                          toprefix,num_chg)
1857             else:
1858                 in_change = False
1859         # check for cases where there is no content to avoid exceptions
1860         if not flaglist:
1861             flaglist = [False]
1862             next_id = ['']
1863             next_href = ['']
1864             last = 0
1865             if context:
1866                 fromlist = ['<td></td><td>&nbsp;No Differences Found&nbsp;</td>']
1867                 tolist = fromlist
1868             else:
1869                 fromlist = tolist = ['<td></td><td>&nbsp;Empty File&nbsp;</td>']
1870         # if not a change on first line, drop a link
1871         if not flaglist[0]:
1872             next_href[0] = '<a href="#difflib_chg_%s_0">f</a>' % toprefix
1873         # redo the last link to link to the top
1874         next_href[last] = '<a href="#difflib_chg_%s_top">t</a>' % (toprefix)
1875
1876         return fromlist,tolist,flaglist,next_href,next_id
1877
1878     def make_table(self,fromlines,tolines,fromdesc='',todesc='',context=False,
1879                    numlines=5):
1880         """Returns HTML table of side by side comparison with change highlights
1881
1882         Arguments:
1883         fromlines -- list of "from" lines
1884         tolines -- list of "to" lines
1885         fromdesc -- "from" file column header string
1886         todesc -- "to" file column header string
1887         context -- set to True for contextual differences (defaults to False
1888             which shows full differences).
1889         numlines -- number of context lines.  When context is set True,
1890             controls number of lines displayed before and after the change.
1891             When context is False, controls the number of lines to place
1892             the "next" link anchors before the next change (so click of
1893             "next" link jumps to just before the change).
1894         """
1895
1896         # make unique anchor prefixes so that multiple tables may exist
1897         # on the same page without conflict.
1898         self._make_prefix()
1899
1900         # change tabs to spaces before it gets more difficult after we insert
1901         # markkup
1902         fromlines,tolines = self._tab_newline_replace(fromlines,tolines)
1903
1904         # create diffs iterator which generates side by side from/to data
1905         if context:
1906             context_lines = numlines
1907         else:
1908             context_lines = None
1909         diffs = _mdiff(fromlines,tolines,context_lines,linejunk=self._linejunk,
1910                       charjunk=self._charjunk)
1911
1912         # set up iterator to wrap lines that exceed desired width
1913         if self._wrapcolumn:
1914             diffs = self._line_wrapper(diffs)
1915
1916         # collect up from/to lines and flags into lists (also format the lines)
1917         fromlist,tolist,flaglist = self._collect_lines(diffs)
1918
1919         # process change flags, generating middle column of next anchors/links
1920         fromlist,tolist,flaglist,next_href,next_id = self._convert_flags(
1921             fromlist,tolist,flaglist,context,numlines)
1922
1923         import cStringIO
1924         s = cStringIO.StringIO()
1925         fmt = '            <tr><td class="diff_next"%s>%s</td>%s' + \
1926               '<td class="diff_next">%s</td>%s</tr>\n'
1927         for i in range(len(flaglist)):
1928             if flaglist[i] is None:
1929                 # mdiff yields None on separator lines skip the bogus ones
1930                 # generated for the first line
1931                 if i > 0:
1932                     s.write('        </tbody>        \n        <tbody>\n')
1933             else:
1934                 s.write( fmt % (next_id[i],next_href[i],fromlist[i],
1935                                            next_href[i],tolist[i]))
1936         if fromdesc or todesc:
1937             header_row = '<thead><tr>%s%s%s%s</tr></thead>' % (
1938                 '<th class="diff_next"><br /></th>',
1939                 '<th colspan="2" class="diff_header">%s</th>' % fromdesc,
1940                 '<th class="diff_next"><br /></th>',
1941                 '<th colspan="2" class="diff_header">%s</th>' % todesc)
1942         else:
1943             header_row = ''
1944
1945         table = self._table_template % dict(
1946             data_rows=s.getvalue(),
1947             header_row=header_row,
1948             prefix=self._prefix[1])
1949
1950         return table.replace('\0+','<span class="diff_add">'). \
1951                      replace('\0-','<span class="diff_sub">'). \
1952                      replace('\0^','<span class="diff_chg">'). \
1953                      replace('\1','</span>'). \
1954                      replace('\t','&nbsp;')
1955
1956 del re
1957
1958 def restore(delta, which):
1959     r"""
1960     Generate one of the two sequences that generated a delta.
1961
1962     Given a `delta` produced by `Differ.compare()` or `ndiff()`, extract
1963     lines originating from file 1 or 2 (parameter `which`), stripping off line
1964     prefixes.
1965
1966     Examples:
1967
1968     >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(1),
1969     ...              'ore\ntree\nemu\n'.splitlines(1))
1970     >>> diff = list(diff)
1971     >>> print ''.join(restore(diff, 1)),
1972     one
1973     two
1974     three
1975     >>> print ''.join(restore(diff, 2)),
1976     ore
1977     tree
1978     emu
1979     """
1980     try:
1981         tag = {1: "- ", 2: "+ "}[int(which)]
1982     except KeyError:
1983         raise ValueError, ('unknown delta choice (must be 1 or 2): %r'
1984                            % which)
1985     prefixes = ("  ", tag)
1986     for line in delta:
1987         if line[:2] in prefixes:
1988             yield line[2:]
1989
1990 def _test():
1991     import doctest, difflib
1992     return doctest.testmod(difflib)
1993
1994 if __name__ == "__main__":
1995     _test()