cola/diffparse.py

   1 from __future__ import absolute_import, division, print_function, unicode_literals
   2 import math
   3 import re
   4 from collections import Counter
   5 from itertools import groupby
   6
   7 from . import compat
   8
   9
  10 DIFF_CONTEXT = ' '
  11 DIFF_ADDITION = '+'
  12 DIFF_DELETION = '-'
  13 DIFF_NO_NEWLINE = '\\'
  14
  15
  16 def parse_range_str(range_str):
  17     if ',' in range_str:
  18         begin, end = range_str.split(',', 1)
  19         return int(begin), int(end)
  20     return int(range_str), 1
  21
  22
  23 def _format_range(start, count):
  24     if count == 1:
  25         return str(start)
  26     return '%d,%d' % (start, count)
  27
  28
  29 def _format_hunk_header(old_start, old_count, new_start, new_count, heading=''):
  30     return '@@ -%s +%s @@%s\n' % (
  31         _format_range(old_start, old_count),
  32         _format_range(new_start, new_count),
  33         heading,
  34     )
  35
  36
  37 def digits(number):
  38     """Return the number of digits needed to display a number"""
  39     if number >= 0:
  40         result = int(math.log10(number)) + 1
  41     else:
  42         result = 1
  43     return result
  44
  45
  46 class LineCounter(object):
  47     """Keep track of a diff range's values"""
  48
  49     def __init__(self, value=0, max_value=-1):
  50         self.value = value
  51         self.max_value = max_value
  52         self._initial_max_value = max_value
  53
  54     def reset(self):
  55         """Reset the max counter and return self for convenience"""
  56         self.max_value = self._initial_max_value
  57         return self
  58
  59     def parse(self, range_str):
  60         """Parse a diff range and setup internal state"""
  61         start, count = parse_range_str(range_str)
  62         self.value = start
  63         self.max_value = max(start + count - 1, self.max_value)
  64
  65     def tick(self, amount=1):
  66         """Return the current value and increment to the next"""
  67         value = self.value
  68         self.value += amount
  69         return value
  70
  71
  72 class DiffLines(object):
  73     """Parse diffs and gather line numbers"""
  74
  75     EMPTY = -1
  76     DASH = -2
  77
  78     def __init__(self):
  79         self.merge = False
  80
  81         # diff <old> <new>
  82         # merge <ours> <theirs> <new>
  83         self.old = LineCounter()
  84         self.new = LineCounter()
  85         self.ours = LineCounter()
  86         self.theirs = LineCounter()
  87
  88     def digits(self):
  89         return digits(
  90             max(
  91                 self.old.max_value,
  92                 self.new.max_value,
  93                 self.ours.max_value,
  94                 self.theirs.max_value,
  95             )
  96         )
  97
  98     def parse(self, diff_text):
  99         lines = []
 100         diff_state = 1
 101         state = initial_state = 0
 102         merge = self.merge = False
 103         no_newline = r'\ No newline at end of file'
 104
 105         old = self.old.reset()
 106         new = self.new.reset()
 107         ours = self.ours.reset()
 108         theirs = self.theirs.reset()
 109
 110         for text in diff_text.split('\n'):
 111             if text.startswith('@@ -'):
 112                 parts = text.split(' ', 4)
 113                 if parts[0] == '@@' and parts[3] == '@@':
 114                     state = diff_state
 115                     old.parse(parts[1][1:])
 116                     new.parse(parts[2][1:])
 117                     lines.append((self.DASH, self.DASH))
 118                     continue
 119             if text.startswith('@@@ -'):
 120                 self.merge = merge = True
 121                 parts = text.split(' ', 5)
 122                 if parts[0] == '@@@' and parts[4] == '@@@':
 123                     state = diff_state
 124                     ours.parse(parts[1][1:])
 125                     theirs.parse(parts[2][1:])
 126                     new.parse(parts[3][1:])
 127                     lines.append((self.DASH, self.DASH, self.DASH))
 128                     continue
 129             if state == initial_state or text.rstrip() == no_newline:
 130                 if merge:
 131                     lines.append((self.EMPTY, self.EMPTY, self.EMPTY))
 132                 else:
 133                     lines.append((self.EMPTY, self.EMPTY))
 134             elif not merge and text.startswith('-'):
 135                 lines.append((old.tick(), self.EMPTY))
 136             elif merge and text.startswith('- '):
 137                 lines.append((ours.tick(), self.EMPTY, self.EMPTY))
 138             elif merge and text.startswith(' -'):
 139                 lines.append((self.EMPTY, theirs.tick(), self.EMPTY))
 140             elif merge and text.startswith('--'):
 141                 lines.append((ours.tick(), theirs.tick(), self.EMPTY))
 142             elif not merge and text.startswith('+'):
 143                 lines.append((self.EMPTY, new.tick()))
 144             elif merge and text.startswith('++'):
 145                 lines.append((self.EMPTY, self.EMPTY, new.tick()))
 146             elif merge and text.startswith('+ '):
 147                 lines.append((self.EMPTY, theirs.tick(), new.tick()))
 148             elif merge and text.startswith(' +'):
 149                 lines.append((ours.tick(), self.EMPTY, new.tick()))
 150             elif not merge and text.startswith(' '):
 151                 lines.append((old.tick(), new.tick()))
 152             elif merge and text.startswith('  '):
 153                 lines.append((ours.tick(), theirs.tick(), new.tick()))
 154             elif not text:
 155                 new.tick()
 156                 old.tick()
 157                 ours.tick()
 158                 theirs.tick()
 159             else:
 160                 state = initial_state
 161                 if merge:
 162                     lines.append((self.EMPTY, self.EMPTY, self.EMPTY))
 163                 else:
 164                     lines.append((self.EMPTY, self.EMPTY))
 165
 166         return lines
 167
 168
 169 class FormatDigits(object):
 170     """Format numbers for use in diff line numbers"""
 171
 172     DASH = DiffLines.DASH
 173     EMPTY = DiffLines.EMPTY
 174
 175     def __init__(self, dash='', empty=''):
 176         self.fmt = ''
 177         self.empty = ''
 178         self.dash = ''
 179         self._dash = dash or compat.uchr(0xB7)
 180         self._empty = empty or ' '
 181
 182     def set_digits(self, value):
 183         self.fmt = '%%0%dd' % value
 184         self.empty = self._empty * value
 185         self.dash = self._dash * value
 186
 187     def value(self, old, new):
 188         old_str = self._format(old)
 189         new_str = self._format(new)
 190         return '%s %s' % (old_str, new_str)
 191
 192     def merge_value(self, old, base, new):
 193         old_str = self._format(old)
 194         base_str = self._format(base)
 195         new_str = self._format(new)
 196         return '%s %s %s' % (old_str, base_str, new_str)
 197
 198     def number(self, value):
 199         return self.fmt % value
 200
 201     def _format(self, value):
 202         if value == self.DASH:
 203             result = self.dash
 204         elif value == self.EMPTY:
 205             result = self.empty
 206         else:
 207             result = self.number(value)
 208         return result
 209
 210
 211 class _HunkGrouper:
 212     _HUNK_HEADER_RE = re.compile(r'^@@ -([0-9,]+) \+([0-9,]+) @@(.*)')
 213
 214     def __init__(self):
 215         self.match = None
 216
 217     def __call__(self, line):
 218         match = self._HUNK_HEADER_RE.match(line)
 219         if match is not None:
 220             self.match = match
 221         return self.match
 222
 223
 224 class _DiffHunk:
 225     def __init__(self, old_start, start_offset, heading, content_lines):
 226         type_counts = Counter(line[:1] for line in content_lines)
 227         self.old_count = type_counts[DIFF_CONTEXT] + type_counts[DIFF_DELETION]
 228         self.new_count = type_counts[DIFF_CONTEXT] + type_counts[DIFF_ADDITION]
 229
 230         if self.old_count == 0:
 231             self.old_start = 0
 232         else:
 233             self.old_start = old_start
 234
 235         if self.new_count == 0:
 236             self.new_start = 0
 237         elif self.old_start == 0:
 238             self.new_start = 1
 239         else:
 240             self.new_start = self.old_start + start_offset
 241
 242         self.heading = heading
 243
 244         self.lines = [
 245             _format_hunk_header(
 246                 self.old_start,
 247                 self.old_count,
 248                 self.new_start,
 249                 self.new_count,
 250                 heading,
 251             ),
 252             *content_lines,
 253         ]
 254         self.content_lines = content_lines
 255
 256         self.changes = type_counts[DIFF_DELETION] + type_counts[DIFF_ADDITION]
 257
 258     def has_changes(self):
 259         return bool(self.changes)
 260
 261     def line_delta(self):
 262         return self.new_count - self.old_count
 263
 264
 265 class Patch:
 266     """Parse and rewrite diffs to produce edited patches
 267
 268     This parser is used for modifying the worktree and index by constructing
 269     temporary patches that are applied using "git apply".
 270
 271     """
 272
 273     def __init__(self, filename, hunks, header_line_count=0):
 274         self.filename = filename
 275         self.hunks = hunks
 276         self.header_line_count = header_line_count
 277
 278     @classmethod
 279     def parse(cls, filename, diff_text):
 280         header_line_count = 0
 281         hunks = []
 282         start_offset = 0
 283         for match, hunk_lines in groupby(diff_text.split('\n'), _HunkGrouper()):
 284             if match is not None:
 285                 # Skip the hunk range header line as it will be regenerated by the
 286                 # _DiffHunk.
 287                 next(hunk_lines)
 288                 hunk = _DiffHunk(
 289                     old_start=parse_range_str(match.group(1))[0],
 290                     start_offset=start_offset,
 291                     heading=match.group(3),
 292                     content_lines=[line + '\n' for line in hunk_lines if line],
 293                 )
 294                 if hunk.has_changes():
 295                     hunks.append(hunk)
 296                     start_offset += hunk.line_delta()
 297             else:
 298                 header_line_count = len(list(hunk_lines))
 299         return cls(filename, hunks, header_line_count)
 300
 301     def has_changes(self):
 302         return bool(self.hunks)
 303
 304     def as_text(self, *, file_headers=True):
 305         lines = []
 306         if self.hunks:
 307             if file_headers:
 308                 lines.append('--- a/%s\n' % self.filename)
 309                 lines.append('+++ b/%s\n' % self.filename)
 310             for hunk in self.hunks:
 311                 lines.extend(hunk.lines)
 312         return ''.join(lines)
 313
 314     def _hunk_iter(self):
 315         hunk_last_line_idx = self.header_line_count - 1
 316         for hunk in self.hunks:
 317             hunk_first_line_idx = hunk_last_line_idx + 1
 318             hunk_last_line_idx += len(hunk.lines)
 319             yield hunk_first_line_idx, hunk_last_line_idx, hunk
 320
 321     @staticmethod
 322     def _reverse_content_lines(content_lines):
 323         # Normally in a diff, deletions come before additions.  In order to preserve
 324         # this property in reverse patches, when this function encounters a deletion
 325         # line and switches it to addition, it appends the line to the pending_additions
 326         # list, while additions that get switched to deletions are appended directly to
 327         # the content_lines list.  Each time a context line is encountered, any pending
 328         # additions are then appended to the content_lines list immmediately before the
 329         # context line and the pending_additions list is cleared.
 330         new_content_lines = []
 331         pending_additions = []
 332         line_type = None
 333         for line in content_lines:
 334             prev_line_type = line_type
 335             line_type = line[:1]
 336             if line_type == DIFF_ADDITION:
 337                 new_content_lines.append(DIFF_DELETION + line[1:])
 338             elif line_type == DIFF_DELETION:
 339                 pending_additions.append(DIFF_ADDITION + line[1:])
 340             elif line_type == DIFF_NO_NEWLINE:
 341                 if prev_line_type == DIFF_DELETION:
 342                     # Previous line was a deletion that was switched to an
 343                     # addition, so the "No newline" line goes with it.
 344                     pending_additions.append(line)
 345                 else:
 346                     new_content_lines.append(line)
 347             else:
 348                 new_content_lines.extend(pending_additions)
 349                 new_content_lines.append(line)
 350                 pending_additions = []
 351         new_content_lines.extend(pending_additions)
 352         return new_content_lines
 353
 354     def extract_subset(self, first_line_idx, last_line_idx, *, reverse=False):
 355         new_hunks = []
 356         start_offset = 0
 357         for hunk_first_line_idx, hunk_last_line_idx, hunk in self._hunk_iter():
 358             # Skip hunks until reaching the one that contains the first selected line.
 359             if hunk_last_line_idx < first_line_idx:
 360                 continue
 361
 362             # Stop once the hunk that contains the last selected line has been
 363             # processed.
 364             if hunk_first_line_idx > last_line_idx:
 365                 break
 366
 367             content_lines = []
 368
 369             prev_skipped = False
 370             for hunk_line_idx, line in enumerate(
 371                 hunk.content_lines, start=hunk_first_line_idx + 1
 372             ):
 373                 line_type = line[:1]
 374                 if not first_line_idx <= hunk_line_idx <= last_line_idx:
 375                     if line_type == DIFF_ADDITION:
 376                         if reverse:
 377                             # Change unselected additions to context for reverse diffs.
 378                             line = DIFF_CONTEXT + line[1:]
 379                         else:
 380                             # Skip unselected additions for normal diffs.
 381                             prev_skipped = True
 382                             continue
 383                     elif line_type == DIFF_DELETION:
 384                         if not reverse:
 385                             # Change unselected deletions to context for normal diffs.
 386                             line = DIFF_CONTEXT + line[1:]
 387                         else:
 388                             # Skip unselected deletions for reverse diffs.
 389                             prev_skipped = True
 390                             continue
 391
 392                 if line_type == DIFF_NO_NEWLINE and prev_skipped:
 393                     # If the line immediately before a "No newline" line was skipped
 394                     # (e.g.  because it was an unselected addition) skip the "No
 395                     # newline" line as well
 396                     continue
 397
 398                 content_lines.append(line)
 399
 400             if reverse:
 401                 old_start = hunk.new_start
 402                 content_lines = self._reverse_content_lines(content_lines)
 403             else:
 404                 old_start = hunk.old_start
 405             new_hunk = _DiffHunk(
 406                 old_start=old_start,
 407                 start_offset=start_offset,
 408                 heading=hunk.heading,
 409                 content_lines=content_lines,
 410             )
 411             if new_hunk.has_changes():
 412                 new_hunks.append(new_hunk)
 413                 start_offset += new_hunk.line_delta()
 414
 415         return Patch(self.filename, new_hunks)
 416
 417     def extract_hunk(self, line_idx, *, reverse=False):
 418         """Return a new patch containing only the hunk containing the specified line"""
 419         new_hunks = []
 420         for _, hunk_last_line_idx, hunk in self._hunk_iter():
 421             if line_idx <= hunk_last_line_idx:
 422                 if reverse:
 423                     old_start = hunk.new_start
 424                     content_lines = self._reverse_content_lines(hunk.content_lines)
 425                 else:
 426                     old_start = hunk.old_start
 427                     content_lines = hunk.content_lines
 428                 new_hunks = [
 429                     _DiffHunk(
 430                         old_start=old_start,
 431                         start_offset=0,
 432                         heading=hunk.heading,
 433                         content_lines=content_lines,
 434                     )
 435                 ]
 436                 break
 437         return Patch(self.filename, new_hunks)