cola/diffparse.py

   1 import math
   2 import re
   3 from collections import Counter
   4 from itertools import groupby
   5
   6 from . import compat
   7
   8
   9 DIFF_CONTEXT = ' '
  10 DIFF_ADDITION = '+'
  11 DIFF_DELETION = '-'
  12 DIFF_NO_NEWLINE = '\\'
  13
  14
  15 def parse_range_str(range_str):
  16     if ',' in range_str:
  17         begin, end = range_str.split(',', 1)
  18         return int(begin), int(end)
  19     return int(range_str), 1
  20
  21
  22 def _format_range(start, count):
  23     if count == 1:
  24         return str(start)
  25     return '%d,%d' % (start, count)
  26
  27
  28 def _format_hunk_header(old_start, old_count, new_start, new_count, heading=''):
  29     return '@@ -{} +{} @@{}\n'.format(
  30         _format_range(old_start, old_count),
  31         _format_range(new_start, new_count),
  32         heading,
  33     )
  34
  35
  36 def digits(number):
  37     """Return the number of digits needed to display a number"""
  38     if number >= 0:
  39         result = int(math.log10(number)) + 1
  40     else:
  41         result = 1
  42     return result
  43
  44
  45 class LineCounter:
  46     """Keep track of a diff range's values"""
  47
  48     def __init__(self, value=0, max_value=-1):
  49         self.value = value
  50         self.max_value = max_value
  51         self._initial_max_value = max_value
  52
  53     def reset(self):
  54         """Reset the max counter and return self for convenience"""
  55         self.max_value = self._initial_max_value
  56         return self
  57
  58     def parse(self, range_str):
  59         """Parse a diff range and setup internal state"""
  60         start, count = parse_range_str(range_str)
  61         self.value = start
  62         self.max_value = max(start + count - 1, self.max_value)
  63
  64     def tick(self, amount=1):
  65         """Return the current value and increment to the next"""
  66         value = self.value
  67         self.value += amount
  68         return value
  69
  70
  71 class DiffLines:
  72     """Parse diffs and gather line numbers"""
  73
  74     EMPTY = -1
  75     DASH = -2
  76
  77     def __init__(self):
  78         self.merge = False
  79
  80         # diff <old> <new>
  81         # merge <ours> <theirs> <new>
  82         self.old = LineCounter()
  83         self.new = LineCounter()
  84         self.ours = LineCounter()
  85         self.theirs = LineCounter()
  86
  87     def digits(self):
  88         return digits(
  89             max(
  90                 self.old.max_value,
  91                 self.new.max_value,
  92                 self.ours.max_value,
  93                 self.theirs.max_value,
  94             )
  95         )
  96
  97     def parse(self, diff_text):
  98         lines = []
  99         diff_state = 1
 100         state = initial_state = 0
 101         merge = self.merge = False
 102         no_newline = r'\ No newline at end of file'
 103
 104         old = self.old.reset()
 105         new = self.new.reset()
 106         ours = self.ours.reset()
 107         theirs = self.theirs.reset()
 108
 109         for text in diff_text.split('\n'):
 110             if text.startswith('@@ -'):
 111                 parts = text.split(' ', 4)
 112                 if parts[0] == '@@' and parts[3] == '@@':
 113                     state = diff_state
 114                     old.parse(parts[1][1:])
 115                     new.parse(parts[2][1:])
 116                     lines.append((self.DASH, self.DASH))
 117                     continue
 118             if text.startswith('@@@ -'):
 119                 self.merge = merge = True
 120                 parts = text.split(' ', 5)
 121                 if parts[0] == '@@@' and parts[4] == '@@@':
 122                     state = diff_state
 123                     ours.parse(parts[1][1:])
 124                     theirs.parse(parts[2][1:])
 125                     new.parse(parts[3][1:])
 126                     lines.append((self.DASH, self.DASH, self.DASH))
 127                     continue
 128             if state == initial_state or text.rstrip() == no_newline:
 129                 if merge:
 130                     lines.append((self.EMPTY, self.EMPTY, self.EMPTY))
 131                 else:
 132                     lines.append((self.EMPTY, self.EMPTY))
 133             elif not merge and text.startswith('-'):
 134                 lines.append((old.tick(), self.EMPTY))
 135             elif merge and text.startswith('- '):
 136                 lines.append((ours.tick(), self.EMPTY, self.EMPTY))
 137             elif merge and text.startswith(' -'):
 138                 lines.append((self.EMPTY, theirs.tick(), self.EMPTY))
 139             elif merge and text.startswith('--'):
 140                 lines.append((ours.tick(), theirs.tick(), self.EMPTY))
 141             elif not merge and text.startswith('+'):
 142                 lines.append((self.EMPTY, new.tick()))
 143             elif merge and text.startswith('++'):
 144                 lines.append((self.EMPTY, self.EMPTY, new.tick()))
 145             elif merge and text.startswith('+ '):
 146                 lines.append((self.EMPTY, theirs.tick(), new.tick()))
 147             elif merge and text.startswith(' +'):
 148                 lines.append((ours.tick(), self.EMPTY, new.tick()))
 149             elif not merge and text.startswith(' '):
 150                 lines.append((old.tick(), new.tick()))
 151             elif merge and text.startswith('  '):
 152                 lines.append((ours.tick(), theirs.tick(), new.tick()))
 153             elif not text:
 154                 new.tick()
 155                 old.tick()
 156                 ours.tick()
 157                 theirs.tick()
 158             else:
 159                 state = initial_state
 160                 if merge:
 161                     lines.append((self.EMPTY, self.EMPTY, self.EMPTY))
 162                 else:
 163                     lines.append((self.EMPTY, self.EMPTY))
 164
 165         return lines
 166
 167
 168 class FormatDigits:
 169     """Format numbers for use in diff line numbers"""
 170
 171     DASH = DiffLines.DASH
 172     EMPTY = DiffLines.EMPTY
 173
 174     def __init__(self, dash='', empty=''):
 175         self.fmt = ''
 176         self.empty = ''
 177         self.dash = ''
 178         self._dash = dash or compat.uchr(0xB7)
 179         self._empty = empty or ' '
 180
 181     def set_digits(self, value):
 182         self.fmt = '%%0%dd' % value
 183         self.empty = self._empty * value
 184         self.dash = self._dash * value
 185
 186     def value(self, old, new):
 187         old_str = self._format(old)
 188         new_str = self._format(new)
 189         return f'{old_str} {new_str}'
 190
 191     def merge_value(self, old, base, new):
 192         old_str = self._format(old)
 193         base_str = self._format(base)
 194         new_str = self._format(new)
 195         return f'{old_str} {base_str} {new_str}'
 196
 197     def number(self, value):
 198         return self.fmt % value
 199
 200     def _format(self, value):
 201         if value == self.DASH:
 202             result = self.dash
 203         elif value == self.EMPTY:
 204             result = self.empty
 205         else:
 206             result = self.number(value)
 207         return result
 208
 209
 210 class _HunkGrouper:
 211     _HUNK_HEADER_RE = re.compile(r'^@@ -([0-9,]+) \+([0-9,]+) @@(.*)')
 212
 213     def __init__(self):
 214         self.match = None
 215
 216     def __call__(self, line):
 217         match = self._HUNK_HEADER_RE.match(line)
 218         if match is not None:
 219             self.match = match
 220         return self.match
 221
 222
 223 class _DiffHunk:
 224     def __init__(self, old_start, start_offset, heading, content_lines):
 225         type_counts = Counter(line[:1] for line in content_lines)
 226         self.old_count = type_counts[DIFF_CONTEXT] + type_counts[DIFF_DELETION]
 227         self.new_count = type_counts[DIFF_CONTEXT] + type_counts[DIFF_ADDITION]
 228
 229         if self.old_count == 0:
 230             self.old_start = 0
 231         else:
 232             self.old_start = old_start
 233
 234         if self.new_count == 0:
 235             self.new_start = 0
 236         elif self.old_start == 0:
 237             self.new_start = 1
 238         else:
 239             self.new_start = self.old_start + start_offset
 240
 241         self.heading = heading
 242
 243         self.lines = [
 244             _format_hunk_header(
 245                 self.old_start,
 246                 self.old_count,
 247                 self.new_start,
 248                 self.new_count,
 249                 heading,
 250             ),
 251             *content_lines,
 252         ]
 253         self.content_lines = content_lines
 254
 255         self.changes = type_counts[DIFF_DELETION] + type_counts[DIFF_ADDITION]
 256
 257     def has_changes(self):
 258         return bool(self.changes)
 259
 260     def line_delta(self):
 261         return self.new_count - self.old_count
 262
 263
 264 class Patch:
 265     """Parse and rewrite diffs to produce edited patches
 266
 267     This parser is used for modifying the worktree and index by constructing
 268     temporary patches that are applied using "git apply".
 269
 270     """
 271
 272     def __init__(self, filename, hunks, header_line_count=0):
 273         self.filename = filename
 274         self.hunks = hunks
 275         self.header_line_count = header_line_count
 276
 277     @classmethod
 278     def parse(cls, filename, diff_text):
 279         header_line_count = 0
 280         hunks = []
 281         start_offset = 0
 282         for match, hunk_lines in groupby(diff_text.split('\n'), _HunkGrouper()):
 283             if match is not None:
 284                 # Skip the hunk range header line as it will be regenerated by the
 285                 # _DiffHunk.
 286                 next(hunk_lines)
 287                 hunk = _DiffHunk(
 288                     old_start=parse_range_str(match.group(1))[0],
 289                     start_offset=start_offset,
 290                     heading=match.group(3),
 291                     content_lines=[line + '\n' for line in hunk_lines if line],
 292                 )
 293                 if hunk.has_changes():
 294                     hunks.append(hunk)
 295                     start_offset += hunk.line_delta()
 296             else:
 297                 header_line_count = len(list(hunk_lines))
 298         return cls(filename, hunks, header_line_count)
 299
 300     def has_changes(self):
 301         return bool(self.hunks)
 302
 303     def as_text(self, *, file_headers=True):
 304         lines = []
 305         if self.hunks:
 306             if file_headers:
 307                 lines.append('--- a/%s\n' % self.filename)
 308                 lines.append('+++ b/%s\n' % self.filename)
 309             for hunk in self.hunks:
 310                 lines.extend(hunk.lines)
 311         return ''.join(lines)
 312
 313     def _hunk_iter(self):
 314         hunk_last_line_idx = self.header_line_count - 1
 315         for hunk in self.hunks:
 316             hunk_first_line_idx = hunk_last_line_idx + 1
 317             hunk_last_line_idx += len(hunk.lines)
 318             yield hunk_first_line_idx, hunk_last_line_idx, hunk
 319
 320     @staticmethod
 321     def _reverse_content_lines(content_lines):
 322         # Normally in a diff, deletions come before additions.  In order to preserve
 323         # this property in reverse patches, when this function encounters a deletion
 324         # line and switches it to addition, it appends the line to the pending_additions
 325         # list, while additions that get switched to deletions are appended directly to
 326         # the content_lines list.  Each time a context line is encountered, any pending
 327         # additions are then appended to the content_lines list immmediately before the
 328         # context line and the pending_additions list is cleared.
 329         new_content_lines = []
 330         pending_additions = []
 331         line_type = None
 332         for line in content_lines:
 333             prev_line_type = line_type
 334             line_type = line[:1]
 335             if line_type == DIFF_ADDITION:
 336                 new_content_lines.append(DIFF_DELETION + line[1:])
 337             elif line_type == DIFF_DELETION:
 338                 pending_additions.append(DIFF_ADDITION + line[1:])
 339             elif line_type == DIFF_NO_NEWLINE:
 340                 if prev_line_type == DIFF_DELETION:
 341                     # Previous line was a deletion that was switched to an
 342                     # addition, so the "No newline" line goes with it.
 343                     pending_additions.append(line)
 344                 else:
 345                     new_content_lines.append(line)
 346             else:
 347                 new_content_lines.extend(pending_additions)
 348                 new_content_lines.append(line)
 349                 pending_additions = []
 350         new_content_lines.extend(pending_additions)
 351         return new_content_lines
 352
 353     def extract_subset(self, first_line_idx, last_line_idx, *, reverse=False):
 354         new_hunks = []
 355         start_offset = 0
 356         for hunk_first_line_idx, hunk_last_line_idx, hunk in self._hunk_iter():
 357             # Skip hunks until reaching the one that contains the first selected line.
 358             if hunk_last_line_idx < first_line_idx:
 359                 continue
 360
 361             # Stop once the hunk that contains the last selected line has been
 362             # processed.
 363             if hunk_first_line_idx > last_line_idx:
 364                 break
 365
 366             content_lines = []
 367
 368             prev_skipped = False
 369             for hunk_line_idx, line in enumerate(
 370                 hunk.content_lines, start=hunk_first_line_idx + 1
 371             ):
 372                 line_type = line[:1]
 373                 if not first_line_idx <= hunk_line_idx <= last_line_idx:
 374                     if line_type == DIFF_ADDITION:
 375                         if reverse:
 376                             # Change unselected additions to context for reverse diffs.
 377                             line = DIFF_CONTEXT + line[1:]
 378                         else:
 379                             # Skip unselected additions for normal diffs.
 380                             prev_skipped = True
 381                             continue
 382                     elif line_type == DIFF_DELETION:
 383                         if not reverse:
 384                             # Change unselected deletions to context for normal diffs.
 385                             line = DIFF_CONTEXT + line[1:]
 386                         else:
 387                             # Skip unselected deletions for reverse diffs.
 388                             prev_skipped = True
 389                             continue
 390
 391                 if line_type == DIFF_NO_NEWLINE and prev_skipped:
 392                     # If the line immediately before a "No newline" line was skipped
 393                     # (e.g.  because it was an unselected addition) skip the "No
 394                     # newline" line as well
 395                     continue
 396
 397                 content_lines.append(line)
 398
 399             if reverse:
 400                 old_start = hunk.new_start
 401                 content_lines = self._reverse_content_lines(content_lines)
 402             else:
 403                 old_start = hunk.old_start
 404             new_hunk = _DiffHunk(
 405                 old_start=old_start,
 406                 start_offset=start_offset,
 407                 heading=hunk.heading,
 408                 content_lines=content_lines,
 409             )
 410             if new_hunk.has_changes():
 411                 new_hunks.append(new_hunk)
 412                 start_offset += new_hunk.line_delta()
 413
 414         return Patch(self.filename, new_hunks)
 415
 416     def extract_hunk(self, line_idx, *, reverse=False):
 417         """Return a new patch containing only the hunk containing the specified line"""
 418         new_hunks = []
 419         for _, hunk_last_line_idx, hunk in self._hunk_iter():
 420             if line_idx <= hunk_last_line_idx:
 421                 if reverse:
 422                     old_start = hunk.new_start
 423                     content_lines = self._reverse_content_lines(hunk.content_lines)
 424                 else:
 425                     old_start = hunk.old_start
 426                     content_lines = hunk.content_lines
 427                 new_hunks = [
 428                     _DiffHunk(
 429                         old_start=old_start,
 430                         start_offset=0,
 431                         heading=hunk.heading,
 432                         content_lines=content_lines,
 433                     )
 434                 ]
 435                 break
 436         return Patch(self.filename, new_hunks)