gitstats: Licensed GitStats under the Apache License, 2.0
[git-stats.git] / src / git_stats / diff.py
blob800d52c6b71bdb55ba25780eb5ef80b47e4fd4a4
1 #!/usr/bin/env python
3 import collections
4 import os
5 import sys
7 from optparse import OptionParser
9 from git_stats import commit
10 from git_stats import parse
12 class FileDiff:
13 """A class to store the information of a file diff in.
15 Fields:
16 afile: The file used as the left side of the diff.
17 bfile: The file used as the right side of the diff.
18 context: The context of this diff.
19 apos: Where the left side of the diff starts.
20 bpos: Where the right side of the diff starts.
21 linesAdded: Which lines were added.
22 linesDeleted: Which lines were deleted.
23 """
25 def __init__(self, diffHeader):
26 self.afile = ""
27 self.bfile = ""
28 self.context = ""
30 self.linesAdded = []
31 self.linesDeleted = []
33 for line in diffHeader:
34 if line.startswith("--- "):
35 self.afile = line[4:]
37 if line.startswith("+++ "):
38 self.bfile = line[4:]
40 def __str__(self):
41 str = ("Diff for '%s' (%d) against '%s' (%d)\n%s\n%s\n%s" %
42 (self.afile, self.astart, self.bfile, self.bstart,
43 self.context, self.linesAdded, self.linesDeleted))
45 return str
47 def _splitDiff(diff):
48 """Splits off the diff in chunks, one for each file
50 Params:
51 diff: The diff to split up.
53 Returns: A list containing a chunk per file.
54 """
56 chunks = []
58 chunk = []
60 for line in diff:
61 # Start of a new chunk, add the old one if there is one
62 if line.startswith("diff"):
63 if chunk:
64 chunks.append(chunk)
66 chunk = []
68 chunk.append(line)
70 # Add the last one
71 if chunk:
72 chunks.append(chunk)
74 return chunks
76 def _splitFileDiff(diff):
77 """Splits a file diff into chunks, one per area.
79 Params:
80 diff: The diff to split up.
82 Returns: The diff header and a list with all the chunks.
83 """
85 chunks = []
87 header = []
88 chunk = []
90 start = len(diff)
92 # Find out where the header stops
93 for i, line in enumerate(diff):
94 if line.startswith("@@"):
95 # Start at the first hunk, which is this line
96 start = i
97 break
99 header.append(line)
101 # Chop off the header and split up the chunks
102 for line in diff[start:]:
103 # Start a new chunk and add the old one if there is one
104 if line.startswith("@@"):
105 if chunk:
106 chunks.append(chunk)
108 chunk = []
110 chunk.append(line)
112 # Add the last one
113 if chunk:
114 chunks.append(chunk)
116 return header, chunks
118 def _parseFileDiff(header, chunk, number=True):
119 """Takes a file diff and returns the parsed result
121 Params:
122 header: The diff header.
123 chunk: The chunk to parse.
124 number: Whether to number the line differences.
126 Returns: A FileDiff containing the parsed diff.
129 result = FileDiff(header)
131 # Empty diff, no need to do anything
132 if not chunk:
133 return result
135 deleted = []
136 added = []
138 # Find out where the context line ends, skipping the first '@@'
139 to = chunk[0].find("@@", 2)
141 # Get the context, skipping the first and last '@@"
142 context = chunk[0][3:to]
144 # Split it at the spaces and store the positions, ignoring '-' and '+'
145 split = context.split(' ')
146 a = split[0][1:]
147 b = split[1][1:]
149 apos = int(a.split(',')[0])
150 bpos = int(b.split(',')[0])
152 result.astart = apos
153 result.bstart = bpos
155 # Start at the first line (skip the context line)
156 for line in chunk[1:]:
157 if line.startswith("-"):
158 if number:
159 deleted.append((apos, line[1:]))
160 else:
161 deleted.append(line[1:])
162 apos += 1
164 if line.startswith("+"):
165 if number:
166 added.append((bpos, line[1:]))
167 else:
168 added.append(line[1:])
169 bpos += 1
171 result.linesDeleted = deleted
172 result.linesAdded = added
173 result.context = context
175 return result
177 def parseCommitDiff(diff, number=True):
178 """Takes a commit diff and returns the parsed result
180 Params:
181 diff: The diff to parse.
183 Returns: A parsedDiff instance containing the parsed diff.
186 result = []
188 # Split the diff in file sized chunks
189 chunks = _splitDiff(diff)
191 # Loop over all the file diffs and parse them
192 for chunk in chunks:
193 header, filechunks = _splitFileDiff(chunk)
195 # Loop over all the chunks and parse them
196 for filechunk in filechunks:
197 # Get the result and store it
198 fd = _parseFileDiff(header, filechunk, number)
199 result.append(fd)
201 return result
203 def _compareFileDiffs(adiff, bdiff, invert=False):
204 """Compares two FileDiffs and returns whether they are equal
206 Args:
207 adiff: The first FileDiff.
208 bdiff: The second FileDiff.
209 invert: Whether to compare linesAdded with linesDeleted.
211 Returns: Whether the two diffs are equal.
214 if invert:
215 # Cross compare added with deleted
216 if not adiff.linesAdded == bdiff.linesDeleted:
217 return False
218 # And vise versa
219 if not adiff.linesDeleted == bdiff.linesAdded:
220 return False
221 else:
222 # Do a normal comparison between added lines
223 if not adiff.linesAdded == bdiff.linesAdded:
224 return False
225 # And between the deleted lines
226 if not adiff.linesDeleted == bdiff.linesDeleted:
227 return False
229 # Checked everything, accept
230 return True
232 def _compareDiffs(adiffs, bdiffs, compareChanges=False, invert=False):
233 """Compares the two diffs and returns whether they are equal
235 Args:
236 adiffs: The first set of diffs.
237 bdiffs: The second set of diffs.
238 compareChanges: Whether to compare not only which lines changed.
239 invert: When compareChanges, invert the comparison of deleted/added.
241 Returns: Whether the diffs are equal.
244 for fd in adiffs:
245 # Look for a match in the bdiffs
246 for theirs in bdiffs:
248 # Check for empty diffs
249 if (((not fd.linesAdded and not fd.linesDeleted) and
250 (theirs.linesAdded or theirs.linesDeleted)) or
251 ((not theirs.linesAdded and not theirs.linesDeleted) and
252 (fd.linesAdded and fd.linesDeleted))):
253 return False
255 # Check if both are empty diffs
256 if (not fd.linesAdded and not theirs.linesAdded and
257 not fd.linesDeleted and not theirs.linesDeleted):
258 return True
260 # Looks like we have a match
261 if ((theirs.astart <= fd.astart and theirs.bstart >= fd.bstart) or
262 (invert and theirs.astart <= fd.bstart and theirs.bstart >= fd.astart)):
264 # If we want to compare changes, do they match
265 if compareChanges:
266 # Reject if they are inequal
267 if not _compareFileDiffs(fd, theirs, invert):
268 return False
270 # It was indeed a match, stop searching through bdiffs
271 break
273 else:
274 # Went through all items in bdiffs and couldn't find a matching pair
275 return False
277 # All items in adiffs checked, all had a matching pair, accept.
278 return True
280 def _difference(adiffs, bdiffs, compareChanges=False, invert=False):
281 """Calculates the difference between two diffs and returns it
283 Params:
284 adiffs: The first set of diffs.
285 bdiffs: The second set of diffs.
286 compareChanges: Whether to compare not only which lines changed.
287 invert: When compareChanges, invert the comparison of deleted/added.
289 Returns: Which keys are missing and the difference between both diffs.
292 afiles = collections.defaultdict(list)
293 bfiles = collections.defaultdict(list)
295 missing = []
296 difference = []
298 # Group the diffs by file pair
299 for fd in adiffs:
300 afiles[(fd.afile, fd.bfile)].append(fd)
302 # Group the diffs by file pair
303 for fd in bdiffs:
304 bfiles[(fd.afile, fd.bfile)].append(fd)
306 # Examine all the diffs and see if they match
307 for key, fds in afiles.iteritems():
308 # There is no counterpart for this file, record that
309 if not key in bfiles:
310 missing.append(key)
311 continue
313 theirs = bfiles[key]
315 # Compare the diffs, if not equal record that
316 if not _compareDiffs(fds, theirs, compareChanges, invert):
317 difference.append((fds, theirs))
319 return missing, difference
321 def _getParsedDiff(target, raw_diffs, parsed_diffs):
322 """Retrieves a parsed commit diff for the specified file
324 Args:
325 target: The commit to get the diff for.
326 raw_diffs: A dictionary with commits and their raw diffs.
327 parsed_diffs: A dictionary with commits and their parsed diffs.
330 if not target in parsed_diffs:
331 # Get the diff, but ignore whitespace
332 if not target in raw_diffs:
333 result = commit.getDiff(target, noContext=True)
334 diffTarget = result.split('\n')
335 else:
336 diffTarget = raw_diffs[target]
338 parsedTarget = parseCommitDiff(diffTarget)
339 raw_diffs[target] = diffTarget
340 parsed_diffs[target] = parsedTarget
341 else:
342 parsedTarget = parsed_diffs[target]
344 return parsedTarget
346 def commitdiffEqual(original,
347 potentialMatch,
348 threshold=0,
349 compareChanges=True,
350 invert=False,
351 verbose=True,
352 raw_diffs={},
353 parsed_diffs={}):
354 """Tests whether a commit matches another by a specified threshhold.
356 Params:
357 original: The original commit that is to be checked.
358 potentialMatch: The commit that might match original.
359 threshhold: The threshold for how close they have to match.
360 compareChanges: Whether to compare the changes made or just changes lines.
361 invert: Whether to compare deletions with insertions instead.
362 raw_diffs: A dictionary of commits adn their raw diffs.
363 parsed_diffs: A dictionary of commits and their parsed diffs.
365 Returns: Whether the commit diffs are equal.
368 # Retrieved the parsed diffs
369 parsedOriginal = _getParsedDiff( original,
370 raw_diffs,
371 parsed_diffs)
373 parsedPotentialMatch = _getParsedDiff( potentialMatch,
374 raw_diffs,
375 parsed_diffs)
377 # Get the difference between both
378 missing, diff = _difference(parsedOriginal,
379 parsedPotentialMatch,
380 compareChanges=compareChanges,
381 invert=invert)
383 if verbose:
384 if missing:
385 print("Missing the following keys:")
386 for key in missing:
387 print(key)
389 if diff:
390 print("Found the following differences:")
391 for ours, theirs in diff:
392 print("---")
393 for fd in ours:
394 print(fd)
395 print("\nDoes not match:\n")
396 for fd in theirs:
397 print(fd)
398 print("----")
400 # TODO use threshhold
402 # Unequal if something missing, or there is a difference
403 return not (missing or diff)
405 def isReverted(commit, potential_revert, raw_diffs={}, parsed_diffs={}):
406 """Returns whether the specified commit is reverted by another one
408 Args:
409 commit: The commit that might be reverted.
410 potential_revert: The commit that might be a revert.
411 raw_diffs: A dictionary of commits adn their raw diffs.
412 parsed_diffs: A dictionary of commits and their parsed diffs.
413 """
415 return commitdiffEqual( commit,
416 potential_revert,
417 invert=True,
418 verbose=False,
419 raw_diffs=raw_diffs,
420 parsed_diffs=parsed_diffs)
422 def findReverts(potential_revert,
423 raw_diffs={},
424 parsed_diffs={},
425 touched_files={}):
426 """Returns all commits that are reverted by the specified commit
428 Args:
429 potential_revert: The commit to check for reverts for.
430 raw_diffs: A dictionary with commits and their raw diffs.
431 parsed_diffs: A dictionary with commits and their parsed diffs.
432 touched_files: A dictionary with files and the commits that touched them.
435 # Find out what paths this commit touched
436 paths = commit.pathsTouchedBy(potential_revert)
438 # If no paths were touched, there can't be any reverts
439 if not paths:
440 return []
442 # Retrieve all commits that touched the same paths
443 commits = commit.commitsThatTouched(paths, touched_files=touched_files)
445 result = []
447 # Check all the found commits to see if they are a revert
448 for aCommit in commits:
449 # Don't compare to self
450 if aCommit == potential_revert:
451 continue
453 if isReverted(aCommit,
454 potential_revert,
455 raw_diffs=raw_diffs,
456 parsed_diffs=parsed_diffs):
457 result.append(aCommit)
459 return result
461 def dispatch(*args):
462 """Dispatches diff related commands
465 progname = os.path.basename(sys.argv[0]) + " diff"
467 parser = OptionParser(option_class=parse.GitOption, prog=progname)
469 parser.add_option(
470 "-e", "--equals",
471 type="commit",
472 nargs=2,
473 help="show whether the two diffs for the specified commits match",
474 metavar="COMMIT COMMIT")
476 parser.add_option(
477 "-t", "--threshold",
478 type="int",
479 help="the threshold for comparison")
481 parser.add_option(
482 "-n", "--no-compare",
483 action="store_false",
484 dest="compare",
485 help="do not compare the diff content, just look at which lines were touched")
487 parser.add_option(
488 "-i", "--invert",
489 action="store_true",
490 help="compare additions with deletions instead of with additions, and vise versa")
492 parser.add_option(
493 "-r", "--reverts",
494 type="commit",
495 help="show only commits that are reverted by the specified commit")
497 parser.set_default("threshold", 0)
498 parser.set_default("compare", True)
499 parser.set_default("invert", False)
501 (options, args) = parser.parse_args(list(args))
503 if options.equals:
504 result = commitdiffEqual( threshold=options.threshold,
505 compareChanges=options.compare,
506 invert=options.invert,
507 *options.equals)
509 if result:
510 print("Equal")
512 if options.reverts:
513 result = findReverts(options.reverts)
514 commit.prettyPrint(result)