6 # This code is part of the LWN git data miner.
8 # Copyright 2007-13 Eklektix, Inc.
9 # Copyright 2007-13 Jonathan Corbet <corbet@lwn.net>
10 # Copyright 2011 Germán Póo-Caamaño <gpoo@gnome.org>
12 # This file may be distributed under the terms of the GNU General
13 # Public License, version 2.
16 import database, csvdump, ConfigFile, reports
17 import getopt, datetime
18 import os, re, sys, rfc822, string, os.path
20 from patterns import patterns
22 Today = datetime.date.today()
25 # Remember author names we have griped about.
27 GripedAuthorNames = [ ]
41 CFName = 'gitdm.config'
46 ReportUnknowns = False
52 # -a Andrew Morton's signoffs shadow Linus's
53 # -b dir Specify the base directory to fetch the configuration files
54 # -c cfile Specify a configuration file
55 # -C company Only consider patches from <company>
56 # -d Output individual developer stats
57 # -D Output date statistics
58 # -f file Write touched-files report to <file>
59 # -h hfile HTML output to hfile
60 # -l count Maximum length for output lists
61 # -n Use numstats instead of generated patch from git log
62 # -o file File for text output
63 # -p prefix Prefix for CSV output
64 # -r pattern Restrict to files matching pattern
65 # -s Ignore author SOB lines
66 # -u Map unknown employers to '(Unknown)'
67 # -U Dump unknown hackers in report
68 # -x file.csv Export raw statistics as CSV
69 # -w Aggregrate the raw statistics by weeks instead of months
70 # -y Aggregrate the raw statistics by years instead of months
71 # -z Dump out the hacker database at completion
74 global MapUnknown, DevReports
75 global DateStats, AuthorSOBs, FileFilter, AkpmOverLt, DumpDB
76 global CFName, CSVFile, CSVPrefix,DirName, Aggregate, Numstat
77 global ReportByFileType, ReportUnknowns, CompanyFilter, FileReport
79 opts, rest = getopt.getopt(sys.argv[1:], 'ab:dC:c:Df:h:l:no:p:r:stUuwx:yz')
86 CompanyFilter = opt[1]
96 reports.SetHTMLOutput(open(opt[1], 'w'))
98 reports.SetMaxList(int(opt[1]))
102 reports.SetOutput(open(opt[1], 'w'))
106 print 'Filter on "%s"' % (opt[1])
107 FileFilter = re.compile(opt[1])
115 ReportUnknowns = True
117 CSVFile = open(opt[1], 'w')
118 print "open output file " + opt[1] + "\n"
119 elif opt [0] == '-w':
121 elif opt [0] == '-y':
127 # Tracking for file accesses.
133 FileAccesses[path] += 1
135 FileAccesses[path] = 1
137 def NoteFileAccess(paths):
139 # Keep separate track of what we've noted in this set so that each level
140 # of the tree only gets a single note from one patch.
144 if path.startswith('a/') or path.startswith('b/'):
148 path, last = os.path.split(path)
149 while path and path not in ['a', 'b', '/']:
154 path, last = os.path.split(path)
157 # Local version still, for now
159 def LookupStoreHacker(name, email):
160 return database.LookupStoreHacker(name, email, MapUnknown)
168 def AddDateLines(date, lines):
170 print 'Skip big patch (%d)' % lines
173 DateMap[date] += lines
175 DateMap[date] = lines
177 def PrintDateStats():
178 dates = DateMap.keys()
181 datef = open('datelc.csv', 'w')
182 datef.write('Date,Changed,Total Changed\n')
184 total += DateMap[date]
185 datef.write('%d/%02d/%02d,%d,%d\n' % (date.year, date.month, date.day,
186 DateMap[date], total))
190 # Let's slowly try to move some smarts into this class.
193 (ADDED, REMOVED) = range(2)
195 def __init__(self, commit):
197 self.merge = self.added = self.removed = 0
198 self.author = LookupStoreHacker('Unknown hacker', 'unknown@hacker.net')
199 self.email = 'unknown@hacker.net'
207 def addreviewer(self, reviewer):
208 self.reviews.append(reviewer)
210 def addtester(self, tester):
211 self.testers.append(tester)
213 def addreporter(self, reporter):
214 self.reports.append(reporter)
216 def addfiletype(self, filetype, added, removed):
217 if self.filetypes.has_key(filetype):
218 self.filetypes[filetype][self.ADDED] += added
219 self.filetypes[filetype][self.REMOVED] += removed
221 self.filetypes[filetype] = [added, removed]
223 def addfile(self, name):
224 self.files.append(name)
227 def parse_numstat(line, file_filter):
229 Receive a line of text, determine if fits a numstat line and
230 parse the added and removed lines as well as the file type.
232 m = patterns['numstat'].match(line)
234 filename = m.group(3)
235 # If we have a file filter, check for file lines.
236 if file_filter and not file_filter.search(filename):
237 return None, None, None, None
240 added = int(m.group(1))
241 removed = int(m.group(2))
243 # A binary file (image, etc.) is marked with '-'
246 m = patterns['rename'].match(filename)
248 filename = '%s%s%s' % (m.group(1), m.group(3), m.group(4))
250 filetype = database.FileTypes.guess_file_type(os.path.basename(filename))
251 return filename, filetype, added, removed
253 return None, None, None, None
256 # The core hack for grabbing the information about a changeset.
258 def grabpatch(logpatch):
259 m = patterns['commit'].match(logpatch[0])
263 p = patch(m.group(1))
264 ignore = (FileFilter is not None)
266 for Line in logpatch[1:]:
268 # Maybe it's an author line?
270 m = patterns['author'].match(Line)
272 p.email = database.RemapEmail(m.group(2))
273 p.author = LookupStoreHacker(m.group(1), p.email)
276 # Could be a signed-off-by:
278 m = patterns['signed-off-by'].match(Line)
280 email = database.RemapEmail(m.group(2))
281 sobber = LookupStoreHacker(m.group(1), email)
282 if sobber != p.author or AuthorSOBs:
283 p.sobs.append((email, LookupStoreHacker(m.group(1), m.group(2))))
286 # Various other tags of interest.
288 m = patterns['reviewed-by'].match(Line)
290 email = database.RemapEmail(m.group(2))
291 p.addreviewer(LookupStoreHacker(m.group(1), email))
293 m = patterns['tested-by'].match(Line)
295 email = database.RemapEmail(m.group(2))
296 p.addtester(LookupStoreHacker(m.group(1), email))
297 p.author.testcredit(patch)
300 m = patterns['reported-by'].match(Line)
302 email = database.RemapEmail(m.group(2))
303 p.addreporter(LookupStoreHacker(m.group(1), email))
304 p.author.reportcredit(patch)
306 # Reported-and-tested-by:
307 m = patterns['reported-and-tested-by'].match(Line)
309 email = database.RemapEmail(m.group(2))
310 h = LookupStoreHacker(m.group(1), email)
313 p.author.reportcredit(patch)
314 p.author.testcredit(patch)
317 # If this one is a merge, make note of the fact.
319 m = patterns['merge'].match(Line)
324 # See if it's the date.
326 m = patterns['date'].match(Line)
328 dt = rfc822.parsedate(m.group(2))
329 p.date = datetime.date(dt[0], dt[1], dt[2])
331 sys.stderr.write('Funky date: %s\n' % p.date)
336 # If we have a file filter, check for file lines.
339 ignore = ApplyFileFilter(Line, ignore)
341 # If we are tracking files touched, look for a relevant line here.
343 if FileReport and not ignore:
344 m = patterns['filea'].match(Line)
347 if file == '/dev/null':
350 p.addfile(m.group(1))
353 m = patterns['fileb'].match(Line)
355 p.addfile(m.group(1))
359 # OK, maybe it's part of the diff itself.
362 if patterns['add'].match(Line):
365 if patterns['rem'].match(Line):
369 # Grab data in the numstat format.
371 (filename, filetype, added, removed) = parse_numstat(Line, FileFilter)
375 p.addfiletype(filetype, added, removed)
378 if '@' in p.author.name:
379 GripeAboutAuthorName(p.author.name)
383 def GripeAboutAuthorName(name):
384 if name in GripedAuthorNames:
386 GripedAuthorNames.append(name)
387 print '%s is an author name, probably not what you want' % (name)
389 def ApplyFileFilter(line, ignore):
391 # If this is the first file line (--- a/), set ignore one way
394 m = patterns['filea'].match(line)
397 if FileFilter.search(file):
401 # For the second line, we can turn ignore off, but not on
403 m = patterns['fileb'].match(line)
406 if FileFilter.search(file):
410 def is_svntag(logpatch):
412 This is a workaround for a bug on the migration to Git
413 from Subversion found in GNOME. It may happen in other
414 repositories as well.
417 for Line in logpatch:
418 m = patterns['svn-tag'].match(Line.strip())
420 sys.stderr.write('(W) detected a commit on a svn tag: %s\n' %
427 # If this patch is signed off by both Andrew Morton and Linus Torvalds,
428 # remove the (redundant) Linus signoff.
431 if AkpmOverLt == 1 and Linus in p.sobs and Akpm in p.sobs:
436 # Here starts the real program.
441 # Read the config files.
443 ConfigFile.ConfigFile(CFName, DirName)
446 # Let's pre-seed the database with a couple of hackers
447 # we want to remember.
450 Linus = ('torvalds@linux-foundation.org',
451 LookupStoreHacker('Linus Torvalds', 'torvalds@linux-foundation.org'))
452 Akpm = ('akpm@linux-foundation.org',
453 LookupStoreHacker('Andrew Morton', 'akpm@linux-foundation.org'))
455 TotalChanged = TotalAdded = TotalRemoved = 0
460 print >> sys.stderr, 'Grabbing changesets...\r',
462 patches = logparser.LogPatchSplitter(sys.stdin)
463 printcount = CSCount = 0
465 for logpatch in patches:
466 if (printcount % 50) == 0:
467 print >> sys.stderr, 'Grabbing changesets...%d\r' % printcount,
470 # We want to ignore commits on svn tags since in Subversion
471 # thats mean a copy of the whole repository, which leads to
472 # wrong results. Some migrations from Subversion to Git does
473 # not catch all this tags/copy and import them just as a new
475 if is_svntag(logpatch):
478 p = grabpatch(logpatch)
481 # if p.added > 100000 or p.removed > 100000:
482 # print 'Skipping massive add', p.commit
484 if FileFilter and p.added == 0 and p.removed == 0:
487 # Apply the company filter if it exists.
489 empl = p.author.emailemployer(p.email, p.date)
490 if CompanyFilter and empl.name != CompanyFilter:
493 # Now note the file accesses if need be.
496 NoteFileAccess(p.files)
498 # Record some global information - but only if this patch had
499 # stuff which wasn't ignored.
501 if ((p.added + p.removed) > 0 or not FileFilter) and not p.merge:
502 TotalAdded += p.added
503 TotalRemoved += p.removed
504 TotalChanged += max(p.added, p.removed)
505 AddDateLines(p.date, max(p.added, p.removed))
509 for sobemail, sobber in p.sobs:
510 empl = sobber.emailemployer(sobemail, p.date)
515 for sobemail, sob in p.sobs:
517 for hacker in p.reviews:
519 for hacker in p.testers:
521 for hacker in p.reports:
524 csvdump.AccumulatePatch(p, Aggregate)
525 csvdump.store_patch(p)
526 print >> sys.stderr, 'Grabbing changesets...done '
530 database.MixVirtuals()
535 hlist = database.AllHackers()
536 elist = database.AllEmployers()
539 if len(h.patches) > 0:
544 reports.Write('Processed %d csets from %d developers\n' % (CSCount,
546 reports.Write('%d employers found\n' % (nempl))
547 reports.Write('A total of %d lines added, %d removed (delta %d)\n' %
548 (TotalAdded, TotalRemoved, TotalAdded - TotalRemoved))
549 if TotalChanged == 0:
550 TotalChanged = 1 # HACK to avoid div by zero
555 csvdump.save_csv(CSVPrefix)
558 csvdump.OutputCSV(CSVFile)
562 reports.DevReports(hlist, TotalChanged, CSCount, TotalRemoved)
564 reports.ReportUnknowns(hlist, CSCount)
565 reports.EmplReports(elist, TotalChanged, CSCount)
567 if ReportByFileType and Numstat:
568 reports.ReportByFileType(hlist)
571 reports.FileAccessReport(FileReport, FileAccesses, CSCount)