6 # This code is part of the LWN git data miner.
8 # Copyright 2007-13 Eklektix, Inc.
9 # Copyright 2007-13 Jonathan Corbet <corbet@lwn.net>
10 # Copyright 2011 Germán Póo-Caamaño <gpoo@gnome.org>
12 # This file may be distributed under the terms of the GNU General
13 # Public License, version 2.
16 import database, csvdump, ConfigFile, reports
17 import getopt, datetime
18 import os, re, sys, string, os.path
20 from email.utils import parsedate
21 from patterns import patterns
23 Today = datetime.date.today()
26 # Remember author names we have griped about.
28 GripedAuthorNames = [ ]
43 CFName = 'gitdm.config'
48 ReportUnknowns = False
54 # -a Andrew Morton's signoffs shadow Linus's
55 # -b dir Specify the base directory to fetch the configuration files
56 # -c cfile Specify a configuration file
57 # -C company Only consider patches from <company>
58 # -d Output individual developer stats
59 # -D Output date statistics
60 # -f file Write touched-files report to <file>
61 # -h hfile HTML output to hfile
62 # -R rfile reStructuredText output to rfile
63 # -H file Export individual developer raw data as CSV
64 # -l count Maximum length for output lists
65 # -n Use numstats instead of generated patch from git log
66 # -o file File for text output
67 # -p prefix Prefix for CSV output
68 # -r pattern Restrict to files matching pattern
69 # -s Ignore author SOB lines
70 # -u Map unknown employers to '(Unknown)'
71 # -U Dump unknown hackers in report
72 # -x file.csv Export raw statistics as CSV
73 # -w Aggregrate the raw statistics by weeks instead of months
74 # -y Aggregrate the raw statistics by years instead of months
75 # -z Dump out the hacker database at completion
78 global MapUnknown, DevReports
79 global DateStats, AuthorSOBs, FileFilter, AkpmOverLt, DumpDB
80 global CFName, CSVFile, CSVPrefix,DirName, Aggregate, Numstat
81 global ReportByFileType, ReportUnknowns, CompanyFilter, FileReport
84 opts, rest = getopt.getopt(sys.argv[1:], 'ab:dC:c:Df:H:h:l:no:p:r:R:stUuwx:yz')
91 CompanyFilter = opt[1]
101 reports.SetHTMLOutput(open(opt[1], 'w'))
103 HackersCSV = open (opt[1], 'w')
105 reports.SetMaxList(int(opt[1]))
109 reports.SetOutput(open(opt[1], 'w'))
113 print('Filter on "%s"' % (opt[1]))
114 FileFilter = re.compile(opt[1])
116 reports.SetrSTOutput(open(opt[1], 'w'))
124 ReportUnknowns = True
126 CSVFile = open(opt[1], 'w')
127 print("open output file " + opt[1] + "\n")
128 elif opt [0] == '-w':
130 elif opt [0] == '-y':
136 # Tracking for file accesses.
142 FileAccesses[path] += 1
144 FileAccesses[path] = 1
146 def NoteFileAccess(paths):
148 # Keep separate track of what we've noted in this set so that each level
149 # of the tree only gets a single note from one patch.
153 if path.startswith('a/') or path.startswith('b/'):
157 path, last = os.path.split(path)
158 while path and path not in ['a', 'b', '/']:
163 path, last = os.path.split(path)
166 # Local version still, for now
168 def LookupStoreHacker(name, email):
169 return database.LookupStoreHacker(name, email, MapUnknown)
177 def AddDateLines(date, lines):
179 print('Skip big patch (%d)' % lines)
182 DateMap[date] += lines
184 DateMap[date] = lines
186 def PrintDateStats():
187 dates = DateMap.keys()
190 datef = open('datelc.csv', 'w')
191 datef.write('Date,Changed,Total Changed\n')
193 total += DateMap[date]
194 datef.write('%d/%02d/%02d,%d,%d\n' % (date.year, date.month, date.day,
195 DateMap[date], total))
199 # Let's slowly try to move some smarts into this class.
202 (ADDED, REMOVED) = range(2)
204 def __init__(self, commit):
206 self.merge = self.added = self.removed = 0
207 self.author = LookupStoreHacker('Unknown hacker', 'unknown@hacker.net')
208 self.email = 'unknown@hacker.net'
216 def addreviewer(self, reviewer):
217 self.reviews.append(reviewer)
219 def addtester(self, tester):
220 self.testers.append(tester)
222 def addreporter(self, reporter):
223 self.reports.append(reporter)
225 def addfiletype(self, filetype, added, removed):
226 if filetype in self.filetypes:
227 self.filetypes[filetype][self.ADDED] += added
228 self.filetypes[filetype][self.REMOVED] += removed
230 self.filetypes[filetype] = [added, removed]
232 def addfile(self, name):
233 self.files.append(name)
236 def parse_numstat(line, file_filter):
238 Receive a line of text, determine if fits a numstat line and
239 parse the added and removed lines as well as the file type.
241 m = patterns['numstat'].match(line)
243 filename = m.group(3)
244 # If we have a file filter, check for file lines.
245 if file_filter and not file_filter.search(filename):
246 return None, None, None, None
249 added = int(m.group(1))
250 removed = int(m.group(2))
252 # A binary file (image, etc.) is marked with '-'
255 m = patterns['rename'].match(filename)
257 filename = '%s%s%s' % (m.group(1), m.group(3), m.group(4))
259 filetype = database.FileTypes.guess_file_type(os.path.basename(filename))
260 return filename, filetype, added, removed
262 return None, None, None, None
265 # The core hack for grabbing the information about a changeset.
267 def grabpatch(logpatch):
268 m = patterns['commit'].match(logpatch[0])
272 p = patch(m.group(1))
273 ignore = (FileFilter is not None)
275 for Line in logpatch[1:]:
277 # Maybe it's an author line?
279 m = patterns['author'].match(Line)
281 p.email = database.RemapEmail(m.group(2))
282 p.author = LookupStoreHacker(m.group(1), p.email)
285 # Could be a signed-off-by:
287 m = patterns['signed-off-by'].match(Line)
289 email = database.RemapEmail(m.group(2))
290 sobber = LookupStoreHacker(m.group(1), email)
291 if sobber != p.author or AuthorSOBs:
292 p.sobs.append((email, LookupStoreHacker(m.group(1), m.group(2))))
295 # Various other tags of interest.
297 m = patterns['reviewed-by'].match(Line)
299 email = database.RemapEmail(m.group(2))
300 p.addreviewer(LookupStoreHacker(m.group(1), email))
302 m = patterns['tested-by'].match(Line)
304 email = database.RemapEmail(m.group(2))
305 p.addtester(LookupStoreHacker(m.group(1), email))
306 p.author.testcredit(patch)
309 m = patterns['reported-by'].match(Line)
311 email = database.RemapEmail(m.group(2))
312 p.addreporter(LookupStoreHacker(m.group(1), email))
313 p.author.reportcredit(patch)
316 # Syzbot has its own special reported-by that nobody else uses,
317 # and they get somewhat tetchy if we don't recognize them anyway.
319 m = patterns['reported-by2'].match(Line)
321 email = database.RemapEmail(m.group(1))
322 p.addreporter(LookupStoreHacker(email, email))
323 p.author.reportcredit(patch)
325 # Reported-and-tested-by:
326 m = patterns['reported-and-tested-by'].match(Line)
328 email = database.RemapEmail(m.group(2))
329 h = LookupStoreHacker(m.group(1), email)
332 p.author.reportcredit(patch)
333 p.author.testcredit(patch)
336 # If this one is a merge, make note of the fact.
338 m = patterns['merge'].match(Line)
343 # See if it's the date.
345 m = patterns['date'].match(Line)
347 dt = parsedate(m.group(2))
348 p.date = datetime.date(dt[0], dt[1], dt[2])
350 sys.stderr.write('Funky date: %s\n' % p.date)
355 # If we have a file filter, check for file lines.
358 ignore = ApplyFileFilter(Line, ignore)
360 # If we are tracking files touched, look for a relevant line here.
362 if FileReport and not ignore:
363 m = patterns['filea'].match(Line)
366 if file == '/dev/null':
369 p.addfile(m.group(1))
372 m = patterns['fileb'].match(Line)
374 p.addfile(m.group(1))
378 # OK, maybe it's part of the diff itself.
381 if patterns['add'].match(Line):
384 if patterns['rem'].match(Line):
388 # Grab data in the numstat format.
390 (filename, filetype, added, removed) = parse_numstat(Line, FileFilter)
394 p.addfiletype(filetype, added, removed)
397 if '@' in p.author.name:
398 GripeAboutAuthorName(p.author.name)
402 def GripeAboutAuthorName(name):
403 if name in GripedAuthorNames:
405 GripedAuthorNames.append(name)
406 print('%s is an author name, probably not what you want' % (name))
408 def ApplyFileFilter(line, ignore):
410 # If this is the first file line (--- a/), set ignore one way
413 m = patterns['filea'].match(line)
416 if FileFilter.search(file):
420 # For the second line, we can turn ignore off, but not on
422 m = patterns['fileb'].match(line)
425 if FileFilter.search(file):
429 def is_svntag(logpatch):
431 This is a workaround for a bug on the migration to Git
432 from Subversion found in GNOME. It may happen in other
433 repositories as well.
436 for Line in logpatch:
437 m = patterns['svn-tag'].match(Line.strip())
439 sys.stderr.write('(W) detected a commit on a svn tag: %s\n' %
446 # If this patch is signed off by both Andrew Morton and Linus Torvalds,
447 # remove the (redundant) Linus signoff.
450 if AkpmOverLt == 1 and Linus in p.sobs and Akpm in p.sobs:
455 # Here starts the real program.
460 # Read the config files.
462 ConfigFile.ConfigFile(CFName, DirName)
463 database.CheckAliases()
465 # Let's pre-seed the database with a couple of hackers
466 # we want to remember.
469 Linus = ('torvalds@linux-foundation.org',
470 LookupStoreHacker('Linus Torvalds', 'torvalds@linux-foundation.org'))
471 Akpm = ('akpm@linux-foundation.org',
472 LookupStoreHacker('Andrew Morton', 'akpm@linux-foundation.org'))
473 LookupStoreHacker('Syzbot', 'syzkaller@googlegroups.com') # XXX
475 TotalChanged = TotalAdded = TotalRemoved = 0
480 print('Grabbing changesets...\r', file=sys.stderr, end='')
482 patches = logparser.LogPatchSplitter(sys.stdin)
483 printcount = CSCount = 0
485 for logpatch in patches:
486 if (printcount % 50) == 0:
487 print('Grabbing changesets...%d\r' % printcount, file = sys.stderr,
491 # We want to ignore commits on svn tags since in Subversion
492 # thats mean a copy of the whole repository, which leads to
493 # wrong results. Some migrations from Subversion to Git does
494 # not catch all this tags/copy and import them just as a new
496 if is_svntag(logpatch):
499 p = grabpatch(logpatch)
502 # if p.added > 100000 or p.removed > 100000:
503 # print 'Skipping massive add', p.commit
505 if FileFilter and p.added == 0 and p.removed == 0:
508 # Apply the company filter if it exists.
510 empl = p.author.emailemployer(p.email, p.date)
511 if CompanyFilter and empl.name != CompanyFilter:
514 # Now note the file accesses if need be.
517 NoteFileAccess(p.files)
519 # Record some global information - but only if this patch had
520 # stuff which wasn't ignored.
522 if ((p.added + p.removed) > 0 or not FileFilter) and not p.merge:
523 TotalAdded += p.added
524 TotalRemoved += p.removed
525 TotalChanged += max(p.added, p.removed)
526 AddDateLines(p.date, max(p.added, p.removed))
530 for sobemail, sobber in p.sobs:
531 empl = sobber.emailemployer(sobemail, p.date)
536 for sobemail, sob in p.sobs:
538 for hacker in p.reviews:
540 for hacker in p.testers:
542 for hacker in p.reports:
545 csvdump.AccumulatePatch(p, Aggregate)
546 csvdump.store_patch(p)
547 print('Grabbing changesets...done ', file = sys.stderr)
551 database.MixVirtuals()
556 hlist = database.AllHackers()
557 elist = database.AllEmployers()
560 if len(h.patches) > 0:
565 reports.Write('Processed %d csets from %d developers\n' % (CSCount,
567 reports.Write('%d employers found\n' % (nempl))
568 reports.Write('A total of %d lines added, %d removed (delta %d)\n' %
569 (TotalAdded, TotalRemoved, TotalAdded - TotalRemoved))
570 if TotalChanged == 0:
571 TotalChanged = 1 # HACK to avoid div by zero
576 csvdump.OutputHackersCSV (HackersCSV, hlist);
580 csvdump.save_csv(CSVPrefix)
583 csvdump.OutputCSV(CSVFile)
587 reports.DevReports(hlist, TotalChanged, CSCount, TotalRemoved)
589 reports.ReportUnknowns(hlist, CSCount)
590 reports.EmplReports(elist, TotalChanged, CSCount)
592 if ReportByFileType and Numstat:
593 reports.ReportByFileType(hlist)
596 reports.FileAccessReport(FileReport, FileAccesses, CSCount)