6 # This code is part of the LWN git data miner.
8 # Copyright 2007-13 Eklektix, Inc.
9 # Copyright 2007-13 Jonathan Corbet <corbet@lwn.net>
10 # Copyright 2011 Germán Póo-Caamaño <gpoo@gnome.org>
12 # This file may be distributed under the terms of the GNU General
13 # Public License, version 2.
16 import database, csvdump, ConfigFile, reports
17 import getopt, datetime
18 import os, re, sys, string, os.path
20 from email.utils import parsedate
21 from patterns import patterns
23 Today = datetime.date.today()
26 # Remember author names we have griped about.
28 GripedAuthorNames = [ ]
43 CFName = 'gitdm.config'
48 ReportUnknowns = False
54 # -a Andrew Morton's signoffs shadow Linus's
55 # -b dir Specify the base directory to fetch the configuration files
56 # -c cfile Specify a configuration file
57 # -C company Only consider patches from <company>
58 # -d Output individual developer stats
59 # -D Output date statistics
60 # -f file Write touched-files report to <file>
61 # -h hfile HTML output to hfile
62 # -H file Export individual developer raw data as CSV
63 # -l count Maximum length for output lists
64 # -n Use numstats instead of generated patch from git log
65 # -o file File for text output
66 # -p prefix Prefix for CSV output
67 # -r pattern Restrict to files matching pattern
68 # -s Ignore author SOB lines
69 # -u Map unknown employers to '(Unknown)'
70 # -U Dump unknown hackers in report
71 # -x file.csv Export raw statistics as CSV
72 # -w Aggregrate the raw statistics by weeks instead of months
73 # -y Aggregrate the raw statistics by years instead of months
74 # -z Dump out the hacker database at completion
77 global MapUnknown, DevReports
78 global DateStats, AuthorSOBs, FileFilter, AkpmOverLt, DumpDB
79 global CFName, CSVFile, CSVPrefix,DirName, Aggregate, Numstat
80 global ReportByFileType, ReportUnknowns, CompanyFilter, FileReport
83 opts, rest = getopt.getopt(sys.argv[1:], 'ab:dC:c:Df:H:h:l:no:p:r:stUuwx:yz')
90 CompanyFilter = opt[1]
100 reports.SetHTMLOutput(open(opt[1], 'w'))
102 HackersCSV = open (opt[1], 'w')
104 reports.SetMaxList(int(opt[1]))
108 reports.SetOutput(open(opt[1], 'w'))
112 print('Filter on "%s"' % (opt[1]))
113 FileFilter = re.compile(opt[1])
121 ReportUnknowns = True
123 CSVFile = open(opt[1], 'w')
124 print("open output file " + opt[1] + "\n")
125 elif opt [0] == '-w':
127 elif opt [0] == '-y':
133 # Tracking for file accesses.
139 FileAccesses[path] += 1
141 FileAccesses[path] = 1
143 def NoteFileAccess(paths):
145 # Keep separate track of what we've noted in this set so that each level
146 # of the tree only gets a single note from one patch.
150 if path.startswith('a/') or path.startswith('b/'):
154 path, last = os.path.split(path)
155 while path and path not in ['a', 'b', '/']:
160 path, last = os.path.split(path)
163 # Local version still, for now
165 def LookupStoreHacker(name, email):
166 return database.LookupStoreHacker(name, email, MapUnknown)
174 def AddDateLines(date, lines):
176 print('Skip big patch (%d)' % lines)
179 DateMap[date] += lines
181 DateMap[date] = lines
183 def PrintDateStats():
184 dates = DateMap.keys()
187 datef = open('datelc.csv', 'w')
188 datef.write('Date,Changed,Total Changed\n')
190 total += DateMap[date]
191 datef.write('%d/%02d/%02d,%d,%d\n' % (date.year, date.month, date.day,
192 DateMap[date], total))
196 # Let's slowly try to move some smarts into this class.
199 (ADDED, REMOVED) = range(2)
201 def __init__(self, commit):
203 self.merge = self.added = self.removed = 0
204 self.author = LookupStoreHacker('Unknown hacker', 'unknown@hacker.net')
205 self.email = 'unknown@hacker.net'
213 def addreviewer(self, reviewer):
214 self.reviews.append(reviewer)
216 def addtester(self, tester):
217 self.testers.append(tester)
219 def addreporter(self, reporter):
220 self.reports.append(reporter)
222 def addfiletype(self, filetype, added, removed):
223 if filetype in self.filetypes:
224 self.filetypes[filetype][self.ADDED] += added
225 self.filetypes[filetype][self.REMOVED] += removed
227 self.filetypes[filetype] = [added, removed]
229 def addfile(self, name):
230 self.files.append(name)
233 def parse_numstat(line, file_filter):
235 Receive a line of text, determine if fits a numstat line and
236 parse the added and removed lines as well as the file type.
238 m = patterns['numstat'].match(line)
240 filename = m.group(3)
241 # If we have a file filter, check for file lines.
242 if file_filter and not file_filter.search(filename):
243 return None, None, None, None
246 added = int(m.group(1))
247 removed = int(m.group(2))
249 # A binary file (image, etc.) is marked with '-'
252 m = patterns['rename'].match(filename)
254 filename = '%s%s%s' % (m.group(1), m.group(3), m.group(4))
256 filetype = database.FileTypes.guess_file_type(os.path.basename(filename))
257 return filename, filetype, added, removed
259 return None, None, None, None
262 # The core hack for grabbing the information about a changeset.
264 def grabpatch(logpatch):
265 m = patterns['commit'].match(logpatch[0])
269 p = patch(m.group(1))
270 ignore = (FileFilter is not None)
272 for Line in logpatch[1:]:
274 # Maybe it's an author line?
276 m = patterns['author'].match(Line)
278 p.email = database.RemapEmail(m.group(2))
279 p.author = LookupStoreHacker(m.group(1), p.email)
282 # Could be a signed-off-by:
284 m = patterns['signed-off-by'].match(Line)
286 email = database.RemapEmail(m.group(2))
287 sobber = LookupStoreHacker(m.group(1), email)
288 if sobber != p.author or AuthorSOBs:
289 p.sobs.append((email, LookupStoreHacker(m.group(1), m.group(2))))
292 # Various other tags of interest.
294 m = patterns['reviewed-by'].match(Line)
296 email = database.RemapEmail(m.group(2))
297 p.addreviewer(LookupStoreHacker(m.group(1), email))
299 m = patterns['tested-by'].match(Line)
301 email = database.RemapEmail(m.group(2))
302 p.addtester(LookupStoreHacker(m.group(1), email))
303 p.author.testcredit(patch)
306 m = patterns['reported-by'].match(Line)
308 email = database.RemapEmail(m.group(2))
309 p.addreporter(LookupStoreHacker(m.group(1), email))
310 p.author.reportcredit(patch)
313 # Syzbot has its own special reported-by that nobody else uses,
314 # and they get somewhat tetchy if we don't recognize them anyway.
316 m = patterns['reported-by2'].match(Line)
318 email = database.RemapEmail(m.group(1))
319 p.addreporter(LookupStoreHacker(email, email))
320 p.author.reportcredit(patch)
322 # Reported-and-tested-by:
323 m = patterns['reported-and-tested-by'].match(Line)
325 email = database.RemapEmail(m.group(2))
326 h = LookupStoreHacker(m.group(1), email)
329 p.author.reportcredit(patch)
330 p.author.testcredit(patch)
333 # If this one is a merge, make note of the fact.
335 m = patterns['merge'].match(Line)
340 # See if it's the date.
342 m = patterns['date'].match(Line)
344 dt = parsedate(m.group(2))
345 p.date = datetime.date(dt[0], dt[1], dt[2])
347 sys.stderr.write('Funky date: %s\n' % p.date)
352 # If we have a file filter, check for file lines.
355 ignore = ApplyFileFilter(Line, ignore)
357 # If we are tracking files touched, look for a relevant line here.
359 if FileReport and not ignore:
360 m = patterns['filea'].match(Line)
363 if file == '/dev/null':
366 p.addfile(m.group(1))
369 m = patterns['fileb'].match(Line)
371 p.addfile(m.group(1))
375 # OK, maybe it's part of the diff itself.
378 if patterns['add'].match(Line):
381 if patterns['rem'].match(Line):
385 # Grab data in the numstat format.
387 (filename, filetype, added, removed) = parse_numstat(Line, FileFilter)
391 p.addfiletype(filetype, added, removed)
394 if '@' in p.author.name:
395 GripeAboutAuthorName(p.author.name)
399 def GripeAboutAuthorName(name):
400 if name in GripedAuthorNames:
402 GripedAuthorNames.append(name)
403 print('%s is an author name, probably not what you want' % (name))
405 def ApplyFileFilter(line, ignore):
407 # If this is the first file line (--- a/), set ignore one way
410 m = patterns['filea'].match(line)
413 if FileFilter.search(file):
417 # For the second line, we can turn ignore off, but not on
419 m = patterns['fileb'].match(line)
422 if FileFilter.search(file):
426 def is_svntag(logpatch):
428 This is a workaround for a bug on the migration to Git
429 from Subversion found in GNOME. It may happen in other
430 repositories as well.
433 for Line in logpatch:
434 m = patterns['svn-tag'].match(Line.strip())
436 sys.stderr.write('(W) detected a commit on a svn tag: %s\n' %
443 # If this patch is signed off by both Andrew Morton and Linus Torvalds,
444 # remove the (redundant) Linus signoff.
447 if AkpmOverLt == 1 and Linus in p.sobs and Akpm in p.sobs:
452 # Here starts the real program.
457 # Read the config files.
459 ConfigFile.ConfigFile(CFName, DirName)
460 database.CheckAliases()
462 # Let's pre-seed the database with a couple of hackers
463 # we want to remember.
466 Linus = ('torvalds@linux-foundation.org',
467 LookupStoreHacker('Linus Torvalds', 'torvalds@linux-foundation.org'))
468 Akpm = ('akpm@linux-foundation.org',
469 LookupStoreHacker('Andrew Morton', 'akpm@linux-foundation.org'))
470 LookupStoreHacker('Syzbot', 'syzkaller@googlegroups.com') # XXX
472 TotalChanged = TotalAdded = TotalRemoved = 0
477 print('Grabbing changesets...\r', file=sys.stderr, end='')
479 patches = logparser.LogPatchSplitter(sys.stdin)
480 printcount = CSCount = 0
482 for logpatch in patches:
483 if (printcount % 50) == 0:
484 print('Grabbing changesets...%d\r' % printcount, file = sys.stderr,
488 # We want to ignore commits on svn tags since in Subversion
489 # thats mean a copy of the whole repository, which leads to
490 # wrong results. Some migrations from Subversion to Git does
491 # not catch all this tags/copy and import them just as a new
493 if is_svntag(logpatch):
496 p = grabpatch(logpatch)
499 # if p.added > 100000 or p.removed > 100000:
500 # print 'Skipping massive add', p.commit
502 if FileFilter and p.added == 0 and p.removed == 0:
505 # Apply the company filter if it exists.
507 empl = p.author.emailemployer(p.email, p.date)
508 if CompanyFilter and empl.name != CompanyFilter:
511 # Now note the file accesses if need be.
514 NoteFileAccess(p.files)
516 # Record some global information - but only if this patch had
517 # stuff which wasn't ignored.
519 if ((p.added + p.removed) > 0 or not FileFilter) and not p.merge:
520 TotalAdded += p.added
521 TotalRemoved += p.removed
522 TotalChanged += max(p.added, p.removed)
523 AddDateLines(p.date, max(p.added, p.removed))
527 for sobemail, sobber in p.sobs:
528 empl = sobber.emailemployer(sobemail, p.date)
533 for sobemail, sob in p.sobs:
535 for hacker in p.reviews:
537 for hacker in p.testers:
539 for hacker in p.reports:
542 csvdump.AccumulatePatch(p, Aggregate)
543 csvdump.store_patch(p)
544 print('Grabbing changesets...done ', file = sys.stderr)
548 database.MixVirtuals()
553 hlist = database.AllHackers()
554 elist = database.AllEmployers()
557 if len(h.patches) > 0:
562 reports.Write('Processed %d csets from %d developers\n' % (CSCount,
564 reports.Write('%d employers found\n' % (nempl))
565 reports.Write('A total of %d lines added, %d removed (delta %d)\n' %
566 (TotalAdded, TotalRemoved, TotalAdded - TotalRemoved))
567 if TotalChanged == 0:
568 TotalChanged = 1 # HACK to avoid div by zero
573 csvdump.OutputHackersCSV (HackersCSV, hlist);
577 csvdump.save_csv(CSVPrefix)
580 csvdump.OutputCSV(CSVFile)
584 reports.DevReports(hlist, TotalChanged, CSCount, TotalRemoved)
586 reports.ReportUnknowns(hlist, CSCount)
587 reports.EmplReports(elist, TotalChanged, CSCount)
589 if ReportByFileType and Numstat:
590 reports.ReportByFileType(hlist)
593 reports.FileAccessReport(FileReport, FileAccesses, CSCount)