Get rid of a debugging print statement.
[git-dm.git] / gitdm
blobd580414b43256b93324d00f83a8051def590ef0b
1 #!/usr/bin/python
5 # This code is part of the LWN git data miner.
7 # Copyright 2007 LWN.net
8 # Copyright 2007 Jonathan Corbet <corbet@lwn.net>
10 # This file may be distributed under the terms of the GNU General
11 # Public License, version 2.
14 import database, ConfigFile
15 import getopt, datetime
16 import os, re, sys, rfc822, string
19 # Some people, when confronted with a problem, think "I know, I'll use regular
20 # expressions." Now they have two problems.
21 # -- Jamie Zawinski
23 Pcommit = re.compile (r'^commit ([0-9a-f]+)$')
24 Pauthor = re.compile (r'^Author: ([^<]+)\s<([^>]+)>$')
25 Psob = re.compile (r'Signed-off-by:\s+([^<]+)\s+<([^>]+)>')
26 Pmerge = re.compile (r'^Merge:.*$')
27 Padd = re.compile (r'^\+[^\+].*$')
28 Prem = re.compile (r'^-[^-].*$')
29 Pdate = re.compile (r'^(Commit)?Date:\s+(.*)$')
30 Pfilea = re.compile (r'^---\s+(.*)$')
31 Pfileb = re.compile (r'^\+\+\+\s+(.*)$')
33 class patch:
34 pass
38 # Control options.
40 Outfile = sys.stdout
41 ListCount = 999999
42 MapUnknown = 0
43 DevReports = 1
44 DateStats = 0
45 AuthorSOBs = 1
46 FileFilter = None
47 AkpmOverLt = 0
48 DumpDB = 0
49 CFName = 'gitdm.config'
51 # Options:
53 # -a Andrew Morton's signoffs shadow Linus's
54 # -c cfile Specify a configuration file
55 # -d Output individual developer stats
56 # -D Output date statistics
57 # -h hfile HTML output to hfile
58 # -l count Maximum length for output lists
59 # -o file File for text output
60 # -r pattern Restrict to files matching pattern
61 # -s Ignore author SOB lines
62 # -u Map unknown employers to '(Unknown)'
63 # -z Dump out the hacker database at completion
65 def ParseOpts ():
66 global Outfile, ListCount, MapUnknown, HTMLfile, DevReports
67 global DateStats, AuthorSOBs, FileFilter, AkpmOverLt, DumpDB
68 global CFName
70 opts, rest = getopt.getopt (sys.argv[1:], 'adDh:l:o:r:suz')
71 for opt in opts:
72 if opt[0] == '-a':
73 AkpmOverLt = 1
74 elif opt[0] == '-c':
75 CFName = opt[1]
76 elif opt[0] == '-d':
77 DevReports = 0
78 elif opt[0] == '-D':
79 DateStats = 1
80 elif opt[0] == '-h':
81 HTMLfile = open (opt[1], 'w')
82 elif opt[0] == '-l':
83 ListCount = int (opt[1])
84 elif opt[0] == '-o':
85 Outfile = open (opt[1], 'w')
86 elif opt[0] == '-r':
87 print 'Filter on "%s"' % (opt[1])
88 FileFilter = re.compile (opt[1])
89 elif opt[0] == '-s':
90 AuthorSOBs = 0
91 elif opt[0] == '-u':
92 MapUnknown = 1
93 elif opt[0] == '-z':
94 DumpDB = 1
98 def LookupStoreHacker (name, email):
99 email = database.RemapEmail (email)
100 h = database.LookupEmail (email)
101 if h: # already there
102 return h
103 elist = database.LookupEmployer (email, MapUnknown)
104 h = database.LookupName (name)
105 if h: # new email
106 h.addemail (email, elist)
107 return h
108 return database.StoreHacker(name, elist, email)
111 # Date tracking.
114 DateMap = { }
116 def AddDateLines(date, lines):
117 if lines > 1000000:
118 print 'Skip big patch (%d)' % lines
119 return
120 dt = (date.year, date.month, date.day)
121 try:
122 DateMap[date] += lines
123 except KeyError:
124 DateMap[date] = lines
126 def PrintDateStats():
127 dates = DateMap.keys ()
128 dates.sort ()
129 total = 0
130 datef = open ('datelc', 'w')
131 for date in dates:
132 total += DateMap[date]
133 datef.write ('%d/%02d/%02d %6d %7d\n' % (date[0], date[1], date[2],
134 DateMap[date], total))
137 # The core hack for grabbing the information about a changeset.
139 def grabpatch():
140 global NextLine, TotalAdded, TotalRemoved, TotalChanged
142 while (1):
143 m = Pcommit.match (NextLine)
144 if m:
145 break;
146 NextLine = sys.stdin.readline ()
147 if not NextLine:
148 return
150 p = patch()
151 p.commit = m.group (1)
152 p.merge = p.added = p.removed = 0
153 p.author = LookupStoreHacker('Unknown hacker', 'unknown@hacker.net')
154 p.email = 'unknown@hacker.net'
155 p.sobs = [ ]
156 NextLine = sys.stdin.readline ()
157 ignore = (FileFilter is not None)
158 while NextLine:
159 Line = NextLine
161 # If this line starts a new commit, drop out.
163 m = Pcommit.match (Line)
164 if m:
165 break
166 NextLine = sys.stdin.readline ()
168 # Maybe it's an author line?
170 m = Pauthor.match (Line)
171 if m:
172 p.email = database.RemapEmail (m.group (2))
173 p.author = LookupStoreHacker(m.group (1), p.email)
174 continue
176 # Could be a signed-off-by:
178 m = Psob.search (Line)
179 if m:
180 email = database.RemapEmail (m.group (2))
181 sobber = LookupStoreHacker(m.group (1), email)
182 if sobber != p.author or AuthorSOBs:
183 p.sobs.append ((email, LookupStoreHacker(m.group (1), m.group (2))))
184 continue
186 # If this one is a merge, make note of the fact.
188 m = Pmerge.match (Line)
189 if m:
190 p.merge = 1
191 continue
193 # See if it's the date.
195 m = Pdate.match (Line)
196 if m:
197 dt = rfc822.parsedate(m.group (2))
198 p.date = datetime.date (dt[0], dt[1], dt[2])
199 continue
201 # If we have a file filter, check for file lines.
203 if FileFilter:
204 ignore = ApplyFileFilter (Line, ignore)
206 # OK, maybe it's part of the diff itself.
208 if not ignore:
209 if Padd.match (Line):
210 p.added += 1
211 continue
212 if Prem.match (Line):
213 p.removed += 1
215 # Record some global information - but only if this patch had
216 # stuff which wasn't ignored. This work should be done
217 # elsewhere,
219 if ((p.added + p.removed) > 0 or not FileFilter) and not p.merge:
220 TotalAdded += p.added
221 TotalRemoved += p.removed
222 TotalChanged += max (p.added, p.removed)
223 AddDateLines (p.date, max (p.added, p.removed))
224 empl = p.author.emailemployer (p.email, p.date)
225 empl.AddCSet (p)
226 if AkpmOverLt:
227 TrimLTSOBs (p)
228 for sobemail, sobber in p.sobs:
229 empl = sobber.emailemployer (sobemail, p.date)
230 empl.AddSOB()
231 return p
234 def ApplyFileFilter (line, ignore):
236 # If this is the first file line (--- a/), set ignore one way
237 # or the other.
239 m = Pfilea.match (line)
240 if m:
241 file = m.group (1)
242 if FileFilter.search (file):
243 return 0
244 return 1
246 # For the second line, we can turn ignore off, but not on
248 m = Pfileb.match (line)
249 if m:
250 file = m.group (1)
251 if FileFilter.search (file):
252 return 0
253 return ignore
256 # If this patch is signed off by both Andrew Morton and Linus Torvalds,
257 # remove the (redundant) Linus signoff.
259 def TrimLTSOBs (p):
260 if Linus in p.sobs and Akpm in p.sobs:
261 p.sobs.remove (Linus)
264 # HTML output support stuff.
266 HTMLfile = None
267 HTMLclass = 0
268 HClasses = ['Even', 'Odd']
270 THead = '''<p>
271 <table cellspacing=3>
272 <tr><th colspan=3>%s</th></tr>
276 def BeginReport (title):
277 global HTMLclass
279 Outfile.write ('\n%s\n' % title)
280 if HTMLfile:
281 HTMLfile.write (THead % title)
282 HTMLclass = 0
284 TRow = ''' <tr class="%s">
285 <td>%s</td><td align="right">%d</td><td align="right">%.1f%%</td></tr>
288 def ReportLine (text, count, pct):
289 global HTMLclass
290 if count == 0:
291 return
292 Outfile.write ('%-25s %4d (%.1f%%)\n' % (text, count, pct))
293 if HTMLfile:
294 HTMLfile.write (TRow % (HClasses[HTMLclass], text, count, pct))
295 HTMLclass ^= 1
297 def EndReport ():
298 if HTMLfile:
299 HTMLfile.write ('</table>\n\n')
302 # Comparison and report generation functions.
304 def ComparePCount (h1, h2):
305 return len (h2.patches) - len (h1.patches)
307 def ReportByPCount (hlist):
308 hlist.sort (ComparePCount)
309 count = 0
310 BeginReport ('Developers with the most changesets')
311 for h in hlist:
312 pcount = len (h.patches)
313 changed = max(h.added, h.removed)
314 delta = h.added - h.removed
315 if pcount > 0:
316 ReportLine (h.name, pcount, (pcount*100.0)/CSCount)
317 count += 1
318 if count >= ListCount:
319 break
320 EndReport ()
322 def CompareLChanged (h1, h2):
323 return max(h2.added, h2.removed) - max(h1.added, h1.removed)
325 def ReportByLChanged (hlist):
326 hlist.sort (CompareLChanged)
327 count = 0
328 BeginReport ('Developers with the most changed lines')
329 for h in hlist:
330 pcount = len (h.patches)
331 changed = max(h.added, h.removed)
332 delta = h.added - h.removed
333 if (h.added + h.removed) > 0:
334 ReportLine (h.name, changed, (changed*100.0)/TotalChanged)
335 count += 1
336 if count >= ListCount:
337 break
338 EndReport ()
340 def CompareLRemoved (h1, h2):
341 return (h2.removed - h2.added) - (h1.removed - h1.added)
343 def ReportByLRemoved (hlist):
344 hlist.sort (CompareLRemoved)
345 count = 0
346 BeginReport ('Developers with the most lines removed')
347 for h in hlist:
348 pcount = len (h.patches)
349 changed = max(h.added, h.removed)
350 delta = h.added - h.removed
351 if delta < 0:
352 ReportLine (h.name, -delta, (-delta*100.0)/TotalRemoved)
353 count += 1
354 if count >= ListCount:
355 break
356 EndReport ()
358 def CompareEPCount (e1, e2):
359 return e2.count - e1.count
361 def ReportByPCEmpl (elist):
362 elist.sort (CompareEPCount)
363 count = 0
364 BeginReport ('Top changeset contributors by employer')
365 for e in elist:
366 if e.count != 0:
367 ReportLine (e.name, e.count, (e.count*100.0)/CSCount)
368 count += 1
369 if count >= ListCount:
370 break
371 EndReport ()
375 def CompareELChanged (e1, e2):
376 return e2.changed - e1.changed
378 def ReportByELChanged (elist):
379 elist.sort (CompareELChanged)
380 count = 0
381 BeginReport ('Top lines changed by employer')
382 for e in elist:
383 if e.changed != 0:
384 ReportLine (e.name, e.changed, (e.changed*100.0)/TotalChanged)
385 count += 1
386 if count >= ListCount:
387 break
388 EndReport ()
392 def CompareSOBs (h1, h2):
393 return len (h2.signoffs) - len (h1.signoffs)
395 def ReportBySOBs (hlist):
396 hlist.sort (CompareSOBs)
397 totalsobs = 0
398 for h in hlist:
399 totalsobs += len (h.signoffs)
400 count = 0
401 BeginReport ('Developers with the most signoffs (total %d)' % totalsobs)
402 for h in hlist:
403 scount = len (h.signoffs)
404 if scount > 0:
405 ReportLine (h.name, scount, (scount*100.0)/totalsobs)
406 count += 1
407 if count >= ListCount:
408 break
409 EndReport ()
411 def CompareESOBs (e1, e2):
412 return e2.sobs - e1.sobs
414 def ReportByESOBs (elist):
415 elist.sort (CompareESOBs)
416 totalsobs = 0
417 for e in elist:
418 totalsobs += e.sobs
419 count = 0
420 BeginReport ('Employers with the most signoffs (total %d)' % totalsobs)
421 for e in elist:
422 if e.sobs > 0:
423 ReportLine (e.name, e.sobs, (e.sobs*100.0)/totalsobs)
424 count += 1
425 if count >= ListCount:
426 break
427 EndReport ()
430 # Here starts the real program. Read the config files.
432 ConfigFile.ConfigFile (CFName)
435 # Let's pre-seed the database with a couple of hackers
436 # we want to remember.
438 Linus = ('torvalds@linux-foundation.org',
439 LookupStoreHacker ('Linus Torvalds', 'torvalds@linux-foundation.org'))
440 Akpm = ('akpm@linux-foundation.org',
441 LookupStoreHacker ('Andrew Morton', 'akpm@linux-foundation.org'))
443 NextLine = sys.stdin.readline ()
444 TotalChanged = TotalAdded = TotalRemoved = 0
445 ParseOpts ()
448 # Snarf changesets.
450 print >> sys.stderr, 'Grabbing changesets...\r',
452 printcount = CSCount = 0
453 while (1):
454 if (printcount % 50) == 0:
455 print >> sys.stderr, 'Grabbing changesets...%d\r' % printcount,
456 printcount += 1
457 p = grabpatch()
458 if not p:
459 break
460 if p.added > 100000 or p.removed > 100000:
461 print 'Skipping massive add'
462 continue
463 if FileFilter and p.added == 0 and p.removed == 0:
464 continue
465 if not p.merge:
466 p.author.addpatch (p)
467 for sobemail, sob in p.sobs:
468 sob.addsob (p)
469 CSCount += 1
470 print >> sys.stderr, 'Grabbing changesets...done'
472 if DumpDB:
473 database.DumpDB ()
475 # Say something
477 hlist = database.AllHackers ()
478 elist = database.AllEmployers ()
479 Outfile.write ('Processed %d csets from %d developers\n' % (CSCount,
480 len (hlist)))
481 Outfile.write ('%d employers found\n' % len (elist))
482 Outfile.write ('A total of %d lines added, %d removed (delta %d)\n' %
483 (TotalAdded, TotalRemoved, TotalAdded - TotalRemoved))
484 if TotalChanged == 0:
485 TotalChanged = 1 # HACK to avoid div by zero
486 if DateStats:
487 PrintDateStats ()
488 sys.exit(0)
490 if DevReports:
491 ReportByPCount (hlist)
492 ReportByLChanged (hlist)
493 ReportByLRemoved (hlist)
494 ReportBySOBs (hlist)
495 ReportByPCEmpl (elist)
496 ReportByELChanged (elist)
497 ReportByESOBs (elist)