Made the CSV file aggregating data by weeks or months
[git-dm.git] / gitdm
blob70f813885f63ce903534f549b3f59bb7d20a3474
1 #!/usr/bin/python
5 # This code is part of the LWN git data miner.
7 # Copyright 2007-9 LWN.net
8 # Copyright 2007-9 Jonathan Corbet <corbet@lwn.net>
10 # This file may be distributed under the terms of the GNU General
11 # Public License, version 2.
14 import database, csv, ConfigFile, reports
15 import getopt, datetime
16 import os, re, sys, rfc822, string
17 from patterns import *
19 Today = datetime.date.today()
22 # Remember author names we have griped about.
24 GripedAuthorNames = [ ]
27 # Control options.
29 MapUnknown = 0
30 DevReports = 1
31 DateStats = 0
32 AuthorSOBs = 1
33 FileFilter = None
34 CSVFile = None
35 AkpmOverLt = 0
36 DumpDB = 0
37 CFName = 'gitdm.config'
38 DirName = ''
39 Aggregate = 'month'
42 # Options:
44 # -a Andrew Morton's signoffs shadow Linus's
45 # -b dir Specify the base directory to fetch the configuration files
46 # -c cfile Specify a configuration file
47 # -d Output individual developer stats
48 # -D Output date statistics
49 # -h hfile HTML output to hfile
50 # -l count Maximum length for output lists
51 # -o file File for text output
52 # -r pattern Restrict to files matching pattern
53 # -s Ignore author SOB lines
54 # -u Map unknown employers to '(Unknown)'
55 # -x file.csv Export raw statistics as CSV
56 # -w Aggregrate the raw statistics by weeks instead of months
57 # -z Dump out the hacker database at completion
59 def ParseOpts ():
60 global MapUnknown, DevReports
61 global DateStats, AuthorSOBs, FileFilter, AkpmOverLt, DumpDB
62 global CFName, CSVFile, DirName, Aggregate
64 opts, rest = getopt.getopt (sys.argv[1:], 'a:b:dc:Dh:l:o:r:suwx:z')
65 for opt in opts:
66 if opt[0] == '-a':
67 AkpmOverLt = 1
68 elif opt[0] == '-b':
69 DirName = opt[1]
70 elif opt[0] == '-c':
71 CFName = opt[1]
72 elif opt[0] == '-d':
73 DevReports = 0
74 elif opt[0] == '-D':
75 DateStats = 1
76 elif opt[0] == '-h':
77 reports.SetHTMLOutput (open (opt[1], 'w'))
78 elif opt[0] == '-l':
79 reports.SetMaxList (int (opt[1]))
80 elif opt[0] == '-o':
81 reports.SetOutput (open (opt[1], 'w'))
82 elif opt[0] == '-r':
83 print 'Filter on "%s"' % (opt[1])
84 FileFilter = re.compile (opt[1])
85 elif opt[0] == '-s':
86 AuthorSOBs = 0
87 elif opt[0] == '-u':
88 MapUnknown = 1
89 elif opt[0] == '-x':
90 CSVFile = open (opt[1], 'w')
91 print "open output file " + opt[1] + "\n"
92 elif opt [0] == '-w':
93 Aggregate = 'week'
94 elif opt[0] == '-z':
95 DumpDB = 1
99 def LookupStoreHacker (name, email):
100 email = database.RemapEmail (email)
101 h = database.LookupEmail (email)
102 if h: # already there
103 return h
104 elist = database.LookupEmployer (email, MapUnknown)
105 h = database.LookupName (name)
106 if h: # new email
107 h.addemail (email, elist)
108 return h
109 return database.StoreHacker(name, elist, email)
112 # Date tracking.
115 DateMap = { }
117 def AddDateLines(date, lines):
118 if lines > 1000000:
119 print 'Skip big patch (%d)' % lines
120 return
121 try:
122 DateMap[date] += lines
123 except KeyError:
124 DateMap[date] = lines
126 def PrintDateStats():
127 dates = DateMap.keys ()
128 dates.sort ()
129 total = 0
130 datef = open ('datelc.csv', 'w')
131 datef.write('Date,Changed,Total Changed\n')
132 for date in dates:
133 total += DateMap[date]
134 datef.write ('%d/%02d/%02d,%d,%d\n' % (date.year, date.month, date.day,
135 DateMap[date], total))
139 # Let's slowly try to move some smarts into this class.
141 class patch:
142 def __init__ (self, commit):
143 self.commit = commit
144 self.merge = self.added = self.removed = 0
145 self.author = LookupStoreHacker('Unknown hacker', 'unknown@hacker.net')
146 self.email = 'unknown@hacker.net'
147 self.sobs = [ ]
148 self.reviews = [ ]
149 self.testers = [ ]
150 self.reports = [ ]
152 def addreviewer (self, reviewer):
153 self.reviews.append (reviewer)
155 def addtester (self, tester):
156 self.testers.append (tester)
158 def addreporter (self, reporter):
159 self.reports.append (reporter)
161 # The core hack for grabbing the information about a changeset.
163 def grabpatch():
164 global NextLine
166 while (1):
167 m = Pcommit.match (NextLine)
168 if m:
169 break;
170 NextLine = sys.stdin.readline ()
171 if not NextLine:
172 return
174 p = patch(m.group (1))
175 NextLine = sys.stdin.readline ()
176 ignore = (FileFilter is not None)
177 while NextLine:
178 Line = NextLine
180 # If this line starts a new commit, drop out.
182 m = Pcommit.match (Line)
183 if m:
184 break
185 NextLine = sys.stdin.readline ()
187 # Maybe it's an author line?
189 m = Pauthor.match (Line)
190 if m:
191 p.email = database.RemapEmail (m.group (2))
192 p.author = LookupStoreHacker(m.group (1), p.email)
193 continue
195 # Could be a signed-off-by:
197 m = Psob.match (Line)
198 if m:
199 email = database.RemapEmail (m.group (2))
200 sobber = LookupStoreHacker(m.group (1), email)
201 if sobber != p.author or AuthorSOBs:
202 p.sobs.append ((email, LookupStoreHacker(m.group (1), m.group (2))))
203 continue
205 # Various other tags of interest.
207 m = Preview.match (Line) # Reviewed-by:
208 if m:
209 email = database.RemapEmail (m.group (2))
210 p.addreviewer (LookupStoreHacker(m.group (1), email))
211 continue
212 m = Ptest.match (Line) # Tested-by:
213 if m:
214 email = database.RemapEmail (m.group (2))
215 p.addtester (LookupStoreHacker (m.group (1), email))
216 p.author.testcredit (patch)
217 continue
218 m = Prep.match (Line) # Reported-by:
219 if m:
220 email = database.RemapEmail (m.group (2))
221 p.addreporter (LookupStoreHacker (m.group (1), email))
222 p.author.reportcredit (patch)
223 continue
224 m = Preptest.match (Line) # Reported-and-tested-by:
225 if m:
226 email = database.RemapEmail (m.group (2))
227 h = LookupStoreHacker (m.group (1), email)
228 p.addreporter (h)
229 p.addtester (h)
230 p.author.reportcredit (patch)
231 p.author.testcredit (patch)
232 continue
234 # If this one is a merge, make note of the fact.
236 m = Pmerge.match (Line)
237 if m:
238 p.merge = 1
239 continue
241 # See if it's the date.
243 m = Pdate.match (Line)
244 if m:
245 dt = rfc822.parsedate(m.group (2))
246 p.date = datetime.date (dt[0], dt[1], dt[2])
247 if p.date > Today:
248 sys.stderr.write ('Funky date: %s\n' % p.date)
249 p.date = Today
250 continue
252 # If we have a file filter, check for file lines.
254 if FileFilter:
255 ignore = ApplyFileFilter (Line, ignore)
257 # OK, maybe it's part of the diff itself.
259 if not ignore:
260 if Padd.match (Line):
261 p.added += 1
262 continue
263 if Prem.match (Line):
264 p.removed += 1
266 if '@' in p.author.name:
267 GripeAboutAuthorName (p.author.name)
269 return p
271 def GripeAboutAuthorName (name):
272 if name in GripedAuthorNames:
273 return
274 GripedAuthorNames.append (name)
275 print '%s is an author name, probably not what you want' % (name)
277 def ApplyFileFilter (line, ignore):
279 # If this is the first file line (--- a/), set ignore one way
280 # or the other.
282 m = Pfilea.match (line)
283 if m:
284 file = m.group (1)
285 if FileFilter.search (file):
286 return 0
287 return 1
289 # For the second line, we can turn ignore off, but not on
291 m = Pfileb.match (line)
292 if m:
293 file = m.group (1)
294 if FileFilter.search (file):
295 return 0
296 return ignore
299 # If this patch is signed off by both Andrew Morton and Linus Torvalds,
300 # remove the (redundant) Linus signoff.
302 def TrimLTSOBs (p):
303 if AkpmOverLt == 1 and Linus in p.sobs and Akpm in p.sobs:
304 p.sobs.remove (Linus)
308 # Here starts the real program.
310 ParseOpts ()
313 # Read the config files.
315 ConfigFile.ConfigFile (CFName, DirName)
318 # Let's pre-seed the database with a couple of hackers
319 # we want to remember.
321 if AkpmOverLt == 1:
322 Linus = ('torvalds@linux-foundation.org',
323 LookupStoreHacker ('Linus Torvalds', 'torvalds@linux-foundation.org'))
324 Akpm = ('akpm@linux-foundation.org',
325 LookupStoreHacker ('Andrew Morton', 'akpm@linux-foundation.org'))
327 NextLine = sys.stdin.readline ()
328 TotalChanged = TotalAdded = TotalRemoved = 0
331 # Snarf changesets.
333 print >> sys.stderr, 'Grabbing changesets...\r',
335 printcount = CSCount = 0
336 while (1):
337 if (printcount % 50) == 0:
338 print >> sys.stderr, 'Grabbing changesets...%d\r' % printcount,
339 printcount += 1
340 p = grabpatch()
341 if not p:
342 break
343 # if p.added > 100000 or p.removed > 100000:
344 # print 'Skipping massive add', p.commit
345 # continue
346 if FileFilter and p.added == 0 and p.removed == 0:
347 continue
350 # Record some global information - but only if this patch had
351 # stuff which wasn't ignored.
353 if ((p.added + p.removed) > 0 or not FileFilter) and not p.merge:
354 TotalAdded += p.added
355 TotalRemoved += p.removed
356 TotalChanged += max (p.added, p.removed)
357 AddDateLines (p.date, max (p.added, p.removed))
358 empl = p.author.emailemployer (p.email, p.date)
359 empl.AddCSet (p)
360 if AkpmOverLt:
361 TrimLTSOBs (p)
362 for sobemail, sobber in p.sobs:
363 empl = sobber.emailemployer (sobemail, p.date)
364 empl.AddSOB()
366 if not p.merge:
367 p.author.addpatch (p)
368 for sobemail, sob in p.sobs:
369 sob.addsob (p)
370 for hacker in p.reviews:
371 hacker.addreview (p)
372 for hacker in p.testers:
373 hacker.addtested (p)
374 for hacker in p.reports:
375 hacker.addreport (p)
376 CSCount += 1
377 csv.AccumulatePatch (p, Aggregate)
378 print >> sys.stderr, 'Grabbing changesets...done '
380 if DumpDB:
381 database.DumpDB ()
383 # Say something
385 hlist = database.AllHackers ()
386 elist = database.AllEmployers ()
387 ndev = nempl = 0
388 for h in hlist:
389 if len (h.patches) > 0:
390 ndev += 1
391 for e in elist:
392 if e.count > 0:
393 nempl += 1
394 reports.Write ('Processed %d csets from %d developers\n' % (CSCount,
395 ndev))
396 reports.Write ('%d employers found\n' % (nempl))
397 reports.Write ('A total of %d lines added, %d removed (delta %d)\n' %
398 (TotalAdded, TotalRemoved, TotalAdded - TotalRemoved))
399 if TotalChanged == 0:
400 TotalChanged = 1 # HACK to avoid div by zero
401 if DateStats:
402 PrintDateStats ()
404 csv.OutputCSV (CSVFile)
405 if CSVFile is not None:
406 CSVFile.close ()
408 if DevReports:
409 reports.DevReports (hlist, TotalChanged, CSCount, TotalRemoved)
410 reports.EmplReports (elist, TotalChanged, CSCount)