Make tag matching stricter
[git-dm.git] / gitdm
blobdce5d3c0da6b5bb6f390fc81404f9a66caab441e
1 #!/usr/bin/python
5 # This code is part of the LWN git data miner.
7 # Copyright 2007-9 LWN.net
8 # Copyright 2007-9 Jonathan Corbet <corbet@lwn.net>
10 # This file may be distributed under the terms of the GNU General
11 # Public License, version 2.
14 import database, csv, ConfigFile, reports
15 import getopt, datetime
16 import os, re, sys, rfc822, string
17 from patterns import *
19 Today = datetime.date.today()
22 # Remember author names we have griped about.
24 GripedAuthorNames = [ ]
27 # Control options.
29 MapUnknown = 0
30 DevReports = 1
31 DateStats = 0
32 AuthorSOBs = 1
33 FileFilter = None
34 CSVFile = None
35 AkpmOverLt = 0
36 DumpDB = 0
37 CFName = 'gitdm.config'
38 DirName = ''
41 # Options:
43 # -a Andrew Morton's signoffs shadow Linus's
44 # -b dir Specify the base directory to fetch the configuration files
45 # -c cfile Specify a configuration file
46 # -d Output individual developer stats
47 # -D Output date statistics
48 # -h hfile HTML output to hfile
49 # -l count Maximum length for output lists
50 # -o file File for text output
51 # -r pattern Restrict to files matching pattern
52 # -s Ignore author SOB lines
53 # -u Map unknown employers to '(Unknown)'
54 # -x file.csv Export raw statistics as CSV
55 # -z Dump out the hacker database at completion
57 def ParseOpts ():
58 global MapUnknown, DevReports
59 global DateStats, AuthorSOBs, FileFilter, AkpmOverLt, DumpDB
60 global CFName, CSVFile, DirName
62 opts, rest = getopt.getopt (sys.argv[1:], 'ab:dc:Dh:l:o:r:sux:z')
63 for opt in opts:
64 if opt[0] == '-a':
65 AkpmOverLt = 1
66 elif opt[0] == '-b':
67 DirName = opt[1]
68 elif opt[0] == '-c':
69 CFName = opt[1]
70 elif opt[0] == '-d':
71 DevReports = 0
72 elif opt[0] == '-D':
73 DateStats = 1
74 elif opt[0] == '-h':
75 reports.SetHTMLOutput (open (opt[1], 'w'))
76 elif opt[0] == '-l':
77 reports.SetMaxList (int (opt[1]))
78 elif opt[0] == '-o':
79 reports.SetOutput (open (opt[1], 'w'))
80 elif opt[0] == '-r':
81 print 'Filter on "%s"' % (opt[1])
82 FileFilter = re.compile (opt[1])
83 elif opt[0] == '-s':
84 AuthorSOBs = 0
85 elif opt[0] == '-u':
86 MapUnknown = 1
87 elif opt[0] == '-x':
88 CSVFile = open (opt[1], 'w')
89 print "open output file " + opt[1] + "\n"
90 elif opt[0] == '-z':
91 DumpDB = 1
95 def LookupStoreHacker (name, email):
96 email = database.RemapEmail (email)
97 h = database.LookupEmail (email)
98 if h: # already there
99 return h
100 elist = database.LookupEmployer (email, MapUnknown)
101 h = database.LookupName (name)
102 if h: # new email
103 h.addemail (email, elist)
104 return h
105 return database.StoreHacker(name, elist, email)
108 # Date tracking.
111 DateMap = { }
113 def AddDateLines(date, lines):
114 if lines > 1000000:
115 print 'Skip big patch (%d)' % lines
116 return
117 try:
118 DateMap[date] += lines
119 except KeyError:
120 DateMap[date] = lines
122 def PrintDateStats():
123 dates = DateMap.keys ()
124 dates.sort ()
125 total = 0
126 datef = open ('datelc', 'w')
127 for date in dates:
128 total += DateMap[date]
129 datef.write ('%d/%02d/%02d %6d %7d\n' % (date.year, date.month, date.day,
130 DateMap[date], total))
134 # Let's slowly try to move some smarts into this class.
136 class patch:
137 def __init__ (self, commit):
138 self.commit = commit
139 self.merge = self.added = self.removed = 0
140 self.author = LookupStoreHacker('Unknown hacker', 'unknown@hacker.net')
141 self.email = 'unknown@hacker.net'
142 self.sobs = [ ]
143 self.reviews = [ ]
144 self.testers = [ ]
145 self.reports = [ ]
147 def addreviewer (self, reviewer):
148 self.reviews.append (reviewer)
150 def addtester (self, tester):
151 self.testers.append (tester)
153 def addreporter (self, reporter):
154 self.reports.append (reporter)
156 # The core hack for grabbing the information about a changeset.
158 def grabpatch():
159 global NextLine
161 while (1):
162 m = Pcommit.match (NextLine)
163 if m:
164 break;
165 NextLine = sys.stdin.readline ()
166 if not NextLine:
167 return
169 p = patch(m.group (1))
170 NextLine = sys.stdin.readline ()
171 ignore = (FileFilter is not None)
172 while NextLine:
173 Line = NextLine
175 # If this line starts a new commit, drop out.
177 m = Pcommit.match (Line)
178 if m:
179 break
180 NextLine = sys.stdin.readline ()
182 # Maybe it's an author line?
184 m = Pauthor.match (Line)
185 if m:
186 p.email = database.RemapEmail (m.group (2))
187 p.author = LookupStoreHacker(m.group (1), p.email)
188 continue
190 # Could be a signed-off-by:
192 m = Psob.match (Line)
193 if m:
194 email = database.RemapEmail (m.group (2))
195 sobber = LookupStoreHacker(m.group (1), email)
196 if sobber != p.author or AuthorSOBs:
197 p.sobs.append ((email, LookupStoreHacker(m.group (1), m.group (2))))
198 continue
200 # Various other tags of interest.
202 m = Preview.match (Line) # Reviewed-by:
203 if m:
204 email = database.RemapEmail (m.group (2))
205 p.addreviewer (LookupStoreHacker(m.group (1), email))
206 continue
207 m = Ptest.match (Line) # Tested-by:
208 if m:
209 email = database.RemapEmail (m.group (2))
210 p.addtester (LookupStoreHacker (m.group (1), email))
211 p.author.testcredit (patch)
212 continue
213 m = Prep.match (Line) # Reported-by:
214 if m:
215 email = database.RemapEmail (m.group (2))
216 p.addreporter (LookupStoreHacker (m.group (1), email))
217 p.author.reportcredit (patch)
218 continue
219 m = Preptest.match (Line) # Reported-and-tested-by:
220 if m:
221 email = database.RemapEmail (m.group (2))
222 h = LookupStoreHacker (m.group (1), email)
223 p.addreporter (h)
224 p.addtester (h)
225 p.author.reportcredit (patch)
226 p.author.testcredit (patch)
227 continue
229 # If this one is a merge, make note of the fact.
231 m = Pmerge.match (Line)
232 if m:
233 p.merge = 1
234 continue
236 # See if it's the date.
238 m = Pdate.match (Line)
239 if m:
240 dt = rfc822.parsedate(m.group (2))
241 p.date = datetime.date (dt[0], dt[1], dt[2])
242 if p.date > Today:
243 sys.stderr.write ('Funky date: %s\n' % p.date)
244 p.date = Today
245 continue
247 # If we have a file filter, check for file lines.
249 if FileFilter:
250 ignore = ApplyFileFilter (Line, ignore)
252 # OK, maybe it's part of the diff itself.
254 if not ignore:
255 if Padd.match (Line):
256 p.added += 1
257 continue
258 if Prem.match (Line):
259 p.removed += 1
261 if '@' in p.author.name:
262 GripeAboutAuthorName (p.author.name)
264 return p
266 def GripeAboutAuthorName (name):
267 if name in GripedAuthorNames:
268 return
269 GripedAuthorNames.append (name)
270 print '%s is an author name, probably not what you want' % (name)
272 def ApplyFileFilter (line, ignore):
274 # If this is the first file line (--- a/), set ignore one way
275 # or the other.
277 m = Pfilea.match (line)
278 if m:
279 file = m.group (1)
280 if FileFilter.search (file):
281 return 0
282 return 1
284 # For the second line, we can turn ignore off, but not on
286 m = Pfileb.match (line)
287 if m:
288 file = m.group (1)
289 if FileFilter.search (file):
290 return 0
291 return ignore
294 # If this patch is signed off by both Andrew Morton and Linus Torvalds,
295 # remove the (redundant) Linus signoff.
297 def TrimLTSOBs (p):
298 if Linus in p.sobs and Akpm in p.sobs:
299 p.sobs.remove (Linus)
303 # Here starts the real program.
305 ParseOpts ()
308 # Read the config files.
310 ConfigFile.ConfigFile (CFName, DirName)
313 # Let's pre-seed the database with a couple of hackers
314 # we want to remember.
316 Linus = ('torvalds@linux-foundation.org',
317 LookupStoreHacker ('Linus Torvalds', 'torvalds@linux-foundation.org'))
318 Akpm = ('akpm@linux-foundation.org',
319 LookupStoreHacker ('Andrew Morton', 'akpm@linux-foundation.org'))
321 NextLine = sys.stdin.readline ()
322 TotalChanged = TotalAdded = TotalRemoved = 0
325 # Snarf changesets.
327 print >> sys.stderr, 'Grabbing changesets...\r',
329 printcount = CSCount = 0
330 while (1):
331 if (printcount % 50) == 0:
332 print >> sys.stderr, 'Grabbing changesets...%d\r' % printcount,
333 printcount += 1
334 p = grabpatch()
335 if not p:
336 break
337 # if p.added > 100000 or p.removed > 100000:
338 # print 'Skipping massive add', p.commit
339 # continue
340 if FileFilter and p.added == 0 and p.removed == 0:
341 continue
344 # Record some global information - but only if this patch had
345 # stuff which wasn't ignored.
347 if ((p.added + p.removed) > 0 or not FileFilter) and not p.merge:
348 TotalAdded += p.added
349 TotalRemoved += p.removed
350 TotalChanged += max (p.added, p.removed)
351 AddDateLines (p.date, max (p.added, p.removed))
352 empl = p.author.emailemployer (p.email, p.date)
353 empl.AddCSet (p)
354 if AkpmOverLt:
355 TrimLTSOBs (p)
356 for sobemail, sobber in p.sobs:
357 empl = sobber.emailemployer (sobemail, p.date)
358 empl.AddSOB()
360 if not p.merge:
361 p.author.addpatch (p)
362 for sobemail, sob in p.sobs:
363 sob.addsob (p)
364 for hacker in p.reviews:
365 hacker.addreview (p)
366 for hacker in p.testers:
367 hacker.addtested (p)
368 for hacker in p.reports:
369 hacker.addreport (p)
370 CSCount += 1
371 csv.AccumulatePatch (p)
372 print >> sys.stderr, 'Grabbing changesets...done '
374 if DumpDB:
375 database.DumpDB ()
377 # Say something
379 hlist = database.AllHackers ()
380 elist = database.AllEmployers ()
381 ndev = nempl = 0
382 for h in hlist:
383 if len (h.patches) > 0:
384 ndev += 1
385 for e in elist:
386 if e.count > 0:
387 nempl += 1
388 reports.Write ('Processed %d csets from %d developers\n' % (CSCount,
389 ndev))
390 reports.Write ('%d employers found\n' % (nempl))
391 reports.Write ('A total of %d lines added, %d removed (delta %d)\n' %
392 (TotalAdded, TotalRemoved, TotalAdded - TotalRemoved))
393 if TotalChanged == 0:
394 TotalChanged = 1 # HACK to avoid div by zero
395 if DateStats:
396 PrintDateStats ()
397 sys.exit(0)
399 csv.OutputCSV (CSVFile)
400 if CSVFile is not None:
401 CSVFile.close ()
403 if DevReports:
404 reports.DevReports (hlist, TotalChanged, CSCount, TotalRemoved)
405 reports.EmplReports (elist, TotalChanged, CSCount)