inittags: some updates
[git-dm.git] / database.py
blobb50b70cac3782076fba92c0b1ecd9f16e5ae8701
2 # The "database".
4 # This code is part of the LWN git data miner.
6 # Copyright 2007-11 Eklektix, Inc.
7 # Copyright 2007-11 Jonathan Corbet <corbet@lwn.net>
9 # This file may be distributed under the terms of the GNU General
10 # Public License, version 2.
12 import sys, datetime
15 class Hacker:
16 def __init__(self, name, id, elist, email):
17 self.name = name
18 self.id = id
19 self.employer = [ elist ]
20 self.email = [ email ]
21 self.changed = self.added = self.removed = 0
22 self.patches = [ ]
23 self.signoffs = [ ]
24 self.reviews = [ ]
25 self.tested = [ ]
26 self.reports = [ ]
27 self.testcred = self.repcred = 0
28 self.activity_start = datetime.date.max
29 self.activity_end = datetime.date.min
30 self.versions = [ ]
32 def addemail(self, email, elist):
33 self.email.append(email)
34 self.employer.append(elist)
35 HackersByEmail[email] = self
37 def emailemployer(self, email, date):
38 for i in range(0, len(self.email)):
39 if (email is None) or (self.email[i] == email):
40 for edate, empl in self.employer[i]:
41 if edate > date:
42 return empl
43 print('OOPS. ', self.name, self.employer, self.email, email, date)
44 return None # Should not happen
46 def addpatch(self, patch):
47 self.added += patch.added
48 self.removed += patch.removed
49 self.changed += max(patch.added, patch.removed)
50 self.patches.append(patch)
51 if patch.date < self.activity_start:
52 self.activity_start = patch.date
53 if patch.date > self.activity_end:
54 self.activity_end= patch.date
57 # Note that the author is represented in this release.
59 def addversion(self, release):
60 if release not in self.versions:
61 self.versions.append(release)
63 # There's got to be a better way.
65 def addsob(self, patch):
66 self.signoffs.append(patch)
67 def addreview(self, patch):
68 self.reviews.append(patch)
69 def addtested(self, patch):
70 self.tested.append(patch)
71 def addreport(self, patch):
72 self.reports.append(patch)
74 def reportcredit(self, patch):
75 self.repcred += 1
76 def testcredit(self, patch):
77 self.testcred += 1
79 HackersByName = { }
80 HackersByEmail = { }
81 HackersByID = { }
82 MaxID = 0
84 def StoreHacker(name, elist, email):
85 global MaxID
87 id = MaxID
88 MaxID += 1
89 h = Hacker(name, id, elist, email)
90 HackersByName[name] = h
91 HackersByEmail[email] = h
92 HackersByID[id] = h
93 return h
95 def LookupEmail(addr):
96 try:
97 return HackersByEmail[addr]
98 except KeyError:
99 return None
101 def LookupName(name):
102 try:
103 return HackersByName[name]
104 except KeyError:
105 return None
107 def LookupID(id):
108 try:
109 return HackersByID[id]
110 except KeyError:
111 return None
113 def LookupStoreHacker(name, email, mapunknown = True):
115 # See if we already know about this email address.
117 email = RemapEmail(email)
118 h = LookupEmail(email)
119 if h: # already there
120 return h
122 # OK, see if we can map an employer to the domain, and try a
123 # name lookup.
125 elist = LookupEmployer(email, mapunknown)
126 h = LookupName(name)
127 if h: # new email
128 h.addemail(email, elist)
129 return h
131 # Something new, remember it.
133 return StoreHacker(name, elist, email)
136 def AllHackers():
137 return list(HackersByID.values())
139 def DumpDB():
140 out = open('database.dump', 'w')
141 names = sorted(HackersByName)
142 for name in names:
143 h = HackersByName[name]
144 out.write('%4d %s %d p (+%d -%d) sob: %d\n' % (h.id, h.name,
145 len(h.patches),
146 h.added, h.removed,
147 len(h.signoffs)))
148 for i in range(0, len(h.email)):
149 out.write('\t%s -> \n' % (h.email[i]))
150 for date, empl in h.employer[i]:
151 out.write('\t\t %d-%d-%d %s\n' % (date.year, date.month, date.day,
152 empl.name))
153 if h.versions:
154 out.write('\tVersions: %s\n' % ','.join(h.versions))
157 # Hack: The first visible tag comes a ways into the stream; when we see it,
158 # push it backward through the changes we've already seen.
160 def ApplyFirstTag(tag):
161 for n in HackersByName.keys():
162 if HackersByName[n].versions:
163 HackersByName[n].versions = [tag]
166 # Employer info.
168 class Employer:
169 def __init__(self, name):
170 self.name = name
171 self.added = self.removed = self.count = self.changed = 0
172 self.sobs = 0
173 self.hackers = [ ]
175 def AddCSet(self, patch):
176 self.added += patch.added
177 self.removed += patch.removed
178 self.changed += max(patch.added, patch.removed)
179 self.count += 1
180 if patch.author not in self.hackers:
181 self.hackers.append(patch.author)
183 def AddSOB(self):
184 self.sobs += 1
186 Employers = { }
188 def GetEmployer(name):
189 try:
190 return Employers[name]
191 except KeyError:
192 e = Employer(name)
193 Employers[name] = e
194 return e
196 def AllEmployers():
197 return list(Employers.values())
200 # Certain obnoxious developers, who will remain nameless (because we
201 # would never want to run afoul of Thomas) want their work split among
202 # multiple companies. Let's try to cope with that. Let's also hope
203 # this doesn't spread.
205 class VirtualEmployer(Employer):
206 def __init__(self, name):
207 Employer.__init__(self, name)
208 self.splits = [ ]
210 def addsplit(self, name, fraction):
211 self.splits.append((name, fraction))
214 # Go through and (destructively) apply our credits to the
215 # real employer. Only one level of weirdness is supported.
217 def applysplits(self):
218 for name, fraction in self.splits:
219 real = GetEmployer(name)
220 real.added += int(self.added*fraction)
221 real.removed += int(self.removed*fraction)
222 real.changed += int(self.changed*fraction)
223 real.count += int(self.count*fraction)
224 self.__init__(name) # Reset counts just in case
226 def store(self):
227 if self.name in Employers:
228 print(Employers[self.name])
229 print('WARNING: Virtual empl %s overwrites another' % (self.name),
230 file = sys.stderr)
231 if len(self.splits) == 0:
232 print('WARNING: Virtual empl %s has no splits' % (self.name),
233 file = sys.stderr)
234 # Should check that they add up too, but I'm lazy
235 Employers[self.name] = self
237 class FileType:
238 def __init__(self, patterns={}, order=[]):
239 self.patterns = patterns
240 self.order = order
242 def guess_file_type(self, filename, patterns=None, order=None):
243 patterns = patterns or self.patterns
244 order = order or self.order
246 for file_type in order:
247 if file_type in patterns:
248 for patt in patterns[file_type]:
249 if patt.search(filename):
250 return file_type
252 return 'unknown'
255 # By default we recognize nothing.
257 FileTypes = FileType({}, [])
260 # Mix all the virtual employers into their real destinations.
262 def MixVirtuals():
263 for empl in AllEmployers():
264 if isinstance(empl, VirtualEmployer):
265 empl.applysplits()
268 # The email map.
270 EmailAliases = { }
271 RXEmailAliases = [ ]
273 def AddEmailAlias(variant, canonical):
274 if variant in EmailAliases:
275 sys.stderr.write('Duplicate email alias for %s\n' % (variant))
276 EmailAliases[variant] = canonical
278 def RemapEmail(email):
279 email = email.lower()
280 try:
281 return EmailAliases[email]
282 except KeyError:
283 return RXRemapEmail(email)
285 def AddRXEmailAlias(regex, canonical):
286 RXEmailAliases.append((regex, canonical))
288 def RXRemapEmail(email):
289 for regex, canonical in RXEmailAliases:
290 if regex.match(email):
291 return canonical
292 return email
295 # Email-to-employer mapping.
297 EmailToEmployer = { }
298 nextyear = datetime.date.today() + datetime.timedelta(days = 365)
300 def AddEmailEmployerMapping(email, employer, end = nextyear):
301 if end is None:
302 end = nextyear
303 email = email.lower()
304 empl = GetEmployer(employer)
305 try:
306 l = EmailToEmployer[email]
307 for i in range(0, len(l)):
308 date, xempl = l[i]
309 if date == end: # probably both nextyear
310 print('WARNING: duplicate email/empl for %s' % (email))
311 if date > end:
312 l.insert(i,(end, empl))
313 return
314 l.append((end, empl))
315 except KeyError:
316 EmailToEmployer[email] = [(end, empl)]
318 def MapToEmployer(email, unknown = 0):
319 # Somebody sometimes does s/@/ at /; let's fix it.
320 email = email.lower().replace(' at ', '@')
321 try:
322 return EmailToEmployer[email]
323 except KeyError:
324 pass
325 namedom = email.split('@')
326 if len(namedom) < 2:
327 print('Oops...funky email %s' % email)
328 return [(nextyear, GetEmployer('Funky'))]
329 s = namedom[1].split('.')
330 for dots in range(len(s) - 2, -1, -1):
331 addr = '.'.join(s[dots:])
332 try:
333 return EmailToEmployer[addr]
334 except KeyError:
335 pass
337 # We don't know who they work for.
339 if unknown:
340 return [(nextyear, GetEmployer('(Unknown)'))]
341 return [(nextyear, GetEmployer(email))]
344 def LookupEmployer(email, mapunknown = 0):
345 elist = MapToEmployer(email, mapunknown)
346 return elist # GetEmployer(ename)
349 # Make sure aliases don't mask other entries.
351 def CheckAliases():
352 for email in EmailToEmployer:
353 remapped = RemapEmail(email)
354 if email != remapped:
355 print(f'WARNING: {email} is masked by an alias entry ({remapped})')