Merge new stemmers from Snowball
[xapian.git] / xapian-maintainer-tools / audit.py
blobed28441759a86bc391c94c4dade396bee861281e
1 #!/usr/bin/env python
3 # Copyright (C) 2007 Lemur Consulting Ltd
5 # This program is free software; you can redistribute it and/or modify
6 # it under the terms of the GNU General Public License as published by
7 # the Free Software Foundation; either version 2 of the License, or
8 # (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License along
16 # with this program; if not, write to the Free Software Foundation, Inc.,
17 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
18 r"""audit.py: Simple script to check code ownership and license messages.
20 Currently assumes that the xapian code can be found in "../xapian-core".
22 """
24 import csv
25 import re
26 import os, os.path
27 from pprint import pprint
28 import sys
30 copy_re = re.compile(r'Copyright\s+(\([Cc]\))?\s*(?P<dates>([0-9]{2,4})((,\s*|-)[0-9]{2,4})*),?\s*$')
31 copy2_re = re.compile(r'Copyright\s+(\([Cc]\))?\s*(?P<dates>([0-9]{2,4})((,\s*|-)[0-9]{2,4})*),?\s+(?P<name>.+)\s*$')
32 copy_unrec_re = re.compile(r'Copyright')
34 directive_re = re.compile(r'\s*#\s*error')
36 # Copyright holders which mean code is GPL only.
37 gplonly = [
38 'BrightStation PLC',
39 'Ananova Ltd',
42 licenses = [
43 ('lgpl2+', r'''
44 is free software; you can redistribute it and\/or modify it under the
45 terms of the GNU Library General Public License as published by the Free
46 Software Foundation; either version 2 of the License, or \(at your option\)
47 any later version.
48 '''),
49 ('gpl2+', r'''
50 is free software; you can redistribute it and\/or modify it under the terms of
51 the GNU General Public License as published by the Free Software Foundation;
52 either version 2( of the License)?, or \(at your option\) any later version.
53 '''),
54 ('sgi-historical', r'''
55 Permission to use, copy, modify, distribute and sell this software and its
56 documentation for any purpose is hereby granted without fee, provided that
57 the above copyright notice appear in all copies and that both that
58 copyright notice and this permission notice appear in supporting
59 documentation.
60 '''),
61 ('pub_domain', r'''
62 The authors of this program disclaim copyright.
63 '''),
64 ('mit_x', r'''
65 Permission is hereby granted, free of charge, to any person obtaining a
66 copy of this software and associated documentation files \(the "Software"\),
67 to deal in the Software without restriction, including without limitation
68 the rights to use, copy, modify, merge, publish, distribute, sublicense,
69 and/or sell copies of the Software, and to permit persons to whom the
70 Software is furnished to do so, subject to the following conditions:
71 '''),
74 fixmes = [
75 r'''FIXME:(?P<milestone>[\d.]+)''',
76 r'''FIXME''',
79 whitespace_re = re.compile(r'\s+')
81 license_patterns = []
82 for name, pattern in licenses:
83 pattern = whitespace_re.sub('\s+', pattern)
84 license_patterns.append((name, re.compile(pattern)))
86 fixme_patterns = []
87 for pattern in fixmes:
88 fixme_patterns.append(re.compile(pattern))
90 class FileDetails:
91 def __init__(self, path):
92 self.path = path
93 self.holders = []
94 self.licenses = []
95 self.length = 0
96 self.fixmes = []
98 def __repr__(self):
99 return "FileDetails(%r, %r, %r, %r)" % (self.path, self.holders, self.licenses, self.fixmes)
101 class SourceChecker:
102 def __init__(self, toppath):
103 self.files = {}
105 self.toppath = os.path.normpath(os.path.abspath(toppath))
106 if os.path.isdir(self.toppath):
107 self.topdirpath = self.toppath
108 else:
109 self.topdirpath = os.path.dirname(self.toppath)
110 self.current_path = None
112 def warn(self, msg):
113 print("Warning in %s: %s" % (self.current_path, msg))
115 def get_file_details(self, path=None):
116 if path is None:
117 path = self.current_path
118 try:
119 return self.files[path]
120 except KeyError:
121 details = FileDetails(path)
122 self.files[path] = details
123 return details
125 def parse_date_list(self, dates):
126 newdates = []
127 prevdate = None
128 for date in dates.split(','):
129 if '-' in date:
130 begin, end = date.split('-')
131 begin = int(begin)
132 end = int(end)
133 if end < begin:
134 self.warn('Invalid date range %r in copyright' % date)
135 newdates.append(begin)
136 for date in xrange(begin, end + 1):
137 newdates.append(date)
138 prevdate = end
139 else:
140 date = int(date)
141 if date < 1000:
142 if prevdate is None or date >= 100:
143 self.warn('Invalid date %r in copyright' % date)
144 else:
145 date = (prevdate // 100) * 100 + date
146 newdates.append(int(date))
147 prevdate = date
148 return newdates
150 def add_copyright_holder(self, name, dates):
151 file = self.get_file_details()
153 dates = self.parse_date_list(dates)
155 file.holders.append((name, dates))
157 def parse_copyrights(self, comments):
158 seen_copyright = False
159 dates = None
160 got_date_line = False
161 for comment in comments:
162 for line in comment.split('\n'):
163 if got_date_line:
164 self.add_copyright_holder(line, dates)
165 got_date_line = False
167 m = copy_re.search(line)
168 m2 = copy2_re.search(line)
169 if m:
170 dates = m.group('dates')
171 got_date_line = True
172 elif m2:
173 name = m2.group('name')
174 dates = m2.group('dates')
175 self.add_copyright_holder(name, dates)
176 seen_copyright = True
177 elif copy_unrec_re.search(line):
178 self.warn("Unrecognised copyright line: %r" % line)
180 def parse_licenses(self, comments):
181 licenses = []
182 for comment in comments:
183 comment = comment.replace('\n', ' ').replace('\r', '').strip()
184 for license, pattern in license_patterns:
185 if pattern.search(comment):
186 licenses.append(license)
187 if len(licenses) == 0:
188 self.warn("No license found: %s" % self.current_path)
190 file = self.get_file_details()
191 file.licenses.extend(licenses)
193 def parse_fixmes(self, comments):
194 fixmes = []
195 for comment in comments:
196 comment = comment.replace('\n', ' ').replace('\r', '').strip()
197 for pattern in fixme_patterns:
198 g = pattern.search(comment)
199 if g:
200 fixmetext = comment[g.end():].strip()
201 if fixmetext.startswith(':'):
202 fixmetext = fixmetext[1:].strip()
203 if fixmetext.startswith('-'):
204 fixmetext = fixmetext[1:].strip()
205 try:
206 milestone = g.group('milestone')
207 except IndexError:
208 milestone = ''
209 fixmes.append((milestone, fixmetext))
210 break
212 file = self.get_file_details()
213 file.fixmes.extend(fixmes)
215 def strip_quotes(self, line, incomment, was_cpp_comment):
216 """Remove any quoted strings from a line.
219 if incomment is not None:
220 if was_cpp_comment:
221 incomment = False
222 else:
223 incomment = True
225 pos = 0
226 in_quote = False
227 while pos < len(line):
228 if incomment:
229 if pos + 1 < len(line) and line[pos:pos+2] == '*/':
230 pos += 2
231 incomment = False
232 continue
233 else:
234 pos += 1
235 continue
237 if not incomment and not in_quote:
238 if pos + 1 < len(line):
239 if line[pos:pos+2] == '/*':
240 pos += 2
241 incomment = True
242 continue
243 if line[pos:pos+2] == '//':
244 break
246 if not in_quote:
247 if line[pos] == "'":
248 start = pos
249 try:
250 pos += 1
251 if line[pos] == '\\':
252 pos += 1
253 if line[pos] == 'x':
254 pos += 2
255 pos += 1
256 if line[pos] != "'":
257 self.warn("Unmatched single quote: %r" % line)
258 pos = start + 1
259 continue
260 else:
261 line = line[:start] + line[pos+1:]
262 pos = start
263 continue
264 except IndexError:
265 self.warn("Unfinished single quote: %r" % line)
266 return line
268 if line[pos] == '"':
269 start = pos
270 in_quote = True
271 else:
272 if line[pos] == '\\':
273 pos += 2
274 if pos >= len(line):
275 self.warn("Unfinished double quote: %r" % line)
276 return line
277 continue
278 if line[pos] == '"':
279 in_quote = False
280 line = line[:start] + line[pos+1:]
281 pos = start
282 continue
284 pos += 1
285 return line
287 def strip_directives(self, line):
288 if directive_re.match(line):
289 return ''
290 return line
292 def join_slashed_lines(self, lines):
293 "Join lines terminated with \ together"
294 newlines = []
295 had_slash = False
296 for line in lines:
297 if had_slash:
298 newlines[-1] += line
299 else:
300 newlines.append(line)
302 had_slash = False
303 if line.endswith('\\'):
304 had_slash = True
305 newlines[-1] = newlines[-1][:-1]
306 return newlines
308 def get_comments(self, lines):
309 """Get the C or C++ style comments from a set of lines.
312 comments = []
313 incomment = None
314 was_cpp_comment = False
315 lines = self.join_slashed_lines(lines)
317 for line in lines:
318 line = line.strip()
319 if len(line) == 0:
320 continue
321 line = self.strip_directives(line)
322 line = self.strip_quotes(line, incomment, was_cpp_comment)
323 pos = 0
324 if incomment is not None:
325 if not was_cpp_comment:
326 # Look for the end of a C comment
327 end = line.find('*/', 0)
329 # Check for leading "*"s
330 if end != 0 and line[0] == '*':
331 line = line[1:].strip()
332 end -= 1
334 # End the comment if an end was found
335 if len(incomment) != 0 and incomment[-1] != '\n':
336 incomment += '\n'
337 if end >= 0:
338 pos = end + 2
339 incomment += line[:end]
340 comments.append(incomment)
341 incomment = None
342 else:
343 incomment += line
345 if was_cpp_comment:
346 # Look for a continuation C++ comment at the start of the line.
347 cpp_start = line.find('//', 0)
348 if cpp_start == 0:
349 incomment += '\n'
350 incomment += line[2:]
351 else:
352 comments.append(incomment)
353 incomment = None
355 if incomment is None:
356 # Look for the start of a comment
357 cc_start = line.find('/*', pos)
358 while cc_start != -1:
359 if line[cc_start] == '*' and line[cc_start+1] != '/':
360 # Skip extra * at start of comment, indicating a
361 # doccommment.
362 cc_start += 1
363 end = line.find('*/', cc_start+1)
364 if end == -1:
365 incomment = line[cc_start + 2:]
366 was_cpp_comment = False
367 break
368 pos = end + 2
369 comments.append(line[cc_start + 2:end])
370 cc_start = line.find('/*', pos)
372 if incomment is None:
373 # Look for the start of a C++ comment
374 cpp_start = line.find('//', pos)
375 if cpp_start != -1:
376 incomment = line[cpp_start + 2:]
377 was_cpp_comment = True
379 if incomment:
380 comments.append(incomment)
381 return comments
384 def check_file(self, path):
385 '''Check the copyright status of a file.
387 Returns a tuple of form (name, (year, year,))
390 fd = open(path)
391 lines = [line.strip() for line in fd.readlines()]
392 assert(path.startswith(self.topdirpath))
393 self.current_path = path[len(self.topdirpath) + 1:]
395 comments = self.get_comments(lines)
396 self.parse_copyrights(comments)
397 self.parse_licenses(comments)
398 self.parse_fixmes(comments)
400 file = self.get_file_details()
401 file.length = len(lines)
403 def check(self):
404 if os.path.isdir(self.toppath):
405 for dirpath, dirnames, filenames in os.walk(self.toppath):
406 for filename in filenames:
407 if filename.endswith('.cc') or \
408 filename.endswith('.c') or \
409 filename.endswith('.h'):
410 path = os.path.join(dirpath, filename)
411 self.check_file(path)
412 else:
413 self.check_file(self.toppath)
415 def get_relicense_classses(self):
416 classes = {}
417 for path, details in self.files.iteritems():
418 if 'gpl2+' not in details.licenses:
419 classes.setdefault('nongpl', []).append(path)
420 continue
421 cls = 'gpl'
422 holders = [item[0] for item in details.holders]
423 for holder in gplonly:
424 if holder in holders:
425 cls = 'gplonly'
426 break
427 classes.setdefault(cls, []).append(path)
428 return classes
430 def get_ownership(self):
431 """Get a dict holding ownership, keyed by copyright holder.
433 The values are tuples, (number of files, sum of proportion of files
434 held, sum of proportion weighted by number of years of files held)
437 # Get a dictionary, keyed by license, holding dictionaries keyed by
438 # copyright holder, holding a list of values representing the
439 # contribution of that holder.
440 owners = {}
441 for file in self.files.itervalues():
442 file_ownership = {}
443 holder_count = len(file.holders)
444 holder_date_count = 0
445 for holder_name, holder_dates in file.holders:
446 holder_date_count += len(holder_dates)
447 for holder_name, holder_dates in file.holders:
448 proportion_equal = float(1)/holder_count
449 proportion_date = float(len(holder_dates)) / holder_date_count
450 file_ownership[holder_name] = [1, file.length,
451 proportion_equal,
452 proportion_date,
453 proportion_equal * file.length,
454 proportion_date * file.length,]
456 for license in file.licenses:
457 try:
458 license_owners = owners[license]
459 except KeyError:
460 license_owners = {}
461 owners[license] = license_owners
463 for holder_name, holder_values in file_ownership.iteritems():
464 try:
465 license_owner = license_owners[holder_name]
466 except KeyError:
467 license_owner = [0] * len(holder_values)
468 license_owners[holder_name] = license_owner
469 for i in xrange(len(holder_values)):
470 license_owner[i] += holder_values[i]
472 # Get a list of the total number of lines for each license, and sort
473 # into descending order.
474 license_total_lines = []
475 for license, owner in owners.iteritems():
476 total_lines = 0
477 for holder_values in owner.itervalues():
478 total_lines += holder_values[4]
479 license_total_lines.append((total_lines, license))
480 license_total_lines.sort()
481 license_total_lines.reverse()
483 # Get a list of the contributors for each license, in descending order of total number of lines
484 result = []
485 for total_lines, license in license_total_lines:
486 license_owners = []
487 for owner, values in owners[license].iteritems():
488 item = [owner]
489 item.extend(values)
490 license_owners.append(tuple(item))
491 license_owners.sort(cmp=lambda x,y:cmp(x[1],y[1]))
492 license_owners.reverse()
493 result.append((license, license_owners))
494 return tuple(result)
496 def get_fixmes(self):
497 """Get a dict holding fixmes, keyed by milestone.
500 milestones = {}
501 for file in self.files.itervalues():
502 for milestone, fixmetext in file.fixmes:
503 if milestone not in milestones:
504 milestones[milestone] = []
505 milestones[milestone].append((file.path, fixmetext))
506 def cmpfn(a, b):
507 if (a[0] == '') ^ (b[0] == ''):
508 return -cmp(a, b)
509 return cmp(a, b)
510 return sorted([(milestone, sorted(milestones[milestone]))
511 for milestone in milestones.iterkeys()],
512 cmp=cmpfn)
515 toppath = '../xapian-core'
516 if len(sys.argv) > 1:
517 toppath = sys.argv[1]
518 checker = SourceChecker(toppath)
519 checker.check()
521 #pprint(checker.files)
523 #pprint(checker.get_fixmes())
524 fixmefd = open("fixmes.csv", "wb")
525 writer = csv.writer(fixmefd)
526 writer.writerow(("Milestone", "File", "Message",))
527 for milestone, fixmes in checker.get_fixmes():
528 for filepath, fixmetext in fixmes:
529 writer.writerow((milestone, filepath, fixmetext))
530 fixmefd.close()
533 #pprint(checker.get_ownership())
535 copyrightfd = open("copyright.csv", "wb")
536 writer = csv.writer(copyrightfd)
537 writer.writerow(("License", "Author", "File count", "Lines touched",
538 "File proportion (equal)", "File proportion (biased)",
539 "Lines proportion (equal)", "Lines proportion (biased)",))
540 for license in checker.get_ownership():
541 for holder in license[1]:
542 value = [license[0]]
543 value.extend(holder)
544 writer.writerow(value)
545 copyrightfd.close()
547 relicense_classes = checker.get_relicense_classses()
548 print ('%d files:' % len(checker.files))
549 print ('%d files "tainted" by unrelicensable GPL code' %
550 len(relicense_classes.get('gplonly', ())))
551 print ('%d files "tainted" by relicensable GPL code' %
552 len(relicense_classes.get('gpl', ())))
553 print ('%d files "untainted" by GPL code' %
554 len(relicense_classes.get('nongpl', ())))
556 fd = open("license_classes.csv", "wb")
557 writer = csv.writer(fd)
558 writer.writerow(("Status", "File path"))
559 for cls, paths in sorted(relicense_classes.iteritems()):
560 status = {
561 'gpl': "GPL, but probably relicensable",
562 'nongpl': "License other than GPL",
563 'gplonly': "GPL, probably non-relicensable",
564 }[cls]
565 for path in sorted(paths):
566 writer.writerow((status, path))
567 fd.close()