xapian-maintainer-tools/audit.py

   1 #!/usr/bin/env python
   2 #
   3 # Copyright (C) 2007 Lemur Consulting Ltd
   4 #
   5 # This program is free software; you can redistribute it and/or modify
   6 # it under the terms of the GNU General Public License as published by
   7 # the Free Software Foundation; either version 2 of the License, or
   8 # (at your option) any later version.
   9 #
  10 # This program is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 # GNU General Public License for more details.
  14 #
  15 # You should have received a copy of the GNU General Public License along
  16 # with this program; if not, write to the Free Software Foundation, Inc.,
  17 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  18 r"""audit.py: Simple script to check code ownership and license messages.
  19
  20 Currently assumes that the xapian code can be found in "../xapian-core".
  21
  22 """
  23
  24 import csv
  25 import re
  26 import os, os.path
  27 from pprint import pprint
  28 import sys
  29
  30 copy_re = re.compile(r'Copyright\s+(\([Cc]\))?\s*(?P<dates>([0-9]{2,4})((,\s*|-)[0-9]{2,4})*),?\s*$')
  31 copy2_re = re.compile(r'Copyright\s+(\([Cc]\))?\s*(?P<dates>([0-9]{2,4})((,\s*|-)[0-9]{2,4})*),?\s+(?P<name>.+)\s*$')
  32 copy_unrec_re = re.compile(r'Copyright')
  33
  34 directive_re = re.compile(r'\s*#\s*error')
  35
  36 # Copyright holders which mean code is GPL only.
  37 gplonly = [
  38     'BrightStation PLC',
  39     'Ananova Ltd',
  40 ]
  41
  42 licenses = [
  43     ('lgpl2+', r'''
  44      is free software; you can redistribute it and\/or modify it under the
  45      terms of the GNU Library General Public License as published by the Free
  46      Software Foundation; either version 2 of the License, or \(at your option\)
  47      any later version.
  48      '''),
  49     ('gpl2+', r'''
  50      is free software; you can redistribute it and\/or modify it under the terms of
  51      the GNU General Public License as published by the Free Software Foundation;
  52      either version 2( of the License)?, or \(at your option\) any later version.
  53      '''),
  54     ('sgi-historical', r'''
  55     Permission to use, copy, modify, distribute and sell this software and its
  56     documentation for any purpose is hereby granted without fee, provided that
  57     the above copyright notice appear in all copies and that both that
  58     copyright notice and this permission notice appear in supporting
  59     documentation.
  60     '''),
  61     ('pub_domain', r'''
  62     The authors of this program disclaim copyright.
  63     '''),
  64     ('mit_x', r'''
  65     Permission is hereby granted, free of charge, to any person obtaining a
  66     copy of this software and associated documentation files \(the "Software"\),
  67     to deal in the Software without restriction, including without limitation
  68     the rights to use, copy, modify, merge, publish, distribute, sublicense,
  69     and/or sell copies of the Software, and to permit persons to whom the
  70     Software is furnished to do so, subject to the following conditions:
  71     '''),
  72 ]
  73
  74 fixmes = [
  75     r'''FIXME:(?P<milestone>[\d.]+)''',
  76     r'''FIXME''',
  77 ]
  78
  79 whitespace_re = re.compile(r'\s+')
  80
  81 license_patterns = []
  82 for name, pattern in licenses:
  83     pattern = whitespace_re.sub('\s+', pattern)
  84     license_patterns.append((name, re.compile(pattern)))
  85
  86 fixme_patterns = []
  87 for pattern in fixmes:
  88     fixme_patterns.append(re.compile(pattern))
  89
  90 class FileDetails:
  91     def __init__(self, path):
  92         self.path = path
  93         self.holders = []
  94         self.licenses = []
  95         self.length = 0
  96         self.fixmes = []
  97
  98     def __repr__(self):
  99         return "FileDetails(%r, %r, %r, %r)" % (self.path, self.holders, self.licenses, self.fixmes)
 100
 101 class SourceChecker:
 102     def __init__(self, toppath):
 103         self.files = {}
 104
 105         self.toppath = os.path.normpath(os.path.abspath(toppath))
 106         if os.path.isdir(self.toppath):
 107             self.topdirpath = self.toppath
 108         else:
 109             self.topdirpath = os.path.dirname(self.toppath)
 110         self.current_path = None
 111
 112     def warn(self, msg):
 113         print("Warning in %s: %s" % (self.current_path, msg))
 114
 115     def get_file_details(self, path=None):
 116         if path is None:
 117             path = self.current_path
 118         try:
 119             return self.files[path]
 120         except KeyError:
 121             details = FileDetails(path)
 122             self.files[path] = details
 123             return details
 124
 125     def parse_date_list(self, dates):
 126         newdates = []
 127         prevdate = None
 128         for date in dates.split(','):
 129             if '-' in date:
 130                 begin, end = date.split('-')
 131                 begin = int(begin)
 132                 end = int(end)
 133                 if end < begin:
 134                     self.warn('Invalid date range %r in copyright' % date)
 135                     newdates.append(begin)
 136                 for date in xrange(begin, end + 1):
 137                     newdates.append(date)
 138                 prevdate = end
 139             else:
 140                 date = int(date)
 141                 if date < 1000:
 142                     if prevdate is None or date >= 100:
 143                         self.warn('Invalid date %r in copyright' % date)
 144                     else:
 145                         date = (prevdate // 100) * 100 + date
 146                 newdates.append(int(date))
 147                 prevdate = date
 148         return newdates
 149
 150     def add_copyright_holder(self, name, dates):
 151         file = self.get_file_details()
 152
 153         dates = self.parse_date_list(dates)
 154
 155         file.holders.append((name, dates))
 156
 157     def parse_copyrights(self, comments):
 158         seen_copyright = False
 159         dates = None
 160         got_date_line = False
 161         for comment in comments:
 162             for line in comment.split('\n'):
 163                 if got_date_line:
 164                     self.add_copyright_holder(line, dates)
 165                     got_date_line = False
 166
 167                 m = copy_re.search(line)
 168                 m2 = copy2_re.search(line)
 169                 if m:
 170                     dates = m.group('dates')
 171                     got_date_line = True
 172                 elif m2:
 173                     name = m2.group('name')
 174                     dates = m2.group('dates')
 175                     self.add_copyright_holder(name, dates)
 176                     seen_copyright = True
 177                 elif copy_unrec_re.search(line):
 178                     self.warn("Unrecognised copyright line: %r" % line)
 179
 180     def parse_licenses(self, comments):
 181         licenses = []
 182         for comment in comments:
 183             comment = comment.replace('\n', ' ').replace('\r', '').strip()
 184             for license, pattern in license_patterns:
 185                 if pattern.search(comment):
 186                     licenses.append(license)
 187         if len(licenses) == 0:
 188             self.warn("No license found: %s" % self.current_path)
 189
 190         file = self.get_file_details()
 191         file.licenses.extend(licenses)
 192
 193     def parse_fixmes(self, comments):
 194         fixmes = []
 195         for comment in comments:
 196             comment = comment.replace('\n', ' ').replace('\r', '').strip()
 197             for pattern in fixme_patterns:
 198                 g = pattern.search(comment)
 199                 if g:
 200                     fixmetext = comment[g.end():].strip()
 201                     if fixmetext.startswith(':'):
 202                         fixmetext = fixmetext[1:].strip()
 203                     if fixmetext.startswith('-'):
 204                         fixmetext = fixmetext[1:].strip()
 205                     try:
 206                         milestone = g.group('milestone')
 207                     except IndexError:
 208                         milestone = ''
 209                     fixmes.append((milestone, fixmetext))
 210                     break
 211
 212         file = self.get_file_details()
 213         file.fixmes.extend(fixmes)
 214
 215     def strip_quotes(self, line, incomment, was_cpp_comment):
 216         """Remove any quoted strings from a line.
 217
 218         """
 219         if incomment is not None:
 220             if was_cpp_comment:
 221                 incomment = False
 222             else:
 223                 incomment = True
 224
 225         pos = 0
 226         in_quote = False
 227         while pos < len(line):
 228             if incomment:
 229                 if pos + 1 < len(line) and line[pos:pos+2] == '*/':
 230                     pos += 2
 231                     incomment = False
 232                     continue
 233                 else:
 234                     pos += 1
 235                     continue
 236
 237             if not incomment and not in_quote:
 238                 if pos + 1 < len(line):
 239                     if line[pos:pos+2] == '/*':
 240                         pos += 2
 241                         incomment = True
 242                         continue
 243                     if line[pos:pos+2] == '//':
 244                         break
 245
 246             if not in_quote:
 247                 if line[pos] == "'":
 248                     start = pos
 249                     try:
 250                         pos += 1
 251                         if line[pos] == '\\':
 252                             pos += 1
 253                             if line[pos] == 'x':
 254                                 pos += 2
 255                         pos += 1
 256                         if line[pos] != "'":
 257                             self.warn("Unmatched single quote: %r" % line)
 258                             pos = start + 1
 259                             continue
 260                         else:
 261                             line = line[:start] + line[pos+1:]
 262                             pos = start
 263                             continue
 264                     except IndexError:
 265                         self.warn("Unfinished single quote: %r" % line)
 266                         return line
 267
 268                 if line[pos] == '"':
 269                     start = pos
 270                     in_quote = True
 271             else:
 272                 if line[pos] == '\\':
 273                     pos += 2
 274                     if pos >= len(line):
 275                         self.warn("Unfinished double quote: %r" % line)
 276                         return line
 277                     continue
 278                 if line[pos] == '"':
 279                     in_quote = False
 280                     line = line[:start] + line[pos+1:]
 281                     pos = start
 282                     continue
 283
 284             pos += 1
 285         return line
 286
 287     def strip_directives(self, line):
 288         if directive_re.match(line):
 289             return ''
 290         return line
 291
 292     def join_slashed_lines(self, lines):
 293         "Join lines terminated with \ together"
 294         newlines = []
 295         had_slash = False
 296         for line in lines:
 297             if had_slash:
 298                 newlines[-1] += line
 299             else:
 300                 newlines.append(line)
 301
 302             had_slash = False
 303             if line.endswith('\\'):
 304                 had_slash = True
 305                 newlines[-1] = newlines[-1][:-1]
 306         return newlines
 307
 308     def get_comments(self, lines):
 309         """Get the C or C++ style comments from a set of lines.
 310
 311         """
 312         comments = []
 313         incomment = None
 314         was_cpp_comment = False
 315         lines = self.join_slashed_lines(lines)
 316
 317         for line in lines:
 318             line = line.strip()
 319             if len(line) == 0:
 320                 continue
 321             line = self.strip_directives(line)
 322             line = self.strip_quotes(line, incomment, was_cpp_comment)
 323             pos = 0
 324             if incomment is not None:
 325                 if not was_cpp_comment:
 326                     # Look for the end of a C comment
 327                     end = line.find('*/', 0)
 328
 329                     # Check for leading "*"s
 330                     if end != 0 and line[0] == '*':
 331                         line = line[1:].strip()
 332                         end -= 1
 333
 334                     # End the comment if an end was found
 335                     if len(incomment) != 0 and incomment[-1] != '\n':
 336                         incomment += '\n'
 337                     if end >= 0:
 338                         pos = end + 2
 339                         incomment += line[:end]
 340                         comments.append(incomment)
 341                         incomment = None
 342                     else:
 343                         incomment += line
 344
 345                 if was_cpp_comment:
 346                     # Look for a continuation C++ comment at the start of the line.
 347                     cpp_start = line.find('//', 0)
 348                     if cpp_start == 0:
 349                         incomment += '\n'
 350                         incomment += line[2:]
 351                     else:
 352                         comments.append(incomment)
 353                         incomment = None
 354
 355             if incomment is None:
 356                 # Look for the start of a comment
 357                 cc_start = line.find('/*', pos)
 358                 while cc_start != -1:
 359                     if line[cc_start] == '*' and line[cc_start+1] != '/':
 360                         # Skip extra * at start of comment, indicating a
 361                         # doccommment.
 362                         cc_start += 1
 363                     end = line.find('*/', cc_start+1)
 364                     if end == -1:
 365                         incomment = line[cc_start + 2:]
 366                         was_cpp_comment = False
 367                         break
 368                     pos = end + 2
 369                     comments.append(line[cc_start + 2:end])
 370                     cc_start = line.find('/*', pos)
 371
 372             if incomment is None:
 373                 # Look for the start of a C++ comment
 374                 cpp_start = line.find('//', pos)
 375                 if cpp_start != -1:
 376                     incomment = line[cpp_start + 2:]
 377                     was_cpp_comment = True
 378
 379         if incomment:
 380             comments.append(incomment)
 381         return comments
 382
 383
 384     def check_file(self, path):
 385         '''Check the copyright status of a file.
 386
 387         Returns a tuple of form (name, (year, year,))
 388
 389         '''
 390         fd = open(path)
 391         lines = [line.strip() for line in fd.readlines()]
 392         assert(path.startswith(self.topdirpath))
 393         self.current_path = path[len(self.topdirpath) + 1:]
 394
 395         comments = self.get_comments(lines)
 396         self.parse_copyrights(comments)
 397         self.parse_licenses(comments)
 398         self.parse_fixmes(comments)
 399
 400         file = self.get_file_details()
 401         file.length = len(lines)
 402
 403     def check(self):
 404         if os.path.isdir(self.toppath):
 405             for dirpath, dirnames, filenames in os.walk(self.toppath):
 406                 for filename in filenames:
 407                     if filename.endswith('.cc') or \
 408                        filename.endswith('.c') or \
 409                        filename.endswith('.h'):
 410                         path = os.path.join(dirpath, filename)
 411                         self.check_file(path)
 412         else:
 413             self.check_file(self.toppath)
 414
 415     def get_relicense_classses(self):
 416         classes = {}
 417         for path, details in self.files.iteritems():
 418             if 'gpl2+' not in details.licenses:
 419                 classes.setdefault('nongpl', []).append(path)
 420                 continue
 421             cls = 'gpl'
 422             holders = [item[0] for item in details.holders]
 423             for holder in gplonly:
 424                 if holder in holders:
 425                     cls = 'gplonly'
 426                     break
 427             classes.setdefault(cls, []).append(path)
 428         return classes
 429
 430     def get_ownership(self):
 431         """Get a dict holding ownership, keyed by copyright holder.
 432
 433         The values are tuples, (number of files, sum of proportion of files
 434         held, sum of proportion weighted by number of years of files held)
 435
 436         """
 437         # Get a dictionary, keyed by license, holding dictionaries keyed by
 438         # copyright holder, holding a list of values representing the
 439         # contribution of that holder.
 440         owners = {}
 441         for file in self.files.itervalues():
 442             file_ownership = {}
 443             holder_count = len(file.holders)
 444             holder_date_count = 0
 445             for holder_name, holder_dates in file.holders:
 446                 holder_date_count += len(holder_dates)
 447             for holder_name, holder_dates in file.holders:
 448                 proportion_equal = float(1)/holder_count
 449                 proportion_date = float(len(holder_dates)) / holder_date_count
 450                 file_ownership[holder_name] = [1, file.length,
 451                                                proportion_equal,
 452                                                proportion_date,
 453                                                proportion_equal * file.length,
 454                                                proportion_date * file.length,]
 455
 456             for license in file.licenses:
 457                 try:
 458                     license_owners = owners[license]
 459                 except KeyError:
 460                     license_owners = {}
 461                     owners[license] = license_owners
 462
 463                 for holder_name, holder_values in file_ownership.iteritems():
 464                     try:
 465                         license_owner = license_owners[holder_name]
 466                     except KeyError:
 467                         license_owner = [0] * len(holder_values)
 468                         license_owners[holder_name] = license_owner
 469                     for i in xrange(len(holder_values)):
 470                         license_owner[i] += holder_values[i]
 471
 472         # Get a list of the total number of lines for each license, and sort
 473         # into descending order.
 474         license_total_lines = []
 475         for license, owner in owners.iteritems():
 476             total_lines = 0
 477             for holder_values in owner.itervalues():
 478                 total_lines += holder_values[4]
 479             license_total_lines.append((total_lines, license))
 480         license_total_lines.sort()
 481         license_total_lines.reverse()
 482
 483         # Get a list of the contributors for each license, in descending order of total number of lines
 484         result = []
 485         for total_lines, license in license_total_lines:
 486             license_owners = []
 487             for owner, values in owners[license].iteritems():
 488                 item = [owner]
 489                 item.extend(values)
 490                 license_owners.append(tuple(item))
 491             license_owners.sort(cmp=lambda x,y:cmp(x[1],y[1]))
 492             license_owners.reverse()
 493             result.append((license, license_owners))
 494         return tuple(result)
 495
 496     def get_fixmes(self):
 497         """Get a dict holding fixmes, keyed by milestone.
 498
 499         """
 500         milestones = {}
 501         for file in self.files.itervalues():
 502             for milestone, fixmetext in file.fixmes:
 503                 if milestone not in milestones:
 504                     milestones[milestone] = []
 505                 milestones[milestone].append((file.path, fixmetext))
 506         def cmpfn(a, b):
 507             if (a[0] == '') ^ (b[0] == ''):
 508                 return -cmp(a, b)
 509             return cmp(a, b)
 510         return sorted([(milestone, sorted(milestones[milestone]))
 511                       for milestone in milestones.iterkeys()],
 512                       cmp=cmpfn)
 513
 514
 515 toppath = '../xapian-core'
 516 if len(sys.argv) > 1:
 517     toppath = sys.argv[1]
 518 checker = SourceChecker(toppath)
 519 checker.check()
 520
 521 #pprint(checker.files)
 522
 523 #pprint(checker.get_fixmes())
 524 fixmefd = open("fixmes.csv", "wb")
 525 writer = csv.writer(fixmefd)
 526 writer.writerow(("Milestone", "File", "Message",))
 527 for milestone, fixmes in checker.get_fixmes():
 528     for filepath, fixmetext in fixmes:
 529        writer.writerow((milestone, filepath, fixmetext))
 530 fixmefd.close()
 531
 532
 533 #pprint(checker.get_ownership())
 534
 535 copyrightfd = open("copyright.csv", "wb")
 536 writer = csv.writer(copyrightfd)
 537 writer.writerow(("License", "Author", "File count", "Lines touched",
 538                  "File proportion (equal)", "File proportion (biased)",
 539                  "Lines proportion (equal)", "Lines proportion (biased)",))
 540 for license in checker.get_ownership():
 541     for holder in license[1]:
 542         value = [license[0]]
 543         value.extend(holder)
 544         writer.writerow(value)
 545 copyrightfd.close()
 546
 547 relicense_classes = checker.get_relicense_classses()
 548 print ('%d files:' % len(checker.files))
 549 print ('%d files "tainted" by unrelicensable GPL code' %
 550        len(relicense_classes.get('gplonly', ())))
 551 print ('%d files "tainted" by relicensable GPL code' %
 552        len(relicense_classes.get('gpl', ())))
 553 print ('%d files "untainted" by GPL code' %
 554        len(relicense_classes.get('nongpl', ())))
 555
 556 fd = open("license_classes.csv", "wb")
 557 writer = csv.writer(fd)
 558 writer.writerow(("Status", "File path"))
 559 for cls, paths in sorted(relicense_classes.iteritems()):
 560     status = {
 561         'gpl': "GPL, but probably relicensable",
 562         'nongpl': "License other than GPL",
 563         'gplonly': "GPL, probably non-relicensable",
 564     }[cls]
 565     for path in sorted(paths):
 566         writer.writerow((status, path))
 567 fd.close()