3 # Copyright (C) 2007 Lemur Consulting Ltd
5 # This program is free software; you can redistribute it and/or modify
6 # it under the terms of the GNU General Public License as published by
7 # the Free Software Foundation; either version 2 of the License, or
8 # (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License along
16 # with this program; if not, write to the Free Software Foundation, Inc.,
17 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
18 r
"""audit.py: Simple script to check code ownership and license messages.
20 Currently assumes that the xapian code can be found in "../xapian-core".
27 from pprint
import pprint
30 copy_re
= re
.compile(r
'Copyright\s+(\([Cc]\))?\s*(?P<dates>([0-9]{2,4})((,\s*|-)[0-9]{2,4})*),?\s*$')
31 copy2_re
= re
.compile(r
'Copyright\s+(\([Cc]\))?\s*(?P<dates>([0-9]{2,4})((,\s*|-)[0-9]{2,4})*),?\s+(?P<name>.+)\s*$')
32 copy_unrec_re
= re
.compile(r
'Copyright')
34 directive_re
= re
.compile(r
'\s*#\s*error')
36 # Copyright holders which mean code is GPL only.
44 is free software; you can redistribute it and\/or modify it under the
45 terms of the GNU Library General Public License as published by the Free
46 Software Foundation; either version 2 of the License, or \(at your option\)
50 is free software; you can redistribute it and\/or modify it under the terms of
51 the GNU General Public License as published by the Free Software Foundation;
52 either version 2( of the License)?, or \(at your option\) any later version.
54 ('sgi-historical', r
'''
55 Permission to use, copy, modify, distribute and sell this software and its
56 documentation for any purpose is hereby granted without fee, provided that
57 the above copyright notice appear in all copies and that both that
58 copyright notice and this permission notice appear in supporting
62 The authors of this program disclaim copyright.
65 Permission is hereby granted, free of charge, to any person obtaining a
66 copy of this software and associated documentation files \(the "Software"\),
67 to deal in the Software without restriction, including without limitation
68 the rights to use, copy, modify, merge, publish, distribute, sublicense,
69 and/or sell copies of the Software, and to permit persons to whom the
70 Software is furnished to do so, subject to the following conditions:
75 r
'''FIXME:(?P<milestone>[\d.]+)''',
79 whitespace_re
= re
.compile(r
'\s+')
82 for name
, pattern
in licenses
:
83 pattern
= whitespace_re
.sub('\s+', pattern
)
84 license_patterns
.append((name
, re
.compile(pattern
)))
87 for pattern
in fixmes
:
88 fixme_patterns
.append(re
.compile(pattern
))
91 def __init__(self
, path
):
99 return "FileDetails(%r, %r, %r, %r)" % (self
.path
, self
.holders
, self
.licenses
, self
.fixmes
)
102 def __init__(self
, toppath
):
105 self
.toppath
= os
.path
.normpath(os
.path
.abspath(toppath
))
106 if os
.path
.isdir(self
.toppath
):
107 self
.topdirpath
= self
.toppath
109 self
.topdirpath
= os
.path
.dirname(self
.toppath
)
110 self
.current_path
= None
113 print("Warning in %s: %s" % (self
.current_path
, msg
))
115 def get_file_details(self
, path
=None):
117 path
= self
.current_path
119 return self
.files
[path
]
121 details
= FileDetails(path
)
122 self
.files
[path
] = details
125 def parse_date_list(self
, dates
):
128 for date
in dates
.split(','):
130 begin
, end
= date
.split('-')
134 self
.warn('Invalid date range %r in copyright' % date
)
135 newdates
.append(begin
)
136 for date
in xrange(begin
, end
+ 1):
137 newdates
.append(date
)
142 if prevdate
is None or date
>= 100:
143 self
.warn('Invalid date %r in copyright' % date
)
145 date
= (prevdate
// 100) * 100 + date
146 newdates
.append(int(date
))
150 def add_copyright_holder(self
, name
, dates
):
151 file = self
.get_file_details()
153 dates
= self
.parse_date_list(dates
)
155 file.holders
.append((name
, dates
))
157 def parse_copyrights(self
, comments
):
158 seen_copyright
= False
160 got_date_line
= False
161 for comment
in comments
:
162 for line
in comment
.split('\n'):
164 self
.add_copyright_holder(line
, dates
)
165 got_date_line
= False
167 m
= copy_re
.search(line
)
168 m2
= copy2_re
.search(line
)
170 dates
= m
.group('dates')
173 name
= m2
.group('name')
174 dates
= m2
.group('dates')
175 self
.add_copyright_holder(name
, dates
)
176 seen_copyright
= True
177 elif copy_unrec_re
.search(line
):
178 self
.warn("Unrecognised copyright line: %r" % line
)
180 def parse_licenses(self
, comments
):
182 for comment
in comments
:
183 comment
= comment
.replace('\n', ' ').replace('\r', '').strip()
184 for license
, pattern
in license_patterns
:
185 if pattern
.search(comment
):
186 licenses
.append(license
)
187 if len(licenses
) == 0:
188 self
.warn("No license found: %s" % self
.current_path
)
190 file = self
.get_file_details()
191 file.licenses
.extend(licenses
)
193 def parse_fixmes(self
, comments
):
195 for comment
in comments
:
196 comment
= comment
.replace('\n', ' ').replace('\r', '').strip()
197 for pattern
in fixme_patterns
:
198 g
= pattern
.search(comment
)
200 fixmetext
= comment
[g
.end():].strip()
201 if fixmetext
.startswith(':'):
202 fixmetext
= fixmetext
[1:].strip()
203 if fixmetext
.startswith('-'):
204 fixmetext
= fixmetext
[1:].strip()
206 milestone
= g
.group('milestone')
209 fixmes
.append((milestone
, fixmetext
))
212 file = self
.get_file_details()
213 file.fixmes
.extend(fixmes
)
215 def strip_quotes(self
, line
, incomment
, was_cpp_comment
):
216 """Remove any quoted strings from a line.
219 if incomment
is not None:
227 while pos
< len(line
):
229 if pos
+ 1 < len(line
) and line
[pos
:pos
+2] == '*/':
237 if not incomment
and not in_quote
:
238 if pos
+ 1 < len(line
):
239 if line
[pos
:pos
+2] == '/*':
243 if line
[pos
:pos
+2] == '//':
251 if line
[pos
] == '\\':
257 self
.warn("Unmatched single quote: %r" % line
)
261 line
= line
[:start
] + line
[pos
+1:]
265 self
.warn("Unfinished single quote: %r" % line
)
272 if line
[pos
] == '\\':
275 self
.warn("Unfinished double quote: %r" % line
)
280 line
= line
[:start
] + line
[pos
+1:]
287 def strip_directives(self
, line
):
288 if directive_re
.match(line
):
292 def join_slashed_lines(self
, lines
):
293 "Join lines terminated with \ together"
300 newlines
.append(line
)
303 if line
.endswith('\\'):
305 newlines
[-1] = newlines
[-1][:-1]
308 def get_comments(self
, lines
):
309 """Get the C or C++ style comments from a set of lines.
314 was_cpp_comment
= False
315 lines
= self
.join_slashed_lines(lines
)
321 line
= self
.strip_directives(line
)
322 line
= self
.strip_quotes(line
, incomment
, was_cpp_comment
)
324 if incomment
is not None:
325 if not was_cpp_comment
:
326 # Look for the end of a C comment
327 end
= line
.find('*/', 0)
329 # Check for leading "*"s
330 if end
!= 0 and line
[0] == '*':
331 line
= line
[1:].strip()
334 # End the comment if an end was found
335 if len(incomment
) != 0 and incomment
[-1] != '\n':
339 incomment
+= line
[:end
]
340 comments
.append(incomment
)
346 # Look for a continuation C++ comment at the start of the line.
347 cpp_start
= line
.find('//', 0)
350 incomment
+= line
[2:]
352 comments
.append(incomment
)
355 if incomment
is None:
356 # Look for the start of a comment
357 cc_start
= line
.find('/*', pos
)
358 while cc_start
!= -1:
359 if line
[cc_start
] == '*' and line
[cc_start
+1] != '/':
360 # Skip extra * at start of comment, indicating a
363 end
= line
.find('*/', cc_start
+1)
365 incomment
= line
[cc_start
+ 2:]
366 was_cpp_comment
= False
369 comments
.append(line
[cc_start
+ 2:end
])
370 cc_start
= line
.find('/*', pos
)
372 if incomment
is None:
373 # Look for the start of a C++ comment
374 cpp_start
= line
.find('//', pos
)
376 incomment
= line
[cpp_start
+ 2:]
377 was_cpp_comment
= True
380 comments
.append(incomment
)
384 def check_file(self
, path
):
385 '''Check the copyright status of a file.
387 Returns a tuple of form (name, (year, year,))
391 lines
= [line
.strip() for line
in fd
.readlines()]
392 assert(path
.startswith(self
.topdirpath
))
393 self
.current_path
= path
[len(self
.topdirpath
) + 1:]
395 comments
= self
.get_comments(lines
)
396 self
.parse_copyrights(comments
)
397 self
.parse_licenses(comments
)
398 self
.parse_fixmes(comments
)
400 file = self
.get_file_details()
401 file.length
= len(lines
)
404 if os
.path
.isdir(self
.toppath
):
405 for dirpath
, dirnames
, filenames
in os
.walk(self
.toppath
):
406 for filename
in filenames
:
407 if filename
.endswith('.cc') or \
408 filename
.endswith('.c') or \
409 filename
.endswith('.h'):
410 path
= os
.path
.join(dirpath
, filename
)
411 self
.check_file(path
)
413 self
.check_file(self
.toppath
)
415 def get_relicense_classses(self
):
417 for path
, details
in self
.files
.iteritems():
418 if 'gpl2+' not in details
.licenses
:
419 classes
.setdefault('nongpl', []).append(path
)
422 holders
= [item
[0] for item
in details
.holders
]
423 for holder
in gplonly
:
424 if holder
in holders
:
427 classes
.setdefault(cls
, []).append(path
)
430 def get_ownership(self
):
431 """Get a dict holding ownership, keyed by copyright holder.
433 The values are tuples, (number of files, sum of proportion of files
434 held, sum of proportion weighted by number of years of files held)
437 # Get a dictionary, keyed by license, holding dictionaries keyed by
438 # copyright holder, holding a list of values representing the
439 # contribution of that holder.
441 for file in self
.files
.itervalues():
443 holder_count
= len(file.holders
)
444 holder_date_count
= 0
445 for holder_name
, holder_dates
in file.holders
:
446 holder_date_count
+= len(holder_dates
)
447 for holder_name
, holder_dates
in file.holders
:
448 proportion_equal
= float(1)/holder_count
449 proportion_date
= float(len(holder_dates
)) / holder_date_count
450 file_ownership
[holder_name
] = [1, file.length
,
453 proportion_equal
* file.length
,
454 proportion_date
* file.length
,]
456 for license
in file.licenses
:
458 license_owners
= owners
[license
]
461 owners
[license
] = license_owners
463 for holder_name
, holder_values
in file_ownership
.iteritems():
465 license_owner
= license_owners
[holder_name
]
467 license_owner
= [0] * len(holder_values
)
468 license_owners
[holder_name
] = license_owner
469 for i
in xrange(len(holder_values
)):
470 license_owner
[i
] += holder_values
[i
]
472 # Get a list of the total number of lines for each license, and sort
473 # into descending order.
474 license_total_lines
= []
475 for license
, owner
in owners
.iteritems():
477 for holder_values
in owner
.itervalues():
478 total_lines
+= holder_values
[4]
479 license_total_lines
.append((total_lines
, license
))
480 license_total_lines
.sort()
481 license_total_lines
.reverse()
483 # Get a list of the contributors for each license, in descending order of total number of lines
485 for total_lines
, license
in license_total_lines
:
487 for owner
, values
in owners
[license
].iteritems():
490 license_owners
.append(tuple(item
))
491 license_owners
.sort(cmp=lambda x
,y
:cmp(x
[1],y
[1]))
492 license_owners
.reverse()
493 result
.append((license
, license_owners
))
496 def get_fixmes(self
):
497 """Get a dict holding fixmes, keyed by milestone.
501 for file in self
.files
.itervalues():
502 for milestone
, fixmetext
in file.fixmes
:
503 if milestone
not in milestones
:
504 milestones
[milestone
] = []
505 milestones
[milestone
].append((file.path
, fixmetext
))
507 if (a
[0] == '') ^
(b
[0] == ''):
510 return sorted([(milestone
, sorted(milestones
[milestone
]))
511 for milestone
in milestones
.iterkeys()],
515 toppath
= '../xapian-core'
516 if len(sys
.argv
) > 1:
517 toppath
= sys
.argv
[1]
518 checker
= SourceChecker(toppath
)
521 #pprint(checker.files)
523 #pprint(checker.get_fixmes())
524 fixmefd
= open("fixmes.csv", "wb")
525 writer
= csv
.writer(fixmefd
)
526 writer
.writerow(("Milestone", "File", "Message",))
527 for milestone
, fixmes
in checker
.get_fixmes():
528 for filepath
, fixmetext
in fixmes
:
529 writer
.writerow((milestone
, filepath
, fixmetext
))
533 #pprint(checker.get_ownership())
535 copyrightfd
= open("copyright.csv", "wb")
536 writer
= csv
.writer(copyrightfd
)
537 writer
.writerow(("License", "Author", "File count", "Lines touched",
538 "File proportion (equal)", "File proportion (biased)",
539 "Lines proportion (equal)", "Lines proportion (biased)",))
540 for license
in checker
.get_ownership():
541 for holder
in license
[1]:
544 writer
.writerow(value
)
547 relicense_classes
= checker
.get_relicense_classses()
548 print ('%d files:' % len(checker
.files
))
549 print ('%d files "tainted" by unrelicensable GPL code' %
550 len(relicense_classes
.get('gplonly', ())))
551 print ('%d files "tainted" by relicensable GPL code' %
552 len(relicense_classes
.get('gpl', ())))
553 print ('%d files "untainted" by GPL code' %
554 len(relicense_classes
.get('nongpl', ())))
556 fd
= open("license_classes.csv", "wb")
557 writer
= csv
.writer(fd
)
558 writer
.writerow(("Status", "File path"))
559 for cls
, paths
in sorted(relicense_classes
.iteritems()):
561 'gpl': "GPL, but probably relicensable",
562 'nongpl': "License other than GPL",
563 'gplonly': "GPL, probably non-relicensable",
565 for path
in sorted(paths
):
566 writer
.writerow((status
, path
))