tools/copyright_scanner/copyright_scanner.py

   1 # Copyright 2014 The Chromium Authors. All rights reserved.
   2 # Use of this source code is governed by a BSD-style license that can be
   3 # found in the LICENSE file.
   4
   5 """Utilities for scanning source files to determine code authorship.
   6 """
   7
   8 import itertools
   9
  10 def ForwardSlashesToOsPathSeps(input_api, path):
  11   """Converts forward slashes ('/') in the input path to OS-specific
  12   path separators. Used when the paths come from outside and are using
  13   UNIX path separators. Only works for relative paths!
  14   Args:
  15     input_api: InputAPI, as in presubmit scripts.
  16     path: The path to convert.
  17   Returns:
  18     Converted path.
  19   """
  20   return input_api.os_path.join(*path.split('/'))
  21
  22 def FindFiles(input_api, root_dir, start_paths_list, excluded_dirs_list):
  23   """Similar to UNIX utility find(1), searches for files in the directories.
  24   Automatically leaves out only source code files and excludes third_party
  25   directories.
  26   Args:
  27     input_api: InputAPI, as in presubmit scripts.
  28     root_dir: The root directory, to which all other paths are relative.
  29     start_paths_list: The list of paths to start search from. Each path can
  30       be a file or a directory.
  31     excluded_dirs_list: The list of directories to skip.
  32   Returns:
  33     The list of source code files found, relative to |root_dir|.
  34   """
  35   excluded_dirs_list = [d for d in excluded_dirs_list if not 'third_party' in d]
  36   # Using a common pattern for third-partyies makes the ignore regexp shorter
  37   excluded_dirs_list.append('third_party')
  38
  39   path_join = input_api.os_path.join
  40   EXTRA_EXCLUDED_DIRS = [
  41     # VCS dirs
  42     path_join('.git'),
  43     path_join('.svn'),
  44     # Build output
  45     path_join('out', 'Debug'),
  46     path_join('out', 'Release'),
  47     # 'Copyright' appears in license agreements
  48     path_join('chrome', 'app', 'resources'),
  49     # Quickoffice js files from internal src used on buildbots.
  50     # crbug.com/350472.
  51     path_join('chrome', 'browser', 'resources', 'chromeos', 'quickoffice'),
  52     # This is a test output directory
  53     path_join('chrome', 'tools', 'test', 'reference_build'),
  54     # blink style copy right headers.
  55     path_join('content', 'shell', 'renderer', 'test_runner'),
  56     # blink style copy right headers.
  57     path_join('content', 'shell', 'tools', 'plugin'),
  58     # This is tests directory, doesn't exist in the snapshot
  59     path_join('content', 'test', 'data'),
  60     # This is a tests directory that doesn't exist in the shipped product.
  61     path_join('gin', 'test'),
  62     # This is a test output directory
  63     path_join('data', 'dom_perf'),
  64     # This is a tests directory that doesn't exist in the shipped product.
  65     path_join('tools', 'perf', 'page_sets'),
  66     path_join('tools', 'perf', 'page_sets', 'tough_animation_cases'),
  67     # Histogram tools, doesn't exist in the snapshot
  68     path_join('tools', 'histograms'),
  69     # Swarming tools, doesn't exist in the snapshot
  70     path_join('tools', 'swarming_client'),
  71     # ARM sysroot, doesn't exist in the snapshot
  72     path_join('build', 'linux', 'debian_wheezy_arm-sysroot'),
  73     # Old location (TODO(sbc): Remove this once it no longer exists on any bots)
  74     path_join('chrome', 'installer', 'linux', 'debian_wheezy_arm-sysroot'),
  75     # Data is not part of open source chromium, but are included on some bots.
  76     path_join('data'),
  77     # This is not part of open source chromium, but are included on some bots.
  78     path_join('skia', 'tools', 'clusterfuzz-data'),
  79     # Not shipped, only relates to Chrome for Android, but not to WebView
  80     path_join('clank'),
  81   ]
  82   excluded_dirs_list.extend(EXTRA_EXCLUDED_DIRS)
  83
  84   # Surround the directory names with OS path separators.
  85   dirs_blacklist = [path_join('.', d, '')[1:] for d in excluded_dirs_list if d]
  86   def IsBlacklistedDir(d):
  87     for item in dirs_blacklist:
  88       if item in d:
  89         return True
  90     return False
  91
  92   files_whitelist_re = input_api.re.compile(
  93     r'\.(asm|c(c|pp|xx)?|h(h|pp|xx)?|p(l|m)|xs|sh|php|py(|x)'
  94     '|rb|idl|java|el|sc(i|e)|cs|pas|inc|js|pac|html|dtd|xsl|mod|mm?'
  95     '|tex|mli?)$')
  96   files = []
  97
  98   base_path_len = len(root_dir)
  99   for path in start_paths_list:
 100     full_path = path_join(root_dir, path)
 101     if input_api.os_path.isfile(full_path):
 102       if files_whitelist_re.search(path) and \
 103           not IsBlacklistedDir(full_path[base_path_len:]):  # Keep '/' prefix.
 104         files.append(path)
 105     else:
 106       for dirpath, dirnames, filenames in input_api.os_walk(full_path):
 107         # Remove excluded subdirs for faster scanning.
 108         for item in dirnames[:]:
 109           if IsBlacklistedDir(
 110               path_join(dirpath, item)[base_path_len + 1:]):
 111             dirnames.remove(item)
 112         for filename in filenames:
 113           filepath = \
 114               path_join(dirpath, filename)[base_path_len + 1:]
 115           if files_whitelist_re.search(filepath) and \
 116               not IsBlacklistedDir(filepath):
 117             files.append(filepath)
 118   return files
 119
 120
 121 class _GeneratedFilesDetector(object):
 122   GENERATED_FILE = 'GENERATED FILE'
 123   NO_COPYRIGHT = '*No copyright*'
 124
 125   def __init__(self, input_api):
 126     self.python_multiline_string_double_re = \
 127       input_api.re.compile(r'"""[^"]*(?:"""|$)', flags=input_api.re.MULTILINE)
 128     self.python_multiline_string_single_re = \
 129       input_api.re.compile(r"'''[^']*(?:'''|$)", flags=input_api.re.MULTILINE)
 130     self.automatically_generated_re = input_api.re.compile(
 131       r'(All changes made in this file will be lost'
 132       '|DO NOT (EDIT|delete this file)'
 133       '|Generated (at|automatically|data)'
 134       '|Automatically generated'
 135       '|\Wgenerated\s+(?:\w+\s+)*file\W)', flags=input_api.re.IGNORECASE)
 136
 137   def IsGeneratedFile(self, header):
 138     header = header.upper()
 139     if '"""' in header:
 140       header = self.python_multiline_string_double_re.sub('', header)
 141     if "'''" in header:
 142       header = self.python_multiline_string_single_re.sub('', header)
 143     # First do simple strings lookup to save time.
 144     if 'ALL CHANGES MADE IN THIS FILE WILL BE LOST' in header:
 145       return True
 146     if 'DO NOT EDIT' in header or 'DO NOT DELETE' in header or \
 147         'GENERATED' in header:
 148       return self.automatically_generated_re.search(header)
 149     return False
 150
 151
 152 class _CopyrightsScanner(object):
 153   @staticmethod
 154   def StaticInit(input_api):
 155     _CopyrightsScanner._c_comment_re = \
 156       input_api.re.compile(r'''"[^"\\]*(?:\\.[^"\\]*)*"''')
 157     _CopyrightsScanner._copyright_indicator = \
 158       r'(?:copyright|copr\.|\xc2\xa9|\(c\))'
 159     _CopyrightsScanner._full_copyright_indicator_re = input_api.re.compile(
 160       r'(?:\W|^)' + _CopyrightsScanner._copyright_indicator + \
 161       r'(?::\s*|\s+)(\w.*)$', input_api.re.IGNORECASE)
 162     _CopyrightsScanner._copyright_disindicator_re = input_api.re.compile(
 163       r'\s*\b(?:info(?:rmation)?|notice|and|or)\b', input_api.re.IGNORECASE)
 164
 165   def __init__(self, input_api):
 166     self.max_line_numbers_proximity = 3
 167     self.last_a_item_line_number = -200
 168     self.last_b_item_line_number = -100
 169     self.re = input_api.re
 170
 171   def _CloseLineNumbers(self, a, b):
 172     return 0 <= a - b <= self.max_line_numbers_proximity
 173
 174   def MatchLine(self, line_number, line):
 175     if '"' in line:
 176       line = _CopyrightsScanner._c_comment_re.sub('', line)
 177     upcase_line = line.upper()
 178     # Record '(a)' and '(b)' last occurences in C++ comments.
 179     # This is to filter out '(c)' used as a list item inside C++ comments.
 180     # E.g. "// blah-blah (a) blah\n// blah-blah (b) and (c) blah"
 181     cpp_comment_idx = upcase_line.find('//')
 182     if cpp_comment_idx != -1:
 183       if upcase_line.find('(A)') > cpp_comment_idx:
 184         self.last_a_item_line_number = line_number
 185       if upcase_line.find('(B)') > cpp_comment_idx:
 186         self.last_b_item_line_number = line_number
 187     # Fast bailout, uses the same patterns as _copyright_indicator regexp.
 188     if not 'COPYRIGHT' in upcase_line and not 'COPR.' in upcase_line \
 189         and not '\xc2\xa9' in upcase_line:
 190       c_item_index = upcase_line.find('(C)')
 191       if c_item_index == -1:
 192         return None
 193       if c_item_index > cpp_comment_idx and \
 194           self._CloseLineNumbers(line_number,
 195                                  self.last_b_item_line_number) and \
 196           self._CloseLineNumbers(self.last_b_item_line_number,
 197                                  self.last_a_item_line_number):
 198         return None
 199     copyr = None
 200     m = _CopyrightsScanner._full_copyright_indicator_re.search(line)
 201     if m and \
 202         not _CopyrightsScanner._copyright_disindicator_re.match(m.group(1)):
 203       copyr = m.group(0)
 204       # Prettify the authorship string.
 205       copyr = self.re.sub(r'([,.])?\s*$/', '', copyr)
 206       copyr = self.re.sub(
 207         _CopyrightsScanner._copyright_indicator, '', copyr, \
 208         flags=self.re.IGNORECASE)
 209       copyr = self.re.sub(r'^\s+', '', copyr)
 210       copyr = self.re.sub(r'\s{2,}', ' ', copyr)
 211       copyr = self.re.sub(r'\\@', '@', copyr)
 212     return copyr
 213
 214
 215 def FindCopyrights(input_api, root_dir, files_to_scan):
 216   """Determines code autorship, and finds generated files.
 217   Args:
 218     input_api: InputAPI, as in presubmit scripts.
 219     root_dir: The root directory, to which all other paths are relative.
 220     files_to_scan: The list of file names to scan.
 221   Returns:
 222     The list of copyrights associated with each of the files given.
 223     If the certain file is generated, the corresponding list consists a single
 224     entry -- 'GENERATED_FILE' string. If the file has no copyright info,
 225     the corresponding list contains 'NO_COPYRIGHT' string.
 226   """
 227   generated_files_detector = _GeneratedFilesDetector(input_api)
 228   _CopyrightsScanner.StaticInit(input_api)
 229   copyrights = []
 230   for file_name in files_to_scan:
 231     linenum = 0
 232     header = []
 233     file_copyrights = []
 234     scanner = _CopyrightsScanner(input_api)
 235     contents = input_api.ReadFile(
 236       input_api.os_path.join(root_dir, file_name), 'r')
 237     for l in contents.split('\n'):
 238       linenum += 1
 239       if linenum <= 25:
 240         header.append(l)
 241       c = scanner.MatchLine(linenum, l)
 242       if c:
 243         file_copyrights.append(c)
 244     if generated_files_detector.IsGeneratedFile('\n'.join(header)):
 245       copyrights.append([_GeneratedFilesDetector.GENERATED_FILE])
 246     elif file_copyrights:
 247       copyrights.append(file_copyrights)
 248     else:
 249       copyrights.append([_GeneratedFilesDetector.NO_COPYRIGHT])
 250   return copyrights
 251
 252
 253 def FindCopyrightViolations(input_api, root_dir, files_to_scan):
 254   """Looks for files that are not belong exlusively to the Chromium Authors.
 255   Args:
 256     input_api: InputAPI, as in presubmit scripts.
 257     root_dir: The root directory, to which all other paths are relative.
 258     files_to_scan: The list of file names to scan.
 259   Returns:
 260     The list of file names that contain non-Chromium copyrights.
 261   """
 262   copyrights = FindCopyrights(input_api, root_dir, files_to_scan)
 263   offending_files = []
 264   allowed_copyrights_re = input_api.re.compile(
 265     r'^(?:20[0-9][0-9](?:-20[0-9][0-9])? The Chromium Authors\. '
 266     'All rights reserved.*)$')
 267   for f, cs in itertools.izip(files_to_scan, copyrights):
 268     if cs[0] == _GeneratedFilesDetector.GENERATED_FILE or \
 269        cs[0] == _GeneratedFilesDetector.NO_COPYRIGHT:
 270       continue
 271     for c in cs:
 272       if not allowed_copyrights_re.match(c):
 273         offending_files.append(input_api.os_path.normpath(f))
 274         break
 275   return offending_files
 276
 277
 278 def _GetWhitelistFileName(input_api):
 279   return input_api.os_path.join(
 280     'tools', 'copyright_scanner', 'third_party_files_whitelist.txt')
 281
 282 def _ProcessWhitelistedFilesList(input_api, lines):
 283   whitelisted_files = []
 284   for line in lines:
 285     match = input_api.re.match(r'([^#\s]+)', line)
 286     if match:
 287       whitelisted_files.append(
 288         ForwardSlashesToOsPathSeps(input_api, match.group(1)))
 289   return whitelisted_files
 290
 291
 292 def LoadWhitelistedFilesList(input_api):
 293   """Loads and parses the 3rd party code whitelist file.
 294     input_api: InputAPI of presubmit scripts.
 295   Returns:
 296     The list of files.
 297   """
 298   full_file_name = input_api.os_path.join(
 299     input_api.change.RepositoryRoot(), _GetWhitelistFileName(input_api))
 300   file_data = input_api.ReadFile(full_file_name, 'rb')
 301   return _ProcessWhitelistedFilesList(input_api, file_data.splitlines())
 302
 303
 304 def AnalyzeScanResults(input_api, whitelisted_files, offending_files):
 305   """Compares whitelist contents with the results of file scanning.
 306     input_api: InputAPI of presubmit scripts.
 307     whitelisted_files: Whitelisted files list.
 308     offending_files: Files that contain 3rd party code.
 309   Returns:
 310     A triplet of "unknown", "missing", and "stale" file lists.
 311     "Unknown" are files that contain 3rd party code but not whitelisted.
 312     "Missing" are files that are whitelisted but doesn't really exist.
 313     "Stale" are files that are whitelisted unnecessarily.
 314   """
 315   unknown = set(offending_files) - set(whitelisted_files)
 316   missing = [f for f in whitelisted_files if not input_api.os_path.isfile(
 317     input_api.os_path.join(input_api.change.RepositoryRoot(), f))]
 318   stale = set(whitelisted_files) - set(offending_files) - set(missing)
 319   return (list(unknown), missing, list(stale))
 320
 321
 322 def _GetDeletedContents(affected_file):
 323   """Returns a list of all deleted lines.
 324   AffectedFile class from presubmit_support is lacking this functionality.
 325   """
 326   deleted_lines = []
 327   for line in affected_file.GenerateScmDiff().splitlines():
 328     if line.startswith('-') and not line.startswith('--'):
 329       deleted_lines.append(line[1:])
 330   return deleted_lines
 331
 332 def _DoScanAtPresubmit(input_api, whitelisted_files, files_to_check):
 333   # We pass empty 'known third-party' dirs list here. Since this is a patch
 334   # for the Chromium's src tree, it must contain properly licensed Chromium
 335   # code. Any third-party code must be put into a directory named 'third_party',
 336   # and such dirs are automatically excluded by FindFiles.
 337   files_to_scan = FindFiles(
 338     input_api, input_api.change.RepositoryRoot(), files_to_check, [])
 339   offending_files = FindCopyrightViolations(
 340     input_api, input_api.change.RepositoryRoot(), files_to_scan)
 341   return AnalyzeScanResults(
 342     input_api, whitelisted_files, offending_files)
 343
 344 def ScanAtPresubmit(input_api, output_api):
 345   """Invoked at change presubmit time. Verifies that updated non third-party
 346   code doesn't contain external copyrighted code.
 347     input_api: InputAPI of presubmit scripts.
 348     output_api: OutputAPI of presubmit scripts.
 349   """
 350   files_to_check = set([])
 351   deleted_files = set([])
 352   whitelist_contents_changed = False
 353   for f in input_api.AffectedFiles():
 354     if f.LocalPath() == _GetWhitelistFileName(input_api):
 355       whitelist_contents_changed = True
 356       deleted_files |= set(_ProcessWhitelistedFilesList(
 357         input_api, _GetDeletedContents(f)))
 358       continue
 359     if f.Action() != 'D':
 360       files_to_check.add(f.LocalPath())
 361     else:
 362       deleted_files.add(f.LocalPath())
 363   whitelisted_files = set(LoadWhitelistedFilesList(input_api))
 364   if not whitelist_contents_changed:
 365     whitelisted_files &= files_to_check | deleted_files
 366   else:
 367     # Need to re-check the entire contents of the whitelist file.
 368     # Also add files removed from the whitelist. If the file has indeed been
 369     # deleted, the scanner will not complain.
 370     files_to_check |= whitelisted_files | deleted_files
 371
 372   (unknown_files, missing_files, stale_files) = _DoScanAtPresubmit(
 373     input_api, list(whitelisted_files), list(files_to_check))
 374   results = []
 375   if unknown_files:
 376     results.append(output_api.PresubmitError(
 377         'The following files contain a third-party license but are not in ' \
 378         'a listed third-party directory and are not whitelisted. You must ' \
 379         'add the following files to the whitelist file %s\n' \
 380         '(Note that if the code you are adding does not actually contain ' \
 381         'any third-party code, it may contain the word "copyright", which ' \
 382         'should be masked out, e.g. by writing it as "copy-right"):' \
 383         '' % _GetWhitelistFileName(input_api),
 384         sorted(unknown_files)))
 385   if missing_files:
 386     results.append(output_api.PresubmitPromptWarning(
 387         'The following files are whitelisted in %s, ' \
 388         'but do not exist or not files:' % _GetWhitelistFileName(input_api),
 389         sorted(missing_files)))
 390   if stale_files:
 391     results.append(output_api.PresubmitPromptWarning(
 392         'The following files are whitelisted unnecessarily. You must ' \
 393         'remove the following files from the whitelist file ' \
 394         '%s:' % _GetWhitelistFileName(input_api),
 395         sorted(stale_files)))
 396   return results