1 # Copyright 2014 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file.
5 """Utilities for scanning source files to determine code authorship.
10 def ForwardSlashesToOsPathSeps(input_api
, path
):
11 """Converts forward slashes ('/') in the input path to OS-specific
12 path separators. Used when the paths come from outside and are using
13 UNIX path separators. Only works for relative paths!
15 input_api: InputAPI, as in presubmit scripts.
16 path: The path to convert.
20 return input_api
.os_path
.join(*path
.split('/'))
22 def FindFiles(input_api
, root_dir
, start_paths_list
, excluded_dirs_list
):
23 """Similar to UNIX utility find(1), searches for files in the directories.
24 Automatically leaves out only source code files and excludes third_party
27 input_api: InputAPI, as in presubmit scripts.
28 root_dir: The root directory, to which all other paths are relative.
29 start_paths_list: The list of paths to start search from. Each path can
30 be a file or a directory.
31 excluded_dirs_list: The list of directories to skip.
33 The list of source code files found, relative to |root_dir|.
35 excluded_dirs_list
= [d
for d
in excluded_dirs_list
if not 'third_party' in d
]
36 # Using a common pattern for third-partyies makes the ignore regexp shorter
37 excluded_dirs_list
.append('third_party')
39 path_join
= input_api
.os_path
.join
40 EXTRA_EXCLUDED_DIRS
= [
45 path_join('out', 'Debug'),
46 path_join('out', 'Release'),
47 # 'Copyright' appears in license agreements
48 path_join('chrome', 'app', 'resources'),
49 # Quickoffice js files from internal src used on buildbots.
51 path_join('chrome', 'browser', 'resources', 'chromeos', 'quickoffice'),
52 # This is a test output directory
53 path_join('chrome', 'tools', 'test', 'reference_build'),
54 # blink style copy right headers.
55 path_join('content', 'shell', 'renderer', 'test_runner'),
56 # blink style copy right headers.
57 path_join('content', 'shell', 'tools', 'plugin'),
58 # This is tests directory, doesn't exist in the snapshot
59 path_join('content', 'test', 'data'),
60 # This is a tests directory that doesn't exist in the shipped product.
61 path_join('gin', 'test'),
62 # This is a test output directory
63 path_join('data', 'dom_perf'),
64 # This is a tests directory that doesn't exist in the shipped product.
65 path_join('tools', 'perf', 'page_sets'),
66 path_join('tools', 'perf', 'page_sets', 'tough_animation_cases'),
67 # Histogram tools, doesn't exist in the snapshot
68 path_join('tools', 'histograms'),
69 # Swarming tools, doesn't exist in the snapshot
70 path_join('tools', 'swarming_client'),
71 # ARM sysroot, doesn't exist in the snapshot
72 path_join('build', 'linux', 'debian_wheezy_arm-sysroot'),
73 # Old location (TODO(sbc): Remove this once it no longer exists on any bots)
74 path_join('chrome', 'installer', 'linux', 'debian_wheezy_arm-sysroot'),
75 # Data is not part of open source chromium, but are included on some bots.
77 # This is not part of open source chromium, but are included on some bots.
78 path_join('skia', 'tools', 'clusterfuzz-data'),
79 # Not shipped, only relates to Chrome for Android, but not to WebView
82 excluded_dirs_list
.extend(EXTRA_EXCLUDED_DIRS
)
84 # Surround the directory names with OS path separators.
85 dirs_blacklist
= [path_join('.', d
, '')[1:] for d
in excluded_dirs_list
if d
]
86 def IsBlacklistedDir(d
):
87 for item
in dirs_blacklist
:
92 files_whitelist_re
= input_api
.re
.compile(
93 r
'\.(asm|c(c|pp|xx)?|h(h|pp|xx)?|p(l|m)|xs|sh|php|py(|x)'
94 '|rb|idl|java|el|sc(i|e)|cs|pas|inc|js|pac|html|dtd|xsl|mod|mm?'
98 base_path_len
= len(root_dir
)
99 for path
in start_paths_list
:
100 full_path
= path_join(root_dir
, path
)
101 if input_api
.os_path
.isfile(full_path
):
102 if files_whitelist_re
.search(path
) and \
103 not IsBlacklistedDir(full_path
[base_path_len
:]): # Keep '/' prefix.
106 for dirpath
, dirnames
, filenames
in input_api
.os_walk(full_path
):
107 # Remove excluded subdirs for faster scanning.
108 for item
in dirnames
[:]:
110 path_join(dirpath
, item
)[base_path_len
+ 1:]):
111 dirnames
.remove(item
)
112 for filename
in filenames
:
114 path_join(dirpath
, filename
)[base_path_len
+ 1:]
115 if files_whitelist_re
.search(filepath
) and \
116 not IsBlacklistedDir(filepath
):
117 files
.append(filepath
)
121 class _GeneratedFilesDetector(object):
122 GENERATED_FILE
= 'GENERATED FILE'
123 NO_COPYRIGHT
= '*No copyright*'
125 def __init__(self
, input_api
):
126 self
.python_multiline_string_double_re
= \
127 input_api
.re
.compile(r
'"""[^"]*(?:"""|$)', flags
=input_api
.re
.MULTILINE
)
128 self
.python_multiline_string_single_re
= \
129 input_api
.re
.compile(r
"'''[^']*(?:'''|$)", flags
=input_api
.re
.MULTILINE
)
130 self
.automatically_generated_re
= input_api
.re
.compile(
131 r
'(All changes made in this file will be lost'
132 '|DO NOT (EDIT|delete this file)'
133 '|Generated (at|automatically|data)'
134 '|Automatically generated'
135 '|\Wgenerated\s+(?:\w+\s+)*file\W)', flags
=input_api
.re
.IGNORECASE
)
137 def IsGeneratedFile(self
, header
):
138 header
= header
.upper()
140 header
= self
.python_multiline_string_double_re
.sub('', header
)
142 header
= self
.python_multiline_string_single_re
.sub('', header
)
143 # First do simple strings lookup to save time.
144 if 'ALL CHANGES MADE IN THIS FILE WILL BE LOST' in header
:
146 if 'DO NOT EDIT' in header
or 'DO NOT DELETE' in header
or \
147 'GENERATED' in header
:
148 return self
.automatically_generated_re
.search(header
)
152 class _CopyrightsScanner(object):
154 def StaticInit(input_api
):
155 _CopyrightsScanner
._c
_comment
_re
= \
156 input_api
.re
.compile(r
'''"[^"\\]*(?:\\.[^"\\]*)*"''')
157 _CopyrightsScanner
._copyright
_indicator
= \
158 r
'(?:copyright|copr\.|\xc2\xa9|\(c\))'
159 _CopyrightsScanner
._full
_copyright
_indicator
_re
= input_api
.re
.compile(
160 r
'(?:\W|^)' + _CopyrightsScanner
._copyright
_indicator
+ \
161 r
'(?::\s*|\s+)(\w.*)$', input_api
.re
.IGNORECASE
)
162 _CopyrightsScanner
._copyright
_disindicator
_re
= input_api
.re
.compile(
163 r
'\s*\b(?:info(?:rmation)?|notice|and|or)\b', input_api
.re
.IGNORECASE
)
165 def __init__(self
, input_api
):
166 self
.max_line_numbers_proximity
= 3
167 self
.last_a_item_line_number
= -200
168 self
.last_b_item_line_number
= -100
169 self
.re
= input_api
.re
171 def _CloseLineNumbers(self
, a
, b
):
172 return 0 <= a
- b
<= self
.max_line_numbers_proximity
174 def MatchLine(self
, line_number
, line
):
176 line
= _CopyrightsScanner
._c
_comment
_re
.sub('', line
)
177 upcase_line
= line
.upper()
178 # Record '(a)' and '(b)' last occurences in C++ comments.
179 # This is to filter out '(c)' used as a list item inside C++ comments.
180 # E.g. "// blah-blah (a) blah\n// blah-blah (b) and (c) blah"
181 cpp_comment_idx
= upcase_line
.find('//')
182 if cpp_comment_idx
!= -1:
183 if upcase_line
.find('(A)') > cpp_comment_idx
:
184 self
.last_a_item_line_number
= line_number
185 if upcase_line
.find('(B)') > cpp_comment_idx
:
186 self
.last_b_item_line_number
= line_number
187 # Fast bailout, uses the same patterns as _copyright_indicator regexp.
188 if not 'COPYRIGHT' in upcase_line
and not 'COPR.' in upcase_line \
189 and not '\xc2\xa9' in upcase_line
:
190 c_item_index
= upcase_line
.find('(C)')
191 if c_item_index
== -1:
193 if c_item_index
> cpp_comment_idx
and \
194 self
._CloseLineNumbers
(line_number
,
195 self
.last_b_item_line_number
) and \
196 self
._CloseLineNumbers
(self
.last_b_item_line_number
,
197 self
.last_a_item_line_number
):
200 m
= _CopyrightsScanner
._full
_copyright
_indicator
_re
.search(line
)
202 not _CopyrightsScanner
._copyright
_disindicator
_re
.match(m
.group(1)):
204 # Prettify the authorship string.
205 copyr
= self
.re
.sub(r
'([,.])?\s*$/', '', copyr
)
207 _CopyrightsScanner
._copyright
_indicator
, '', copyr
, \
208 flags
=self
.re
.IGNORECASE
)
209 copyr
= self
.re
.sub(r
'^\s+', '', copyr
)
210 copyr
= self
.re
.sub(r
'\s{2,}', ' ', copyr
)
211 copyr
= self
.re
.sub(r
'\\@', '@', copyr
)
215 def FindCopyrights(input_api
, root_dir
, files_to_scan
):
216 """Determines code autorship, and finds generated files.
218 input_api: InputAPI, as in presubmit scripts.
219 root_dir: The root directory, to which all other paths are relative.
220 files_to_scan: The list of file names to scan.
222 The list of copyrights associated with each of the files given.
223 If the certain file is generated, the corresponding list consists a single
224 entry -- 'GENERATED_FILE' string. If the file has no copyright info,
225 the corresponding list contains 'NO_COPYRIGHT' string.
227 generated_files_detector
= _GeneratedFilesDetector(input_api
)
228 _CopyrightsScanner
.StaticInit(input_api
)
230 for file_name
in files_to_scan
:
234 scanner
= _CopyrightsScanner(input_api
)
235 contents
= input_api
.ReadFile(
236 input_api
.os_path
.join(root_dir
, file_name
), 'r')
237 for l
in contents
.split('\n'):
241 c
= scanner
.MatchLine(linenum
, l
)
243 file_copyrights
.append(c
)
244 if generated_files_detector
.IsGeneratedFile('\n'.join(header
)):
245 copyrights
.append([_GeneratedFilesDetector
.GENERATED_FILE
])
246 elif file_copyrights
:
247 copyrights
.append(file_copyrights
)
249 copyrights
.append([_GeneratedFilesDetector
.NO_COPYRIGHT
])
253 def FindCopyrightViolations(input_api
, root_dir
, files_to_scan
):
254 """Looks for files that are not belong exlusively to the Chromium Authors.
256 input_api: InputAPI, as in presubmit scripts.
257 root_dir: The root directory, to which all other paths are relative.
258 files_to_scan: The list of file names to scan.
260 The list of file names that contain non-Chromium copyrights.
262 copyrights
= FindCopyrights(input_api
, root_dir
, files_to_scan
)
264 allowed_copyrights_re
= input_api
.re
.compile(
265 r
'^(?:20[0-9][0-9](?:-20[0-9][0-9])? The Chromium Authors\. '
266 'All rights reserved.*)$')
267 for f
, cs
in itertools
.izip(files_to_scan
, copyrights
):
268 if cs
[0] == _GeneratedFilesDetector
.GENERATED_FILE
or \
269 cs
[0] == _GeneratedFilesDetector
.NO_COPYRIGHT
:
272 if not allowed_copyrights_re
.match(c
):
273 offending_files
.append(input_api
.os_path
.normpath(f
))
275 return offending_files
278 def _GetWhitelistFileName(input_api
):
279 return input_api
.os_path
.join(
280 'tools', 'copyright_scanner', 'third_party_files_whitelist.txt')
282 def _ProcessWhitelistedFilesList(input_api
, lines
):
283 whitelisted_files
= []
285 match
= input_api
.re
.match(r
'([^#\s]+)', line
)
287 whitelisted_files
.append(
288 ForwardSlashesToOsPathSeps(input_api
, match
.group(1)))
289 return whitelisted_files
292 def LoadWhitelistedFilesList(input_api
):
293 """Loads and parses the 3rd party code whitelist file.
294 input_api: InputAPI of presubmit scripts.
298 full_file_name
= input_api
.os_path
.join(
299 input_api
.change
.RepositoryRoot(), _GetWhitelistFileName(input_api
))
300 file_data
= input_api
.ReadFile(full_file_name
, 'rb')
301 return _ProcessWhitelistedFilesList(input_api
, file_data
.splitlines())
304 def AnalyzeScanResults(input_api
, whitelisted_files
, offending_files
):
305 """Compares whitelist contents with the results of file scanning.
306 input_api: InputAPI of presubmit scripts.
307 whitelisted_files: Whitelisted files list.
308 offending_files: Files that contain 3rd party code.
310 A triplet of "unknown", "missing", and "stale" file lists.
311 "Unknown" are files that contain 3rd party code but not whitelisted.
312 "Missing" are files that are whitelisted but doesn't really exist.
313 "Stale" are files that are whitelisted unnecessarily.
315 unknown
= set(offending_files
) - set(whitelisted_files
)
316 missing
= [f
for f
in whitelisted_files
if not input_api
.os_path
.isfile(
317 input_api
.os_path
.join(input_api
.change
.RepositoryRoot(), f
))]
318 stale
= set(whitelisted_files
) - set(offending_files
) - set(missing
)
319 return (list(unknown
), missing
, list(stale
))
322 def _GetDeletedContents(affected_file
):
323 """Returns a list of all deleted lines.
324 AffectedFile class from presubmit_support is lacking this functionality.
327 for line
in affected_file
.GenerateScmDiff().splitlines():
328 if line
.startswith('-') and not line
.startswith('--'):
329 deleted_lines
.append(line
[1:])
332 def _DoScanAtPresubmit(input_api
, whitelisted_files
, files_to_check
):
333 # We pass empty 'known third-party' dirs list here. Since this is a patch
334 # for the Chromium's src tree, it must contain properly licensed Chromium
335 # code. Any third-party code must be put into a directory named 'third_party',
336 # and such dirs are automatically excluded by FindFiles.
337 files_to_scan
= FindFiles(
338 input_api
, input_api
.change
.RepositoryRoot(), files_to_check
, [])
339 offending_files
= FindCopyrightViolations(
340 input_api
, input_api
.change
.RepositoryRoot(), files_to_scan
)
341 return AnalyzeScanResults(
342 input_api
, whitelisted_files
, offending_files
)
344 def ScanAtPresubmit(input_api
, output_api
):
345 """Invoked at change presubmit time. Verifies that updated non third-party
346 code doesn't contain external copyrighted code.
347 input_api: InputAPI of presubmit scripts.
348 output_api: OutputAPI of presubmit scripts.
350 files_to_check
= set([])
351 deleted_files
= set([])
352 whitelist_contents_changed
= False
353 for f
in input_api
.AffectedFiles():
354 if f
.LocalPath() == _GetWhitelistFileName(input_api
):
355 whitelist_contents_changed
= True
356 deleted_files |
= set(_ProcessWhitelistedFilesList(
357 input_api
, _GetDeletedContents(f
)))
359 if f
.Action() != 'D':
360 files_to_check
.add(f
.LocalPath())
362 deleted_files
.add(f
.LocalPath())
363 whitelisted_files
= set(LoadWhitelistedFilesList(input_api
))
364 if not whitelist_contents_changed
:
365 whitelisted_files
&= files_to_check | deleted_files
367 # Need to re-check the entire contents of the whitelist file.
368 # Also add files removed from the whitelist. If the file has indeed been
369 # deleted, the scanner will not complain.
370 files_to_check |
= whitelisted_files | deleted_files
372 (unknown_files
, missing_files
, stale_files
) = _DoScanAtPresubmit(
373 input_api
, list(whitelisted_files
), list(files_to_check
))
376 results
.append(output_api
.PresubmitError(
377 'The following files contain a third-party license but are not in ' \
378 'a listed third-party directory and are not whitelisted. You must ' \
379 'add the following files to the whitelist file %s\n' \
380 '(Note that if the code you are adding does not actually contain ' \
381 'any third-party code, it may contain the word "copyright", which ' \
382 'should be masked out, e.g. by writing it as "copy-right"):' \
383 '' % _GetWhitelistFileName(input_api
),
384 sorted(unknown_files
)))
386 results
.append(output_api
.PresubmitPromptWarning(
387 'The following files are whitelisted in %s, ' \
388 'but do not exist or not files:' % _GetWhitelistFileName(input_api
),
389 sorted(missing_files
)))
391 results
.append(output_api
.PresubmitPromptWarning(
392 'The following files are whitelisted unnecessarily. You must ' \
393 'remove the following files from the whitelist file ' \
394 '%s:' % _GetWhitelistFileName(input_api
),
395 sorted(stale_files
)))