no bug - Bumping Firefox l10n changesets r=release a=l10n-bump DONTBUILD CLOSED TREE
[gecko.git] / tools / lint / fluent-lint / __init__.py
blob3b4c3c570b35bbda217b30d14f7043aa3e4cc56c
1 # This Source Code Form is subject to the terms of the Mozilla Public
2 # License, v. 2.0. If a copy of the MPL was not distributed with this
3 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
4 import bisect
5 import os
6 import re
7 from html.parser import HTMLParser
9 import mozpack.path as mozpath
10 import yaml
11 from fluent.syntax import ast, parse, visitor
12 from mozlint import result
13 from mozlint.pathutils import expand_exclusions
16 class TextElementHTMLParser(HTMLParser):
17 """HTML Parser for TextElement.
19 TextElements may contain embedded html tags, which can include
20 quotes in attributes. We only want to check the actual text.
21 """
23 def __init__(self):
24 super().__init__()
25 self.extracted_text = []
27 def handle_data(self, data):
28 self.extracted_text.append(data)
31 class Linter(visitor.Visitor):
32 """Fluent linter implementation.
34 This subclasses the Fluent AST visitor. Methods are called corresponding
35 to each type of node in the Fluent AST. It is possible to control
36 whether a node is recursed into by calling the generic_visit method on
37 the superclass.
39 See the documentation here:
40 https://www.projectfluent.org/python-fluent/fluent.syntax/stable/usage.html
41 """
43 def __init__(
44 self, path, config, exclusions, contents, offsets_and_lines, brand_names=[]
46 super().__init__()
47 self.path = path
48 self.config = config
49 self.exclusions = exclusions
50 self.contents = contents
51 self.offsets_and_lines = offsets_and_lines
53 self.results = []
54 self.identifier_re = re.compile(r"[a-z0-9-]+")
55 self.apostrophe_re = re.compile(r"\w'")
56 self.incorrect_apostrophe_re = re.compile(r"\w\u2018\w")
57 self.single_quote_re = re.compile(r"'(.+)'")
58 self.double_quote_re = re.compile(r"\".+\"")
59 self.ellipsis_re = re.compile(r"\.\.\.")
61 self.brand_names = brand_names
62 self.minimum_id_length = 9
64 self.state = {
65 # The resource comment should be at the top of the page after the license.
66 "node_can_be_resource_comment": True,
67 # Group comments must be followed by a message. Two group comments are not
68 # allowed in a row.
69 "can_have_group_comment": True,
70 # Comment bound to the current message
71 "comment": "",
72 # The current group comment
73 "group_comment": "",
74 # Variables in the current message
75 "variables": [],
78 attributes = [
79 "label",
80 "value",
81 "accesskey",
82 "alt",
83 "title",
84 "tooltiptext",
85 "placeholder",
86 "aria-label",
87 "aria-description",
88 "aria-valuetext",
89 "style",
90 # For XUL key/command setup.
91 "key",
92 "keycode",
93 # For download filenames:
94 "download",
95 # Used in the Firefox prefs
96 "searchkeywords",
97 # Used by search-textbox.js
98 "searchbuttonlabel",
99 # Used in toolbar customization.
100 "toolbarname",
101 # Used in moz-message-bar.
102 "message",
103 # Used in dialogs (should be moved to using fluent IDs though)
104 "buttonlabelaccept",
105 "buttonaccesskeyaccept",
106 "buttonlabelcancel",
107 "buttonaccesskeycancel",
108 "buttonlabelextra2",
109 "buttonaccesskeyextra2",
110 # Used in app menu notifications (should be moved to use fluent IDs)
111 "buttonlabel",
112 "buttonaccesskey",
113 "secondarybuttonlabel",
114 "secondarybuttonaccesskey",
115 # Commonly used in Lit-based web components
116 "heading",
117 "description",
119 self.known_attribute_list = [a.lower() for a in attributes]
121 # Set this to true to debug print the root node's json. This is useful for
122 # writing new lint rules, or debugging existing ones.
123 self.debug_print_json = False
125 def generic_visit(self, node):
126 node_name = type(node).__name__
127 self.state["node_can_be_resource_comment"] = self.state[
128 "node_can_be_resource_comment"
129 ] and (
130 # This is the root node.
131 node_name == "Resource"
132 # Empty space is allowed.
133 or node_name == "Span"
134 # Comments are allowed
135 or node_name == "Comment"
138 if self.debug_print_json:
139 import json
141 print(json.dumps(node.to_json(), indent=2))
142 # Only debug print the root node.
143 self.debug_print_json = False
145 super(Linter, self).generic_visit(node)
147 def visit_Attribute(self, node):
148 # Only visit values for Attribute nodes, the identifier comes from dom.
149 super().generic_visit(node.value)
151 def visit_FunctionReference(self, node):
152 # We don't recurse into function references, the identifiers there are
153 # allowed to be free form.
154 pass
156 def visit_Message(self, node):
157 # There must be at least one message or term between group comments.
158 self.state["can_have_group_comment"] = True
159 self.last_message_id = node.id.name
161 super().generic_visit(node)
163 # Do this here instead as visit_Attribute doesn't have access to the
164 # message's comment.
165 for attr in node.attributes:
166 if not attr.id.name.lower() in self.known_attribute_list:
167 comment = self.state["comment"] + self.state["group_comment"]
168 if not f".{attr.id.name}" in comment:
169 self.add_error(
170 attr,
171 "VA01",
172 "Use attributes designed for localized content directly."
173 " If script-based processing is necessary, add a comment"
174 f" explaining why. The linter didn't recognize: .{attr.id.name}",
175 "warning",
178 # Check if variables are referenced in comments
179 if self.state["variables"]:
180 comments = self.state["comment"] + self.state["group_comment"]
181 missing_references = [
182 v for v in self.state["variables"] if f"${v}" not in comments
184 if missing_references:
185 self.add_error(
186 node,
187 "VC01",
188 "Messages including variables should have a comment "
189 "explaining what will replace the variable. "
190 "Missing references: "
191 + ", ".join([f"${m}" for m in missing_references]),
194 # Reset current comment and variable references after reading the
195 # message.
196 self.state["comment"] = ""
197 self.state["variables"] = []
199 def visit_Term(self, node):
200 # There must be at least one message or term between group comments.
201 self.state["can_have_group_comment"] = True
202 self.last_message_id = None
204 super().generic_visit(node)
206 # Reset current comment and variable references after reading the term.
207 self.state["comment"] = ""
208 self.state["variables"] = []
210 def visit_MessageReference(self, node):
211 # We don't recurse into message references, the identifiers are either
212 # checked elsewhere or are attributes and come from DOM.
213 pass
215 def visit_Identifier(self, node):
216 if (
217 self.path not in self.exclusions["ID01"]["files"]
218 and node.name not in self.exclusions["ID01"]["messages"]
219 and not self.identifier_re.fullmatch(node.name)
221 self.add_error(
222 node,
223 "ID01",
224 "Identifiers may only contain lowercase characters and -",
226 if (
227 len(node.name) < self.minimum_id_length
228 and self.path not in self.exclusions["ID02"]["files"]
229 and node.name not in self.exclusions["ID02"]["messages"]
231 self.add_error(
232 node,
233 "ID02",
234 f"Identifiers must be at least {self.minimum_id_length} characters long",
237 def visit_TextElement(self, node):
238 parser = TextElementHTMLParser()
239 parser.feed(node.value)
240 for text in parser.extracted_text:
241 # To check for apostrophes, first remove pairs of straight quotes
242 # used as delimiters.
243 cleaned_str = re.sub(self.single_quote_re, "\1", node.value)
244 if self.apostrophe_re.search(cleaned_str):
245 self.add_error(
246 node,
247 "TE01",
248 "Strings with apostrophes should use foo\u2019s instead of foo's.",
250 if self.incorrect_apostrophe_re.search(text):
251 self.add_error(
252 node,
253 "TE02",
254 "Strings with apostrophes should use foo\u2019s instead of foo\u2018s.",
256 if self.single_quote_re.search(text):
257 self.add_error(
258 node,
259 "TE03",
260 "Single-quoted strings should use Unicode \u2018foo\u2019 instead of 'foo'.",
262 if self.double_quote_re.search(text):
263 self.add_error(
264 node,
265 "TE04",
266 'Double-quoted strings should use Unicode \u201cfoo\u201d instead of "foo".',
268 if self.ellipsis_re.search(text):
269 self.add_error(
270 node,
271 "TE05",
272 "Strings with an ellipsis should use the Unicode \u2026 character"
273 " instead of three periods",
276 # If part of a message, check for brand names
277 if (
278 self.last_message_id is not None
279 and self.path not in self.exclusions["CO01"]["files"]
280 and self.last_message_id not in self.exclusions["CO01"]["messages"]
282 found_brands = []
283 for brand in self.brand_names:
284 if brand in text:
285 found_brands.append(brand)
286 if found_brands:
287 self.add_error(
288 node,
289 "CO01",
290 "Strings should use the corresponding terms instead of"
291 f" hard-coded brand names ({', '.join(found_brands)})",
294 def visit_ResourceComment(self, node):
295 # This node is a comment with: "###"
296 if not self.state["node_can_be_resource_comment"]:
297 self.add_error(
298 node,
299 "RC01",
300 "Resource comments (###) should be placed at the top of the file, just "
301 "after the license header. There should only be one resource comment "
302 "per file.",
304 return
306 lines_after = get_newlines_count_after(node.span, self.contents)
307 lines_before = get_newlines_count_before(node.span, self.contents)
309 if node.span.end == len(self.contents) - 1:
310 # This file only contains a resource comment.
311 return
313 if lines_after != 2:
314 self.add_error(
315 node,
316 "RC02",
317 "Resource comments (###) should be followed by one empty line.",
319 return
321 if lines_before != 2:
322 self.add_error(
323 node,
324 "RC03",
325 "Resource comments (###) should have one empty line above them.",
327 return
329 def visit_SelectExpression(self, node):
330 # We only want to visit the variant values, the identifiers in selectors
331 # and keys are allowed to be free form.
332 for variant in node.variants:
333 super().generic_visit(variant.value)
335 # Store the variable used for the SelectExpression, excluding functions
336 # like PLATFORM()
337 if (
338 type(node.selector) == ast.VariableReference
339 and node.selector.id.name not in self.state["variables"]
341 self.state["variables"].append(node.selector.id.name)
343 def visit_Comment(self, node):
344 # This node is a comment with: "#"
346 # Store the comment
347 self.state["comment"] = node.content
349 def visit_GroupComment(self, node):
350 # This node is a comment with: "##"
352 # Store the group comment
353 self.state["group_comment"] = node.content
355 if not self.state["can_have_group_comment"]:
356 self.add_error(
357 node,
358 "GC04",
359 "Group comments (##) must be followed by at least one message "
360 "or term. Make sure that a single group comment with multiple "
361 "paragraphs is not separated by whitespace, as it will be "
362 "interpreted as two different comments.",
364 return
366 self.state["can_have_group_comment"] = False
368 lines_after = get_newlines_count_after(node.span, self.contents)
369 lines_before = get_newlines_count_before(node.span, self.contents)
371 if node.span.end == len(self.contents) - 1:
372 # The group comment is the last thing in the file.
374 if node.content == "":
375 # Empty comments are allowed at the end of the file.
376 return
378 self.add_error(
379 node,
380 "GC01",
381 "Group comments (##) should not be at the end of the file, they should "
382 "always be above a message. Only an empty group comment is allowed at "
383 "the end of a file.",
385 return
387 if lines_after != 2:
388 self.add_error(
389 node,
390 "GC02",
391 "Group comments (##) should be followed by one empty line.",
393 return
395 if lines_before != 2:
396 self.add_error(
397 node,
398 "GC03",
399 "Group comments (##) should have an empty line before them.",
401 return
403 def visit_VariableReference(self, node):
404 # Identifiers are allowed to be free form, but need to store them
405 # for comment checks.
407 if node.id.name not in self.state["variables"]:
408 self.state["variables"].append(node.id.name)
410 def add_error(self, node, rule, msg, level=None):
411 (col, line) = self.span_to_line_and_col(node.span)
412 res = {
413 "path": self.path,
414 "lineno": line,
415 "column": col,
416 "rule": rule,
417 "message": msg,
419 if level:
420 res["level"] = level
422 self.results.append(result.from_config(self.config, **res))
424 def span_to_line_and_col(self, span):
425 i = bisect.bisect_left(self.offsets_and_lines, (span.start, 0))
426 if i > 0:
427 col = span.start - self.offsets_and_lines[i - 1][0]
428 else:
429 col = 1 + span.start
430 return (col, self.offsets_and_lines[i][1])
433 def get_offsets_and_lines(contents):
434 """Return a list consisting of tuples of (offset, line).
436 The Fluent AST contains spans of start and end offsets in the file.
437 This function returns a list of offsets and line numbers so that errors
438 can be reported using line and column.
440 line = 1
441 result = []
442 for m in re.finditer(r"\n", contents):
443 result.append((m.start(), line))
444 line += 1
445 return result
448 def get_newlines_count_after(span, contents):
449 # Determine the number of newlines.
450 count = 0
451 for i in range(span.end, len(contents)):
452 assert contents[i] != "\r", "This linter does not handle \\r characters."
453 if contents[i] != "\n":
454 break
455 count += 1
457 return count
460 def get_newlines_count_before(span, contents):
461 # Determine the range of newline characters.
462 count = 0
463 for i in range(span.start - 1, 0, -1):
464 assert contents[i] != "\r", "This linter does not handle \\r characters."
465 if contents[i] != "\n":
466 break
467 count += 1
469 return count
472 def get_exclusions(root):
473 with open(
474 mozpath.join(root, "tools", "lint", "fluent-lint", "exclusions.yml")
475 ) as f:
476 exclusions = list(yaml.safe_load_all(f))[0]
477 for error_type in exclusions:
478 exclusions[error_type]["files"] = set(
479 [mozpath.join(root, x) for x in exclusions[error_type]["files"]]
481 return exclusions
484 def get_branding_list(root, brand_files):
485 class MessageExtractor(visitor.Visitor):
486 def __init__(self):
487 self.brands = []
488 self.last_message_id = None
490 def visit_Term(self, node):
491 self.last_message_id = node.id.name
492 self.generic_visit(node)
494 def visit_TextElement(self, node):
495 if self.last_message_id:
496 self.brands += [node.value]
497 self.last_message_id = None
498 self.generic_visit(node)
500 extractor = MessageExtractor()
502 for brand_path in brand_files:
503 brand_file = mozpath.join(root, brand_path)
504 if os.path.exists(brand_file):
505 with open(brand_file, encoding="utf-8") as f:
506 messages = parse(f.read())
507 extractor.visit(messages)
509 return list(set(extractor.brands))
512 def lint(paths, config, fix=None, **lintargs):
513 root = lintargs["root"]
514 files = list(expand_exclusions(paths, config, root))
515 exclusions = get_exclusions(root)
516 brand_files = config.get("brand-files")
517 brand_names = get_branding_list(root, brand_files)
518 results = []
519 for path in files:
520 contents = open(path, "r", encoding="utf-8").read()
521 linter = Linter(
522 path,
523 config,
524 exclusions,
525 contents,
526 get_offsets_and_lines(contents),
527 brand_names,
529 linter.visit(parse(contents))
530 results.extend(linter.results)
531 return results