Daily bump.
[official-gcc.git] / contrib / check-internal-format-escaping.py
blobe06752666b83057198f73d5f3534bf1cd0ade346
1 #!/usr/bin/env python3
3 # Check gcc.pot file for stylistic issues as described in
4 # https://gcc.gnu.org/onlinedocs/gccint/Guidelines-for-Diagnostics.html,
5 # especially in gcc-internal-format messages.
7 # This file is part of GCC.
9 # GCC is free software; you can redistribute it and/or modify it under
10 # the terms of the GNU General Public License as published by the Free
11 # Software Foundation; either version 3, or (at your option) any later
12 # version.
14 # GCC is distributed in the hope that it will be useful, but WITHOUT ANY
15 # WARRANTY; without even the implied warranty of MERCHANTABILITY or
16 # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 # for more details.
19 # You should have received a copy of the GNU General Public License
20 # along with GCC; see the file COPYING3. If not see
21 # <http://www.gnu.org/licenses/>.
23 import argparse
24 import re
25 from collections import Counter
26 from typing import Dict, Match
28 import polib
30 seen_warnings = Counter()
33 def location(msg: polib.POEntry):
34 if msg.occurrences:
35 occ = msg.occurrences[0]
36 return f'{occ[0]}:{occ[1]}'
37 return '<unknown location>'
40 def warn(msg: polib.POEntry,
41 diagnostic_id: str, diagnostic: str, include_msgid=True):
42 """
43 To suppress a warning for a particular message,
44 add a line "#, gcclint:ignore:{diagnostic_id}" to the message.
45 """
47 if f'gcclint:ignore:{diagnostic_id}' in msg.flags:
48 return
50 seen_warnings[diagnostic] += 1
52 if include_msgid:
53 print(f'{location(msg)}: {diagnostic} in {repr(msg.msgid)}')
54 else:
55 print(f'{location(msg)}: {diagnostic}')
58 def lint_gcc_internal_format(msg: polib.POEntry):
59 """
60 Checks a single message that has the gcc-internal-format. These
61 messages use a variety of placeholders like %qs, %<quotes%> and
62 %q#E.
63 """
65 msgid: str = msg.msgid
67 def outside_quotes(m: Match[str]):
68 before = msgid[:m.start(0)]
69 return before.count("%<") == before.count("%>")
71 def lint_matching_placeholders():
72 """
73 Warns when literal values in placeholders are not exactly equal
74 in the translation. This can happen when doing copy-and-paste
75 translations of similar messages.
77 To avoid these mismatches in the first place,
78 structurally equal messages are found by
79 lint_diagnostics_differing_only_in_placeholders.
81 This check only applies when checking a finished translation
82 such as de.po, not gcc.pot.
83 """
85 if not msg.translated():
86 return
88 in_msgid = re.findall('%<[^%]+%>', msgid)
89 in_msgstr = re.findall('%<[^%]+%>', msg.msgstr)
91 if set(in_msgid) != set(in_msgstr):
92 warn(msg,
93 'placeholder-mismatch',
94 f'placeholder mismatch: msgid has {in_msgid}, '
95 f'msgstr has {in_msgstr}',
96 include_msgid=False)
98 def lint_option_outside_quotes():
99 for match in re.finditer(r'\S+', msgid):
100 part = match.group()
101 if not outside_quotes(match):
102 continue
104 if part.startswith('-'):
105 if len(part) >= 2 and part[1].isalpha():
106 if part == '-INF':
107 continue
109 warn(msg,
110 'option-outside-quotes',
111 'command line option outside %<quotes%>')
113 if part.startswith('__builtin_'):
114 warn(msg,
115 'builtin-outside-quotes',
116 'builtin function outside %<quotes%>')
118 def lint_plain_apostrophe():
119 for match in re.finditer("[^%]'", msgid):
120 if outside_quotes(match):
121 warn(msg, 'apostrophe', 'apostrophe without leading %')
123 def lint_space_before_quote():
125 A space before %< is often the result of string literals that
126 are joined by the C compiler and neither literal has a space
127 to separate the words.
130 for match in re.finditer("(.?[a-zA-Z0-9])%<", msgid):
131 if match.group(1) != '%s':
132 warn(msg,
133 'no-space-before-quote',
134 '%< directly following a letter or digit')
136 def lint_underscore_outside_quotes():
138 An underscore outside of quotes is used in several contexts,
139 and many of them violate the GCC Guidelines for Diagnostics:
141 * names of GCC-internal compiler functions
142 * names of GCC-internal data structures
143 * static_cast and the like (which are legitimate)
146 for match in re.finditer("_", msgid):
147 if outside_quotes(match):
148 warn(msg,
149 'underscore-outside-quotes',
150 'underscore outside of %<quotes%>')
151 return
153 def lint_may_not():
155 The term "may not" may either mean "it could be the case"
156 or "should not". These two different meanings are sometimes
157 hard to tell apart.
160 if re.search(r'\bmay not\b', msgid):
161 warn(msg,
162 'ambiguous-may-not',
163 'the term "may not" is ambiguous')
165 def lint_unbalanced_quotes():
166 if msgid.count("%<") != msgid.count("%>"):
167 warn(msg,
168 'unbalanced-quotes',
169 'unbalanced %< and %> quotes')
171 if msg.translated():
172 if msg.msgstr.count("%<") != msg.msgstr.count("%>"):
173 warn(msg,
174 'unbalanced-quotes',
175 'unbalanced %< and %> quotes')
177 def lint_single_space_after_sentence():
179 After a sentence there should be two spaces.
182 if re.search(r'[.] [A-Z]', msgid):
183 warn(msg,
184 'single-space-after-sentence',
185 'single space after sentence')
187 def lint_non_canonical_quotes():
189 Catches %<%s%>, which can be written in the shorter form %qs.
191 match = re.search("%<%s%>|'%s'|\"%s\"|`%s'", msgid)
192 if match:
193 warn(msg,
194 'non-canonical-quotes',
195 f'placeholder {match.group()} should be written as %qs')
197 lint_option_outside_quotes()
198 lint_plain_apostrophe()
199 lint_space_before_quote()
200 lint_underscore_outside_quotes()
201 lint_may_not()
202 lint_unbalanced_quotes()
203 lint_matching_placeholders()
204 lint_single_space_after_sentence()
205 lint_non_canonical_quotes()
208 def lint_diagnostics_differing_only_in_placeholders(po: polib.POFile):
210 Detects messages that are structurally the same, except that they
211 use different plain strings inside %<quotes%>. These messages can
212 be merged in order to prevent copy-and-paste mistakes by the
213 translators.
215 See bug 90119.
218 seen: Dict[str, polib.POEntry] = {}
220 for msg in po:
221 msg: polib.POEntry
222 msgid = msg.msgid
224 normalized = re.sub('%<[^%]+%>', '%qs', msgid)
225 if normalized not in seen:
226 seen[normalized] = msg
227 seen[msgid] = msg
228 continue
230 prev = seen[normalized]
231 warn(msg,
232 'same-pattern',
233 f'same pattern for {repr(msgid)} and '
234 f'{repr(prev.msgid)} in {location(prev)}',
235 include_msgid=False)
238 def lint_file(po: polib.POFile):
239 for msg in po:
240 msg: polib.POEntry
242 if not msg.obsolete and not msg.fuzzy:
243 if 'gcc-internal-format' in msg.flags:
244 lint_gcc_internal_format(msg)
246 lint_diagnostics_differing_only_in_placeholders(po)
249 def main():
250 parser = argparse.ArgumentParser(description='')
251 parser.add_argument('file', help='pot file')
253 args = parser.parse_args()
255 po = polib.pofile(args.file)
256 lint_file(po)
258 print()
259 print('summary:')
260 for entry in seen_warnings.most_common():
261 if entry[1] > 1:
262 print(f'{entry[1]}\t{entry[0]}')
265 if __name__ == '__main__':
266 main()