locale: Fix localedata/sort-test undefined behavior
[glibc.git] / scripts / check-obsolete-constructs.py
blob69d7cade229dcd4b352ad65b98bbeb7b02b6e3f5
1 #! /usr/bin/python3
2 # Copyright (C) 2019-2021 Free Software Foundation, Inc.
3 # This file is part of the GNU C Library.
5 # The GNU C Library is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU Lesser General Public
7 # License as published by the Free Software Foundation; either
8 # version 2.1 of the License, or (at your option) any later version.
10 # The GNU C Library is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 # Lesser General Public License for more details.
15 # You should have received a copy of the GNU Lesser General Public
16 # License along with the GNU C Library; if not, see
17 # <https://www.gnu.org/licenses/>.
19 """Verifies that installed headers do not use any obsolete constructs:
20 * legacy BSD typedefs superseded by <stdint.h>:
21 ushort uint ulong u_char u_short u_int u_long u_intNN_t quad_t u_quad_t
22 (sys/types.h is allowed to _define_ these types, but not to use them
23 to define anything else).
24 """
26 import argparse
27 import collections
28 import re
29 import sys
31 # Simplified lexical analyzer for C preprocessing tokens.
32 # Does not implement trigraphs.
33 # Does not implement backslash-newline in the middle of any lexical
34 # item other than a string literal.
35 # Does not implement universal-character-names in identifiers.
36 # Treats prefixed strings (e.g. L"...") as two tokens (L and "...")
37 # Accepts non-ASCII characters only within comments and strings.
39 # Caution: The order of the outermost alternation matters.
40 # STRING must be before BAD_STRING, CHARCONST before BAD_CHARCONST,
41 # BLOCK_COMMENT before BAD_BLOCK_COM before PUNCTUATOR, and OTHER must
42 # be last.
43 # Caution: There should be no capturing groups other than the named
44 # captures in the outermost alternation.
46 # For reference, these are all of the C punctuators as of C11:
47 # [ ] ( ) { } , ; ? ~
48 # ! != * *= / /= ^ ^= = ==
49 # # ##
50 # % %= %> %: %:%:
51 # & &= &&
52 # | |= ||
53 # + += ++
54 # - -= -- ->
55 # . ...
56 # : :>
57 # < <% <: << <<= <=
58 # > >= >> >>=
60 # The BAD_* tokens are not part of the official definition of pp-tokens;
61 # they match unclosed strings, character constants, and block comments,
62 # so that the regex engine doesn't have to backtrack all the way to the
63 # beginning of a broken construct and then emit dozens of junk tokens.
65 PP_TOKEN_RE_ = re.compile(r"""
66 (?P<STRING> \"(?:[^\"\\\r\n]|\\(?:[\r\n -~]|\r\n))*\")
67 |(?P<BAD_STRING> \"(?:[^\"\\\r\n]|\\[ -~])*)
68 |(?P<CHARCONST> \'(?:[^\'\\\r\n]|\\(?:[\r\n -~]|\r\n))*\')
69 |(?P<BAD_CHARCONST> \'(?:[^\'\\\r\n]|\\[ -~])*)
70 |(?P<BLOCK_COMMENT> /\*(?:\*(?!/)|[^*])*\*/)
71 |(?P<BAD_BLOCK_COM> /\*(?:\*(?!/)|[^*])*\*?)
72 |(?P<LINE_COMMENT> //[^\r\n]*)
73 |(?P<IDENT> [_a-zA-Z][_a-zA-Z0-9]*)
74 |(?P<PP_NUMBER> \.?[0-9](?:[0-9a-df-oq-zA-DF-OQ-Z_.]|[eEpP][+-]?)*)
75 |(?P<PUNCTUATOR>
76 [,;?~(){}\[\]]
77 | [!*/^=]=?
78 | \#\#?
79 | %(?:[=>]|:(?:%:)?)?
80 | &[=&]?
81 |\|[=|]?
82 |\+[=+]?
83 | -[=->]?
84 |\.(?:\.\.)?
85 | :>?
86 | <(?:[%:]|<(?:=|<=?)?)?
87 | >(?:=|>=?)?)
88 |(?P<ESCNL> \\(?:\r|\n|\r\n))
89 |(?P<WHITESPACE> [ \t\n\r\v\f]+)
90 |(?P<OTHER> .)
91 """, re.DOTALL | re.VERBOSE)
93 HEADER_NAME_RE_ = re.compile(r"""
94 < [^>\r\n]+ >
95 | " [^"\r\n]+ "
96 """, re.DOTALL | re.VERBOSE)
98 ENDLINE_RE_ = re.compile(r"""\r|\n|\r\n""")
100 # based on the sample code in the Python re documentation
101 Token_ = collections.namedtuple("Token", (
102 "kind", "text", "line", "column", "context"))
103 Token_.__doc__ = """
104 One C preprocessing token, comment, or chunk of whitespace.
105 'kind' identifies the token type, which will be one of:
106 STRING, CHARCONST, BLOCK_COMMENT, LINE_COMMENT, IDENT,
107 PP_NUMBER, PUNCTUATOR, ESCNL, WHITESPACE, HEADER_NAME,
108 or OTHER. The BAD_* alternatives in PP_TOKEN_RE_ are
109 handled within tokenize_c, below.
111 'text' is the sequence of source characters making up the token;
112 no decoding whatsoever is performed.
114 'line' and 'column' give the position of the first character of the
115 token within the source file. They are both 1-based.
117 'context' indicates whether or not this token occurred within a
118 preprocessing directive; it will be None for running text,
119 '<null>' for the leading '#' of a directive line (because '#'
120 all by itself on a line is a "null directive"), or the name of
121 the directive for tokens within a directive line, starting with
122 the IDENT for the name itself.
125 def tokenize_c(file_contents, reporter):
126 """Yield a series of Token objects, one for each preprocessing
127 token, comment, or chunk of whitespace within FILE_CONTENTS.
128 The REPORTER object is expected to have one method,
129 reporter.error(token, message), which will be called to
130 indicate a lexical error at the position of TOKEN.
131 If MESSAGE contains the four-character sequence '{!r}', that
132 is expected to be replaced by repr(token.text).
135 Token = Token_
136 PP_TOKEN_RE = PP_TOKEN_RE_
137 ENDLINE_RE = ENDLINE_RE_
138 HEADER_NAME_RE = HEADER_NAME_RE_
140 line_num = 1
141 line_start = 0
142 pos = 0
143 limit = len(file_contents)
144 directive = None
145 at_bol = True
146 while pos < limit:
147 if directive == "include":
148 mo = HEADER_NAME_RE.match(file_contents, pos)
149 if mo:
150 kind = "HEADER_NAME"
151 directive = "after_include"
152 else:
153 mo = PP_TOKEN_RE.match(file_contents, pos)
154 kind = mo.lastgroup
155 if kind != "WHITESPACE":
156 directive = "after_include"
157 else:
158 mo = PP_TOKEN_RE.match(file_contents, pos)
159 kind = mo.lastgroup
161 text = mo.group()
162 line = line_num
163 column = mo.start() - line_start
164 adj_line_start = 0
165 # only these kinds can contain a newline
166 if kind in ("WHITESPACE", "BLOCK_COMMENT", "LINE_COMMENT",
167 "STRING", "CHARCONST", "BAD_BLOCK_COM", "ESCNL"):
168 for tmo in ENDLINE_RE.finditer(text):
169 line_num += 1
170 adj_line_start = tmo.end()
171 if adj_line_start:
172 line_start = mo.start() + adj_line_start
174 # Track whether or not we are scanning a preprocessing directive.
175 if kind == "LINE_COMMENT" or (kind == "WHITESPACE" and adj_line_start):
176 at_bol = True
177 directive = None
178 else:
179 if kind == "PUNCTUATOR" and text == "#" and at_bol:
180 directive = "<null>"
181 elif kind == "IDENT" and directive == "<null>":
182 directive = text
183 at_bol = False
185 # Report ill-formed tokens and rewrite them as their well-formed
186 # equivalents, so downstream processing doesn't have to know about them.
187 # (Rewriting instead of discarding provides better error recovery.)
188 if kind == "BAD_BLOCK_COM":
189 reporter.error(Token("BAD_BLOCK_COM", "", line, column+1, ""),
190 "unclosed block comment")
191 text += "*/"
192 kind = "BLOCK_COMMENT"
193 elif kind == "BAD_STRING":
194 reporter.error(Token("BAD_STRING", "", line, column+1, ""),
195 "unclosed string")
196 text += "\""
197 kind = "STRING"
198 elif kind == "BAD_CHARCONST":
199 reporter.error(Token("BAD_CHARCONST", "", line, column+1, ""),
200 "unclosed char constant")
201 text += "'"
202 kind = "CHARCONST"
204 tok = Token(kind, text, line, column+1,
205 "include" if directive == "after_include" else directive)
206 # Do not complain about OTHER tokens inside macro definitions.
207 # $ and @ appear in macros defined by headers intended to be
208 # included from assembly language, e.g. sysdeps/mips/sys/asm.h.
209 if kind == "OTHER" and directive != "define":
210 self.error(tok, "stray {!r} in program")
212 yield tok
213 pos = mo.end()
216 # Base and generic classes for individual checks.
219 class ConstructChecker:
220 """Scan a stream of C preprocessing tokens and possibly report
221 problems with them. The REPORTER object passed to __init__ has
222 one method, reporter.error(token, message), which should be
223 called to indicate a problem detected at the position of TOKEN.
224 If MESSAGE contains the four-character sequence '{!r}' then that
225 will be replaced with a textual representation of TOKEN.
227 def __init__(self, reporter):
228 self.reporter = reporter
230 def examine(self, tok):
231 """Called once for each token in a header file.
232 Call self.reporter.error if a problem is detected.
234 raise NotImplementedError
236 def eof(self):
237 """Called once at the end of the stream. Subclasses need only
238 override this if it might have something to do."""
239 pass
241 class NoCheck(ConstructChecker):
242 """Generic checker class which doesn't do anything. Substitute this
243 class for a real checker when a particular check should be skipped
244 for some file."""
246 def examine(self, tok):
247 pass
250 # Check for obsolete type names.
253 # The obsolete type names we're looking for:
254 OBSOLETE_TYPE_RE_ = re.compile(r"""\A
255 (__)?
256 ( quad_t
257 | u(?: short | int | long
258 | _(?: char | short | int(?:[0-9]+_t)? | long | quad_t )))
259 \Z""", re.VERBOSE)
261 class ObsoleteNotAllowed(ConstructChecker):
262 """Don't allow any use of the obsolete typedefs."""
263 def examine(self, tok):
264 if OBSOLETE_TYPE_RE_.match(tok.text):
265 self.reporter.error(tok, "use of {!r}")
267 class ObsoletePrivateDefinitionsAllowed(ConstructChecker):
268 """Allow definitions of the private versions of the
269 obsolete typedefs; that is, 'typedef [anything] __obsolete;'
271 def __init__(self, reporter):
272 super().__init__(reporter)
273 self.in_typedef = False
274 self.prev_token = None
276 def examine(self, tok):
277 # bits/types.h hides 'typedef' in a macro sometimes.
278 if (tok.kind == "IDENT"
279 and tok.text in ("typedef", "__STD_TYPE")
280 and tok.context is None):
281 self.in_typedef = True
282 elif tok.kind == "PUNCTUATOR" and tok.text == ";" and self.in_typedef:
283 self.in_typedef = False
284 if self.prev_token.kind == "IDENT":
285 m = OBSOLETE_TYPE_RE_.match(self.prev_token.text)
286 if m and m.group(1) != "__":
287 self.reporter.error(self.prev_token, "use of {!r}")
288 self.prev_token = None
289 else:
290 self._check_prev()
292 self.prev_token = tok
294 def eof(self):
295 self._check_prev()
297 def _check_prev(self):
298 if (self.prev_token is not None
299 and self.prev_token.kind == "IDENT"
300 and OBSOLETE_TYPE_RE_.match(self.prev_token.text)):
301 self.reporter.error(self.prev_token, "use of {!r}")
303 class ObsoletePublicDefinitionsAllowed(ConstructChecker):
304 """Allow definitions of the public versions of the obsolete
305 typedefs. Only specific forms of definition are allowed:
307 typedef __obsolete obsolete; // identifiers must agree
308 typedef __uintN_t u_intN_t; // N must agree
309 typedef unsigned long int ulong;
310 typedef unsigned short int ushort;
311 typedef unsigned int uint;
313 def __init__(self, reporter):
314 super().__init__(reporter)
315 self.typedef_tokens = []
317 def examine(self, tok):
318 if tok.kind in ("WHITESPACE", "BLOCK_COMMENT",
319 "LINE_COMMENT", "NL", "ESCNL"):
320 pass
322 elif (tok.kind == "IDENT" and tok.text == "typedef"
323 and tok.context is None):
324 if self.typedef_tokens:
325 self.reporter.error(tok, "typedef inside typedef")
326 self._reset()
327 self.typedef_tokens.append(tok)
329 elif tok.kind == "PUNCTUATOR" and tok.text == ";":
330 self._finish()
332 elif self.typedef_tokens:
333 self.typedef_tokens.append(tok)
335 def eof(self):
336 self._reset()
338 def _reset(self):
339 while self.typedef_tokens:
340 tok = self.typedef_tokens.pop(0)
341 if tok.kind == "IDENT" and OBSOLETE_TYPE_RE_.match(tok.text):
342 self.reporter.error(tok, "use of {!r}")
344 def _finish(self):
345 if not self.typedef_tokens: return
346 if self.typedef_tokens[-1].kind == "IDENT":
347 m = OBSOLETE_TYPE_RE_.match(self.typedef_tokens[-1].text)
348 if m:
349 if self._permissible_public_definition(m):
350 self.typedef_tokens.clear()
351 self._reset()
353 def _permissible_public_definition(self, m):
354 if m.group(1) == "__": return False
355 name = m.group(2)
356 toks = self.typedef_tokens
357 ntok = len(toks)
358 if ntok == 3 and toks[1].kind == "IDENT":
359 defn = toks[1].text
360 n = OBSOLETE_TYPE_RE_.match(defn)
361 if n and n.group(1) == "__" and n.group(2) == name:
362 return True
364 if (name[:5] == "u_int" and name[-2:] == "_t"
365 and defn[:6] == "__uint" and defn[-2:] == "_t"
366 and name[5:-2] == defn[6:-2]):
367 return True
369 return False
371 if (name == "ulong" and ntok == 5
372 and toks[1].kind == "IDENT" and toks[1].text == "unsigned"
373 and toks[2].kind == "IDENT" and toks[2].text == "long"
374 and toks[3].kind == "IDENT" and toks[3].text == "int"):
375 return True
377 if (name == "ushort" and ntok == 5
378 and toks[1].kind == "IDENT" and toks[1].text == "unsigned"
379 and toks[2].kind == "IDENT" and toks[2].text == "short"
380 and toks[3].kind == "IDENT" and toks[3].text == "int"):
381 return True
383 if (name == "uint" and ntok == 4
384 and toks[1].kind == "IDENT" and toks[1].text == "unsigned"
385 and toks[2].kind == "IDENT" and toks[2].text == "int"):
386 return True
388 return False
390 def ObsoleteTypedefChecker(reporter, fname):
391 """Factory: produce an instance of the appropriate
392 obsolete-typedef checker for FNAME."""
394 # The obsolete rpc/ and rpcsvc/ headers are allowed to use the
395 # obsolete types, because it would be more trouble than it's
396 # worth to remove them from headers that we intend to stop
397 # installing eventually anyway.
398 if (fname.startswith("rpc/")
399 or fname.startswith("rpcsvc/")
400 or "/rpc/" in fname
401 or "/rpcsvc/" in fname):
402 return NoCheck(reporter)
404 # bits/types.h is allowed to define the __-versions of the
405 # obsolete types.
406 if (fname == "bits/types.h"
407 or fname.endswith("/bits/types.h")):
408 return ObsoletePrivateDefinitionsAllowed(reporter)
410 # sys/types.h is allowed to use the __-versions of the
411 # obsolete types, but only to define the unprefixed versions.
412 if (fname == "sys/types.h"
413 or fname.endswith("/sys/types.h")):
414 return ObsoletePublicDefinitionsAllowed(reporter)
416 return ObsoleteNotAllowed(reporter)
419 # Master control
422 class HeaderChecker:
423 """Perform all of the checks on each header. This is also the
424 "reporter" object expected by tokenize_c and ConstructChecker.
426 def __init__(self):
427 self.fname = None
428 self.status = 0
430 def error(self, tok, message):
431 self.status = 1
432 if '{!r}' in message:
433 message = message.format(tok.text)
434 sys.stderr.write("{}:{}:{}: error: {}\n".format(
435 self.fname, tok.line, tok.column, message))
437 def check(self, fname):
438 self.fname = fname
439 try:
440 with open(fname, "rt", encoding="utf-8") as fp:
441 contents = fp.read()
442 except OSError as e:
443 sys.stderr.write("{}: {}\n".format(fname, e.strerror))
444 self.status = 1
445 return
447 typedef_checker = ObsoleteTypedefChecker(self, self.fname)
449 for tok in tokenize_c(contents, self):
450 typedef_checker.examine(tok)
452 def main():
453 ap = argparse.ArgumentParser(description=__doc__)
454 ap.add_argument("headers", metavar="header", nargs="+",
455 help="one or more headers to scan for obsolete constructs")
456 args = ap.parse_args()
458 checker = HeaderChecker()
459 for fname in args.headers:
460 # Headers whose installed name begins with "finclude/" contain
461 # Fortran, not C, and this program should completely ignore them.
462 if not (fname.startswith("finclude/") or "/finclude/" in fname):
463 checker.check(fname)
464 sys.exit(checker.status)
466 main()