2 # Copyright (C) 2019-2021 Free Software Foundation, Inc.
3 # This file is part of the GNU C Library.
5 # The GNU C Library is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU Lesser General Public
7 # License as published by the Free Software Foundation; either
8 # version 2.1 of the License, or (at your option) any later version.
10 # The GNU C Library is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 # Lesser General Public License for more details.
15 # You should have received a copy of the GNU Lesser General Public
16 # License along with the GNU C Library; if not, see
17 # <https://www.gnu.org/licenses/>.
19 """Verifies that installed headers do not use any obsolete constructs:
20 * legacy BSD typedefs superseded by <stdint.h>:
21 ushort uint ulong u_char u_short u_int u_long u_intNN_t quad_t u_quad_t
22 (sys/types.h is allowed to _define_ these types, but not to use them
23 to define anything else).
31 # Simplified lexical analyzer for C preprocessing tokens.
32 # Does not implement trigraphs.
33 # Does not implement backslash-newline in the middle of any lexical
34 # item other than a string literal.
35 # Does not implement universal-character-names in identifiers.
36 # Treats prefixed strings (e.g. L"...") as two tokens (L and "...")
37 # Accepts non-ASCII characters only within comments and strings.
39 # Caution: The order of the outermost alternation matters.
40 # STRING must be before BAD_STRING, CHARCONST before BAD_CHARCONST,
41 # BLOCK_COMMENT before BAD_BLOCK_COM before PUNCTUATOR, and OTHER must
43 # Caution: There should be no capturing groups other than the named
44 # captures in the outermost alternation.
46 # For reference, these are all of the C punctuators as of C11:
48 # ! != * *= / /= ^ ^= = ==
60 # The BAD_* tokens are not part of the official definition of pp-tokens;
61 # they match unclosed strings, character constants, and block comments,
62 # so that the regex engine doesn't have to backtrack all the way to the
63 # beginning of a broken construct and then emit dozens of junk tokens.
65 PP_TOKEN_RE_
= re
.compile(r
"""
66 (?P<STRING> \"(?
:[^
\"\\\r\n]|
\\(?
:[\r\n -~
]|
\r\n))*\")
67 |
(?P
<BAD_STRING
> \"(?
:[^
\"\\\r\n]|
\\[ -~
])*)
68 |
(?P
<CHARCONST
> \'(?
:[^
\'\\\r\n]|
\\(?
:[\r\n -~
]|
\r\n))*\')
69 |
(?P
<BAD_CHARCONST
> \'(?
:[^
\'\\\r\n]|
\\[ -~
])*)
70 |
(?P
<BLOCK_COMMENT
> /\
*(?
:\
*(?
!/)|
[^
*])*\
*/)
71 |
(?P
<BAD_BLOCK_COM
> /\
*(?
:\
*(?
!/)|
[^
*])*\
*?
)
72 |
(?P
<LINE_COMMENT
> //[^
\r\n]*)
73 |
(?P
<IDENT
> [_a
-zA
-Z
][_a
-zA
-Z0
-9]*)
74 |
(?P
<PP_NUMBER
> \
.?
[0-9](?
:[0-9a
-df
-oq
-zA
-DF
-OQ
-Z_
.]|
[eEpP
][+-]?
)*)
86 |
<(?
:[%:]|
<(?
:=|
<=?
)?
)?
88 |
(?P
<ESCNL
> \\(?
:\r|
\n|
\r\n))
89 |
(?P
<WHITESPACE
> [ \t\n\r\v\f]+)
91 """, re.DOTALL | re.VERBOSE)
93 HEADER_NAME_RE_ = re.compile(r"""
96 """, re.DOTALL | re.VERBOSE)
98 ENDLINE_RE_ = re.compile(r"""\r|\n|\r\n""")
100 # based on the sample code in the Python re documentation
101 Token_ = collections.namedtuple("Token
", (
102 "kind
", "text
", "line
", "column
", "context
"))
104 One C preprocessing token, comment, or chunk of whitespace.
105 'kind' identifies the token type, which will be one of:
106 STRING, CHARCONST, BLOCK_COMMENT, LINE_COMMENT, IDENT,
107 PP_NUMBER, PUNCTUATOR, ESCNL, WHITESPACE, HEADER_NAME,
108 or OTHER. The BAD_* alternatives in PP_TOKEN_RE_ are
109 handled within tokenize_c, below.
111 'text' is the sequence of source characters making up the token;
112 no decoding whatsoever is performed.
114 'line' and 'column' give the position of the first character of the
115 token within the source file. They are both 1-based.
117 'context' indicates whether or not this token occurred within a
118 preprocessing directive; it will be None for running text,
119 '<null>' for the leading '#' of a directive line (because '#'
120 all by itself on a line is a "null directive
"), or the name of
121 the directive for tokens within a directive line, starting with
122 the IDENT for the name itself.
125 def tokenize_c(file_contents, reporter):
126 """Yield a series of Token objects, one for each preprocessing
127 token, comment, or chunk of whitespace within FILE_CONTENTS.
128 The REPORTER object is expected to have one method,
129 reporter.error(token, message), which will be called to
130 indicate a lexical error at the position of TOKEN.
131 If MESSAGE contains the four-character sequence '{!r}', that
132 is expected to be replaced by repr(token.text).
136 PP_TOKEN_RE = PP_TOKEN_RE_
137 ENDLINE_RE = ENDLINE_RE_
138 HEADER_NAME_RE = HEADER_NAME_RE_
143 limit = len(file_contents)
147 if directive == "include
":
148 mo = HEADER_NAME_RE.match(file_contents, pos)
151 directive = "after_include
"
153 mo = PP_TOKEN_RE.match(file_contents, pos)
155 if kind != "WHITESPACE
":
156 directive = "after_include
"
158 mo = PP_TOKEN_RE.match(file_contents, pos)
163 column = mo.start() - line_start
165 # only these kinds can contain a newline
166 if kind in ("WHITESPACE
", "BLOCK_COMMENT
", "LINE_COMMENT
",
167 "STRING
", "CHARCONST
", "BAD_BLOCK_COM
", "ESCNL
"):
168 for tmo in ENDLINE_RE.finditer(text):
170 adj_line_start = tmo.end()
172 line_start = mo.start() + adj_line_start
174 # Track whether or not we are scanning a preprocessing directive.
175 if kind == "LINE_COMMENT
" or (kind == "WHITESPACE
" and adj_line_start):
179 if kind == "PUNCTUATOR
" and text == "#" and at_bol:
181 elif kind
== "IDENT" and directive
== "<null>":
185 # Report ill-formed tokens and rewrite them as their well-formed
186 # equivalents, so downstream processing doesn't have to know about them.
187 # (Rewriting instead of discarding provides better error recovery.)
188 if kind
== "BAD_BLOCK_COM":
189 reporter
.error(Token("BAD_BLOCK_COM", "", line
, column
+1, ""),
190 "unclosed block comment")
192 kind
= "BLOCK_COMMENT"
193 elif kind
== "BAD_STRING":
194 reporter
.error(Token("BAD_STRING", "", line
, column
+1, ""),
198 elif kind
== "BAD_CHARCONST":
199 reporter
.error(Token("BAD_CHARCONST", "", line
, column
+1, ""),
200 "unclosed char constant")
204 tok
= Token(kind
, text
, line
, column
+1,
205 "include" if directive
== "after_include" else directive
)
206 # Do not complain about OTHER tokens inside macro definitions.
207 # $ and @ appear in macros defined by headers intended to be
208 # included from assembly language, e.g. sysdeps/mips/sys/asm.h.
209 if kind
== "OTHER" and directive
!= "define":
210 self
.error(tok
, "stray {!r} in program")
216 # Base and generic classes for individual checks.
219 class ConstructChecker
:
220 """Scan a stream of C preprocessing tokens and possibly report
221 problems with them. The REPORTER object passed to __init__ has
222 one method, reporter.error(token, message), which should be
223 called to indicate a problem detected at the position of TOKEN.
224 If MESSAGE contains the four-character sequence '{!r}' then that
225 will be replaced with a textual representation of TOKEN.
227 def __init__(self
, reporter
):
228 self
.reporter
= reporter
230 def examine(self
, tok
):
231 """Called once for each token in a header file.
232 Call self.reporter.error if a problem is detected.
234 raise NotImplementedError
237 """Called once at the end of the stream. Subclasses need only
238 override this if it might have something to do."""
241 class NoCheck(ConstructChecker
):
242 """Generic checker class which doesn't do anything. Substitute this
243 class for a real checker when a particular check should be skipped
246 def examine(self
, tok
):
250 # Check for obsolete type names.
253 # The obsolete type names we're looking for:
254 OBSOLETE_TYPE_RE_
= re
.compile(r
"""\A
257 | u(?: short | int | long
258 | _(?: char | short | int(?:[0-9]+_t)? | long | quad_t )))
261 class ObsoleteNotAllowed(ConstructChecker
):
262 """Don't allow any use of the obsolete typedefs."""
263 def examine(self
, tok
):
264 if OBSOLETE_TYPE_RE_
.match(tok
.text
):
265 self
.reporter
.error(tok
, "use of {!r}")
267 class ObsoletePrivateDefinitionsAllowed(ConstructChecker
):
268 """Allow definitions of the private versions of the
269 obsolete typedefs; that is, 'typedef [anything] __obsolete;'
271 def __init__(self
, reporter
):
272 super().__init
__(reporter
)
273 self
.in_typedef
= False
274 self
.prev_token
= None
276 def examine(self
, tok
):
277 # bits/types.h hides 'typedef' in a macro sometimes.
278 if (tok
.kind
== "IDENT"
279 and tok
.text
in ("typedef", "__STD_TYPE")
280 and tok
.context
is None):
281 self
.in_typedef
= True
282 elif tok
.kind
== "PUNCTUATOR" and tok
.text
== ";" and self
.in_typedef
:
283 self
.in_typedef
= False
284 if self
.prev_token
.kind
== "IDENT":
285 m
= OBSOLETE_TYPE_RE_
.match(self
.prev_token
.text
)
286 if m
and m
.group(1) != "__":
287 self
.reporter
.error(self
.prev_token
, "use of {!r}")
288 self
.prev_token
= None
292 self
.prev_token
= tok
297 def _check_prev(self
):
298 if (self
.prev_token
is not None
299 and self
.prev_token
.kind
== "IDENT"
300 and OBSOLETE_TYPE_RE_
.match(self
.prev_token
.text
)):
301 self
.reporter
.error(self
.prev_token
, "use of {!r}")
303 class ObsoletePublicDefinitionsAllowed(ConstructChecker
):
304 """Allow definitions of the public versions of the obsolete
305 typedefs. Only specific forms of definition are allowed:
307 typedef __obsolete obsolete; // identifiers must agree
308 typedef __uintN_t u_intN_t; // N must agree
309 typedef unsigned long int ulong;
310 typedef unsigned short int ushort;
311 typedef unsigned int uint;
313 def __init__(self
, reporter
):
314 super().__init
__(reporter
)
315 self
.typedef_tokens
= []
317 def examine(self
, tok
):
318 if tok
.kind
in ("WHITESPACE", "BLOCK_COMMENT",
319 "LINE_COMMENT", "NL", "ESCNL"):
322 elif (tok
.kind
== "IDENT" and tok
.text
== "typedef"
323 and tok
.context
is None):
324 if self
.typedef_tokens
:
325 self
.reporter
.error(tok
, "typedef inside typedef")
327 self
.typedef_tokens
.append(tok
)
329 elif tok
.kind
== "PUNCTUATOR" and tok
.text
== ";":
332 elif self
.typedef_tokens
:
333 self
.typedef_tokens
.append(tok
)
339 while self
.typedef_tokens
:
340 tok
= self
.typedef_tokens
.pop(0)
341 if tok
.kind
== "IDENT" and OBSOLETE_TYPE_RE_
.match(tok
.text
):
342 self
.reporter
.error(tok
, "use of {!r}")
345 if not self
.typedef_tokens
: return
346 if self
.typedef_tokens
[-1].kind
== "IDENT":
347 m
= OBSOLETE_TYPE_RE_
.match(self
.typedef_tokens
[-1].text
)
349 if self
._permissible
_public
_definition
(m
):
350 self
.typedef_tokens
.clear()
353 def _permissible_public_definition(self
, m
):
354 if m
.group(1) == "__": return False
356 toks
= self
.typedef_tokens
358 if ntok
== 3 and toks
[1].kind
== "IDENT":
360 n
= OBSOLETE_TYPE_RE_
.match(defn
)
361 if n
and n
.group(1) == "__" and n
.group(2) == name
:
364 if (name
[:5] == "u_int" and name
[-2:] == "_t"
365 and defn
[:6] == "__uint" and defn
[-2:] == "_t"
366 and name
[5:-2] == defn
[6:-2]):
371 if (name
== "ulong" and ntok
== 5
372 and toks
[1].kind
== "IDENT" and toks
[1].text
== "unsigned"
373 and toks
[2].kind
== "IDENT" and toks
[2].text
== "long"
374 and toks
[3].kind
== "IDENT" and toks
[3].text
== "int"):
377 if (name
== "ushort" and ntok
== 5
378 and toks
[1].kind
== "IDENT" and toks
[1].text
== "unsigned"
379 and toks
[2].kind
== "IDENT" and toks
[2].text
== "short"
380 and toks
[3].kind
== "IDENT" and toks
[3].text
== "int"):
383 if (name
== "uint" and ntok
== 4
384 and toks
[1].kind
== "IDENT" and toks
[1].text
== "unsigned"
385 and toks
[2].kind
== "IDENT" and toks
[2].text
== "int"):
390 def ObsoleteTypedefChecker(reporter
, fname
):
391 """Factory: produce an instance of the appropriate
392 obsolete-typedef checker for FNAME."""
394 # The obsolete rpc/ and rpcsvc/ headers are allowed to use the
395 # obsolete types, because it would be more trouble than it's
396 # worth to remove them from headers that we intend to stop
397 # installing eventually anyway.
398 if (fname
.startswith("rpc/")
399 or fname
.startswith("rpcsvc/")
401 or "/rpcsvc/" in fname
):
402 return NoCheck(reporter
)
404 # bits/types.h is allowed to define the __-versions of the
406 if (fname
== "bits/types.h"
407 or fname
.endswith("/bits/types.h")):
408 return ObsoletePrivateDefinitionsAllowed(reporter
)
410 # sys/types.h is allowed to use the __-versions of the
411 # obsolete types, but only to define the unprefixed versions.
412 if (fname
== "sys/types.h"
413 or fname
.endswith("/sys/types.h")):
414 return ObsoletePublicDefinitionsAllowed(reporter
)
416 return ObsoleteNotAllowed(reporter
)
423 """Perform all of the checks on each header. This is also the
424 "reporter" object expected by tokenize_c and ConstructChecker.
430 def error(self
, tok
, message
):
432 if '{!r}' in message
:
433 message
= message
.format(tok
.text
)
434 sys
.stderr
.write("{}:{}:{}: error: {}\n".format(
435 self
.fname
, tok
.line
, tok
.column
, message
))
437 def check(self
, fname
):
440 with
open(fname
, "rt", encoding
="utf-8") as fp
:
443 sys
.stderr
.write("{}: {}\n".format(fname
, e
.strerror
))
447 typedef_checker
= ObsoleteTypedefChecker(self
, self
.fname
)
449 for tok
in tokenize_c(contents
, self
):
450 typedef_checker
.examine(tok
)
453 ap
= argparse
.ArgumentParser(description
=__doc__
)
454 ap
.add_argument("headers", metavar
="header", nargs
="+",
455 help="one or more headers to scan for obsolete constructs")
456 args
= ap
.parse_args()
458 checker
= HeaderChecker()
459 for fname
in args
.headers
:
460 # Headers whose installed name begins with "finclude/" contain
461 # Fortran, not C, and this program should completely ignore them.
462 if not (fname
.startswith("finclude/") or "/finclude/" in fname
):
464 sys
.exit(checker
.status
)