scripts/check-obsolete-constructs.py

   1 #! /usr/bin/python3
   2 # Copyright (C) 2019-2020 Free Software Foundation, Inc.
   3 # This file is part of the GNU C Library.
   4 #
   5 # The GNU C Library is free software; you can redistribute it and/or
   6 # modify it under the terms of the GNU Lesser General Public
   7 # License as published by the Free Software Foundation; either
   8 # version 2.1 of the License, or (at your option) any later version.
   9 #
  10 # The GNU C Library is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13 # Lesser General Public License for more details.
  14 #
  15 # You should have received a copy of the GNU Lesser General Public
  16 # License along with the GNU C Library; if not, see
  17 # <https://www.gnu.org/licenses/>.
  18
  19 """Verifies that installed headers do not use any obsolete constructs:
  20  * legacy BSD typedefs superseded by <stdint.h>:
  21    ushort uint ulong u_char u_short u_int u_long u_intNN_t quad_t u_quad_t
  22    (sys/types.h is allowed to _define_ these types, but not to use them
  23     to define anything else).
  24 """
  25
  26 import argparse
  27 import collections
  28 import re
  29 import sys
  30
  31 # Simplified lexical analyzer for C preprocessing tokens.
  32 # Does not implement trigraphs.
  33 # Does not implement backslash-newline in the middle of any lexical
  34 #   item other than a string literal.
  35 # Does not implement universal-character-names in identifiers.
  36 # Treats prefixed strings (e.g. L"...") as two tokens (L and "...")
  37 # Accepts non-ASCII characters only within comments and strings.
  38
  39 # Caution: The order of the outermost alternation matters.
  40 # STRING must be before BAD_STRING, CHARCONST before BAD_CHARCONST,
  41 # BLOCK_COMMENT before BAD_BLOCK_COM before PUNCTUATOR, and OTHER must
  42 # be last.
  43 # Caution: There should be no capturing groups other than the named
  44 # captures in the outermost alternation.
  45
  46 # For reference, these are all of the C punctuators as of C11:
  47 #   [ ] ( ) { } , ; ? ~
  48 #   ! != * *= / /= ^ ^= = ==
  49 #   # ##
  50 #   % %= %> %: %:%:
  51 #   & &= &&
  52 #   | |= ||
  53 #   + += ++
  54 #   - -= -- ->
  55 #   . ...
  56 #   : :>
  57 #   < <% <: << <<= <=
  58 #   > >= >> >>=
  59
  60 # The BAD_* tokens are not part of the official definition of pp-tokens;
  61 # they match unclosed strings, character constants, and block comments,
  62 # so that the regex engine doesn't have to backtrack all the way to the
  63 # beginning of a broken construct and then emit dozens of junk tokens.
  64
  65 PP_TOKEN_RE_ = re.compile(r"""
  66     (?P<STRING>        \"(?:[^\"\\\r\n]|\\(?:[\r\n -~]|\r\n))*\")
  67    |(?P<BAD_STRING>    \"(?:[^\"\\\r\n]|\\[ -~])*)
  68    |(?P<CHARCONST>     \'(?:[^\'\\\r\n]|\\(?:[\r\n -~]|\r\n))*\')
  69    |(?P<BAD_CHARCONST> \'(?:[^\'\\\r\n]|\\[ -~])*)
  70    |(?P<BLOCK_COMMENT> /\*(?:\*(?!/)|[^*])*\*/)
  71    |(?P<BAD_BLOCK_COM> /\*(?:\*(?!/)|[^*])*\*?)
  72    |(?P<LINE_COMMENT>  //[^\r\n]*)
  73    |(?P<IDENT>         [_a-zA-Z][_a-zA-Z0-9]*)
  74    |(?P<PP_NUMBER>     \.?[0-9](?:[0-9a-df-oq-zA-DF-OQ-Z_.]|[eEpP][+-]?)*)
  75    |(?P<PUNCTUATOR>
  76        [,;?~(){}\[\]]
  77      | [!*/^=]=?
  78      | \#\#?
  79      | %(?:[=>]|:(?:%:)?)?
  80      | &[=&]?
  81      |\|[=|]?
  82      |\+[=+]?
  83      | -[=->]?
  84      |\.(?:\.\.)?
  85      | :>?
  86      | <(?:[%:]|<(?:=|<=?)?)?
  87      | >(?:=|>=?)?)
  88    |(?P<ESCNL>         \\(?:\r|\n|\r\n))
  89    |(?P<WHITESPACE>    [ \t\n\r\v\f]+)
  90    |(?P<OTHER>         .)
  91 """, re.DOTALL | re.VERBOSE)
  92
  93 HEADER_NAME_RE_ = re.compile(r"""
  94     < [^>\r\n]+ >
  95   | " [^"\r\n]+ "
  96 """, re.DOTALL | re.VERBOSE)
  97
  98 ENDLINE_RE_ = re.compile(r"""\r|\n|\r\n""")
  99
 100 # based on the sample code in the Python re documentation
 101 Token_ = collections.namedtuple("Token", (
 102     "kind", "text", "line", "column", "context"))
 103 Token_.__doc__ = """
 104    One C preprocessing token, comment, or chunk of whitespace.
 105    'kind' identifies the token type, which will be one of:
 106        STRING, CHARCONST, BLOCK_COMMENT, LINE_COMMENT, IDENT,
 107        PP_NUMBER, PUNCTUATOR, ESCNL, WHITESPACE, HEADER_NAME,
 108        or OTHER.  The BAD_* alternatives in PP_TOKEN_RE_ are
 109        handled within tokenize_c, below.
 110
 111    'text' is the sequence of source characters making up the token;
 112        no decoding whatsoever is performed.
 113
 114    'line' and 'column' give the position of the first character of the
 115       token within the source file.  They are both 1-based.
 116
 117    'context' indicates whether or not this token occurred within a
 118       preprocessing directive; it will be None for running text,
 119       '<null>' for the leading '#' of a directive line (because '#'
 120       all by itself on a line is a "null directive"), or the name of
 121       the directive for tokens within a directive line, starting with
 122       the IDENT for the name itself.
 123 """
 124
 125 def tokenize_c(file_contents, reporter):
 126     """Yield a series of Token objects, one for each preprocessing
 127        token, comment, or chunk of whitespace within FILE_CONTENTS.
 128        The REPORTER object is expected to have one method,
 129        reporter.error(token, message), which will be called to
 130        indicate a lexical error at the position of TOKEN.
 131        If MESSAGE contains the four-character sequence '{!r}', that
 132        is expected to be replaced by repr(token.text).
 133     """
 134
 135     Token = Token_
 136     PP_TOKEN_RE = PP_TOKEN_RE_
 137     ENDLINE_RE = ENDLINE_RE_
 138     HEADER_NAME_RE = HEADER_NAME_RE_
 139
 140     line_num = 1
 141     line_start = 0
 142     pos = 0
 143     limit = len(file_contents)
 144     directive = None
 145     at_bol = True
 146     while pos < limit:
 147         if directive == "include":
 148             mo = HEADER_NAME_RE.match(file_contents, pos)
 149             if mo:
 150                 kind = "HEADER_NAME"
 151                 directive = "after_include"
 152             else:
 153                 mo = PP_TOKEN_RE.match(file_contents, pos)
 154                 kind = mo.lastgroup
 155                 if kind != "WHITESPACE":
 156                     directive = "after_include"
 157         else:
 158             mo = PP_TOKEN_RE.match(file_contents, pos)
 159             kind = mo.lastgroup
 160
 161         text = mo.group()
 162         line = line_num
 163         column = mo.start() - line_start
 164         adj_line_start = 0
 165         # only these kinds can contain a newline
 166         if kind in ("WHITESPACE", "BLOCK_COMMENT", "LINE_COMMENT",
 167                     "STRING", "CHARCONST", "BAD_BLOCK_COM", "ESCNL"):
 168             for tmo in ENDLINE_RE.finditer(text):
 169                 line_num += 1
 170                 adj_line_start = tmo.end()
 171             if adj_line_start:
 172                 line_start = mo.start() + adj_line_start
 173
 174         # Track whether or not we are scanning a preprocessing directive.
 175         if kind == "LINE_COMMENT" or (kind == "WHITESPACE" and adj_line_start):
 176             at_bol = True
 177             directive = None
 178         else:
 179             if kind == "PUNCTUATOR" and text == "#" and at_bol:
 180                 directive = "<null>"
 181             elif kind == "IDENT" and directive == "<null>":
 182                 directive = text
 183             at_bol = False
 184
 185         # Report ill-formed tokens and rewrite them as their well-formed
 186         # equivalents, so downstream processing doesn't have to know about them.
 187         # (Rewriting instead of discarding provides better error recovery.)
 188         if kind == "BAD_BLOCK_COM":
 189             reporter.error(Token("BAD_BLOCK_COM", "", line, column+1, ""),
 190                            "unclosed block comment")
 191             text += "*/"
 192             kind = "BLOCK_COMMENT"
 193         elif kind == "BAD_STRING":
 194             reporter.error(Token("BAD_STRING", "", line, column+1, ""),
 195                            "unclosed string")
 196             text += "\""
 197             kind = "STRING"
 198         elif kind == "BAD_CHARCONST":
 199             reporter.error(Token("BAD_CHARCONST", "", line, column+1, ""),
 200                            "unclosed char constant")
 201             text += "'"
 202             kind = "CHARCONST"
 203
 204         tok = Token(kind, text, line, column+1,
 205                     "include" if directive == "after_include" else directive)
 206         # Do not complain about OTHER tokens inside macro definitions.
 207         # $ and @ appear in macros defined by headers intended to be
 208         # included from assembly language, e.g. sysdeps/mips/sys/asm.h.
 209         if kind == "OTHER" and directive != "define":
 210             self.error(tok, "stray {!r} in program")
 211
 212         yield tok
 213         pos = mo.end()
 214
 215 #
 216 # Base and generic classes for individual checks.
 217 #
 218
 219 class ConstructChecker:
 220     """Scan a stream of C preprocessing tokens and possibly report
 221        problems with them.  The REPORTER object passed to __init__ has
 222        one method, reporter.error(token, message), which should be
 223        called to indicate a problem detected at the position of TOKEN.
 224        If MESSAGE contains the four-character sequence '{!r}' then that
 225        will be replaced with a textual representation of TOKEN.
 226     """
 227     def __init__(self, reporter):
 228         self.reporter = reporter
 229
 230     def examine(self, tok):
 231         """Called once for each token in a header file.
 232            Call self.reporter.error if a problem is detected.
 233         """
 234         raise NotImplementedError
 235
 236     def eof(self):
 237         """Called once at the end of the stream.  Subclasses need only
 238            override this if it might have something to do."""
 239         pass
 240
 241 class NoCheck(ConstructChecker):
 242     """Generic checker class which doesn't do anything.  Substitute this
 243        class for a real checker when a particular check should be skipped
 244        for some file."""
 245
 246     def examine(self, tok):
 247         pass
 248
 249 #
 250 # Check for obsolete type names.
 251 #
 252
 253 # The obsolete type names we're looking for:
 254 OBSOLETE_TYPE_RE_ = re.compile(r"""\A
 255   (__)?
 256   (   quad_t
 257     | u(?: short | int | long
 258          | _(?: char | short | int(?:[0-9]+_t)? | long | quad_t )))
 259 \Z""", re.VERBOSE)
 260
 261 class ObsoleteNotAllowed(ConstructChecker):
 262     """Don't allow any use of the obsolete typedefs."""
 263     def examine(self, tok):
 264         if OBSOLETE_TYPE_RE_.match(tok.text):
 265             self.reporter.error(tok, "use of {!r}")
 266
 267 class ObsoletePrivateDefinitionsAllowed(ConstructChecker):
 268     """Allow definitions of the private versions of the
 269        obsolete typedefs; that is, 'typedef [anything] __obsolete;'
 270     """
 271     def __init__(self, reporter):
 272         super().__init__(reporter)
 273         self.in_typedef = False
 274         self.prev_token = None
 275
 276     def examine(self, tok):
 277         # bits/types.h hides 'typedef' in a macro sometimes.
 278         if (tok.kind == "IDENT"
 279             and tok.text in ("typedef", "__STD_TYPE")
 280             and tok.context is None):
 281             self.in_typedef = True
 282         elif tok.kind == "PUNCTUATOR" and tok.text == ";" and self.in_typedef:
 283             self.in_typedef = False
 284             if self.prev_token.kind == "IDENT":
 285                 m = OBSOLETE_TYPE_RE_.match(self.prev_token.text)
 286                 if m and m.group(1) != "__":
 287                     self.reporter.error(self.prev_token, "use of {!r}")
 288             self.prev_token = None
 289         else:
 290             self._check_prev()
 291
 292         self.prev_token = tok
 293
 294     def eof(self):
 295         self._check_prev()
 296
 297     def _check_prev(self):
 298         if (self.prev_token is not None
 299             and self.prev_token.kind == "IDENT"
 300             and OBSOLETE_TYPE_RE_.match(self.prev_token.text)):
 301             self.reporter.error(self.prev_token, "use of {!r}")
 302
 303 class ObsoletePublicDefinitionsAllowed(ConstructChecker):
 304     """Allow definitions of the public versions of the obsolete
 305        typedefs.  Only specific forms of definition are allowed:
 306
 307            typedef __obsolete obsolete;  // identifiers must agree
 308            typedef __uintN_t u_intN_t;   // N must agree
 309            typedef unsigned long int ulong;
 310            typedef unsigned short int ushort;
 311            typedef unsigned int uint;
 312     """
 313     def __init__(self, reporter):
 314         super().__init__(reporter)
 315         self.typedef_tokens = []
 316
 317     def examine(self, tok):
 318         if tok.kind in ("WHITESPACE", "BLOCK_COMMENT",
 319                         "LINE_COMMENT", "NL", "ESCNL"):
 320             pass
 321
 322         elif (tok.kind == "IDENT" and tok.text == "typedef"
 323               and tok.context is None):
 324             if self.typedef_tokens:
 325                 self.reporter.error(tok, "typedef inside typedef")
 326                 self._reset()
 327             self.typedef_tokens.append(tok)
 328
 329         elif tok.kind == "PUNCTUATOR" and tok.text == ";":
 330             self._finish()
 331
 332         elif self.typedef_tokens:
 333             self.typedef_tokens.append(tok)
 334
 335     def eof(self):
 336         self._reset()
 337
 338     def _reset(self):
 339         while self.typedef_tokens:
 340             tok = self.typedef_tokens.pop(0)
 341             if tok.kind == "IDENT" and OBSOLETE_TYPE_RE_.match(tok.text):
 342                 self.reporter.error(tok, "use of {!r}")
 343
 344     def _finish(self):
 345         if not self.typedef_tokens: return
 346         if self.typedef_tokens[-1].kind == "IDENT":
 347             m = OBSOLETE_TYPE_RE_.match(self.typedef_tokens[-1].text)
 348             if m:
 349                 if self._permissible_public_definition(m):
 350                     self.typedef_tokens.clear()
 351         self._reset()
 352
 353     def _permissible_public_definition(self, m):
 354         if m.group(1) == "__": return False
 355         name = m.group(2)
 356         toks = self.typedef_tokens
 357         ntok = len(toks)
 358         if ntok == 3 and toks[1].kind == "IDENT":
 359             defn = toks[1].text
 360             n = OBSOLETE_TYPE_RE_.match(defn)
 361             if n and n.group(1) == "__" and n.group(2) == name:
 362                 return True
 363
 364             if (name[:5] == "u_int" and name[-2:] == "_t"
 365                 and defn[:6] == "__uint" and defn[-2:] == "_t"
 366                 and name[5:-2] == defn[6:-2]):
 367                 return True
 368
 369             return False
 370
 371         if (name == "ulong" and ntok == 5
 372             and toks[1].kind == "IDENT" and toks[1].text == "unsigned"
 373             and toks[2].kind == "IDENT" and toks[2].text == "long"
 374             and toks[3].kind == "IDENT" and toks[3].text == "int"):
 375             return True
 376
 377         if (name == "ushort" and ntok == 5
 378             and toks[1].kind == "IDENT" and toks[1].text == "unsigned"
 379             and toks[2].kind == "IDENT" and toks[2].text == "short"
 380             and toks[3].kind == "IDENT" and toks[3].text == "int"):
 381             return True
 382
 383         if (name == "uint" and ntok == 4
 384             and toks[1].kind == "IDENT" and toks[1].text == "unsigned"
 385             and toks[2].kind == "IDENT" and toks[2].text == "int"):
 386             return True
 387
 388         return False
 389
 390 def ObsoleteTypedefChecker(reporter, fname):
 391     """Factory: produce an instance of the appropriate
 392        obsolete-typedef checker for FNAME."""
 393
 394     # The obsolete rpc/ and rpcsvc/ headers are allowed to use the
 395     # obsolete types, because it would be more trouble than it's
 396     # worth to remove them from headers that we intend to stop
 397     # installing eventually anyway.
 398     if (fname.startswith("rpc/")
 399         or fname.startswith("rpcsvc/")
 400         or "/rpc/" in fname
 401         or "/rpcsvc/" in fname):
 402         return NoCheck(reporter)
 403
 404     # bits/types.h is allowed to define the __-versions of the
 405     # obsolete types.
 406     if (fname == "bits/types.h"
 407         or fname.endswith("/bits/types.h")):
 408         return ObsoletePrivateDefinitionsAllowed(reporter)
 409
 410     # sys/types.h is allowed to use the __-versions of the
 411     # obsolete types, but only to define the unprefixed versions.
 412     if (fname == "sys/types.h"
 413         or fname.endswith("/sys/types.h")):
 414         return ObsoletePublicDefinitionsAllowed(reporter)
 415
 416     return ObsoleteNotAllowed(reporter)
 417
 418 #
 419 # Master control
 420 #
 421
 422 class HeaderChecker:
 423     """Perform all of the checks on each header.  This is also the
 424        "reporter" object expected by tokenize_c and ConstructChecker.
 425     """
 426     def __init__(self):
 427         self.fname = None
 428         self.status = 0
 429
 430     def error(self, tok, message):
 431         self.status = 1
 432         if '{!r}' in message:
 433             message = message.format(tok.text)
 434         sys.stderr.write("{}:{}:{}: error: {}\n".format(
 435             self.fname, tok.line, tok.column, message))
 436
 437     def check(self, fname):
 438         self.fname = fname
 439         try:
 440             with open(fname, "rt", encoding="utf-8") as fp:
 441                 contents = fp.read()
 442         except OSError as e:
 443             sys.stderr.write("{}: {}\n".format(fname, e.strerror))
 444             self.status = 1
 445             return
 446
 447         typedef_checker = ObsoleteTypedefChecker(self, self.fname)
 448
 449         for tok in tokenize_c(contents, self):
 450             typedef_checker.examine(tok)
 451
 452 def main():
 453     ap = argparse.ArgumentParser(description=__doc__)
 454     ap.add_argument("headers", metavar="header", nargs="+",
 455                     help="one or more headers to scan for obsolete constructs")
 456     args = ap.parse_args()
 457
 458     checker = HeaderChecker()
 459     for fname in args.headers:
 460         # Headers whose installed name begins with "finclude/" contain
 461         # Fortran, not C, and this program should completely ignore them.
 462         if not (fname.startswith("finclude/") or "/finclude/" in fname):
 463             checker.check(fname)
 464     sys.exit(checker.status)
 465
 466 main()