libcpp/lex.c

   1 /* CPP Library - lexical analysis.
   2    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2007, 2008 Free Software Foundation, Inc.
   3    Contributed by Per Bothner, 1994-95.
   4    Based on CCCP program by Paul Rubin, June 1986
   5    Adapted to ANSI C, Richard Stallman, Jan 1987
   6    Broken out to separate file, Zack Weinberg, Mar 2000
   7
   8 This program is free software; you can redistribute it and/or modify it
   9 under the terms of the GNU General Public License as published by the
  10 Free Software Foundation; either version 2, or (at your option) any
  11 later version.
  12
  13 This program is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with this program; if not, write to the Free Software
  20 Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "cpplib.h"
  25 #include "internal.h"
  26
  27 enum spell_type
  28 {
  29   SPELL_OPERATOR = 0,
  30   SPELL_IDENT,
  31   SPELL_LITERAL,
  32   SPELL_NONE
  33 };
  34
  35 struct token_spelling
  36 {
  37   enum spell_type category;
  38   const unsigned char *name;
  39 };
  40
  41 static const unsigned char *const digraph_spellings[] =
  42 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
  43
  44 #define OP(e, s) { SPELL_OPERATOR, UC s  },
  45 #define TK(e, s) { SPELL_ ## s,    UC #e },
  46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
  47 #undef OP
  48 #undef TK
  49
  50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
  51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
  52
  53 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
  54 static int skip_line_comment (cpp_reader *);
  55 static void skip_whitespace (cpp_reader *, cppchar_t);
  56 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
  57 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
  58 static void store_comment (cpp_reader *, cpp_token *);
  59 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
  60                             unsigned int, enum cpp_ttype);
  61 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
  62 static int name_p (cpp_reader *, const cpp_string *);
  63 static tokenrun *next_tokenrun (tokenrun *);
  64
  65 static _cpp_buff *new_buff (size_t);
  66
  67
  68 /* Utility routine:
  69
  70    Compares, the token TOKEN to the NUL-terminated string STRING.
  71    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
  72 int
  73 cpp_ideq (const cpp_token *token, const char *string)
  74 {
  75   if (token->type != CPP_NAME)
  76     return 0;
  77
  78   return !ustrcmp (NODE_NAME (token->val.node), (const uchar *) string);
  79 }
  80
  81 /* Record a note TYPE at byte POS into the current cleaned logical
  82    line.  */
  83 static void
  84 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
  85 {
  86   if (buffer->notes_used == buffer->notes_cap)
  87     {
  88       buffer->notes_cap = buffer->notes_cap * 2 + 200;
  89       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
  90                                   buffer->notes_cap);
  91     }
  92
  93   buffer->notes[buffer->notes_used].pos = pos;
  94   buffer->notes[buffer->notes_used].type = type;
  95   buffer->notes_used++;
  96 }
  97
  98 /* Returns with a logical line that contains no escaped newlines or
  99    trigraphs.  This is a time-critical inner loop.  */
 100 void
 101 _cpp_clean_line (cpp_reader *pfile)
 102 {
 103   cpp_buffer *buffer;
 104   const uchar *s;
 105   uchar c, *d, *p;
 106
 107   buffer = pfile->buffer;
 108   buffer->cur_note = buffer->notes_used = 0;
 109   buffer->cur = buffer->line_base = buffer->next_line;
 110   buffer->need_line = false;
 111   s = buffer->next_line - 1;
 112
 113   if (!buffer->from_stage3)
 114     {
 115       const uchar *pbackslash = NULL;
 116
 117       /* Short circuit for the common case of an un-escaped line with
 118          no trigraphs.  The primary win here is by not writing any
 119          data back to memory until we have to.  */
 120       for (;;)
 121         {
 122           c = *++s;
 123           if (__builtin_expect (c == '\n', false)
 124               || __builtin_expect (c == '\r', false))
 125             {
 126               d = (uchar *) s;
 127
 128               if (__builtin_expect (s == buffer->rlimit, false))
 129                 goto done;
 130
 131               /* DOS line ending? */
 132               if (__builtin_expect (c == '\r', false)
 133                   && s[1] == '\n')
 134                 {
 135                   s++;
 136                   if (s == buffer->rlimit)
 137                     goto done;
 138                 }
 139
 140               if (__builtin_expect (pbackslash == NULL, true))
 141                 goto done;
 142
 143               /* Check for escaped newline.  */
 144               p = d;
 145               while (is_nvspace (p[-1]))
 146                 p--;
 147               if (p - 1 != pbackslash)
 148                 goto done;
 149
 150               /* Have an escaped newline; process it and proceed to
 151                  the slow path.  */
 152               add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
 153               d = p - 2;
 154               buffer->next_line = p - 1;
 155               break;
 156             }
 157           if (__builtin_expect (c == '\\', false))
 158             pbackslash = s;
 159           else if (__builtin_expect (c == '?', false)
 160                    && __builtin_expect (s[1] == '?', false)
 161                    && _cpp_trigraph_map[s[2]])
 162             {
 163               /* Have a trigraph.  We may or may not have to convert
 164                  it.  Add a line note regardless, for -Wtrigraphs.  */
 165               add_line_note (buffer, s, s[2]);
 166               if (CPP_OPTION (pfile, trigraphs))
 167                 {
 168                   /* We do, and that means we have to switch to the
 169                      slow path.  */
 170                   d = (uchar *) s;
 171                   *d = _cpp_trigraph_map[s[2]];
 172                   s += 2;
 173                   break;
 174                 }
 175             }
 176         }
 177
 178
 179       for (;;)
 180         {
 181           c = *++s;
 182           *++d = c;
 183
 184           if (c == '\n' || c == '\r')
 185             {
 186                   /* Handle DOS line endings.  */
 187               if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
 188                 s++;
 189               if (s == buffer->rlimit)
 190                 break;
 191
 192               /* Escaped?  */
 193               p = d;
 194               while (p != buffer->next_line && is_nvspace (p[-1]))
 195                 p--;
 196               if (p == buffer->next_line || p[-1] != '\\')
 197                 break;
 198
 199               add_line_note (buffer, p - 1, p != d ? ' ': '\\');
 200               d = p - 2;
 201               buffer->next_line = p - 1;
 202             }
 203           else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
 204             {
 205               /* Add a note regardless, for the benefit of -Wtrigraphs.  */
 206               add_line_note (buffer, d, s[2]);
 207               if (CPP_OPTION (pfile, trigraphs))
 208                 {
 209                   *d = _cpp_trigraph_map[s[2]];
 210                   s += 2;
 211                 }
 212             }
 213         }
 214     }
 215   else
 216     {
 217       do
 218         s++;
 219       while (*s != '\n' && *s != '\r');
 220       d = (uchar *) s;
 221
 222       /* Handle DOS line endings.  */
 223       if (*s == '\r' && s != buffer->rlimit && s[1] == '\n')
 224         s++;
 225     }
 226
 227  done:
 228   *d = '\n';
 229   /* A sentinel note that should never be processed.  */
 230   add_line_note (buffer, d + 1, '\n');
 231   buffer->next_line = s + 1;
 232 }
 233
 234 /* Return true if the trigraph indicated by NOTE should be warned
 235    about in a comment.  */
 236 static bool
 237 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
 238 {
 239   const uchar *p;
 240
 241   /* Within comments we don't warn about trigraphs, unless the
 242      trigraph forms an escaped newline, as that may change
 243      behavior.  */
 244   if (note->type != '/')
 245     return false;
 246
 247   /* If -trigraphs, then this was an escaped newline iff the next note
 248      is coincident.  */
 249   if (CPP_OPTION (pfile, trigraphs))
 250     return note[1].pos == note->pos;
 251
 252   /* Otherwise, see if this forms an escaped newline.  */
 253   p = note->pos + 3;
 254   while (is_nvspace (*p))
 255     p++;
 256
 257   /* There might have been escaped newlines between the trigraph and the
 258      newline we found.  Hence the position test.  */
 259   return (*p == '\n' && p < note[1].pos);
 260 }
 261
 262 /* Process the notes created by add_line_note as far as the current
 263    location.  */
 264 void
 265 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
 266 {
 267   cpp_buffer *buffer = pfile->buffer;
 268
 269   for (;;)
 270     {
 271       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
 272       unsigned int col;
 273
 274       if (note->pos > buffer->cur)
 275         break;
 276
 277       buffer->cur_note++;
 278       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
 279
 280       if (note->type == '\\' || note->type == ' ')
 281         {
 282           if (note->type == ' ' && !in_comment)
 283             cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
 284                                  "backslash and newline separated by space");
 285
 286           if (buffer->next_line > buffer->rlimit)
 287             {
 288               cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
 289                                    "backslash-newline at end of file");
 290               /* Prevent "no newline at end of file" warning.  */
 291               buffer->next_line = buffer->rlimit;
 292             }
 293
 294           buffer->line_base = note->pos;
 295           CPP_INCREMENT_LINE (pfile, 0);
 296         }
 297       else if (_cpp_trigraph_map[note->type])
 298         {
 299           if (CPP_OPTION (pfile, warn_trigraphs)
 300               && (!in_comment || warn_in_comment (pfile, note)))
 301             {
 302               if (CPP_OPTION (pfile, trigraphs))
 303                 cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
 304                                      "trigraph ??%c converted to %c",
 305                                      note->type,
 306                                      (int) _cpp_trigraph_map[note->type]);
 307               else
 308                 {
 309                   cpp_error_with_line
 310                     (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
 311                      "trigraph ??%c ignored, use -trigraphs to enable",
 312                      note->type);
 313                 }
 314             }
 315         }
 316       else
 317         abort ();
 318     }
 319 }
 320
 321 /* Skip a C-style block comment.  We find the end of the comment by
 322    seeing if an asterisk is before every '/' we encounter.  Returns
 323    nonzero if comment terminated by EOF, zero otherwise.
 324
 325    Buffer->cur points to the initial asterisk of the comment.  */
 326 bool
 327 _cpp_skip_block_comment (cpp_reader *pfile)
 328 {
 329   cpp_buffer *buffer = pfile->buffer;
 330   const uchar *cur = buffer->cur;
 331   uchar c;
 332
 333   cur++;
 334   if (*cur == '/')
 335     cur++;
 336
 337   for (;;)
 338     {
 339       /* People like decorating comments with '*', so check for '/'
 340          instead for efficiency.  */
 341       c = *cur++;
 342
 343       if (c == '/')
 344         {
 345           if (cur[-2] == '*')
 346             break;
 347
 348           /* Warn about potential nested comments, but not if the '/'
 349              comes immediately before the true comment delimiter.
 350              Don't bother to get it right across escaped newlines.  */
 351           if (CPP_OPTION (pfile, warn_comments)
 352               && cur[0] == '*' && cur[1] != '/')
 353             {
 354               buffer->cur = cur;
 355               cpp_error_with_line (pfile, CPP_DL_WARNING,
 356                                    pfile->line_table->highest_line, CPP_BUF_COL (buffer),
 357                                    "\"/*\" within comment");
 358             }
 359         }
 360       else if (c == '\n')
 361         {
 362           unsigned int cols;
 363           buffer->cur = cur - 1;
 364           _cpp_process_line_notes (pfile, true);
 365           if (buffer->next_line >= buffer->rlimit)
 366             return true;
 367           _cpp_clean_line (pfile);
 368
 369           cols = buffer->next_line - buffer->line_base;
 370           CPP_INCREMENT_LINE (pfile, cols);
 371
 372           cur = buffer->cur;
 373         }
 374     }
 375
 376   buffer->cur = cur;
 377   _cpp_process_line_notes (pfile, true);
 378   return false;
 379 }
 380
 381 /* Skip a C++ line comment, leaving buffer->cur pointing to the
 382    terminating newline.  Handles escaped newlines.  Returns nonzero
 383    if a multiline comment.  */
 384 static int
 385 skip_line_comment (cpp_reader *pfile)
 386 {
 387   cpp_buffer *buffer = pfile->buffer;
 388   source_location orig_line = pfile->line_table->highest_line;
 389
 390   while (*buffer->cur != '\n')
 391     buffer->cur++;
 392
 393   _cpp_process_line_notes (pfile, true);
 394   return orig_line != pfile->line_table->highest_line;
 395 }
 396
 397 /* Skips whitespace, saving the next non-whitespace character.  */
 398 static void
 399 skip_whitespace (cpp_reader *pfile, cppchar_t c)
 400 {
 401   cpp_buffer *buffer = pfile->buffer;
 402   bool saw_NUL = false;
 403
 404   do
 405     {
 406       /* Horizontal space always OK.  */
 407       if (c == ' ' || c == '\t')
 408         ;
 409       /* Just \f \v or \0 left.  */
 410       else if (c == '\0')
 411         saw_NUL = true;
 412       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
 413         cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
 414                              CPP_BUF_COL (buffer),
 415                              "%s in preprocessing directive",
 416                              c == '\f' ? "form feed" : "vertical tab");
 417
 418       c = *buffer->cur++;
 419     }
 420   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
 421   while (is_nvspace (c));
 422
 423   if (saw_NUL)
 424     cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored");
 425
 426   buffer->cur--;
 427 }
 428
 429 /* See if the characters of a number token are valid in a name (no
 430    '.', '+' or '-').  */
 431 static int
 432 name_p (cpp_reader *pfile, const cpp_string *string)
 433 {
 434   unsigned int i;
 435
 436   for (i = 0; i < string->len; i++)
 437     if (!is_idchar (string->text[i]))
 438       return 0;
 439
 440   return 1;
 441 }
 442
 443 /* After parsing an identifier or other sequence, produce a warning about
 444    sequences not in NFC/NFKC.  */
 445 static void
 446 warn_about_normalization (cpp_reader *pfile,
 447                           const cpp_token *token,
 448                           const struct normalize_state *s)
 449 {
 450   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
 451       && !pfile->state.skipping)
 452     {
 453       /* Make sure that the token is printed using UCNs, even
 454          if we'd otherwise happily print UTF-8.  */
 455       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
 456       size_t sz;
 457
 458       sz = cpp_spell_token (pfile, token, buf, false) - buf;
 459       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
 460         cpp_error_with_line (pfile, CPP_DL_WARNING, token->src_loc, 0,
 461                              "`%.*s' is not in NFKC", (int) sz, buf);
 462       else
 463         cpp_error_with_line (pfile, CPP_DL_WARNING, token->src_loc, 0,
 464                              "`%.*s' is not in NFC", (int) sz, buf);
 465     }
 466 }
 467
 468 /* Returns TRUE if the sequence starting at buffer->cur is invalid in
 469    an identifier.  FIRST is TRUE if this starts an identifier.  */
 470 static bool
 471 forms_identifier_p (cpp_reader *pfile, int first,
 472                     struct normalize_state *state)
 473 {
 474   cpp_buffer *buffer = pfile->buffer;
 475
 476   if (*buffer->cur == '$')
 477     {
 478       if (!CPP_OPTION (pfile, dollars_in_ident))
 479         return false;
 480
 481       buffer->cur++;
 482       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
 483         {
 484           CPP_OPTION (pfile, warn_dollars) = 0;
 485           cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
 486         }
 487
 488       return true;
 489     }
 490
 491   /* Is this a syntactically valid UCN?  */
 492   if (CPP_OPTION (pfile, extended_identifiers)
 493       && *buffer->cur == '\\'
 494       && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
 495     {
 496       buffer->cur += 2;
 497       if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
 498                           state))
 499         return true;
 500       buffer->cur -= 2;
 501     }
 502
 503   return false;
 504 }
 505
 506 /* Lex an identifier starting at BUFFER->CUR - 1.  */
 507 static cpp_hashnode *
 508 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
 509                 struct normalize_state *nst)
 510 {
 511   cpp_hashnode *result;
 512   const uchar *cur;
 513   unsigned int len;
 514   unsigned int hash = HT_HASHSTEP (0, *base);
 515
 516   cur = pfile->buffer->cur;
 517   if (! starts_ucn)
 518     while (ISIDNUM (*cur))
 519       {
 520         hash = HT_HASHSTEP (hash, *cur);
 521         cur++;
 522       }
 523   pfile->buffer->cur = cur;
 524   if (starts_ucn || forms_identifier_p (pfile, false, nst))
 525     {
 526       /* Slower version for identifiers containing UCNs (or $).  */
 527       do {
 528         while (ISIDNUM (*pfile->buffer->cur))
 529           {
 530             pfile->buffer->cur++;
 531             NORMALIZE_STATE_UPDATE_IDNUM (nst);
 532           }
 533       } while (forms_identifier_p (pfile, false, nst));
 534       result = _cpp_interpret_identifier (pfile, base,
 535                                           pfile->buffer->cur - base);
 536     }
 537   else
 538     {
 539       len = cur - base;
 540       hash = HT_HASHFINISH (hash, len);
 541
 542       result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
 543                                                   base, len, hash, HT_ALLOC));
 544     }
 545
 546   /* Rarely, identifiers require diagnostics when lexed.  */
 547   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
 548                         && !pfile->state.skipping, 0))
 549     {
 550       /* It is allowed to poison the same identifier twice.  */
 551       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
 552         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
 553                    NODE_NAME (result));
 554
 555       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
 556          replacement list of a variadic macro.  */
 557       if (result == pfile->spec_nodes.n__VA_ARGS__
 558           && !pfile->state.va_args_ok)
 559         cpp_error (pfile, CPP_DL_PEDWARN,
 560                    "__VA_ARGS__ can only appear in the expansion"
 561                    " of a C99 variadic macro");
 562     }
 563
 564   return result;
 565 }
 566
 567 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
 568 static void
 569 lex_number (cpp_reader *pfile, cpp_string *number,
 570             struct normalize_state *nst)
 571 {
 572   const uchar *cur;
 573   const uchar *base;
 574   uchar *dest;
 575
 576   base = pfile->buffer->cur - 1;
 577   do
 578     {
 579       cur = pfile->buffer->cur;
 580
 581       /* N.B. ISIDNUM does not include $.  */
 582       while (ISIDNUM (*cur) || *cur == '.' || VALID_SIGN (*cur, cur[-1]))
 583         {
 584           cur++;
 585           NORMALIZE_STATE_UPDATE_IDNUM (nst);
 586         }
 587
 588       pfile->buffer->cur = cur;
 589     }
 590   while (forms_identifier_p (pfile, false, nst));
 591
 592   number->len = cur - base;
 593   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
 594   memcpy (dest, base, number->len);
 595   dest[number->len] = '\0';
 596   number->text = dest;
 597 }
 598
 599 /* Create a token of type TYPE with a literal spelling.  */
 600 static void
 601 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
 602                 unsigned int len, enum cpp_ttype type)
 603 {
 604   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
 605
 606   memcpy (dest, base, len);
 607   dest[len] = '\0';
 608   token->type = type;
 609   token->val.str.len = len;
 610   token->val.str.text = dest;
 611 }
 612
 613 /* Lexes a string, character constant, or angle-bracketed header file
 614    name.  The stored string contains the spelling, including opening
 615    quote and leading any leading 'L', 'u' or 'U'.  It returns the type
 616    of the literal, or CPP_OTHER if it was not properly terminated, or
 617    CPP_LESS for an unterminated header name which must be relexed as
 618    normal tokens.
 619
 620    The spelling is NUL-terminated, but it is not guaranteed that this
 621    is the first NUL since embedded NULs are preserved.  */
 622 static void
 623 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
 624 {
 625   bool saw_NUL = false;
 626   const uchar *cur;
 627   cppchar_t terminator;
 628   enum cpp_ttype type;
 629
 630   cur = base;
 631   terminator = *cur++;
 632   if (terminator == 'L' || terminator == 'u' || terminator == 'U')
 633     terminator = *cur++;
 634   if (terminator == '\"')
 635     type = (*base == 'L' ? CPP_WSTRING :
 636             *base == 'U' ? CPP_STRING32 :
 637             *base == 'u' ? CPP_STRING16 : CPP_STRING);
 638   else if (terminator == '\'')
 639     type = (*base == 'L' ? CPP_WCHAR :
 640             *base == 'U' ? CPP_CHAR32 :
 641             *base == 'u' ? CPP_CHAR16 : CPP_CHAR);
 642   else
 643     terminator = '>', type = CPP_HEADER_NAME;
 644
 645   for (;;)
 646     {
 647       cppchar_t c = *cur++;
 648
 649       /* In #include-style directives, terminators are not escapable.  */
 650       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
 651         cur++;
 652       else if (c == terminator)
 653         break;
 654       else if (c == '\n')
 655         {
 656           cur--;
 657           /* Unmatched quotes always yield undefined behavior, but
 658              greedy lexing means that what appears to be an unterminated
 659              header name may actually be a legitimate sequence of tokens.  */
 660           if (terminator == '>')
 661             {
 662               token->type = CPP_LESS;
 663               return;
 664             }
 665           type = CPP_OTHER;
 666           break;
 667         }
 668       else if (c == '\0')
 669         saw_NUL = true;
 670     }
 671
 672   if (saw_NUL && !pfile->state.skipping)
 673     cpp_error (pfile, CPP_DL_WARNING,
 674                "null character(s) preserved in literal");
 675
 676   if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
 677     cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
 678                (int) terminator);
 679
 680   pfile->buffer->cur = cur;
 681   create_literal (pfile, token, base, cur - base, type);
 682 }
 683
 684 /* Return the comment table. The client may not make any assumption
 685    about the ordering of the table.  */
 686 cpp_comment_table *
 687 cpp_get_comments (cpp_reader *pfile)
 688 {
 689   return &pfile->comments;
 690 }
 691
 692 /* Append a comment to the end of the comment table. */
 693 static void
 694 store_comment (cpp_reader *pfile, cpp_token *token)
 695 {
 696   int len;
 697
 698   if (pfile->comments.allocated == 0)
 699     {
 700       pfile->comments.allocated = 256;
 701       pfile->comments.entries = (cpp_comment *) xmalloc
 702         (pfile->comments.allocated * sizeof (cpp_comment));
 703     }
 704
 705   if (pfile->comments.count == pfile->comments.allocated)
 706     {
 707       pfile->comments.allocated *= 2;
 708       pfile->comments.entries = (cpp_comment *) xrealloc
 709         (pfile->comments.entries,
 710          pfile->comments.allocated * sizeof (cpp_comment));
 711     }
 712
 713   len = token->val.str.len;
 714
 715   /* Copy comment. Note, token may not be NULL terminated. */
 716   pfile->comments.entries[pfile->comments.count].comment =
 717     (char *) xmalloc (sizeof (char) * (len + 1));
 718   memcpy (pfile->comments.entries[pfile->comments.count].comment,
 719           token->val.str.text, len);
 720   pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
 721
 722   /* Set source location. */
 723   pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
 724
 725   /* Increment the count of entries in the comment table. */
 726   pfile->comments.count++;
 727 }
 728
 729 /* The stored comment includes the comment start and any terminator.  */
 730 static void
 731 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
 732               cppchar_t type)
 733 {
 734   unsigned char *buffer;
 735   unsigned int len, clen;
 736
 737   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
 738
 739   /* C++ comments probably (not definitely) have moved past a new
 740      line, which we don't want to save in the comment.  */
 741   if (is_vspace (pfile->buffer->cur[-1]))
 742     len--;
 743
 744   /* If we are currently in a directive, then we need to store all
 745      C++ comments as C comments internally, and so we need to
 746      allocate a little extra space in that case.
 747
 748      Note that the only time we encounter a directive here is
 749      when we are saving comments in a "#define".  */
 750   clen = (pfile->state.in_directive && type == '/') ? len + 2 : len;
 751
 752   buffer = _cpp_unaligned_alloc (pfile, clen);
 753
 754   token->type = CPP_COMMENT;
 755   token->val.str.len = clen;
 756   token->val.str.text = buffer;
 757
 758   buffer[0] = '/';
 759   memcpy (buffer + 1, from, len - 1);
 760
 761   /* Finish conversion to a C comment, if necessary.  */
 762   if (pfile->state.in_directive && type == '/')
 763     {
 764       buffer[1] = '*';
 765       buffer[clen - 2] = '*';
 766       buffer[clen - 1] = '/';
 767     }
 768
 769   /* Finally store this comment for use by clients of libcpp. */
 770   store_comment (pfile, token);
 771 }
 772
 773 /* Allocate COUNT tokens for RUN.  */
 774 void
 775 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
 776 {
 777   run->base = XNEWVEC (cpp_token, count);
 778   run->limit = run->base + count;
 779   run->next = NULL;
 780 }
 781
 782 /* Returns the next tokenrun, or creates one if there is none.  */
 783 static tokenrun *
 784 next_tokenrun (tokenrun *run)
 785 {
 786   if (run->next == NULL)
 787     {
 788       run->next = XNEW (tokenrun);
 789       run->next->prev = run;
 790       _cpp_init_tokenrun (run->next, 250);
 791     }
 792
 793   return run->next;
 794 }
 795
 796 /* Look ahead in the input stream.  */
 797 const cpp_token *
 798 cpp_peek_token (cpp_reader *pfile, int index)
 799 {
 800   cpp_context *context = pfile->context;
 801   const cpp_token *peektok;
 802   int count;
 803
 804   /* First, scan through any pending cpp_context objects.  */
 805   while (context->prev)
 806     {
 807       ptrdiff_t sz = (context->direct_p
 808                       ? LAST (context).token - FIRST (context).token
 809                       : LAST (context).ptoken - FIRST (context).ptoken);
 810
 811       if (index < (int) sz)
 812         return (context->direct_p
 813                 ? FIRST (context).token + index
 814                 : *(FIRST (context).ptoken + index));
 815
 816       index -= (int) sz;
 817       context = context->prev;
 818     }
 819
 820   /* We will have to read some new tokens after all (and do so
 821      without invalidating preceding tokens).  */
 822   count = index;
 823   pfile->keep_tokens++;
 824
 825   do
 826     {
 827       peektok = _cpp_lex_token (pfile);
 828       if (peektok->type == CPP_EOF)
 829         return peektok;
 830     }
 831   while (index--);
 832
 833   _cpp_backup_tokens_direct (pfile, count + 1);
 834   pfile->keep_tokens--;
 835
 836   return peektok;
 837 }
 838
 839 /* Allocate a single token that is invalidated at the same time as the
 840    rest of the tokens on the line.  Has its line and col set to the
 841    same as the last lexed token, so that diagnostics appear in the
 842    right place.  */
 843 cpp_token *
 844 _cpp_temp_token (cpp_reader *pfile)
 845 {
 846   cpp_token *old, *result;
 847   ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
 848   ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
 849
 850   old = pfile->cur_token - 1;
 851   /* Any pre-existing lookaheads must not be clobbered.  */
 852   if (la)
 853     {
 854       if (sz <= la)
 855         {
 856           tokenrun *next = next_tokenrun (pfile->cur_run);
 857
 858           if (sz < la)
 859             memmove (next->base + 1, next->base,
 860                      (la - sz) * sizeof (cpp_token));
 861
 862           next->base[0] = pfile->cur_run->limit[-1];
 863         }
 864
 865       if (sz > 1)
 866         memmove (pfile->cur_token + 1, pfile->cur_token,
 867                  MIN (la, sz - 1) * sizeof (cpp_token));
 868     }
 869
 870   if (!sz && pfile->cur_token == pfile->cur_run->limit)
 871     {
 872       pfile->cur_run = next_tokenrun (pfile->cur_run);
 873       pfile->cur_token = pfile->cur_run->base;
 874     }
 875
 876   result = pfile->cur_token++;
 877   result->src_loc = old->src_loc;
 878   return result;
 879 }
 880
 881 /* Lex a token into RESULT (external interface).  Takes care of issues
 882    like directive handling, token lookahead, multiple include
 883    optimization and skipping.  */
 884 const cpp_token *
 885 _cpp_lex_token (cpp_reader *pfile)
 886 {
 887   cpp_token *result;
 888
 889   for (;;)
 890     {
 891       if (pfile->cur_token == pfile->cur_run->limit)
 892         {
 893           pfile->cur_run = next_tokenrun (pfile->cur_run);
 894           pfile->cur_token = pfile->cur_run->base;
 895         }
 896       /* We assume that the current token is somewhere in the current
 897          run.  */
 898       if (pfile->cur_token < pfile->cur_run->base
 899           || pfile->cur_token >= pfile->cur_run->limit)
 900         abort ();
 901
 902       if (pfile->lookaheads)
 903         {
 904           pfile->lookaheads--;
 905           result = pfile->cur_token++;
 906         }
 907       else
 908         result = _cpp_lex_direct (pfile);
 909
 910       if (result->flags & BOL)
 911         {
 912           /* Is this a directive.  If _cpp_handle_directive returns
 913              false, it is an assembler #.  */
 914           if (result->type == CPP_HASH
 915               /* 6.10.3 p 11: Directives in a list of macro arguments
 916                  gives undefined behavior.  This implementation
 917                  handles the directive as normal.  */
 918               && pfile->state.parsing_args != 1)
 919             {
 920               if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
 921                 {
 922                   if (pfile->directive_result.type == CPP_PADDING)
 923                     continue;
 924                   result = &pfile->directive_result;
 925                 }
 926             }
 927           else if (pfile->state.in_deferred_pragma)
 928             result = &pfile->directive_result;
 929
 930           if (pfile->cb.line_change && !pfile->state.skipping)
 931             pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
 932         }
 933
 934       /* We don't skip tokens in directives.  */
 935       if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
 936         break;
 937
 938       /* Outside a directive, invalidate controlling macros.  At file
 939          EOF, _cpp_lex_direct takes care of popping the buffer, so we never
 940          get here and MI optimization works.  */
 941       pfile->mi_valid = false;
 942
 943       if (!pfile->state.skipping || result->type == CPP_EOF)
 944         break;
 945     }
 946
 947   return result;
 948 }
 949
 950 /* Returns true if a fresh line has been loaded.  */
 951 bool
 952 _cpp_get_fresh_line (cpp_reader *pfile)
 953 {
 954   int return_at_eof;
 955
 956   /* We can't get a new line until we leave the current directive.  */
 957   if (pfile->state.in_directive)
 958     return false;
 959
 960   for (;;)
 961     {
 962       cpp_buffer *buffer = pfile->buffer;
 963
 964       if (!buffer->need_line)
 965         return true;
 966
 967       if (buffer->next_line < buffer->rlimit)
 968         {
 969           _cpp_clean_line (pfile);
 970           return true;
 971         }
 972
 973       /* First, get out of parsing arguments state.  */
 974       if (pfile->state.parsing_args)
 975         return false;
 976
 977       /* End of buffer.  Non-empty files should end in a newline.  */
 978       if (buffer->buf != buffer->rlimit
 979           && buffer->next_line > buffer->rlimit
 980           && !buffer->from_stage3)
 981         {
 982           /* Clip to buffer size.  */
 983           buffer->next_line = buffer->rlimit;
 984         }
 985
 986       return_at_eof = buffer->return_at_eof;
 987       _cpp_pop_buffer (pfile);
 988       if (pfile->buffer == NULL || return_at_eof)
 989         return false;
 990     }
 991 }
 992
 993 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)          \
 994   do                                                    \
 995     {                                                   \
 996       result->type = ELSE_TYPE;                         \
 997       if (*buffer->cur == CHAR)                         \
 998         buffer->cur++, result->type = THEN_TYPE;        \
 999     }                                                   \
1000   while (0)
1001
1002 /* Lex a token into pfile->cur_token, which is also incremented, to
1003    get diagnostics pointing to the correct location.
1004
1005    Does not handle issues such as token lookahead, multiple-include
1006    optimization, directives, skipping etc.  This function is only
1007    suitable for use by _cpp_lex_token, and in special cases like
1008    lex_expansion_token which doesn't care for any of these issues.
1009
1010    When meeting a newline, returns CPP_EOF if parsing a directive,
1011    otherwise returns to the start of the token buffer if permissible.
1012    Returns the location of the lexed token.  */
1013 cpp_token *
1014 _cpp_lex_direct (cpp_reader *pfile)
1015 {
1016   cppchar_t c;
1017   cpp_buffer *buffer;
1018   const unsigned char *comment_start;
1019   cpp_token *result = pfile->cur_token++;
1020
1021  fresh_line:
1022   result->flags = 0;
1023   buffer = pfile->buffer;
1024   if (buffer->need_line)
1025     {
1026       if (pfile->state.in_deferred_pragma)
1027         {
1028           result->type = CPP_PRAGMA_EOL;
1029           pfile->state.in_deferred_pragma = false;
1030           if (!pfile->state.pragma_allow_expansion)
1031             pfile->state.prevent_expansion--;
1032           return result;
1033         }
1034       if (!_cpp_get_fresh_line (pfile))
1035         {
1036           result->type = CPP_EOF;
1037           if (!pfile->state.in_directive)
1038             {
1039               /* Tell the compiler the line number of the EOF token.  */
1040               result->src_loc = pfile->line_table->highest_line;
1041               result->flags = BOL;
1042             }
1043           return result;
1044         }
1045       if (!pfile->keep_tokens)
1046         {
1047           pfile->cur_run = &pfile->base_run;
1048           result = pfile->base_run.base;
1049           pfile->cur_token = result + 1;
1050         }
1051       result->flags = BOL;
1052       if (pfile->state.parsing_args == 2)
1053         result->flags |= PREV_WHITE;
1054     }
1055   buffer = pfile->buffer;
1056  update_tokens_line:
1057   result->src_loc = pfile->line_table->highest_line;
1058
1059  skipped_white:
1060   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
1061       && !pfile->overlaid_buffer)
1062     {
1063       _cpp_process_line_notes (pfile, false);
1064       result->src_loc = pfile->line_table->highest_line;
1065     }
1066   c = *buffer->cur++;
1067
1068   LINEMAP_POSITION_FOR_COLUMN (result->src_loc, pfile->line_table,
1069                                CPP_BUF_COLUMN (buffer, buffer->cur));
1070
1071   switch (c)
1072     {
1073     case ' ': case '\t': case '\f': case '\v': case '\0':
1074       result->flags |= PREV_WHITE;
1075       skip_whitespace (pfile, c);
1076       goto skipped_white;
1077
1078     case '\n':
1079       if (buffer->cur < buffer->rlimit)
1080         CPP_INCREMENT_LINE (pfile, 0);
1081       buffer->need_line = true;
1082       goto fresh_line;
1083
1084     case '0': case '1': case '2': case '3': case '4':
1085     case '5': case '6': case '7': case '8': case '9':
1086       {
1087         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1088         result->type = CPP_NUMBER;
1089         lex_number (pfile, &result->val.str, &nst);
1090         warn_about_normalization (pfile, result, &nst);
1091         break;
1092       }
1093
1094     case 'L':
1095     case 'u':
1096     case 'U':
1097       /* 'L', 'u' or 'U' may introduce wide characters or strings.  */
1098       if (c == 'L' || CPP_OPTION (pfile, uliterals))
1099         {
1100           if (*buffer->cur == '\'' || *buffer->cur == '"')
1101             {
1102               lex_string (pfile, result, buffer->cur - 1);
1103               break;
1104             }
1105         }
1106       /* Fall through.  */
1107
1108     case '_':
1109     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1110     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1111     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1112     case 's': case 't':           case 'v': case 'w': case 'x':
1113     case 'y': case 'z':
1114     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1115     case 'G': case 'H': case 'I': case 'J': case 'K':
1116     case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1117     case 'S': case 'T':           case 'V': case 'W': case 'X':
1118     case 'Y': case 'Z':
1119       result->type = CPP_NAME;
1120       {
1121         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1122         result->val.node = lex_identifier (pfile, buffer->cur - 1, false,
1123                                            &nst);
1124         warn_about_normalization (pfile, result, &nst);
1125       }
1126
1127       /* Convert named operators to their proper types.  */
1128       if (result->val.node->flags & NODE_OPERATOR)
1129         {
1130           result->flags |= NAMED_OP;
1131           result->type = (enum cpp_ttype) result->val.node->directive_index;
1132         }
1133       break;
1134
1135     case '\'':
1136     case '"':
1137       lex_string (pfile, result, buffer->cur - 1);
1138       break;
1139
1140     case '/':
1141       /* A potential block or line comment.  */
1142       comment_start = buffer->cur;
1143       c = *buffer->cur;
1144
1145       if (c == '*')
1146         {
1147           if (_cpp_skip_block_comment (pfile))
1148             cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
1149         }
1150       else if (c == '/' && (CPP_OPTION (pfile, cplusplus_comments)
1151                             || cpp_in_system_header (pfile)))
1152         {
1153           /* Warn about comments only if pedantically GNUC89, and not
1154              in system headers.  */
1155           if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile)
1156               && ! buffer->warned_cplusplus_comments)
1157             {
1158               cpp_error (pfile, CPP_DL_PEDWARN,
1159                          "C++ style comments are not allowed in ISO C90");
1160               cpp_error (pfile, CPP_DL_PEDWARN,
1161                          "(this will be reported only once per input file)");
1162               buffer->warned_cplusplus_comments = 1;
1163             }
1164
1165           if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
1166             cpp_error (pfile, CPP_DL_WARNING, "multi-line comment");
1167         }
1168       else if (c == '=')
1169         {
1170           buffer->cur++;
1171           result->type = CPP_DIV_EQ;
1172           break;
1173         }
1174       else
1175         {
1176           result->type = CPP_DIV;
1177           break;
1178         }
1179
1180       if (!pfile->state.save_comments)
1181         {
1182           result->flags |= PREV_WHITE;
1183           goto update_tokens_line;
1184         }
1185
1186       /* Save the comment as a token in its own right.  */
1187       save_comment (pfile, result, comment_start, c);
1188       break;
1189
1190     case '<':
1191       if (pfile->state.angled_headers)
1192         {
1193           lex_string (pfile, result, buffer->cur - 1);
1194           if (result->type != CPP_LESS)
1195             break;
1196         }
1197
1198       result->type = CPP_LESS;
1199       if (*buffer->cur == '=')
1200         buffer->cur++, result->type = CPP_LESS_EQ;
1201       else if (*buffer->cur == '<')
1202         {
1203           buffer->cur++;
1204           IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
1205         }
1206       else if (CPP_OPTION (pfile, digraphs))
1207         {
1208           if (*buffer->cur == ':')
1209             {
1210               buffer->cur++;
1211               result->flags |= DIGRAPH;
1212               result->type = CPP_OPEN_SQUARE;
1213             }
1214           else if (*buffer->cur == '%')
1215             {
1216               buffer->cur++;
1217               result->flags |= DIGRAPH;
1218               result->type = CPP_OPEN_BRACE;
1219             }
1220         }
1221       break;
1222
1223     case '>':
1224       result->type = CPP_GREATER;
1225       if (*buffer->cur == '=')
1226         buffer->cur++, result->type = CPP_GREATER_EQ;
1227       else if (*buffer->cur == '>')
1228         {
1229           buffer->cur++;
1230           IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
1231         }
1232       break;
1233
1234     case '%':
1235       result->type = CPP_MOD;
1236       if (*buffer->cur == '=')
1237         buffer->cur++, result->type = CPP_MOD_EQ;
1238       else if (CPP_OPTION (pfile, digraphs))
1239         {
1240           if (*buffer->cur == ':')
1241             {
1242               buffer->cur++;
1243               result->flags |= DIGRAPH;
1244               result->type = CPP_HASH;
1245               if (*buffer->cur == '%' && buffer->cur[1] == ':')
1246                 buffer->cur += 2, result->type = CPP_PASTE;
1247             }
1248           else if (*buffer->cur == '>')
1249             {
1250               buffer->cur++;
1251               result->flags |= DIGRAPH;
1252               result->type = CPP_CLOSE_BRACE;
1253             }
1254         }
1255       break;
1256
1257     case '.':
1258       result->type = CPP_DOT;
1259       if (ISDIGIT (*buffer->cur))
1260         {
1261           struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1262           result->type = CPP_NUMBER;
1263           lex_number (pfile, &result->val.str, &nst);
1264           warn_about_normalization (pfile, result, &nst);
1265         }
1266       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
1267         buffer->cur += 2, result->type = CPP_ELLIPSIS;
1268       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
1269         buffer->cur++, result->type = CPP_DOT_STAR;
1270       break;
1271
1272     case '+':
1273       result->type = CPP_PLUS;
1274       if (*buffer->cur == '+')
1275         buffer->cur++, result->type = CPP_PLUS_PLUS;
1276       else if (*buffer->cur == '=')
1277         buffer->cur++, result->type = CPP_PLUS_EQ;
1278       break;
1279
1280     case '-':
1281       result->type = CPP_MINUS;
1282       if (*buffer->cur == '>')
1283         {
1284           buffer->cur++;
1285           result->type = CPP_DEREF;
1286           if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
1287             buffer->cur++, result->type = CPP_DEREF_STAR;
1288         }
1289       else if (*buffer->cur == '-')
1290         buffer->cur++, result->type = CPP_MINUS_MINUS;
1291       else if (*buffer->cur == '=')
1292         buffer->cur++, result->type = CPP_MINUS_EQ;
1293       break;
1294
1295     case '&':
1296       result->type = CPP_AND;
1297       if (*buffer->cur == '&')
1298         buffer->cur++, result->type = CPP_AND_AND;
1299       else if (*buffer->cur == '=')
1300         buffer->cur++, result->type = CPP_AND_EQ;
1301       break;
1302
1303     case '|':
1304       result->type = CPP_OR;
1305       if (*buffer->cur == '|')
1306         buffer->cur++, result->type = CPP_OR_OR;
1307       else if (*buffer->cur == '=')
1308         buffer->cur++, result->type = CPP_OR_EQ;
1309       break;
1310
1311     case ':':
1312       result->type = CPP_COLON;
1313       if (*buffer->cur == ':' && CPP_OPTION (pfile, cplusplus))
1314         buffer->cur++, result->type = CPP_SCOPE;
1315       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
1316         {
1317           buffer->cur++;
1318           result->flags |= DIGRAPH;
1319           result->type = CPP_CLOSE_SQUARE;
1320         }
1321       break;
1322
1323     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
1324     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
1325     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
1326     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
1327     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); break;
1328
1329     case '?': result->type = CPP_QUERY; break;
1330     case '~': result->type = CPP_COMPL; break;
1331     case ',': result->type = CPP_COMMA; break;
1332     case '(': result->type = CPP_OPEN_PAREN; break;
1333     case ')': result->type = CPP_CLOSE_PAREN; break;
1334     case '[': result->type = CPP_OPEN_SQUARE; break;
1335     case ']': result->type = CPP_CLOSE_SQUARE; break;
1336     case '{': result->type = CPP_OPEN_BRACE; break;
1337     case '}': result->type = CPP_CLOSE_BRACE; break;
1338     case ';': result->type = CPP_SEMICOLON; break;
1339
1340       /* @ is a punctuator in Objective-C.  */
1341     case '@': result->type = CPP_ATSIGN; break;
1342
1343     case '$':
1344     case '\\':
1345       {
1346         const uchar *base = --buffer->cur;
1347         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1348
1349         if (forms_identifier_p (pfile, true, &nst))
1350           {
1351             result->type = CPP_NAME;
1352             result->val.node = lex_identifier (pfile, base, true, &nst);
1353             warn_about_normalization (pfile, result, &nst);
1354             break;
1355           }
1356         buffer->cur++;
1357       }
1358
1359     default:
1360       create_literal (pfile, result, buffer->cur - 1, 1, CPP_OTHER);
1361       break;
1362     }
1363
1364   return result;
1365 }
1366
1367 /* An upper bound on the number of bytes needed to spell TOKEN.
1368    Does not include preceding whitespace.  */
1369 unsigned int
1370 cpp_token_len (const cpp_token *token)
1371 {
1372   unsigned int len;
1373
1374   switch (TOKEN_SPELL (token))
1375     {
1376     default:            len = 6;                                break;
1377     case SPELL_LITERAL: len = token->val.str.len;               break;
1378     case SPELL_IDENT:   len = NODE_LEN (token->val.node) * 10;  break;
1379     }
1380
1381   return len;
1382 }
1383
1384 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
1385    Return the number of bytes read out of NAME.  (There are always
1386    10 bytes written to BUFFER.)  */
1387
1388 static size_t
1389 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
1390 {
1391   int j;
1392   int ucn_len = 0;
1393   int ucn_len_c;
1394   unsigned t;
1395   unsigned long utf32;
1396
1397   /* Compute the length of the UTF-8 sequence.  */
1398   for (t = *name; t & 0x80; t <<= 1)
1399     ucn_len++;
1400
1401   utf32 = *name & (0x7F >> ucn_len);
1402   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
1403     {
1404       utf32 = (utf32 << 6) | (*++name & 0x3F);
1405
1406       /* Ill-formed UTF-8.  */
1407       if ((*name & ~0x3F) != 0x80)
1408         abort ();
1409     }
1410
1411   *buffer++ = '\\';
1412   *buffer++ = 'U';
1413   for (j = 7; j >= 0; j--)
1414     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
1415   return ucn_len;
1416 }
1417
1418
1419 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
1420    already contain the enough space to hold the token's spelling.
1421    Returns a pointer to the character after the last character written.
1422    FORSTRING is true if this is to be the spelling after translation
1423    phase 1 (this is different for UCNs).
1424    FIXME: Would be nice if we didn't need the PFILE argument.  */
1425 unsigned char *
1426 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
1427                  unsigned char *buffer, bool forstring)
1428 {
1429   switch (TOKEN_SPELL (token))
1430     {
1431     case SPELL_OPERATOR:
1432       {
1433         const unsigned char *spelling;
1434         unsigned char c;
1435
1436         if (token->flags & DIGRAPH)
1437           spelling
1438             = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1439         else if (token->flags & NAMED_OP)
1440           goto spell_ident;
1441         else
1442           spelling = TOKEN_NAME (token);
1443
1444         while ((c = *spelling++) != '\0')
1445           *buffer++ = c;
1446       }
1447       break;
1448
1449     spell_ident:
1450     case SPELL_IDENT:
1451       if (forstring)
1452         {
1453           memcpy (buffer, NODE_NAME (token->val.node),
1454                   NODE_LEN (token->val.node));
1455           buffer += NODE_LEN (token->val.node);
1456         }
1457       else
1458         {
1459           size_t i;
1460           const unsigned char * name = NODE_NAME (token->val.node);
1461
1462           for (i = 0; i < NODE_LEN (token->val.node); i++)
1463             if (name[i] & ~0x7F)
1464               {
1465                 i += utf8_to_ucn (buffer, name + i) - 1;
1466                 buffer += 10;
1467               }
1468             else
1469               *buffer++ = NODE_NAME (token->val.node)[i];
1470         }
1471       break;
1472
1473     case SPELL_LITERAL:
1474       memcpy (buffer, token->val.str.text, token->val.str.len);
1475       buffer += token->val.str.len;
1476       break;
1477
1478     case SPELL_NONE:
1479       cpp_error (pfile, CPP_DL_ICE,
1480                  "unspellable token %s", TOKEN_NAME (token));
1481       break;
1482     }
1483
1484   return buffer;
1485 }
1486
1487 /* Returns TOKEN spelt as a null-terminated string.  The string is
1488    freed when the reader is destroyed.  Useful for diagnostics.  */
1489 unsigned char *
1490 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
1491 {
1492   unsigned int len = cpp_token_len (token) + 1;
1493   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
1494
1495   end = cpp_spell_token (pfile, token, start, false);
1496   end[0] = '\0';
1497
1498   return start;
1499 }
1500
1501 /* Used by C front ends, which really should move to using
1502    cpp_token_as_text.  */
1503 const char *
1504 cpp_type2name (enum cpp_ttype type)
1505 {
1506   return (const char *) token_spellings[type].name;
1507 }
1508
1509 /* Writes the spelling of token to FP, without any preceding space.
1510    Separated from cpp_spell_token for efficiency - to avoid stdio
1511    double-buffering.  */
1512 void
1513 cpp_output_token (const cpp_token *token, FILE *fp)
1514 {
1515   switch (TOKEN_SPELL (token))
1516     {
1517     case SPELL_OPERATOR:
1518       {
1519         const unsigned char *spelling;
1520         int c;
1521
1522         if (token->flags & DIGRAPH)
1523           spelling
1524             = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1525         else if (token->flags & NAMED_OP)
1526           goto spell_ident;
1527         else
1528           spelling = TOKEN_NAME (token);
1529
1530         c = *spelling;
1531         do
1532           putc (c, fp);
1533         while ((c = *++spelling) != '\0');
1534       }
1535       break;
1536
1537     spell_ident:
1538     case SPELL_IDENT:
1539       {
1540         size_t i;
1541         const unsigned char * name = NODE_NAME (token->val.node);
1542
1543         for (i = 0; i < NODE_LEN (token->val.node); i++)
1544           if (name[i] & ~0x7F)
1545             {
1546               unsigned char buffer[10];
1547               i += utf8_to_ucn (buffer, name + i) - 1;
1548               fwrite (buffer, 1, 10, fp);
1549             }
1550           else
1551             fputc (NODE_NAME (token->val.node)[i], fp);
1552       }
1553       break;
1554
1555     case SPELL_LITERAL:
1556       fwrite (token->val.str.text, 1, token->val.str.len, fp);
1557       break;
1558
1559     case SPELL_NONE:
1560       /* An error, most probably.  */
1561       break;
1562     }
1563 }
1564
1565 /* Compare two tokens.  */
1566 int
1567 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
1568 {
1569   if (a->type == b->type && a->flags == b->flags)
1570     switch (TOKEN_SPELL (a))
1571       {
1572       default:                  /* Keep compiler happy.  */
1573       case SPELL_OPERATOR:
1574         return 1;
1575       case SPELL_NONE:
1576         return (a->type != CPP_MACRO_ARG || a->val.arg_no == b->val.arg_no);
1577       case SPELL_IDENT:
1578         return a->val.node == b->val.node;
1579       case SPELL_LITERAL:
1580         return (a->val.str.len == b->val.str.len
1581                 && !memcmp (a->val.str.text, b->val.str.text,
1582                             a->val.str.len));
1583       }
1584
1585   return 0;
1586 }
1587
1588 /* Returns nonzero if a space should be inserted to avoid an
1589    accidental token paste for output.  For simplicity, it is
1590    conservative, and occasionally advises a space where one is not
1591    needed, e.g. "." and ".2".  */
1592 int
1593 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
1594                  const cpp_token *token2)
1595 {
1596   enum cpp_ttype a = token1->type, b = token2->type;
1597   cppchar_t c;
1598
1599   if (token1->flags & NAMED_OP)
1600     a = CPP_NAME;
1601   if (token2->flags & NAMED_OP)
1602     b = CPP_NAME;
1603
1604   c = EOF;
1605   if (token2->flags & DIGRAPH)
1606     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
1607   else if (token_spellings[b].category == SPELL_OPERATOR)
1608     c = token_spellings[b].name[0];
1609
1610   /* Quickly get everything that can paste with an '='.  */
1611   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
1612     return 1;
1613
1614   switch (a)
1615     {
1616     case CPP_GREATER:   return c == '>';
1617     case CPP_LESS:      return c == '<' || c == '%' || c == ':';
1618     case CPP_PLUS:      return c == '+';
1619     case CPP_MINUS:     return c == '-' || c == '>';
1620     case CPP_DIV:       return c == '/' || c == '*'; /* Comments.  */
1621     case CPP_MOD:       return c == ':' || c == '>';
1622     case CPP_AND:       return c == '&';
1623     case CPP_OR:        return c == '|';
1624     case CPP_COLON:     return c == ':' || c == '>';
1625     case CPP_DEREF:     return c == '*';
1626     case CPP_DOT:       return c == '.' || c == '%' || b == CPP_NUMBER;
1627     case CPP_HASH:      return c == '#' || c == '%'; /* Digraph form.  */
1628     case CPP_NAME:      return ((b == CPP_NUMBER
1629                                  && name_p (pfile, &token2->val.str))
1630                                 || b == CPP_NAME
1631                                 || b == CPP_CHAR || b == CPP_STRING); /* L */
1632     case CPP_NUMBER:    return (b == CPP_NUMBER || b == CPP_NAME
1633                                 || c == '.' || c == '+' || c == '-');
1634                                       /* UCNs */
1635     case CPP_OTHER:     return ((token1->val.str.text[0] == '\\'
1636                                  && b == CPP_NAME)
1637                                 || (CPP_OPTION (pfile, objc)
1638                                     && token1->val.str.text[0] == '@'
1639                                     && (b == CPP_NAME || b == CPP_STRING)));
1640     default:            break;
1641     }
1642
1643   return 0;
1644 }
1645
1646 /* Output all the remaining tokens on the current line, and a newline
1647    character, to FP.  Leading whitespace is removed.  If there are
1648    macros, special token padding is not performed.  */
1649 void
1650 cpp_output_line (cpp_reader *pfile, FILE *fp)
1651 {
1652   const cpp_token *token;
1653
1654   token = cpp_get_token (pfile);
1655   while (token->type != CPP_EOF)
1656     {
1657       cpp_output_token (token, fp);
1658       token = cpp_get_token (pfile);
1659       if (token->flags & PREV_WHITE)
1660         putc (' ', fp);
1661     }
1662
1663   putc ('\n', fp);
1664 }
1665
1666 /* Return a string representation of all the remaining tokens on the
1667    current line.  The result is allocated using xmalloc and must be
1668    freed by the caller.  */
1669 unsigned char *
1670 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
1671 {
1672   const cpp_token *token;
1673   unsigned int out = dir_name ? ustrlen (dir_name) : 0;
1674   unsigned int alloced = 120 + out;
1675   unsigned char *result = (unsigned char *) xmalloc (alloced);
1676
1677   /* If DIR_NAME is empty, there are no initial contents.  */
1678   if (dir_name)
1679     {
1680       sprintf ((char *) result, "#%s ", dir_name);
1681       out += 2;
1682     }
1683
1684   token = cpp_get_token (pfile);
1685   while (token->type != CPP_EOF)
1686     {
1687       unsigned char *last;
1688       /* Include room for a possible space and the terminating nul.  */
1689       unsigned int len = cpp_token_len (token) + 2;
1690
1691       if (out + len > alloced)
1692         {
1693           alloced *= 2;
1694           if (out + len > alloced)
1695             alloced = out + len;
1696           result = (unsigned char *) xrealloc (result, alloced);
1697         }
1698
1699       last = cpp_spell_token (pfile, token, &result[out], 0);
1700       out = last - result;
1701
1702       token = cpp_get_token (pfile);
1703       if (token->flags & PREV_WHITE)
1704         result[out++] = ' ';
1705     }
1706
1707   result[out] = '\0';
1708   return result;
1709 }
1710
1711 /* Memory buffers.  Changing these three constants can have a dramatic
1712    effect on performance.  The values here are reasonable defaults,
1713    but might be tuned.  If you adjust them, be sure to test across a
1714    range of uses of cpplib, including heavy nested function-like macro
1715    expansion.  Also check the change in peak memory usage (NJAMD is a
1716    good tool for this).  */
1717 #define MIN_BUFF_SIZE 8000
1718 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
1719 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
1720         (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
1721
1722 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
1723   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
1724 #endif
1725
1726 /* Create a new allocation buffer.  Place the control block at the end
1727    of the buffer, so that buffer overflows will cause immediate chaos.  */
1728 static _cpp_buff *
1729 new_buff (size_t len)
1730 {
1731   _cpp_buff *result;
1732   unsigned char *base;
1733
1734   if (len < MIN_BUFF_SIZE)
1735     len = MIN_BUFF_SIZE;
1736   len = CPP_ALIGN (len);
1737
1738   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
1739   result = (_cpp_buff *) (base + len);
1740   result->base = base;
1741   result->cur = base;
1742   result->limit = base + len;
1743   result->next = NULL;
1744   return result;
1745 }
1746
1747 /* Place a chain of unwanted allocation buffers on the free list.  */
1748 void
1749 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
1750 {
1751   _cpp_buff *end = buff;
1752
1753   while (end->next)
1754     end = end->next;
1755   end->next = pfile->free_buffs;
1756   pfile->free_buffs = buff;
1757 }
1758
1759 /* Return a free buffer of size at least MIN_SIZE.  */
1760 _cpp_buff *
1761 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
1762 {
1763   _cpp_buff *result, **p;
1764
1765   for (p = &pfile->free_buffs;; p = &(*p)->next)
1766     {
1767       size_t size;
1768
1769       if (*p == NULL)
1770         return new_buff (min_size);
1771       result = *p;
1772       size = result->limit - result->base;
1773       /* Return a buffer that's big enough, but don't waste one that's
1774          way too big.  */
1775       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
1776         break;
1777     }
1778
1779   *p = result->next;
1780   result->next = NULL;
1781   result->cur = result->base;
1782   return result;
1783 }
1784
1785 /* Creates a new buffer with enough space to hold the uncommitted
1786    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
1787    the excess bytes to the new buffer.  Chains the new buffer after
1788    BUFF, and returns the new buffer.  */
1789 _cpp_buff *
1790 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
1791 {
1792   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
1793   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
1794
1795   buff->next = new_buff;
1796   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
1797   return new_buff;
1798 }
1799
1800 /* Creates a new buffer with enough space to hold the uncommitted
1801    remaining bytes of the buffer pointed to by BUFF, and at least
1802    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
1803    Chains the new buffer before the buffer pointed to by BUFF, and
1804    updates the pointer to point to the new buffer.  */
1805 void
1806 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
1807 {
1808   _cpp_buff *new_buff, *old_buff = *pbuff;
1809   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
1810
1811   new_buff = _cpp_get_buff (pfile, size);
1812   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
1813   new_buff->next = old_buff;
1814   *pbuff = new_buff;
1815 }
1816
1817 /* Free a chain of buffers starting at BUFF.  */
1818 void
1819 _cpp_free_buff (_cpp_buff *buff)
1820 {
1821   _cpp_buff *next;
1822
1823   for (; buff; buff = next)
1824     {
1825       next = buff->next;
1826       free (buff->base);
1827     }
1828 }
1829
1830 /* Allocate permanent, unaligned storage of length LEN.  */
1831 unsigned char *
1832 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
1833 {
1834   _cpp_buff *buff = pfile->u_buff;
1835   unsigned char *result = buff->cur;
1836
1837   if (len > (size_t) (buff->limit - result))
1838     {
1839       buff = _cpp_get_buff (pfile, len);
1840       buff->next = pfile->u_buff;
1841       pfile->u_buff = buff;
1842       result = buff->cur;
1843     }
1844
1845   buff->cur = result + len;
1846   return result;
1847 }
1848
1849 /* Allocate permanent, unaligned storage of length LEN from a_buff.
1850    That buffer is used for growing allocations when saving macro
1851    replacement lists in a #define, and when parsing an answer to an
1852    assertion in #assert, #unassert or #if (and therefore possibly
1853    whilst expanding macros).  It therefore must not be used by any
1854    code that they might call: specifically the lexer and the guts of
1855    the macro expander.
1856
1857    All existing other uses clearly fit this restriction: storing
1858    registered pragmas during initialization.  */
1859 unsigned char *
1860 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
1861 {
1862   _cpp_buff *buff = pfile->a_buff;
1863   unsigned char *result = buff->cur;
1864
1865   if (len > (size_t) (buff->limit - result))
1866     {
1867       buff = _cpp_get_buff (pfile, len);
1868       buff->next = pfile->a_buff;
1869       pfile->a_buff = buff;
1870       result = buff->cur;
1871     }
1872
1873   buff->cur = result + len;
1874   return result;
1875 }
1876
1877 /* Say which field of TOK is in use.  */
1878
1879 enum cpp_token_fld_kind
1880 cpp_token_val_index (cpp_token *tok)
1881 {
1882   switch (TOKEN_SPELL (tok))
1883     {
1884     case SPELL_IDENT:
1885       return CPP_TOKEN_FLD_NODE;
1886     case SPELL_LITERAL:
1887       return CPP_TOKEN_FLD_STR;
1888     case SPELL_NONE:
1889       if (tok->type == CPP_MACRO_ARG)
1890         return CPP_TOKEN_FLD_ARG_NO;
1891       else if (tok->type == CPP_PADDING)
1892         return CPP_TOKEN_FLD_SOURCE;
1893       else if (tok->type == CPP_PRAGMA)
1894         return CPP_TOKEN_FLD_PRAGMA;
1895       /* else fall through */
1896     default:
1897       return CPP_TOKEN_FLD_NONE;
1898     }
1899 }