libcpp/lex.c

   1 /* CPP Library - lexical analysis.
   2    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2007, 2008 Free Software Foundation, Inc.
   3    Contributed by Per Bothner, 1994-95.
   4    Based on CCCP program by Paul Rubin, June 1986
   5    Adapted to ANSI C, Richard Stallman, Jan 1987
   6    Broken out to separate file, Zack Weinberg, Mar 2000
   7
   8 This program is free software; you can redistribute it and/or modify it
   9 under the terms of the GNU General Public License as published by the
  10 Free Software Foundation; either version 2, or (at your option) any
  11 later version.
  12
  13 This program is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with this program; if not, write to the Free Software
  20 Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "cpplib.h"
  25 #include "internal.h"
  26
  27 enum spell_type
  28 {
  29   SPELL_OPERATOR = 0,
  30   SPELL_IDENT,
  31   SPELL_LITERAL,
  32   SPELL_NONE
  33 };
  34
  35 struct token_spelling
  36 {
  37   enum spell_type category;
  38   const unsigned char *name;
  39 };
  40
  41 static const unsigned char *const digraph_spellings[] =
  42 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
  43
  44 #define OP(e, s) { SPELL_OPERATOR, UC s  },
  45 #define TK(e, s) { SPELL_ ## s,    UC #e },
  46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
  47 #undef OP
  48 #undef TK
  49
  50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
  51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
  52
  53 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
  54 static int skip_line_comment (cpp_reader *);
  55 static void skip_whitespace (cpp_reader *, cppchar_t);
  56 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
  57 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
  58 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
  59                             unsigned int, enum cpp_ttype);
  60 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
  61 static int name_p (cpp_reader *, const cpp_string *);
  62 static tokenrun *next_tokenrun (tokenrun *);
  63
  64 static _cpp_buff *new_buff (size_t);
  65
  66
  67 /* Utility routine:
  68
  69    Compares, the token TOKEN to the NUL-terminated string STRING.
  70    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
  71 int
  72 cpp_ideq (const cpp_token *token, const char *string)
  73 {
  74   if (token->type != CPP_NAME)
  75     return 0;
  76
  77   return !ustrcmp (NODE_NAME (token->val.node), (const uchar *) string);
  78 }
  79
  80 /* Record a note TYPE at byte POS into the current cleaned logical
  81    line.  */
  82 static void
  83 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
  84 {
  85   if (buffer->notes_used == buffer->notes_cap)
  86     {
  87       buffer->notes_cap = buffer->notes_cap * 2 + 200;
  88       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
  89                                   buffer->notes_cap);
  90     }
  91
  92   buffer->notes[buffer->notes_used].pos = pos;
  93   buffer->notes[buffer->notes_used].type = type;
  94   buffer->notes_used++;
  95 }
  96
  97 /* Returns with a logical line that contains no escaped newlines or
  98    trigraphs.  This is a time-critical inner loop.  */
  99 void
 100 _cpp_clean_line (cpp_reader *pfile)
 101 {
 102   cpp_buffer *buffer;
 103   const uchar *s;
 104   uchar c, *d, *p;
 105
 106   buffer = pfile->buffer;
 107   buffer->cur_note = buffer->notes_used = 0;
 108   buffer->cur = buffer->line_base = buffer->next_line;
 109   buffer->need_line = false;
 110   s = buffer->next_line - 1;
 111
 112   if (!buffer->from_stage3)
 113     {
 114       const uchar *pbackslash = NULL;
 115
 116       /* Short circuit for the common case of an un-escaped line with
 117          no trigraphs.  The primary win here is by not writing any
 118          data back to memory until we have to.  */
 119       for (;;)
 120         {
 121           c = *++s;
 122           if (__builtin_expect (c == '\n', false)
 123               || __builtin_expect (c == '\r', false))
 124             {
 125               d = (uchar *) s;
 126
 127               if (__builtin_expect (s == buffer->rlimit, false))
 128                 goto done;
 129
 130               /* DOS line ending? */
 131               if (__builtin_expect (c == '\r', false)
 132                   && s[1] == '\n')
 133                 {
 134                   s++;
 135                   if (s == buffer->rlimit)
 136                     goto done;
 137                 }
 138
 139               if (__builtin_expect (pbackslash == NULL, true))
 140                 goto done;
 141
 142               /* Check for escaped newline.  */
 143               p = d;
 144               while (is_nvspace (p[-1]))
 145                 p--;
 146               if (p - 1 != pbackslash)
 147                 goto done;
 148
 149               /* Have an escaped newline; process it and proceed to
 150                  the slow path.  */
 151               add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
 152               d = p - 2;
 153               buffer->next_line = p - 1;
 154               break;
 155             }
 156           if (__builtin_expect (c == '\\', false))
 157             pbackslash = s;
 158           else if (__builtin_expect (c == '?', false)
 159                    && __builtin_expect (s[1] == '?', false)
 160                    && _cpp_trigraph_map[s[2]])
 161             {
 162               /* Have a trigraph.  We may or may not have to convert
 163                  it.  Add a line note regardless, for -Wtrigraphs.  */
 164               add_line_note (buffer, s, s[2]);
 165               if (CPP_OPTION (pfile, trigraphs))
 166                 {
 167                   /* We do, and that means we have to switch to the
 168                      slow path.  */
 169                   d = (uchar *) s;
 170                   *d = _cpp_trigraph_map[s[2]];
 171                   s += 2;
 172                   break;
 173                 }
 174             }
 175         }
 176
 177
 178       for (;;)
 179         {
 180           c = *++s;
 181           *++d = c;
 182
 183           if (c == '\n' || c == '\r')
 184             {
 185                   /* Handle DOS line endings.  */
 186               if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
 187                 s++;
 188               if (s == buffer->rlimit)
 189                 break;
 190
 191               /* Escaped?  */
 192               p = d;
 193               while (p != buffer->next_line && is_nvspace (p[-1]))
 194                 p--;
 195               if (p == buffer->next_line || p[-1] != '\\')
 196                 break;
 197
 198               add_line_note (buffer, p - 1, p != d ? ' ': '\\');
 199               d = p - 2;
 200               buffer->next_line = p - 1;
 201             }
 202           else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
 203             {
 204               /* Add a note regardless, for the benefit of -Wtrigraphs.  */
 205               add_line_note (buffer, d, s[2]);
 206               if (CPP_OPTION (pfile, trigraphs))
 207                 {
 208                   *d = _cpp_trigraph_map[s[2]];
 209                   s += 2;
 210                 }
 211             }
 212         }
 213     }
 214   else
 215     {
 216       do
 217         s++;
 218       while (*s != '\n' && *s != '\r');
 219       d = (uchar *) s;
 220
 221       /* Handle DOS line endings.  */
 222       if (*s == '\r' && s != buffer->rlimit && s[1] == '\n')
 223         s++;
 224     }
 225
 226  done:
 227   *d = '\n';
 228   /* A sentinel note that should never be processed.  */
 229   add_line_note (buffer, d + 1, '\n');
 230   buffer->next_line = s + 1;
 231 }
 232
 233 /* Return true if the trigraph indicated by NOTE should be warned
 234    about in a comment.  */
 235 static bool
 236 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
 237 {
 238   const uchar *p;
 239
 240   /* Within comments we don't warn about trigraphs, unless the
 241      trigraph forms an escaped newline, as that may change
 242      behavior.  */
 243   if (note->type != '/')
 244     return false;
 245
 246   /* If -trigraphs, then this was an escaped newline iff the next note
 247      is coincident.  */
 248   if (CPP_OPTION (pfile, trigraphs))
 249     return note[1].pos == note->pos;
 250
 251   /* Otherwise, see if this forms an escaped newline.  */
 252   p = note->pos + 3;
 253   while (is_nvspace (*p))
 254     p++;
 255
 256   /* There might have been escaped newlines between the trigraph and the
 257      newline we found.  Hence the position test.  */
 258   return (*p == '\n' && p < note[1].pos);
 259 }
 260
 261 /* Process the notes created by add_line_note as far as the current
 262    location.  */
 263 void
 264 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
 265 {
 266   cpp_buffer *buffer = pfile->buffer;
 267
 268   for (;;)
 269     {
 270       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
 271       unsigned int col;
 272
 273       if (note->pos > buffer->cur)
 274         break;
 275
 276       buffer->cur_note++;
 277       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
 278
 279       if (note->type == '\\' || note->type == ' ')
 280         {
 281           if (note->type == ' ' && !in_comment)
 282             cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
 283                                  "backslash and newline separated by space");
 284
 285           if (buffer->next_line > buffer->rlimit)
 286             {
 287               cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
 288                                    "backslash-newline at end of file");
 289               /* Prevent "no newline at end of file" warning.  */
 290               buffer->next_line = buffer->rlimit;
 291             }
 292
 293           buffer->line_base = note->pos;
 294           CPP_INCREMENT_LINE (pfile, 0);
 295         }
 296       else if (_cpp_trigraph_map[note->type])
 297         {
 298           if (CPP_OPTION (pfile, warn_trigraphs)
 299               && (!in_comment || warn_in_comment (pfile, note)))
 300             {
 301               if (CPP_OPTION (pfile, trigraphs))
 302                 cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
 303                                      "trigraph ??%c converted to %c",
 304                                      note->type,
 305                                      (int) _cpp_trigraph_map[note->type]);
 306               else
 307                 {
 308                   cpp_error_with_line
 309                     (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
 310                      "trigraph ??%c ignored, use -trigraphs to enable",
 311                      note->type);
 312                 }
 313             }
 314         }
 315       else
 316         abort ();
 317     }
 318 }
 319
 320 /* Skip a C-style block comment.  We find the end of the comment by
 321    seeing if an asterisk is before every '/' we encounter.  Returns
 322    nonzero if comment terminated by EOF, zero otherwise.
 323
 324    Buffer->cur points to the initial asterisk of the comment.  */
 325 bool
 326 _cpp_skip_block_comment (cpp_reader *pfile)
 327 {
 328   cpp_buffer *buffer = pfile->buffer;
 329   const uchar *cur = buffer->cur;
 330   uchar c;
 331
 332   cur++;
 333   if (*cur == '/')
 334     cur++;
 335
 336   for (;;)
 337     {
 338       /* People like decorating comments with '*', so check for '/'
 339          instead for efficiency.  */
 340       c = *cur++;
 341
 342       if (c == '/')
 343         {
 344           if (cur[-2] == '*')
 345             break;
 346
 347           /* Warn about potential nested comments, but not if the '/'
 348              comes immediately before the true comment delimiter.
 349              Don't bother to get it right across escaped newlines.  */
 350           if (CPP_OPTION (pfile, warn_comments)
 351               && cur[0] == '*' && cur[1] != '/')
 352             {
 353               buffer->cur = cur;
 354               cpp_error_with_line (pfile, CPP_DL_WARNING,
 355                                    pfile->line_table->highest_line, CPP_BUF_COL (buffer),
 356                                    "\"/*\" within comment");
 357             }
 358         }
 359       else if (c == '\n')
 360         {
 361           unsigned int cols;
 362           buffer->cur = cur - 1;
 363           _cpp_process_line_notes (pfile, true);
 364           if (buffer->next_line >= buffer->rlimit)
 365             return true;
 366           _cpp_clean_line (pfile);
 367
 368           cols = buffer->next_line - buffer->line_base;
 369           CPP_INCREMENT_LINE (pfile, cols);
 370
 371           cur = buffer->cur;
 372         }
 373     }
 374
 375   buffer->cur = cur;
 376   _cpp_process_line_notes (pfile, true);
 377   return false;
 378 }
 379
 380 /* Skip a C++ line comment, leaving buffer->cur pointing to the
 381    terminating newline.  Handles escaped newlines.  Returns nonzero
 382    if a multiline comment.  */
 383 static int
 384 skip_line_comment (cpp_reader *pfile)
 385 {
 386   cpp_buffer *buffer = pfile->buffer;
 387   unsigned int orig_line = pfile->line_table->highest_line;
 388
 389   while (*buffer->cur != '\n')
 390     buffer->cur++;
 391
 392   _cpp_process_line_notes (pfile, true);
 393   return orig_line != pfile->line_table->highest_line;
 394 }
 395
 396 /* Skips whitespace, saving the next non-whitespace character.  */
 397 static void
 398 skip_whitespace (cpp_reader *pfile, cppchar_t c)
 399 {
 400   cpp_buffer *buffer = pfile->buffer;
 401   bool saw_NUL = false;
 402
 403   do
 404     {
 405       /* Horizontal space always OK.  */
 406       if (c == ' ' || c == '\t')
 407         ;
 408       /* Just \f \v or \0 left.  */
 409       else if (c == '\0')
 410         saw_NUL = true;
 411       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
 412         cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
 413                              CPP_BUF_COL (buffer),
 414                              "%s in preprocessing directive",
 415                              c == '\f' ? "form feed" : "vertical tab");
 416
 417       c = *buffer->cur++;
 418     }
 419   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
 420   while (is_nvspace (c));
 421
 422   if (saw_NUL)
 423     cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored");
 424
 425   buffer->cur--;
 426 }
 427
 428 /* See if the characters of a number token are valid in a name (no
 429    '.', '+' or '-').  */
 430 static int
 431 name_p (cpp_reader *pfile, const cpp_string *string)
 432 {
 433   unsigned int i;
 434
 435   for (i = 0; i < string->len; i++)
 436     if (!is_idchar (string->text[i]))
 437       return 0;
 438
 439   return 1;
 440 }
 441
 442 /* After parsing an identifier or other sequence, produce a warning about
 443    sequences not in NFC/NFKC.  */
 444 static void
 445 warn_about_normalization (cpp_reader *pfile,
 446                           const cpp_token *token,
 447                           const struct normalize_state *s)
 448 {
 449   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
 450       && !pfile->state.skipping)
 451     {
 452       /* Make sure that the token is printed using UCNs, even
 453          if we'd otherwise happily print UTF-8.  */
 454       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
 455       size_t sz;
 456
 457       sz = cpp_spell_token (pfile, token, buf, false) - buf;
 458       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
 459         cpp_error_with_line (pfile, CPP_DL_WARNING, token->src_loc, 0,
 460                              "`%.*s' is not in NFKC", (int) sz, buf);
 461       else
 462         cpp_error_with_line (pfile, CPP_DL_WARNING, token->src_loc, 0,
 463                              "`%.*s' is not in NFC", (int) sz, buf);
 464     }
 465 }
 466
 467 /* Returns TRUE if the sequence starting at buffer->cur is invalid in
 468    an identifier.  FIRST is TRUE if this starts an identifier.  */
 469 static bool
 470 forms_identifier_p (cpp_reader *pfile, int first,
 471                     struct normalize_state *state)
 472 {
 473   cpp_buffer *buffer = pfile->buffer;
 474
 475   if (*buffer->cur == '$')
 476     {
 477       if (!CPP_OPTION (pfile, dollars_in_ident))
 478         return false;
 479
 480       buffer->cur++;
 481       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
 482         {
 483           CPP_OPTION (pfile, warn_dollars) = 0;
 484           cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
 485         }
 486
 487       return true;
 488     }
 489
 490   /* Is this a syntactically valid UCN?  */
 491   if (CPP_OPTION (pfile, extended_identifiers)
 492       && *buffer->cur == '\\'
 493       && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
 494     {
 495       buffer->cur += 2;
 496       if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
 497                           state))
 498         return true;
 499       buffer->cur -= 2;
 500     }
 501
 502   return false;
 503 }
 504
 505 /* Lex an identifier starting at BUFFER->CUR - 1.  */
 506 static cpp_hashnode *
 507 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
 508                 struct normalize_state *nst)
 509 {
 510   cpp_hashnode *result;
 511   const uchar *cur;
 512   unsigned int len;
 513   unsigned int hash = HT_HASHSTEP (0, *base);
 514
 515   cur = pfile->buffer->cur;
 516   if (! starts_ucn)
 517     while (ISIDNUM (*cur))
 518       {
 519         hash = HT_HASHSTEP (hash, *cur);
 520         cur++;
 521       }
 522   pfile->buffer->cur = cur;
 523   if (starts_ucn || forms_identifier_p (pfile, false, nst))
 524     {
 525       /* Slower version for identifiers containing UCNs (or $).  */
 526       do {
 527         while (ISIDNUM (*pfile->buffer->cur))
 528           {
 529             pfile->buffer->cur++;
 530             NORMALIZE_STATE_UPDATE_IDNUM (nst);
 531           }
 532       } while (forms_identifier_p (pfile, false, nst));
 533       result = _cpp_interpret_identifier (pfile, base,
 534                                           pfile->buffer->cur - base);
 535     }
 536   else
 537     {
 538       len = cur - base;
 539       hash = HT_HASHFINISH (hash, len);
 540
 541       result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
 542                                                   base, len, hash, HT_ALLOC));
 543     }
 544
 545   /* Rarely, identifiers require diagnostics when lexed.  */
 546   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
 547                         && !pfile->state.skipping, 0))
 548     {
 549       /* It is allowed to poison the same identifier twice.  */
 550       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
 551         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
 552                    NODE_NAME (result));
 553
 554       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
 555          replacement list of a variadic macro.  */
 556       if (result == pfile->spec_nodes.n__VA_ARGS__
 557           && !pfile->state.va_args_ok)
 558         cpp_error (pfile, CPP_DL_PEDWARN,
 559                    "__VA_ARGS__ can only appear in the expansion"
 560                    " of a C99 variadic macro");
 561     }
 562
 563   return result;
 564 }
 565
 566 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
 567 static void
 568 lex_number (cpp_reader *pfile, cpp_string *number,
 569             struct normalize_state *nst)
 570 {
 571   const uchar *cur;
 572   const uchar *base;
 573   uchar *dest;
 574
 575   base = pfile->buffer->cur - 1;
 576   do
 577     {
 578       cur = pfile->buffer->cur;
 579
 580       /* N.B. ISIDNUM does not include $.  */
 581       while (ISIDNUM (*cur) || *cur == '.' || VALID_SIGN (*cur, cur[-1]))
 582         {
 583           cur++;
 584           NORMALIZE_STATE_UPDATE_IDNUM (nst);
 585         }
 586
 587       pfile->buffer->cur = cur;
 588     }
 589   while (forms_identifier_p (pfile, false, nst));
 590
 591   number->len = cur - base;
 592   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
 593   memcpy (dest, base, number->len);
 594   dest[number->len] = '\0';
 595   number->text = dest;
 596 }
 597
 598 /* Create a token of type TYPE with a literal spelling.  */
 599 static void
 600 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
 601                 unsigned int len, enum cpp_ttype type)
 602 {
 603   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
 604
 605   memcpy (dest, base, len);
 606   dest[len] = '\0';
 607   token->type = type;
 608   token->val.str.len = len;
 609   token->val.str.text = dest;
 610 }
 611
 612 /* Lexes a string, character constant, or angle-bracketed header file
 613    name.  The stored string contains the spelling, including opening
 614    quote and leading any leading 'L', 'u' or 'U'.  It returns the type
 615    of the literal, or CPP_OTHER if it was not properly terminated.
 616
 617    The spelling is NUL-terminated, but it is not guaranteed that this
 618    is the first NUL since embedded NULs are preserved.  */
 619 static void
 620 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
 621 {
 622   bool saw_NUL = false;
 623   const uchar *cur;
 624   cppchar_t terminator;
 625   enum cpp_ttype type;
 626
 627   cur = base;
 628   terminator = *cur++;
 629   if (terminator == 'L' || terminator == 'u' || terminator == 'U')
 630     terminator = *cur++;
 631   if (terminator == '\"')
 632     type = (*base == 'L' ? CPP_WSTRING :
 633             *base == 'U' ? CPP_STRING32 :
 634             *base == 'u' ? CPP_STRING16 : CPP_STRING);
 635   else if (terminator == '\'')
 636     type = (*base == 'L' ? CPP_WCHAR :
 637             *base == 'U' ? CPP_CHAR32 :
 638             *base == 'u' ? CPP_CHAR16 : CPP_CHAR);
 639   else
 640     terminator = '>', type = CPP_HEADER_NAME;
 641
 642   for (;;)
 643     {
 644       cppchar_t c = *cur++;
 645
 646       /* In #include-style directives, terminators are not escapable.  */
 647       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
 648         cur++;
 649       else if (c == terminator)
 650         break;
 651       else if (c == '\n')
 652         {
 653           cur--;
 654           type = CPP_OTHER;
 655           break;
 656         }
 657       else if (c == '\0')
 658         saw_NUL = true;
 659     }
 660
 661   if (saw_NUL && !pfile->state.skipping)
 662     cpp_error (pfile, CPP_DL_WARNING,
 663                "null character(s) preserved in literal");
 664
 665   if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
 666     cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
 667                (int) terminator);
 668
 669   pfile->buffer->cur = cur;
 670   create_literal (pfile, token, base, cur - base, type);
 671 }
 672
 673 /* The stored comment includes the comment start and any terminator.  */
 674 static void
 675 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
 676               cppchar_t type)
 677 {
 678   unsigned char *buffer;
 679   unsigned int len, clen;
 680
 681   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
 682
 683   /* C++ comments probably (not definitely) have moved past a new
 684      line, which we don't want to save in the comment.  */
 685   if (is_vspace (pfile->buffer->cur[-1]))
 686     len--;
 687
 688   /* If we are currently in a directive, then we need to store all
 689      C++ comments as C comments internally, and so we need to
 690      allocate a little extra space in that case.
 691
 692      Note that the only time we encounter a directive here is
 693      when we are saving comments in a "#define".  */
 694   clen = (pfile->state.in_directive && type == '/') ? len + 2 : len;
 695
 696   buffer = _cpp_unaligned_alloc (pfile, clen);
 697
 698   token->type = CPP_COMMENT;
 699   token->val.str.len = clen;
 700   token->val.str.text = buffer;
 701
 702   buffer[0] = '/';
 703   memcpy (buffer + 1, from, len - 1);
 704
 705   /* Finish conversion to a C comment, if necessary.  */
 706   if (pfile->state.in_directive && type == '/')
 707     {
 708       buffer[1] = '*';
 709       buffer[clen - 2] = '*';
 710       buffer[clen - 1] = '/';
 711     }
 712 }
 713
 714 /* Allocate COUNT tokens for RUN.  */
 715 void
 716 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
 717 {
 718   run->base = XNEWVEC (cpp_token, count);
 719   run->limit = run->base + count;
 720   run->next = NULL;
 721 }
 722
 723 /* Returns the next tokenrun, or creates one if there is none.  */
 724 static tokenrun *
 725 next_tokenrun (tokenrun *run)
 726 {
 727   if (run->next == NULL)
 728     {
 729       run->next = XNEW (tokenrun);
 730       run->next->prev = run;
 731       _cpp_init_tokenrun (run->next, 250);
 732     }
 733
 734   return run->next;
 735 }
 736
 737 /* Allocate a single token that is invalidated at the same time as the
 738    rest of the tokens on the line.  Has its line and col set to the
 739    same as the last lexed token, so that diagnostics appear in the
 740    right place.  */
 741 cpp_token *
 742 _cpp_temp_token (cpp_reader *pfile)
 743 {
 744   cpp_token *old, *result;
 745
 746   old = pfile->cur_token - 1;
 747   if (pfile->cur_token == pfile->cur_run->limit)
 748     {
 749       pfile->cur_run = next_tokenrun (pfile->cur_run);
 750       pfile->cur_token = pfile->cur_run->base;
 751     }
 752
 753   result = pfile->cur_token++;
 754   result->src_loc = old->src_loc;
 755   return result;
 756 }
 757
 758 /* Lex a token into RESULT (external interface).  Takes care of issues
 759    like directive handling, token lookahead, multiple include
 760    optimization and skipping.  */
 761 const cpp_token *
 762 _cpp_lex_token (cpp_reader *pfile)
 763 {
 764   cpp_token *result;
 765
 766   for (;;)
 767     {
 768       if (pfile->cur_token == pfile->cur_run->limit)
 769         {
 770           pfile->cur_run = next_tokenrun (pfile->cur_run);
 771           pfile->cur_token = pfile->cur_run->base;
 772         }
 773       /* We assume that the current token is somewhere in the current
 774          run.  */
 775       if (pfile->cur_token < pfile->cur_run->base
 776           || pfile->cur_token >= pfile->cur_run->limit)
 777         abort ();
 778
 779       if (pfile->lookaheads)
 780         {
 781           pfile->lookaheads--;
 782           result = pfile->cur_token++;
 783         }
 784       else
 785         result = _cpp_lex_direct (pfile);
 786
 787       if (result->flags & BOL)
 788         {
 789           /* Is this a directive.  If _cpp_handle_directive returns
 790              false, it is an assembler #.  */
 791           if (result->type == CPP_HASH
 792               /* 6.10.3 p 11: Directives in a list of macro arguments
 793                  gives undefined behavior.  This implementation
 794                  handles the directive as normal.  */
 795               && pfile->state.parsing_args != 1)
 796             {
 797               if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
 798                 {
 799                   if (pfile->directive_result.type == CPP_PADDING)
 800                     continue;
 801                   result = &pfile->directive_result;
 802                 }
 803             }
 804           else if (pfile->state.in_deferred_pragma)
 805             result = &pfile->directive_result;
 806
 807           if (pfile->cb.line_change && !pfile->state.skipping)
 808             pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
 809         }
 810
 811       /* We don't skip tokens in directives.  */
 812       if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
 813         break;
 814
 815       /* Outside a directive, invalidate controlling macros.  At file
 816          EOF, _cpp_lex_direct takes care of popping the buffer, so we never
 817          get here and MI optimization works.  */
 818       pfile->mi_valid = false;
 819
 820       if (!pfile->state.skipping || result->type == CPP_EOF)
 821         break;
 822     }
 823
 824   return result;
 825 }
 826
 827 /* Returns true if a fresh line has been loaded.  */
 828 bool
 829 _cpp_get_fresh_line (cpp_reader *pfile)
 830 {
 831   int return_at_eof;
 832
 833   /* We can't get a new line until we leave the current directive.  */
 834   if (pfile->state.in_directive)
 835     return false;
 836
 837   for (;;)
 838     {
 839       cpp_buffer *buffer = pfile->buffer;
 840
 841       if (!buffer->need_line)
 842         return true;
 843
 844       if (buffer->next_line < buffer->rlimit)
 845         {
 846           _cpp_clean_line (pfile);
 847           return true;
 848         }
 849
 850       /* First, get out of parsing arguments state.  */
 851       if (pfile->state.parsing_args)
 852         return false;
 853
 854       /* End of buffer.  Non-empty files should end in a newline.  */
 855       if (buffer->buf != buffer->rlimit
 856           && buffer->next_line > buffer->rlimit
 857           && !buffer->from_stage3)
 858         {
 859           /* Clip to buffer size.  */
 860           buffer->next_line = buffer->rlimit;
 861         }
 862
 863       return_at_eof = buffer->return_at_eof;
 864       _cpp_pop_buffer (pfile);
 865       if (pfile->buffer == NULL || return_at_eof)
 866         return false;
 867     }
 868 }
 869
 870 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)          \
 871   do                                                    \
 872     {                                                   \
 873       result->type = ELSE_TYPE;                         \
 874       if (*buffer->cur == CHAR)                         \
 875         buffer->cur++, result->type = THEN_TYPE;        \
 876     }                                                   \
 877   while (0)
 878
 879 /* Lex a token into pfile->cur_token, which is also incremented, to
 880    get diagnostics pointing to the correct location.
 881
 882    Does not handle issues such as token lookahead, multiple-include
 883    optimization, directives, skipping etc.  This function is only
 884    suitable for use by _cpp_lex_token, and in special cases like
 885    lex_expansion_token which doesn't care for any of these issues.
 886
 887    When meeting a newline, returns CPP_EOF if parsing a directive,
 888    otherwise returns to the start of the token buffer if permissible.
 889    Returns the location of the lexed token.  */
 890 cpp_token *
 891 _cpp_lex_direct (cpp_reader *pfile)
 892 {
 893   cppchar_t c;
 894   cpp_buffer *buffer;
 895   const unsigned char *comment_start;
 896   cpp_token *result = pfile->cur_token++;
 897
 898  fresh_line:
 899   result->flags = 0;
 900   buffer = pfile->buffer;
 901   if (buffer->need_line)
 902     {
 903       if (pfile->state.in_deferred_pragma)
 904         {
 905           result->type = CPP_PRAGMA_EOL;
 906           pfile->state.in_deferred_pragma = false;
 907           if (!pfile->state.pragma_allow_expansion)
 908             pfile->state.prevent_expansion--;
 909           return result;
 910         }
 911       if (!_cpp_get_fresh_line (pfile))
 912         {
 913           result->type = CPP_EOF;
 914           if (!pfile->state.in_directive)
 915             {
 916               /* Tell the compiler the line number of the EOF token.  */
 917               result->src_loc = pfile->line_table->highest_line;
 918               result->flags = BOL;
 919             }
 920           return result;
 921         }
 922       if (!pfile->keep_tokens)
 923         {
 924           pfile->cur_run = &pfile->base_run;
 925           result = pfile->base_run.base;
 926           pfile->cur_token = result + 1;
 927         }
 928       result->flags = BOL;
 929       if (pfile->state.parsing_args == 2)
 930         result->flags |= PREV_WHITE;
 931     }
 932   buffer = pfile->buffer;
 933  update_tokens_line:
 934   result->src_loc = pfile->line_table->highest_line;
 935
 936  skipped_white:
 937   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
 938       && !pfile->overlaid_buffer)
 939     {
 940       _cpp_process_line_notes (pfile, false);
 941       result->src_loc = pfile->line_table->highest_line;
 942     }
 943   c = *buffer->cur++;
 944
 945   LINEMAP_POSITION_FOR_COLUMN (result->src_loc, pfile->line_table,
 946                                CPP_BUF_COLUMN (buffer, buffer->cur));
 947
 948   switch (c)
 949     {
 950     case ' ': case '\t': case '\f': case '\v': case '\0':
 951       result->flags |= PREV_WHITE;
 952       skip_whitespace (pfile, c);
 953       goto skipped_white;
 954
 955     case '\n':
 956       if (buffer->cur < buffer->rlimit)
 957         CPP_INCREMENT_LINE (pfile, 0);
 958       buffer->need_line = true;
 959       goto fresh_line;
 960
 961     case '0': case '1': case '2': case '3': case '4':
 962     case '5': case '6': case '7': case '8': case '9':
 963       {
 964         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
 965         result->type = CPP_NUMBER;
 966         lex_number (pfile, &result->val.str, &nst);
 967         warn_about_normalization (pfile, result, &nst);
 968         break;
 969       }
 970
 971     case 'L':
 972     case 'u':
 973     case 'U':
 974       /* 'L', 'u' or 'U' may introduce wide characters or strings.  */
 975       if (c == 'L' || CPP_OPTION (pfile, uliterals))
 976         {
 977           if (*buffer->cur == '\'' || *buffer->cur == '"')
 978             {
 979               lex_string (pfile, result, buffer->cur - 1);
 980               break;
 981             }
 982         }
 983       /* Fall through.  */
 984
 985     case '_':
 986     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
 987     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
 988     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
 989     case 's': case 't':           case 'v': case 'w': case 'x':
 990     case 'y': case 'z':
 991     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
 992     case 'G': case 'H': case 'I': case 'J': case 'K':
 993     case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
 994     case 'S': case 'T':           case 'V': case 'W': case 'X':
 995     case 'Y': case 'Z':
 996       result->type = CPP_NAME;
 997       {
 998         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
 999         result->val.node = lex_identifier (pfile, buffer->cur - 1, false,
1000                                            &nst);
1001         warn_about_normalization (pfile, result, &nst);
1002       }
1003
1004       /* Convert named operators to their proper types.  */
1005       if (result->val.node->flags & NODE_OPERATOR)
1006         {
1007           result->flags |= NAMED_OP;
1008           result->type = (enum cpp_ttype) result->val.node->directive_index;
1009         }
1010       break;
1011
1012     case '\'':
1013     case '"':
1014       lex_string (pfile, result, buffer->cur - 1);
1015       break;
1016
1017     case '/':
1018       /* A potential block or line comment.  */
1019       comment_start = buffer->cur;
1020       c = *buffer->cur;
1021
1022       if (c == '*')
1023         {
1024           if (_cpp_skip_block_comment (pfile))
1025             cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
1026         }
1027       else if (c == '/' && (CPP_OPTION (pfile, cplusplus_comments)
1028                             || cpp_in_system_header (pfile)))
1029         {
1030           /* Warn about comments only if pedantically GNUC89, and not
1031              in system headers.  */
1032           if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile)
1033               && ! buffer->warned_cplusplus_comments)
1034             {
1035               cpp_error (pfile, CPP_DL_PEDWARN,
1036                          "C++ style comments are not allowed in ISO C90");
1037               cpp_error (pfile, CPP_DL_PEDWARN,
1038                          "(this will be reported only once per input file)");
1039               buffer->warned_cplusplus_comments = 1;
1040             }
1041
1042           if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
1043             cpp_error (pfile, CPP_DL_WARNING, "multi-line comment");
1044         }
1045       else if (c == '=')
1046         {
1047           buffer->cur++;
1048           result->type = CPP_DIV_EQ;
1049           break;
1050         }
1051       else
1052         {
1053           result->type = CPP_DIV;
1054           break;
1055         }
1056
1057       if (!pfile->state.save_comments)
1058         {
1059           result->flags |= PREV_WHITE;
1060           goto update_tokens_line;
1061         }
1062
1063       /* Save the comment as a token in its own right.  */
1064       save_comment (pfile, result, comment_start, c);
1065       break;
1066
1067     case '<':
1068       if (pfile->state.angled_headers)
1069         {
1070           lex_string (pfile, result, buffer->cur - 1);
1071           break;
1072         }
1073
1074       result->type = CPP_LESS;
1075       if (*buffer->cur == '=')
1076         buffer->cur++, result->type = CPP_LESS_EQ;
1077       else if (*buffer->cur == '<')
1078         {
1079           buffer->cur++;
1080           IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
1081         }
1082       else if (CPP_OPTION (pfile, digraphs))
1083         {
1084           if (*buffer->cur == ':')
1085             {
1086               buffer->cur++;
1087               result->flags |= DIGRAPH;
1088               result->type = CPP_OPEN_SQUARE;
1089             }
1090           else if (*buffer->cur == '%')
1091             {
1092               buffer->cur++;
1093               result->flags |= DIGRAPH;
1094               result->type = CPP_OPEN_BRACE;
1095             }
1096         }
1097       break;
1098
1099     case '>':
1100       result->type = CPP_GREATER;
1101       if (*buffer->cur == '=')
1102         buffer->cur++, result->type = CPP_GREATER_EQ;
1103       else if (*buffer->cur == '>')
1104         {
1105           buffer->cur++;
1106           IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
1107         }
1108       break;
1109
1110     case '%':
1111       result->type = CPP_MOD;
1112       if (*buffer->cur == '=')
1113         buffer->cur++, result->type = CPP_MOD_EQ;
1114       else if (CPP_OPTION (pfile, digraphs))
1115         {
1116           if (*buffer->cur == ':')
1117             {
1118               buffer->cur++;
1119               result->flags |= DIGRAPH;
1120               result->type = CPP_HASH;
1121               if (*buffer->cur == '%' && buffer->cur[1] == ':')
1122                 buffer->cur += 2, result->type = CPP_PASTE;
1123             }
1124           else if (*buffer->cur == '>')
1125             {
1126               buffer->cur++;
1127               result->flags |= DIGRAPH;
1128               result->type = CPP_CLOSE_BRACE;
1129             }
1130         }
1131       break;
1132
1133     case '.':
1134       result->type = CPP_DOT;
1135       if (ISDIGIT (*buffer->cur))
1136         {
1137           struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1138           result->type = CPP_NUMBER;
1139           lex_number (pfile, &result->val.str, &nst);
1140           warn_about_normalization (pfile, result, &nst);
1141         }
1142       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
1143         buffer->cur += 2, result->type = CPP_ELLIPSIS;
1144       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
1145         buffer->cur++, result->type = CPP_DOT_STAR;
1146       break;
1147
1148     case '+':
1149       result->type = CPP_PLUS;
1150       if (*buffer->cur == '+')
1151         buffer->cur++, result->type = CPP_PLUS_PLUS;
1152       else if (*buffer->cur == '=')
1153         buffer->cur++, result->type = CPP_PLUS_EQ;
1154       break;
1155
1156     case '-':
1157       result->type = CPP_MINUS;
1158       if (*buffer->cur == '>')
1159         {
1160           buffer->cur++;
1161           result->type = CPP_DEREF;
1162           if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
1163             buffer->cur++, result->type = CPP_DEREF_STAR;
1164         }
1165       else if (*buffer->cur == '-')
1166         buffer->cur++, result->type = CPP_MINUS_MINUS;
1167       else if (*buffer->cur == '=')
1168         buffer->cur++, result->type = CPP_MINUS_EQ;
1169       break;
1170
1171     case '&':
1172       result->type = CPP_AND;
1173       if (*buffer->cur == '&')
1174         buffer->cur++, result->type = CPP_AND_AND;
1175       else if (*buffer->cur == '=')
1176         buffer->cur++, result->type = CPP_AND_EQ;
1177       break;
1178
1179     case '|':
1180       result->type = CPP_OR;
1181       if (*buffer->cur == '|')
1182         buffer->cur++, result->type = CPP_OR_OR;
1183       else if (*buffer->cur == '=')
1184         buffer->cur++, result->type = CPP_OR_EQ;
1185       break;
1186
1187     case ':':
1188       result->type = CPP_COLON;
1189       if (*buffer->cur == ':' && CPP_OPTION (pfile, cplusplus))
1190         buffer->cur++, result->type = CPP_SCOPE;
1191       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
1192         {
1193           buffer->cur++;
1194           result->flags |= DIGRAPH;
1195           result->type = CPP_CLOSE_SQUARE;
1196         }
1197       break;
1198
1199     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
1200     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
1201     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
1202     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
1203     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); break;
1204
1205     case '?': result->type = CPP_QUERY; break;
1206     case '~': result->type = CPP_COMPL; break;
1207     case ',': result->type = CPP_COMMA; break;
1208     case '(': result->type = CPP_OPEN_PAREN; break;
1209     case ')': result->type = CPP_CLOSE_PAREN; break;
1210     case '[': result->type = CPP_OPEN_SQUARE; break;
1211     case ']': result->type = CPP_CLOSE_SQUARE; break;
1212     case '{': result->type = CPP_OPEN_BRACE; break;
1213     case '}': result->type = CPP_CLOSE_BRACE; break;
1214     case ';': result->type = CPP_SEMICOLON; break;
1215
1216       /* @ is a punctuator in Objective-C.  */
1217     case '@': result->type = CPP_ATSIGN; break;
1218
1219     case '$':
1220     case '\\':
1221       {
1222         const uchar *base = --buffer->cur;
1223         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1224
1225         if (forms_identifier_p (pfile, true, &nst))
1226           {
1227             result->type = CPP_NAME;
1228             result->val.node = lex_identifier (pfile, base, true, &nst);
1229             warn_about_normalization (pfile, result, &nst);
1230             break;
1231           }
1232         buffer->cur++;
1233       }
1234
1235     default:
1236       create_literal (pfile, result, buffer->cur - 1, 1, CPP_OTHER);
1237       break;
1238     }
1239
1240   return result;
1241 }
1242
1243 /* An upper bound on the number of bytes needed to spell TOKEN.
1244    Does not include preceding whitespace.  */
1245 unsigned int
1246 cpp_token_len (const cpp_token *token)
1247 {
1248   unsigned int len;
1249
1250   switch (TOKEN_SPELL (token))
1251     {
1252     default:            len = 4;                                break;
1253     case SPELL_LITERAL: len = token->val.str.len;               break;
1254     case SPELL_IDENT:   len = NODE_LEN (token->val.node) * 10;  break;
1255     }
1256
1257   return len;
1258 }
1259
1260 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
1261    Return the number of bytes read out of NAME.  (There are always
1262    10 bytes written to BUFFER.)  */
1263
1264 static size_t
1265 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
1266 {
1267   int j;
1268   int ucn_len = 0;
1269   int ucn_len_c;
1270   unsigned t;
1271   unsigned long utf32;
1272
1273   /* Compute the length of the UTF-8 sequence.  */
1274   for (t = *name; t & 0x80; t <<= 1)
1275     ucn_len++;
1276
1277   utf32 = *name & (0x7F >> ucn_len);
1278   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
1279     {
1280       utf32 = (utf32 << 6) | (*++name & 0x3F);
1281
1282       /* Ill-formed UTF-8.  */
1283       if ((*name & ~0x3F) != 0x80)
1284         abort ();
1285     }
1286
1287   *buffer++ = '\\';
1288   *buffer++ = 'U';
1289   for (j = 7; j >= 0; j--)
1290     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
1291   return ucn_len;
1292 }
1293
1294
1295 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
1296    already contain the enough space to hold the token's spelling.
1297    Returns a pointer to the character after the last character written.
1298    FORSTRING is true if this is to be the spelling after translation
1299    phase 1 (this is different for UCNs).
1300    FIXME: Would be nice if we didn't need the PFILE argument.  */
1301 unsigned char *
1302 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
1303                  unsigned char *buffer, bool forstring)
1304 {
1305   switch (TOKEN_SPELL (token))
1306     {
1307     case SPELL_OPERATOR:
1308       {
1309         const unsigned char *spelling;
1310         unsigned char c;
1311
1312         if (token->flags & DIGRAPH)
1313           spelling
1314             = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1315         else if (token->flags & NAMED_OP)
1316           goto spell_ident;
1317         else
1318           spelling = TOKEN_NAME (token);
1319
1320         while ((c = *spelling++) != '\0')
1321           *buffer++ = c;
1322       }
1323       break;
1324
1325     spell_ident:
1326     case SPELL_IDENT:
1327       if (forstring)
1328         {
1329           memcpy (buffer, NODE_NAME (token->val.node),
1330                   NODE_LEN (token->val.node));
1331           buffer += NODE_LEN (token->val.node);
1332         }
1333       else
1334         {
1335           size_t i;
1336           const unsigned char * name = NODE_NAME (token->val.node);
1337
1338           for (i = 0; i < NODE_LEN (token->val.node); i++)
1339             if (name[i] & ~0x7F)
1340               {
1341                 i += utf8_to_ucn (buffer, name + i) - 1;
1342                 buffer += 10;
1343               }
1344             else
1345               *buffer++ = NODE_NAME (token->val.node)[i];
1346         }
1347       break;
1348
1349     case SPELL_LITERAL:
1350       memcpy (buffer, token->val.str.text, token->val.str.len);
1351       buffer += token->val.str.len;
1352       break;
1353
1354     case SPELL_NONE:
1355       cpp_error (pfile, CPP_DL_ICE,
1356                  "unspellable token %s", TOKEN_NAME (token));
1357       break;
1358     }
1359
1360   return buffer;
1361 }
1362
1363 /* Returns TOKEN spelt as a null-terminated string.  The string is
1364    freed when the reader is destroyed.  Useful for diagnostics.  */
1365 unsigned char *
1366 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
1367 {
1368   unsigned int len = cpp_token_len (token) + 1;
1369   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
1370
1371   end = cpp_spell_token (pfile, token, start, false);
1372   end[0] = '\0';
1373
1374   return start;
1375 }
1376
1377 /* Used by C front ends, which really should move to using
1378    cpp_token_as_text.  */
1379 const char *
1380 cpp_type2name (enum cpp_ttype type)
1381 {
1382   return (const char *) token_spellings[type].name;
1383 }
1384
1385 /* Writes the spelling of token to FP, without any preceding space.
1386    Separated from cpp_spell_token for efficiency - to avoid stdio
1387    double-buffering.  */
1388 void
1389 cpp_output_token (const cpp_token *token, FILE *fp)
1390 {
1391   switch (TOKEN_SPELL (token))
1392     {
1393     case SPELL_OPERATOR:
1394       {
1395         const unsigned char *spelling;
1396         int c;
1397
1398         if (token->flags & DIGRAPH)
1399           spelling
1400             = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1401         else if (token->flags & NAMED_OP)
1402           goto spell_ident;
1403         else
1404           spelling = TOKEN_NAME (token);
1405
1406         c = *spelling;
1407         do
1408           putc (c, fp);
1409         while ((c = *++spelling) != '\0');
1410       }
1411       break;
1412
1413     spell_ident:
1414     case SPELL_IDENT:
1415       {
1416         size_t i;
1417         const unsigned char * name = NODE_NAME (token->val.node);
1418
1419         for (i = 0; i < NODE_LEN (token->val.node); i++)
1420           if (name[i] & ~0x7F)
1421             {
1422               unsigned char buffer[10];
1423               i += utf8_to_ucn (buffer, name + i) - 1;
1424               fwrite (buffer, 1, 10, fp);
1425             }
1426           else
1427             fputc (NODE_NAME (token->val.node)[i], fp);
1428       }
1429       break;
1430
1431     case SPELL_LITERAL:
1432       fwrite (token->val.str.text, 1, token->val.str.len, fp);
1433       break;
1434
1435     case SPELL_NONE:
1436       /* An error, most probably.  */
1437       break;
1438     }
1439 }
1440
1441 /* Compare two tokens.  */
1442 int
1443 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
1444 {
1445   if (a->type == b->type && a->flags == b->flags)
1446     switch (TOKEN_SPELL (a))
1447       {
1448       default:                  /* Keep compiler happy.  */
1449       case SPELL_OPERATOR:
1450         return 1;
1451       case SPELL_NONE:
1452         return (a->type != CPP_MACRO_ARG || a->val.arg_no == b->val.arg_no);
1453       case SPELL_IDENT:
1454         return a->val.node == b->val.node;
1455       case SPELL_LITERAL:
1456         return (a->val.str.len == b->val.str.len
1457                 && !memcmp (a->val.str.text, b->val.str.text,
1458                             a->val.str.len));
1459       }
1460
1461   return 0;
1462 }
1463
1464 /* Returns nonzero if a space should be inserted to avoid an
1465    accidental token paste for output.  For simplicity, it is
1466    conservative, and occasionally advises a space where one is not
1467    needed, e.g. "." and ".2".  */
1468 int
1469 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
1470                  const cpp_token *token2)
1471 {
1472   enum cpp_ttype a = token1->type, b = token2->type;
1473   cppchar_t c;
1474
1475   if (token1->flags & NAMED_OP)
1476     a = CPP_NAME;
1477   if (token2->flags & NAMED_OP)
1478     b = CPP_NAME;
1479
1480   c = EOF;
1481   if (token2->flags & DIGRAPH)
1482     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
1483   else if (token_spellings[b].category == SPELL_OPERATOR)
1484     c = token_spellings[b].name[0];
1485
1486   /* Quickly get everything that can paste with an '='.  */
1487   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
1488     return 1;
1489
1490   switch (a)
1491     {
1492     case CPP_GREATER:   return c == '>';
1493     case CPP_LESS:      return c == '<' || c == '%' || c == ':';
1494     case CPP_PLUS:      return c == '+';
1495     case CPP_MINUS:     return c == '-' || c == '>';
1496     case CPP_DIV:       return c == '/' || c == '*'; /* Comments.  */
1497     case CPP_MOD:       return c == ':' || c == '>';
1498     case CPP_AND:       return c == '&';
1499     case CPP_OR:        return c == '|';
1500     case CPP_COLON:     return c == ':' || c == '>';
1501     case CPP_DEREF:     return c == '*';
1502     case CPP_DOT:       return c == '.' || c == '%' || b == CPP_NUMBER;
1503     case CPP_HASH:      return c == '#' || c == '%'; /* Digraph form.  */
1504     case CPP_NAME:      return ((b == CPP_NUMBER
1505                                  && name_p (pfile, &token2->val.str))
1506                                 || b == CPP_NAME
1507                                 || b == CPP_CHAR || b == CPP_STRING); /* L */
1508     case CPP_NUMBER:    return (b == CPP_NUMBER || b == CPP_NAME
1509                                 || c == '.' || c == '+' || c == '-');
1510                                       /* UCNs */
1511     case CPP_OTHER:     return ((token1->val.str.text[0] == '\\'
1512                                  && b == CPP_NAME)
1513                                 || (CPP_OPTION (pfile, objc)
1514                                     && token1->val.str.text[0] == '@'
1515                                     && (b == CPP_NAME || b == CPP_STRING)));
1516     default:            break;
1517     }
1518
1519   return 0;
1520 }
1521
1522 /* Output all the remaining tokens on the current line, and a newline
1523    character, to FP.  Leading whitespace is removed.  If there are
1524    macros, special token padding is not performed.  */
1525 void
1526 cpp_output_line (cpp_reader *pfile, FILE *fp)
1527 {
1528   const cpp_token *token;
1529
1530   token = cpp_get_token (pfile);
1531   while (token->type != CPP_EOF)
1532     {
1533       cpp_output_token (token, fp);
1534       token = cpp_get_token (pfile);
1535       if (token->flags & PREV_WHITE)
1536         putc (' ', fp);
1537     }
1538
1539   putc ('\n', fp);
1540 }
1541
1542 /* Return a string representation of all the remaining tokens on the
1543    current line.  The result is allocated using xmalloc and must be
1544    freed by the caller.  */
1545 unsigned char *
1546 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
1547 {
1548   const cpp_token *token;
1549   unsigned int out = dir_name ? ustrlen (dir_name) : 0;
1550   unsigned int alloced = 120 + out;
1551   unsigned char *result = (unsigned char *) xmalloc (alloced);
1552
1553   /* If DIR_NAME is empty, there are no initial contents.  */
1554   if (dir_name)
1555     {
1556       sprintf ((char *) result, "#%s ", dir_name);
1557       out += 2;
1558     }
1559
1560   token = cpp_get_token (pfile);
1561   while (token->type != CPP_EOF)
1562     {
1563       unsigned char *last;
1564       /* Include room for a possible space and the terminating nul.  */
1565       unsigned int len = cpp_token_len (token) + 2;
1566
1567       if (out + len > alloced)
1568         {
1569           alloced *= 2;
1570           if (out + len > alloced)
1571             alloced = out + len;
1572           result = (unsigned char *) xrealloc (result, alloced);
1573         }
1574
1575       last = cpp_spell_token (pfile, token, &result[out], 0);
1576       out = last - result;
1577
1578       token = cpp_get_token (pfile);
1579       if (token->flags & PREV_WHITE)
1580         result[out++] = ' ';
1581     }
1582
1583   result[out] = '\0';
1584   return result;
1585 }
1586
1587 /* Memory buffers.  Changing these three constants can have a dramatic
1588    effect on performance.  The values here are reasonable defaults,
1589    but might be tuned.  If you adjust them, be sure to test across a
1590    range of uses of cpplib, including heavy nested function-like macro
1591    expansion.  Also check the change in peak memory usage (NJAMD is a
1592    good tool for this).  */
1593 #define MIN_BUFF_SIZE 8000
1594 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
1595 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
1596         (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
1597
1598 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
1599   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
1600 #endif
1601
1602 /* Create a new allocation buffer.  Place the control block at the end
1603    of the buffer, so that buffer overflows will cause immediate chaos.  */
1604 static _cpp_buff *
1605 new_buff (size_t len)
1606 {
1607   _cpp_buff *result;
1608   unsigned char *base;
1609
1610   if (len < MIN_BUFF_SIZE)
1611     len = MIN_BUFF_SIZE;
1612   len = CPP_ALIGN (len);
1613
1614   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
1615   result = (_cpp_buff *) (base + len);
1616   result->base = base;
1617   result->cur = base;
1618   result->limit = base + len;
1619   result->next = NULL;
1620   return result;
1621 }
1622
1623 /* Place a chain of unwanted allocation buffers on the free list.  */
1624 void
1625 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
1626 {
1627   _cpp_buff *end = buff;
1628
1629   while (end->next)
1630     end = end->next;
1631   end->next = pfile->free_buffs;
1632   pfile->free_buffs = buff;
1633 }
1634
1635 /* Return a free buffer of size at least MIN_SIZE.  */
1636 _cpp_buff *
1637 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
1638 {
1639   _cpp_buff *result, **p;
1640
1641   for (p = &pfile->free_buffs;; p = &(*p)->next)
1642     {
1643       size_t size;
1644
1645       if (*p == NULL)
1646         return new_buff (min_size);
1647       result = *p;
1648       size = result->limit - result->base;
1649       /* Return a buffer that's big enough, but don't waste one that's
1650          way too big.  */
1651       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
1652         break;
1653     }
1654
1655   *p = result->next;
1656   result->next = NULL;
1657   result->cur = result->base;
1658   return result;
1659 }
1660
1661 /* Creates a new buffer with enough space to hold the uncommitted
1662    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
1663    the excess bytes to the new buffer.  Chains the new buffer after
1664    BUFF, and returns the new buffer.  */
1665 _cpp_buff *
1666 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
1667 {
1668   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
1669   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
1670
1671   buff->next = new_buff;
1672   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
1673   return new_buff;
1674 }
1675
1676 /* Creates a new buffer with enough space to hold the uncommitted
1677    remaining bytes of the buffer pointed to by BUFF, and at least
1678    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
1679    Chains the new buffer before the buffer pointed to by BUFF, and
1680    updates the pointer to point to the new buffer.  */
1681 void
1682 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
1683 {
1684   _cpp_buff *new_buff, *old_buff = *pbuff;
1685   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
1686
1687   new_buff = _cpp_get_buff (pfile, size);
1688   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
1689   new_buff->next = old_buff;
1690   *pbuff = new_buff;
1691 }
1692
1693 /* Free a chain of buffers starting at BUFF.  */
1694 void
1695 _cpp_free_buff (_cpp_buff *buff)
1696 {
1697   _cpp_buff *next;
1698
1699   for (; buff; buff = next)
1700     {
1701       next = buff->next;
1702       free (buff->base);
1703     }
1704 }
1705
1706 /* Allocate permanent, unaligned storage of length LEN.  */
1707 unsigned char *
1708 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
1709 {
1710   _cpp_buff *buff = pfile->u_buff;
1711   unsigned char *result = buff->cur;
1712
1713   if (len > (size_t) (buff->limit - result))
1714     {
1715       buff = _cpp_get_buff (pfile, len);
1716       buff->next = pfile->u_buff;
1717       pfile->u_buff = buff;
1718       result = buff->cur;
1719     }
1720
1721   buff->cur = result + len;
1722   return result;
1723 }
1724
1725 /* Allocate permanent, unaligned storage of length LEN from a_buff.
1726    That buffer is used for growing allocations when saving macro
1727    replacement lists in a #define, and when parsing an answer to an
1728    assertion in #assert, #unassert or #if (and therefore possibly
1729    whilst expanding macros).  It therefore must not be used by any
1730    code that they might call: specifically the lexer and the guts of
1731    the macro expander.
1732
1733    All existing other uses clearly fit this restriction: storing
1734    registered pragmas during initialization.  */
1735 unsigned char *
1736 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
1737 {
1738   _cpp_buff *buff = pfile->a_buff;
1739   unsigned char *result = buff->cur;
1740
1741   if (len > (size_t) (buff->limit - result))
1742     {
1743       buff = _cpp_get_buff (pfile, len);
1744       buff->next = pfile->a_buff;
1745       pfile->a_buff = buff;
1746       result = buff->cur;
1747     }
1748
1749   buff->cur = result + len;
1750   return result;
1751 }
1752
1753 /* Say which field of TOK is in use.  */
1754
1755 enum cpp_token_fld_kind
1756 cpp_token_val_index (cpp_token *tok)
1757 {
1758   switch (TOKEN_SPELL (tok))
1759     {
1760     case SPELL_IDENT:
1761       return CPP_TOKEN_FLD_NODE;
1762     case SPELL_LITERAL:
1763       return CPP_TOKEN_FLD_STR;
1764     case SPELL_NONE:
1765       if (tok->type == CPP_MACRO_ARG)
1766         return CPP_TOKEN_FLD_ARG_NO;
1767       else if (tok->type == CPP_PADDING)
1768         return CPP_TOKEN_FLD_SOURCE;
1769       else if (tok->type == CPP_PRAGMA)
1770         return CPP_TOKEN_FLD_PRAGMA;
1771       /* else fall through */
1772     default:
1773       return CPP_TOKEN_FLD_NONE;
1774     }
1775 }