libcpp/lex.c

   1 /* CPP Library - lexical analysis.
   2    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2007, 2008, 2009
   3    Free Software Foundation, Inc.
   4    Contributed by Per Bothner, 1994-95.
   5    Based on CCCP program by Paul Rubin, June 1986
   6    Adapted to ANSI C, Richard Stallman, Jan 1987
   7    Broken out to separate file, Zack Weinberg, Mar 2000
   8
   9 This program is free software; you can redistribute it and/or modify it
  10 under the terms of the GNU General Public License as published by the
  11 Free Software Foundation; either version 3, or (at your option) any
  12 later version.
  13
  14 This program is distributed in the hope that it will be useful,
  15 but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17 GNU General Public License for more details.
  18
  19 You should have received a copy of the GNU General Public License
  20 along with this program; see the file COPYING3.  If not see
  21 <http://www.gnu.org/licenses/>.  */
  22
  23 #include "config.h"
  24 #include "system.h"
  25 #include "cpplib.h"
  26 #include "internal.h"
  27
  28 enum spell_type
  29 {
  30   SPELL_OPERATOR = 0,
  31   SPELL_IDENT,
  32   SPELL_LITERAL,
  33   SPELL_NONE
  34 };
  35
  36 struct token_spelling
  37 {
  38   enum spell_type category;
  39   const unsigned char *name;
  40 };
  41
  42 static const unsigned char *const digraph_spellings[] =
  43 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
  44
  45 #define OP(e, s) { SPELL_OPERATOR, UC s  },
  46 #define TK(e, s) { SPELL_ ## s,    UC #e },
  47 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
  48 #undef OP
  49 #undef TK
  50
  51 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
  52 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
  53
  54 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
  55 static int skip_line_comment (cpp_reader *);
  56 static void skip_whitespace (cpp_reader *, cppchar_t);
  57 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
  58 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
  59 static void store_comment (cpp_reader *, cpp_token *);
  60 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
  61                             unsigned int, enum cpp_ttype);
  62 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
  63 static int name_p (cpp_reader *, const cpp_string *);
  64 static tokenrun *next_tokenrun (tokenrun *);
  65
  66 static _cpp_buff *new_buff (size_t);
  67
  68
  69 /* Utility routine:
  70
  71    Compares, the token TOKEN to the NUL-terminated string STRING.
  72    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
  73 int
  74 cpp_ideq (const cpp_token *token, const char *string)
  75 {
  76   if (token->type != CPP_NAME)
  77     return 0;
  78
  79   return !ustrcmp (NODE_NAME (token->val.node), (const uchar *) string);
  80 }
  81
  82 /* Record a note TYPE at byte POS into the current cleaned logical
  83    line.  */
  84 static void
  85 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
  86 {
  87   if (buffer->notes_used == buffer->notes_cap)
  88     {
  89       buffer->notes_cap = buffer->notes_cap * 2 + 200;
  90       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
  91                                   buffer->notes_cap);
  92     }
  93
  94   buffer->notes[buffer->notes_used].pos = pos;
  95   buffer->notes[buffer->notes_used].type = type;
  96   buffer->notes_used++;
  97 }
  98
  99 /* Returns with a logical line that contains no escaped newlines or
 100    trigraphs.  This is a time-critical inner loop.  */
 101 void
 102 _cpp_clean_line (cpp_reader *pfile)
 103 {
 104   cpp_buffer *buffer;
 105   const uchar *s;
 106   uchar c, *d, *p;
 107
 108   buffer = pfile->buffer;
 109   buffer->cur_note = buffer->notes_used = 0;
 110   buffer->cur = buffer->line_base = buffer->next_line;
 111   buffer->need_line = false;
 112   s = buffer->next_line - 1;
 113
 114   if (!buffer->from_stage3)
 115     {
 116       const uchar *pbackslash = NULL;
 117
 118       /* Short circuit for the common case of an un-escaped line with
 119          no trigraphs.  The primary win here is by not writing any
 120          data back to memory until we have to.  */
 121       for (;;)
 122         {
 123           c = *++s;
 124           if (__builtin_expect (c == '\n', false)
 125               || __builtin_expect (c == '\r', false))
 126             {
 127               d = (uchar *) s;
 128
 129               if (__builtin_expect (s == buffer->rlimit, false))
 130                 goto done;
 131
 132               /* DOS line ending? */
 133               if (__builtin_expect (c == '\r', false)
 134                   && s[1] == '\n')
 135                 {
 136                   s++;
 137                   if (s == buffer->rlimit)
 138                     goto done;
 139                 }
 140
 141               if (__builtin_expect (pbackslash == NULL, true))
 142                 goto done;
 143
 144               /* Check for escaped newline.  */
 145               p = d;
 146               while (is_nvspace (p[-1]))
 147                 p--;
 148               if (p - 1 != pbackslash)
 149                 goto done;
 150
 151               /* Have an escaped newline; process it and proceed to
 152                  the slow path.  */
 153               add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
 154               d = p - 2;
 155               buffer->next_line = p - 1;
 156               break;
 157             }
 158           if (__builtin_expect (c == '\\', false))
 159             pbackslash = s;
 160           else if (__builtin_expect (c == '?', false)
 161                    && __builtin_expect (s[1] == '?', false)
 162                    && _cpp_trigraph_map[s[2]])
 163             {
 164               /* Have a trigraph.  We may or may not have to convert
 165                  it.  Add a line note regardless, for -Wtrigraphs.  */
 166               add_line_note (buffer, s, s[2]);
 167               if (CPP_OPTION (pfile, trigraphs))
 168                 {
 169                   /* We do, and that means we have to switch to the
 170                      slow path.  */
 171                   d = (uchar *) s;
 172                   *d = _cpp_trigraph_map[s[2]];
 173                   s += 2;
 174                   break;
 175                 }
 176             }
 177         }
 178
 179
 180       for (;;)
 181         {
 182           c = *++s;
 183           *++d = c;
 184
 185           if (c == '\n' || c == '\r')
 186             {
 187                   /* Handle DOS line endings.  */
 188               if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
 189                 s++;
 190               if (s == buffer->rlimit)
 191                 break;
 192
 193               /* Escaped?  */
 194               p = d;
 195               while (p != buffer->next_line && is_nvspace (p[-1]))
 196                 p--;
 197               if (p == buffer->next_line || p[-1] != '\\')
 198                 break;
 199
 200               add_line_note (buffer, p - 1, p != d ? ' ': '\\');
 201               d = p - 2;
 202               buffer->next_line = p - 1;
 203             }
 204           else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
 205             {
 206               /* Add a note regardless, for the benefit of -Wtrigraphs.  */
 207               add_line_note (buffer, d, s[2]);
 208               if (CPP_OPTION (pfile, trigraphs))
 209                 {
 210                   *d = _cpp_trigraph_map[s[2]];
 211                   s += 2;
 212                 }
 213             }
 214         }
 215     }
 216   else
 217     {
 218       do
 219         s++;
 220       while (*s != '\n' && *s != '\r');
 221       d = (uchar *) s;
 222
 223       /* Handle DOS line endings.  */
 224       if (*s == '\r' && s != buffer->rlimit && s[1] == '\n')
 225         s++;
 226     }
 227
 228  done:
 229   *d = '\n';
 230   /* A sentinel note that should never be processed.  */
 231   add_line_note (buffer, d + 1, '\n');
 232   buffer->next_line = s + 1;
 233 }
 234
 235 /* Return true if the trigraph indicated by NOTE should be warned
 236    about in a comment.  */
 237 static bool
 238 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
 239 {
 240   const uchar *p;
 241
 242   /* Within comments we don't warn about trigraphs, unless the
 243      trigraph forms an escaped newline, as that may change
 244      behavior.  */
 245   if (note->type != '/')
 246     return false;
 247
 248   /* If -trigraphs, then this was an escaped newline iff the next note
 249      is coincident.  */
 250   if (CPP_OPTION (pfile, trigraphs))
 251     return note[1].pos == note->pos;
 252
 253   /* Otherwise, see if this forms an escaped newline.  */
 254   p = note->pos + 3;
 255   while (is_nvspace (*p))
 256     p++;
 257
 258   /* There might have been escaped newlines between the trigraph and the
 259      newline we found.  Hence the position test.  */
 260   return (*p == '\n' && p < note[1].pos);
 261 }
 262
 263 /* Process the notes created by add_line_note as far as the current
 264    location.  */
 265 void
 266 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
 267 {
 268   cpp_buffer *buffer = pfile->buffer;
 269
 270   for (;;)
 271     {
 272       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
 273       unsigned int col;
 274
 275       if (note->pos > buffer->cur)
 276         break;
 277
 278       buffer->cur_note++;
 279       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
 280
 281       if (note->type == '\\' || note->type == ' ')
 282         {
 283           if (note->type == ' ' && !in_comment)
 284             cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
 285                                  "backslash and newline separated by space");
 286
 287           if (buffer->next_line > buffer->rlimit)
 288             {
 289               cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
 290                                    "backslash-newline at end of file");
 291               /* Prevent "no newline at end of file" warning.  */
 292               buffer->next_line = buffer->rlimit;
 293             }
 294
 295           buffer->line_base = note->pos;
 296           CPP_INCREMENT_LINE (pfile, 0);
 297         }
 298       else if (_cpp_trigraph_map[note->type])
 299         {
 300           if (CPP_OPTION (pfile, warn_trigraphs)
 301               && (!in_comment || warn_in_comment (pfile, note)))
 302             {
 303               if (CPP_OPTION (pfile, trigraphs))
 304                 cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
 305                                      "trigraph ??%c converted to %c",
 306                                      note->type,
 307                                      (int) _cpp_trigraph_map[note->type]);
 308               else
 309                 {
 310                   cpp_error_with_line
 311                     (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
 312                      "trigraph ??%c ignored, use -trigraphs to enable",
 313                      note->type);
 314                 }
 315             }
 316         }
 317       else
 318         abort ();
 319     }
 320 }
 321
 322 /* Skip a C-style block comment.  We find the end of the comment by
 323    seeing if an asterisk is before every '/' we encounter.  Returns
 324    nonzero if comment terminated by EOF, zero otherwise.
 325
 326    Buffer->cur points to the initial asterisk of the comment.  */
 327 bool
 328 _cpp_skip_block_comment (cpp_reader *pfile)
 329 {
 330   cpp_buffer *buffer = pfile->buffer;
 331   const uchar *cur = buffer->cur;
 332   uchar c;
 333
 334   cur++;
 335   if (*cur == '/')
 336     cur++;
 337
 338   for (;;)
 339     {
 340       /* People like decorating comments with '*', so check for '/'
 341          instead for efficiency.  */
 342       c = *cur++;
 343
 344       if (c == '/')
 345         {
 346           if (cur[-2] == '*')
 347             break;
 348
 349           /* Warn about potential nested comments, but not if the '/'
 350              comes immediately before the true comment delimiter.
 351              Don't bother to get it right across escaped newlines.  */
 352           if (CPP_OPTION (pfile, warn_comments)
 353               && cur[0] == '*' && cur[1] != '/')
 354             {
 355               buffer->cur = cur;
 356               cpp_error_with_line (pfile, CPP_DL_WARNING,
 357                                    pfile->line_table->highest_line, CPP_BUF_COL (buffer),
 358                                    "\"/*\" within comment");
 359             }
 360         }
 361       else if (c == '\n')
 362         {
 363           unsigned int cols;
 364           buffer->cur = cur - 1;
 365           _cpp_process_line_notes (pfile, true);
 366           if (buffer->next_line >= buffer->rlimit)
 367             return true;
 368           _cpp_clean_line (pfile);
 369
 370           cols = buffer->next_line - buffer->line_base;
 371           CPP_INCREMENT_LINE (pfile, cols);
 372
 373           cur = buffer->cur;
 374         }
 375     }
 376
 377   buffer->cur = cur;
 378   _cpp_process_line_notes (pfile, true);
 379   return false;
 380 }
 381
 382 /* Skip a C++ line comment, leaving buffer->cur pointing to the
 383    terminating newline.  Handles escaped newlines.  Returns nonzero
 384    if a multiline comment.  */
 385 static int
 386 skip_line_comment (cpp_reader *pfile)
 387 {
 388   cpp_buffer *buffer = pfile->buffer;
 389   source_location orig_line = pfile->line_table->highest_line;
 390
 391   while (*buffer->cur != '\n')
 392     buffer->cur++;
 393
 394   _cpp_process_line_notes (pfile, true);
 395   return orig_line != pfile->line_table->highest_line;
 396 }
 397
 398 /* Skips whitespace, saving the next non-whitespace character.  */
 399 static void
 400 skip_whitespace (cpp_reader *pfile, cppchar_t c)
 401 {
 402   cpp_buffer *buffer = pfile->buffer;
 403   bool saw_NUL = false;
 404
 405   do
 406     {
 407       /* Horizontal space always OK.  */
 408       if (c == ' ' || c == '\t')
 409         ;
 410       /* Just \f \v or \0 left.  */
 411       else if (c == '\0')
 412         saw_NUL = true;
 413       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
 414         cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
 415                              CPP_BUF_COL (buffer),
 416                              "%s in preprocessing directive",
 417                              c == '\f' ? "form feed" : "vertical tab");
 418
 419       c = *buffer->cur++;
 420     }
 421   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
 422   while (is_nvspace (c));
 423
 424   if (saw_NUL)
 425     cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored");
 426
 427   buffer->cur--;
 428 }
 429
 430 /* See if the characters of a number token are valid in a name (no
 431    '.', '+' or '-').  */
 432 static int
 433 name_p (cpp_reader *pfile, const cpp_string *string)
 434 {
 435   unsigned int i;
 436
 437   for (i = 0; i < string->len; i++)
 438     if (!is_idchar (string->text[i]))
 439       return 0;
 440
 441   return 1;
 442 }
 443
 444 /* After parsing an identifier or other sequence, produce a warning about
 445    sequences not in NFC/NFKC.  */
 446 static void
 447 warn_about_normalization (cpp_reader *pfile,
 448                           const cpp_token *token,
 449                           const struct normalize_state *s)
 450 {
 451   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
 452       && !pfile->state.skipping)
 453     {
 454       /* Make sure that the token is printed using UCNs, even
 455          if we'd otherwise happily print UTF-8.  */
 456       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
 457       size_t sz;
 458
 459       sz = cpp_spell_token (pfile, token, buf, false) - buf;
 460       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
 461         cpp_error_with_line (pfile, CPP_DL_WARNING, token->src_loc, 0,
 462                              "`%.*s' is not in NFKC", (int) sz, buf);
 463       else
 464         cpp_error_with_line (pfile, CPP_DL_WARNING, token->src_loc, 0,
 465                              "`%.*s' is not in NFC", (int) sz, buf);
 466     }
 467 }
 468
 469 /* Returns TRUE if the sequence starting at buffer->cur is invalid in
 470    an identifier.  FIRST is TRUE if this starts an identifier.  */
 471 static bool
 472 forms_identifier_p (cpp_reader *pfile, int first,
 473                     struct normalize_state *state)
 474 {
 475   cpp_buffer *buffer = pfile->buffer;
 476
 477   if (*buffer->cur == '$')
 478     {
 479       if (!CPP_OPTION (pfile, dollars_in_ident))
 480         return false;
 481
 482       buffer->cur++;
 483       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
 484         {
 485           CPP_OPTION (pfile, warn_dollars) = 0;
 486           cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
 487         }
 488
 489       return true;
 490     }
 491
 492   /* Is this a syntactically valid UCN?  */
 493   if (CPP_OPTION (pfile, extended_identifiers)
 494       && *buffer->cur == '\\'
 495       && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
 496     {
 497       buffer->cur += 2;
 498       if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
 499                           state))
 500         return true;
 501       buffer->cur -= 2;
 502     }
 503
 504   return false;
 505 }
 506
 507 /* Lex an identifier starting at BUFFER->CUR - 1.  */
 508 static cpp_hashnode *
 509 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
 510                 struct normalize_state *nst)
 511 {
 512   cpp_hashnode *result;
 513   const uchar *cur;
 514   unsigned int len;
 515   unsigned int hash = HT_HASHSTEP (0, *base);
 516
 517   cur = pfile->buffer->cur;
 518   if (! starts_ucn)
 519     while (ISIDNUM (*cur))
 520       {
 521         hash = HT_HASHSTEP (hash, *cur);
 522         cur++;
 523       }
 524   pfile->buffer->cur = cur;
 525   if (starts_ucn || forms_identifier_p (pfile, false, nst))
 526     {
 527       /* Slower version for identifiers containing UCNs (or $).  */
 528       do {
 529         while (ISIDNUM (*pfile->buffer->cur))
 530           {
 531             pfile->buffer->cur++;
 532             NORMALIZE_STATE_UPDATE_IDNUM (nst);
 533           }
 534       } while (forms_identifier_p (pfile, false, nst));
 535       result = _cpp_interpret_identifier (pfile, base,
 536                                           pfile->buffer->cur - base);
 537     }
 538   else
 539     {
 540       len = cur - base;
 541       hash = HT_HASHFINISH (hash, len);
 542
 543       result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
 544                                                   base, len, hash, HT_ALLOC));
 545     }
 546
 547   /* Rarely, identifiers require diagnostics when lexed.  */
 548   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
 549                         && !pfile->state.skipping, 0))
 550     {
 551       /* It is allowed to poison the same identifier twice.  */
 552       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
 553         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
 554                    NODE_NAME (result));
 555
 556       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
 557          replacement list of a variadic macro.  */
 558       if (result == pfile->spec_nodes.n__VA_ARGS__
 559           && !pfile->state.va_args_ok)
 560         cpp_error (pfile, CPP_DL_PEDWARN,
 561                    "__VA_ARGS__ can only appear in the expansion"
 562                    " of a C99 variadic macro");
 563     }
 564
 565   return result;
 566 }
 567
 568 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
 569 static void
 570 lex_number (cpp_reader *pfile, cpp_string *number,
 571             struct normalize_state *nst)
 572 {
 573   const uchar *cur;
 574   const uchar *base;
 575   uchar *dest;
 576
 577   base = pfile->buffer->cur - 1;
 578   do
 579     {
 580       cur = pfile->buffer->cur;
 581
 582       /* N.B. ISIDNUM does not include $.  */
 583       while (ISIDNUM (*cur) || *cur == '.' || VALID_SIGN (*cur, cur[-1]))
 584         {
 585           cur++;
 586           NORMALIZE_STATE_UPDATE_IDNUM (nst);
 587         }
 588
 589       pfile->buffer->cur = cur;
 590     }
 591   while (forms_identifier_p (pfile, false, nst));
 592
 593   number->len = cur - base;
 594   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
 595   memcpy (dest, base, number->len);
 596   dest[number->len] = '\0';
 597   number->text = dest;
 598 }
 599
 600 /* Create a token of type TYPE with a literal spelling.  */
 601 static void
 602 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
 603                 unsigned int len, enum cpp_ttype type)
 604 {
 605   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
 606
 607   memcpy (dest, base, len);
 608   dest[len] = '\0';
 609   token->type = type;
 610   token->val.str.len = len;
 611   token->val.str.text = dest;
 612 }
 613
 614 /* Lexes a string, character constant, or angle-bracketed header file
 615    name.  The stored string contains the spelling, including opening
 616    quote and leading any leading 'L', 'u' or 'U'.  It returns the type
 617    of the literal, or CPP_OTHER if it was not properly terminated, or
 618    CPP_LESS for an unterminated header name which must be relexed as
 619    normal tokens.
 620
 621    The spelling is NUL-terminated, but it is not guaranteed that this
 622    is the first NUL since embedded NULs are preserved.  */
 623 static void
 624 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
 625 {
 626   bool saw_NUL = false;
 627   const uchar *cur;
 628   cppchar_t terminator;
 629   enum cpp_ttype type;
 630
 631   cur = base;
 632   terminator = *cur++;
 633   if (terminator == 'L' || terminator == 'u' || terminator == 'U')
 634     terminator = *cur++;
 635   if (terminator == '\"')
 636     type = (*base == 'L' ? CPP_WSTRING :
 637             *base == 'U' ? CPP_STRING32 :
 638             *base == 'u' ? CPP_STRING16 : CPP_STRING);
 639   else if (terminator == '\'')
 640     type = (*base == 'L' ? CPP_WCHAR :
 641             *base == 'U' ? CPP_CHAR32 :
 642             *base == 'u' ? CPP_CHAR16 : CPP_CHAR);
 643   else
 644     terminator = '>', type = CPP_HEADER_NAME;
 645
 646   for (;;)
 647     {
 648       cppchar_t c = *cur++;
 649
 650       /* In #include-style directives, terminators are not escapable.  */
 651       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
 652         cur++;
 653       else if (c == terminator)
 654         break;
 655       else if (c == '\n')
 656         {
 657           cur--;
 658           /* Unmatched quotes always yield undefined behavior, but
 659              greedy lexing means that what appears to be an unterminated
 660              header name may actually be a legitimate sequence of tokens.  */
 661           if (terminator == '>')
 662             {
 663               token->type = CPP_LESS;
 664               return;
 665             }
 666           type = CPP_OTHER;
 667           break;
 668         }
 669       else if (c == '\0')
 670         saw_NUL = true;
 671     }
 672
 673   if (saw_NUL && !pfile->state.skipping)
 674     cpp_error (pfile, CPP_DL_WARNING,
 675                "null character(s) preserved in literal");
 676
 677   if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
 678     cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
 679                (int) terminator);
 680
 681   pfile->buffer->cur = cur;
 682   create_literal (pfile, token, base, cur - base, type);
 683 }
 684
 685 /* Return the comment table. The client may not make any assumption
 686    about the ordering of the table.  */
 687 cpp_comment_table *
 688 cpp_get_comments (cpp_reader *pfile)
 689 {
 690   return &pfile->comments;
 691 }
 692
 693 /* Append a comment to the end of the comment table. */
 694 static void
 695 store_comment (cpp_reader *pfile, cpp_token *token)
 696 {
 697   int len;
 698
 699   if (pfile->comments.allocated == 0)
 700     {
 701       pfile->comments.allocated = 256;
 702       pfile->comments.entries = (cpp_comment *) xmalloc
 703         (pfile->comments.allocated * sizeof (cpp_comment));
 704     }
 705
 706   if (pfile->comments.count == pfile->comments.allocated)
 707     {
 708       pfile->comments.allocated *= 2;
 709       pfile->comments.entries = (cpp_comment *) xrealloc
 710         (pfile->comments.entries,
 711          pfile->comments.allocated * sizeof (cpp_comment));
 712     }
 713
 714   len = token->val.str.len;
 715
 716   /* Copy comment. Note, token may not be NULL terminated. */
 717   pfile->comments.entries[pfile->comments.count].comment =
 718     (char *) xmalloc (sizeof (char) * (len + 1));
 719   memcpy (pfile->comments.entries[pfile->comments.count].comment,
 720           token->val.str.text, len);
 721   pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
 722
 723   /* Set source location. */
 724   pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
 725
 726   /* Increment the count of entries in the comment table. */
 727   pfile->comments.count++;
 728 }
 729
 730 /* The stored comment includes the comment start and any terminator.  */
 731 static void
 732 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
 733               cppchar_t type)
 734 {
 735   unsigned char *buffer;
 736   unsigned int len, clen;
 737
 738   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
 739
 740   /* C++ comments probably (not definitely) have moved past a new
 741      line, which we don't want to save in the comment.  */
 742   if (is_vspace (pfile->buffer->cur[-1]))
 743     len--;
 744
 745   /* If we are currently in a directive, then we need to store all
 746      C++ comments as C comments internally, and so we need to
 747      allocate a little extra space in that case.
 748
 749      Note that the only time we encounter a directive here is
 750      when we are saving comments in a "#define".  */
 751   clen = (pfile->state.in_directive && type == '/') ? len + 2 : len;
 752
 753   buffer = _cpp_unaligned_alloc (pfile, clen);
 754
 755   token->type = CPP_COMMENT;
 756   token->val.str.len = clen;
 757   token->val.str.text = buffer;
 758
 759   buffer[0] = '/';
 760   memcpy (buffer + 1, from, len - 1);
 761
 762   /* Finish conversion to a C comment, if necessary.  */
 763   if (pfile->state.in_directive && type == '/')
 764     {
 765       buffer[1] = '*';
 766       buffer[clen - 2] = '*';
 767       buffer[clen - 1] = '/';
 768     }
 769
 770   /* Finally store this comment for use by clients of libcpp. */
 771   store_comment (pfile, token);
 772 }
 773
 774 /* Allocate COUNT tokens for RUN.  */
 775 void
 776 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
 777 {
 778   run->base = XNEWVEC (cpp_token, count);
 779   run->limit = run->base + count;
 780   run->next = NULL;
 781 }
 782
 783 /* Returns the next tokenrun, or creates one if there is none.  */
 784 static tokenrun *
 785 next_tokenrun (tokenrun *run)
 786 {
 787   if (run->next == NULL)
 788     {
 789       run->next = XNEW (tokenrun);
 790       run->next->prev = run;
 791       _cpp_init_tokenrun (run->next, 250);
 792     }
 793
 794   return run->next;
 795 }
 796
 797 /* Look ahead in the input stream.  */
 798 const cpp_token *
 799 cpp_peek_token (cpp_reader *pfile, int index)
 800 {
 801   cpp_context *context = pfile->context;
 802   const cpp_token *peektok;
 803   int count;
 804
 805   /* First, scan through any pending cpp_context objects.  */
 806   while (context->prev)
 807     {
 808       ptrdiff_t sz = (context->direct_p
 809                       ? LAST (context).token - FIRST (context).token
 810                       : LAST (context).ptoken - FIRST (context).ptoken);
 811
 812       if (index < (int) sz)
 813         return (context->direct_p
 814                 ? FIRST (context).token + index
 815                 : *(FIRST (context).ptoken + index));
 816
 817       index -= (int) sz;
 818       context = context->prev;
 819     }
 820
 821   /* We will have to read some new tokens after all (and do so
 822      without invalidating preceding tokens).  */
 823   count = index;
 824   pfile->keep_tokens++;
 825
 826   do
 827     {
 828       peektok = _cpp_lex_token (pfile);
 829       if (peektok->type == CPP_EOF)
 830         return peektok;
 831     }
 832   while (index--);
 833
 834   _cpp_backup_tokens_direct (pfile, count + 1);
 835   pfile->keep_tokens--;
 836
 837   return peektok;
 838 }
 839
 840 /* Allocate a single token that is invalidated at the same time as the
 841    rest of the tokens on the line.  Has its line and col set to the
 842    same as the last lexed token, so that diagnostics appear in the
 843    right place.  */
 844 cpp_token *
 845 _cpp_temp_token (cpp_reader *pfile)
 846 {
 847   cpp_token *old, *result;
 848   ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
 849   ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
 850
 851   old = pfile->cur_token - 1;
 852   /* Any pre-existing lookaheads must not be clobbered.  */
 853   if (la)
 854     {
 855       if (sz <= la)
 856         {
 857           tokenrun *next = next_tokenrun (pfile->cur_run);
 858
 859           if (sz < la)
 860             memmove (next->base + 1, next->base,
 861                      (la - sz) * sizeof (cpp_token));
 862
 863           next->base[0] = pfile->cur_run->limit[-1];
 864         }
 865
 866       if (sz > 1)
 867         memmove (pfile->cur_token + 1, pfile->cur_token,
 868                  MIN (la, sz - 1) * sizeof (cpp_token));
 869     }
 870
 871   if (!sz && pfile->cur_token == pfile->cur_run->limit)
 872     {
 873       pfile->cur_run = next_tokenrun (pfile->cur_run);
 874       pfile->cur_token = pfile->cur_run->base;
 875     }
 876
 877   result = pfile->cur_token++;
 878   result->src_loc = old->src_loc;
 879   return result;
 880 }
 881
 882 /* Lex a token into RESULT (external interface).  Takes care of issues
 883    like directive handling, token lookahead, multiple include
 884    optimization and skipping.  */
 885 const cpp_token *
 886 _cpp_lex_token (cpp_reader *pfile)
 887 {
 888   cpp_token *result;
 889
 890   for (;;)
 891     {
 892       if (pfile->cur_token == pfile->cur_run->limit)
 893         {
 894           pfile->cur_run = next_tokenrun (pfile->cur_run);
 895           pfile->cur_token = pfile->cur_run->base;
 896         }
 897       /* We assume that the current token is somewhere in the current
 898          run.  */
 899       if (pfile->cur_token < pfile->cur_run->base
 900           || pfile->cur_token >= pfile->cur_run->limit)
 901         abort ();
 902
 903       if (pfile->lookaheads)
 904         {
 905           pfile->lookaheads--;
 906           result = pfile->cur_token++;
 907         }
 908       else
 909         result = _cpp_lex_direct (pfile);
 910
 911       if (result->flags & BOL)
 912         {
 913           /* Is this a directive.  If _cpp_handle_directive returns
 914              false, it is an assembler #.  */
 915           if (result->type == CPP_HASH
 916               /* 6.10.3 p 11: Directives in a list of macro arguments
 917                  gives undefined behavior.  This implementation
 918                  handles the directive as normal.  */
 919               && pfile->state.parsing_args != 1)
 920             {
 921               if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
 922                 {
 923                   if (pfile->directive_result.type == CPP_PADDING)
 924                     continue;
 925                   result = &pfile->directive_result;
 926                 }
 927             }
 928           else if (pfile->state.in_deferred_pragma)
 929             result = &pfile->directive_result;
 930
 931           if (pfile->cb.line_change && !pfile->state.skipping)
 932             pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
 933         }
 934
 935       /* We don't skip tokens in directives.  */
 936       if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
 937         break;
 938
 939       /* Outside a directive, invalidate controlling macros.  At file
 940          EOF, _cpp_lex_direct takes care of popping the buffer, so we never
 941          get here and MI optimization works.  */
 942       pfile->mi_valid = false;
 943
 944       if (!pfile->state.skipping || result->type == CPP_EOF)
 945         break;
 946     }
 947
 948   return result;
 949 }
 950
 951 /* Returns true if a fresh line has been loaded.  */
 952 bool
 953 _cpp_get_fresh_line (cpp_reader *pfile)
 954 {
 955   int return_at_eof;
 956
 957   /* We can't get a new line until we leave the current directive.  */
 958   if (pfile->state.in_directive)
 959     return false;
 960
 961   for (;;)
 962     {
 963       cpp_buffer *buffer = pfile->buffer;
 964
 965       if (!buffer->need_line)
 966         return true;
 967
 968       if (buffer->next_line < buffer->rlimit)
 969         {
 970           _cpp_clean_line (pfile);
 971           return true;
 972         }
 973
 974       /* First, get out of parsing arguments state.  */
 975       if (pfile->state.parsing_args)
 976         return false;
 977
 978       /* End of buffer.  Non-empty files should end in a newline.  */
 979       if (buffer->buf != buffer->rlimit
 980           && buffer->next_line > buffer->rlimit
 981           && !buffer->from_stage3)
 982         {
 983           /* Clip to buffer size.  */
 984           buffer->next_line = buffer->rlimit;
 985         }
 986
 987       return_at_eof = buffer->return_at_eof;
 988       _cpp_pop_buffer (pfile);
 989       if (pfile->buffer == NULL || return_at_eof)
 990         return false;
 991     }
 992 }
 993
 994 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)          \
 995   do                                                    \
 996     {                                                   \
 997       result->type = ELSE_TYPE;                         \
 998       if (*buffer->cur == CHAR)                         \
 999         buffer->cur++, result->type = THEN_TYPE;        \
1000     }                                                   \
1001   while (0)
1002
1003 /* Lex a token into pfile->cur_token, which is also incremented, to
1004    get diagnostics pointing to the correct location.
1005
1006    Does not handle issues such as token lookahead, multiple-include
1007    optimization, directives, skipping etc.  This function is only
1008    suitable for use by _cpp_lex_token, and in special cases like
1009    lex_expansion_token which doesn't care for any of these issues.
1010
1011    When meeting a newline, returns CPP_EOF if parsing a directive,
1012    otherwise returns to the start of the token buffer if permissible.
1013    Returns the location of the lexed token.  */
1014 cpp_token *
1015 _cpp_lex_direct (cpp_reader *pfile)
1016 {
1017   cppchar_t c;
1018   cpp_buffer *buffer;
1019   const unsigned char *comment_start;
1020   cpp_token *result = pfile->cur_token++;
1021
1022  fresh_line:
1023   result->flags = 0;
1024   buffer = pfile->buffer;
1025   if (buffer->need_line)
1026     {
1027       if (pfile->state.in_deferred_pragma)
1028         {
1029           result->type = CPP_PRAGMA_EOL;
1030           pfile->state.in_deferred_pragma = false;
1031           if (!pfile->state.pragma_allow_expansion)
1032             pfile->state.prevent_expansion--;
1033           return result;
1034         }
1035       if (!_cpp_get_fresh_line (pfile))
1036         {
1037           result->type = CPP_EOF;
1038           if (!pfile->state.in_directive)
1039             {
1040               /* Tell the compiler the line number of the EOF token.  */
1041               result->src_loc = pfile->line_table->highest_line;
1042               result->flags = BOL;
1043             }
1044           return result;
1045         }
1046       if (!pfile->keep_tokens)
1047         {
1048           pfile->cur_run = &pfile->base_run;
1049           result = pfile->base_run.base;
1050           pfile->cur_token = result + 1;
1051         }
1052       result->flags = BOL;
1053       if (pfile->state.parsing_args == 2)
1054         result->flags |= PREV_WHITE;
1055     }
1056   buffer = pfile->buffer;
1057  update_tokens_line:
1058   result->src_loc = pfile->line_table->highest_line;
1059
1060  skipped_white:
1061   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
1062       && !pfile->overlaid_buffer)
1063     {
1064       _cpp_process_line_notes (pfile, false);
1065       result->src_loc = pfile->line_table->highest_line;
1066     }
1067   c = *buffer->cur++;
1068
1069   LINEMAP_POSITION_FOR_COLUMN (result->src_loc, pfile->line_table,
1070                                CPP_BUF_COLUMN (buffer, buffer->cur));
1071
1072   switch (c)
1073     {
1074     case ' ': case '\t': case '\f': case '\v': case '\0':
1075       result->flags |= PREV_WHITE;
1076       skip_whitespace (pfile, c);
1077       goto skipped_white;
1078
1079     case '\n':
1080       if (buffer->cur < buffer->rlimit)
1081         CPP_INCREMENT_LINE (pfile, 0);
1082       buffer->need_line = true;
1083       goto fresh_line;
1084
1085     case '0': case '1': case '2': case '3': case '4':
1086     case '5': case '6': case '7': case '8': case '9':
1087       {
1088         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1089         result->type = CPP_NUMBER;
1090         lex_number (pfile, &result->val.str, &nst);
1091         warn_about_normalization (pfile, result, &nst);
1092         break;
1093       }
1094
1095     case 'L':
1096     case 'u':
1097     case 'U':
1098       /* 'L', 'u' or 'U' may introduce wide characters or strings.  */
1099       if (c == 'L' || CPP_OPTION (pfile, uliterals))
1100         {
1101           if (*buffer->cur == '\'' || *buffer->cur == '"')
1102             {
1103               lex_string (pfile, result, buffer->cur - 1);
1104               break;
1105             }
1106         }
1107       /* Fall through.  */
1108
1109     case '_':
1110     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1111     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1112     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1113     case 's': case 't':           case 'v': case 'w': case 'x':
1114     case 'y': case 'z':
1115     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1116     case 'G': case 'H': case 'I': case 'J': case 'K':
1117     case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1118     case 'S': case 'T':           case 'V': case 'W': case 'X':
1119     case 'Y': case 'Z':
1120       result->type = CPP_NAME;
1121       {
1122         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1123         result->val.node = lex_identifier (pfile, buffer->cur - 1, false,
1124                                            &nst);
1125         warn_about_normalization (pfile, result, &nst);
1126       }
1127
1128       /* Convert named operators to their proper types.  */
1129       if (result->val.node->flags & NODE_OPERATOR)
1130         {
1131           result->flags |= NAMED_OP;
1132           result->type = (enum cpp_ttype) result->val.node->directive_index;
1133         }
1134       break;
1135
1136     case '\'':
1137     case '"':
1138       lex_string (pfile, result, buffer->cur - 1);
1139       break;
1140
1141     case '/':
1142       /* A potential block or line comment.  */
1143       comment_start = buffer->cur;
1144       c = *buffer->cur;
1145
1146       if (c == '*')
1147         {
1148           if (_cpp_skip_block_comment (pfile))
1149             cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
1150         }
1151       else if (c == '/' && (CPP_OPTION (pfile, cplusplus_comments)
1152                             || cpp_in_system_header (pfile)))
1153         {
1154           /* Warn about comments only if pedantically GNUC89, and not
1155              in system headers.  */
1156           if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile)
1157               && ! buffer->warned_cplusplus_comments)
1158             {
1159               cpp_error (pfile, CPP_DL_PEDWARN,
1160                          "C++ style comments are not allowed in ISO C90");
1161               cpp_error (pfile, CPP_DL_PEDWARN,
1162                          "(this will be reported only once per input file)");
1163               buffer->warned_cplusplus_comments = 1;
1164             }
1165
1166           if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
1167             cpp_error (pfile, CPP_DL_WARNING, "multi-line comment");
1168         }
1169       else if (c == '=')
1170         {
1171           buffer->cur++;
1172           result->type = CPP_DIV_EQ;
1173           break;
1174         }
1175       else
1176         {
1177           result->type = CPP_DIV;
1178           break;
1179         }
1180
1181       if (!pfile->state.save_comments)
1182         {
1183           result->flags |= PREV_WHITE;
1184           goto update_tokens_line;
1185         }
1186
1187       /* Save the comment as a token in its own right.  */
1188       save_comment (pfile, result, comment_start, c);
1189       break;
1190
1191     case '<':
1192       if (pfile->state.angled_headers)
1193         {
1194           lex_string (pfile, result, buffer->cur - 1);
1195           if (result->type != CPP_LESS)
1196             break;
1197         }
1198
1199       result->type = CPP_LESS;
1200       if (*buffer->cur == '=')
1201         buffer->cur++, result->type = CPP_LESS_EQ;
1202       else if (*buffer->cur == '<')
1203         {
1204           buffer->cur++;
1205           IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
1206         }
1207       else if (CPP_OPTION (pfile, digraphs))
1208         {
1209           if (*buffer->cur == ':')
1210             {
1211               buffer->cur++;
1212               result->flags |= DIGRAPH;
1213               result->type = CPP_OPEN_SQUARE;
1214             }
1215           else if (*buffer->cur == '%')
1216             {
1217               buffer->cur++;
1218               result->flags |= DIGRAPH;
1219               result->type = CPP_OPEN_BRACE;
1220             }
1221         }
1222       break;
1223
1224     case '>':
1225       result->type = CPP_GREATER;
1226       if (*buffer->cur == '=')
1227         buffer->cur++, result->type = CPP_GREATER_EQ;
1228       else if (*buffer->cur == '>')
1229         {
1230           buffer->cur++;
1231           IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
1232         }
1233       break;
1234
1235     case '%':
1236       result->type = CPP_MOD;
1237       if (*buffer->cur == '=')
1238         buffer->cur++, result->type = CPP_MOD_EQ;
1239       else if (CPP_OPTION (pfile, digraphs))
1240         {
1241           if (*buffer->cur == ':')
1242             {
1243               buffer->cur++;
1244               result->flags |= DIGRAPH;
1245               result->type = CPP_HASH;
1246               if (*buffer->cur == '%' && buffer->cur[1] == ':')
1247                 buffer->cur += 2, result->type = CPP_PASTE, result->val.arg_no = 0;
1248             }
1249           else if (*buffer->cur == '>')
1250             {
1251               buffer->cur++;
1252               result->flags |= DIGRAPH;
1253               result->type = CPP_CLOSE_BRACE;
1254             }
1255         }
1256       break;
1257
1258     case '.':
1259       result->type = CPP_DOT;
1260       if (ISDIGIT (*buffer->cur))
1261         {
1262           struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1263           result->type = CPP_NUMBER;
1264           lex_number (pfile, &result->val.str, &nst);
1265           warn_about_normalization (pfile, result, &nst);
1266         }
1267       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
1268         buffer->cur += 2, result->type = CPP_ELLIPSIS;
1269       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
1270         buffer->cur++, result->type = CPP_DOT_STAR;
1271       break;
1272
1273     case '+':
1274       result->type = CPP_PLUS;
1275       if (*buffer->cur == '+')
1276         buffer->cur++, result->type = CPP_PLUS_PLUS;
1277       else if (*buffer->cur == '=')
1278         buffer->cur++, result->type = CPP_PLUS_EQ;
1279       break;
1280
1281     case '-':
1282       result->type = CPP_MINUS;
1283       if (*buffer->cur == '>')
1284         {
1285           buffer->cur++;
1286           result->type = CPP_DEREF;
1287           if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
1288             buffer->cur++, result->type = CPP_DEREF_STAR;
1289         }
1290       else if (*buffer->cur == '-')
1291         buffer->cur++, result->type = CPP_MINUS_MINUS;
1292       else if (*buffer->cur == '=')
1293         buffer->cur++, result->type = CPP_MINUS_EQ;
1294       break;
1295
1296     case '&':
1297       result->type = CPP_AND;
1298       if (*buffer->cur == '&')
1299         buffer->cur++, result->type = CPP_AND_AND;
1300       else if (*buffer->cur == '=')
1301         buffer->cur++, result->type = CPP_AND_EQ;
1302       break;
1303
1304     case '|':
1305       result->type = CPP_OR;
1306       if (*buffer->cur == '|')
1307         buffer->cur++, result->type = CPP_OR_OR;
1308       else if (*buffer->cur == '=')
1309         buffer->cur++, result->type = CPP_OR_EQ;
1310       break;
1311
1312     case ':':
1313       result->type = CPP_COLON;
1314       if (*buffer->cur == ':' && CPP_OPTION (pfile, cplusplus))
1315         buffer->cur++, result->type = CPP_SCOPE;
1316       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
1317         {
1318           buffer->cur++;
1319           result->flags |= DIGRAPH;
1320           result->type = CPP_CLOSE_SQUARE;
1321         }
1322       break;
1323
1324     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
1325     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
1326     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
1327     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
1328     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.arg_no = 0; break;
1329
1330     case '?': result->type = CPP_QUERY; break;
1331     case '~': result->type = CPP_COMPL; break;
1332     case ',': result->type = CPP_COMMA; break;
1333     case '(': result->type = CPP_OPEN_PAREN; break;
1334     case ')': result->type = CPP_CLOSE_PAREN; break;
1335     case '[': result->type = CPP_OPEN_SQUARE; break;
1336     case ']': result->type = CPP_CLOSE_SQUARE; break;
1337     case '{': result->type = CPP_OPEN_BRACE; break;
1338     case '}': result->type = CPP_CLOSE_BRACE; break;
1339     case ';': result->type = CPP_SEMICOLON; break;
1340
1341       /* @ is a punctuator in Objective-C.  */
1342     case '@': result->type = CPP_ATSIGN; break;
1343
1344     case '$':
1345     case '\\':
1346       {
1347         const uchar *base = --buffer->cur;
1348         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1349
1350         if (forms_identifier_p (pfile, true, &nst))
1351           {
1352             result->type = CPP_NAME;
1353             result->val.node = lex_identifier (pfile, base, true, &nst);
1354             warn_about_normalization (pfile, result, &nst);
1355             break;
1356           }
1357         buffer->cur++;
1358       }
1359
1360     default:
1361       create_literal (pfile, result, buffer->cur - 1, 1, CPP_OTHER);
1362       break;
1363     }
1364
1365   return result;
1366 }
1367
1368 /* An upper bound on the number of bytes needed to spell TOKEN.
1369    Does not include preceding whitespace.  */
1370 unsigned int
1371 cpp_token_len (const cpp_token *token)
1372 {
1373   unsigned int len;
1374
1375   switch (TOKEN_SPELL (token))
1376     {
1377     default:            len = 6;                                break;
1378     case SPELL_LITERAL: len = token->val.str.len;               break;
1379     case SPELL_IDENT:   len = NODE_LEN (token->val.node) * 10;  break;
1380     }
1381
1382   return len;
1383 }
1384
1385 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
1386    Return the number of bytes read out of NAME.  (There are always
1387    10 bytes written to BUFFER.)  */
1388
1389 static size_t
1390 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
1391 {
1392   int j;
1393   int ucn_len = 0;
1394   int ucn_len_c;
1395   unsigned t;
1396   unsigned long utf32;
1397
1398   /* Compute the length of the UTF-8 sequence.  */
1399   for (t = *name; t & 0x80; t <<= 1)
1400     ucn_len++;
1401
1402   utf32 = *name & (0x7F >> ucn_len);
1403   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
1404     {
1405       utf32 = (utf32 << 6) | (*++name & 0x3F);
1406
1407       /* Ill-formed UTF-8.  */
1408       if ((*name & ~0x3F) != 0x80)
1409         abort ();
1410     }
1411
1412   *buffer++ = '\\';
1413   *buffer++ = 'U';
1414   for (j = 7; j >= 0; j--)
1415     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
1416   return ucn_len;
1417 }
1418
1419
1420 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
1421    already contain the enough space to hold the token's spelling.
1422    Returns a pointer to the character after the last character written.
1423    FORSTRING is true if this is to be the spelling after translation
1424    phase 1 (this is different for UCNs).
1425    FIXME: Would be nice if we didn't need the PFILE argument.  */
1426 unsigned char *
1427 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
1428                  unsigned char *buffer, bool forstring)
1429 {
1430   switch (TOKEN_SPELL (token))
1431     {
1432     case SPELL_OPERATOR:
1433       {
1434         const unsigned char *spelling;
1435         unsigned char c;
1436
1437         if (token->flags & DIGRAPH)
1438           spelling
1439             = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1440         else if (token->flags & NAMED_OP)
1441           goto spell_ident;
1442         else
1443           spelling = TOKEN_NAME (token);
1444
1445         while ((c = *spelling++) != '\0')
1446           *buffer++ = c;
1447       }
1448       break;
1449
1450     spell_ident:
1451     case SPELL_IDENT:
1452       if (forstring)
1453         {
1454           memcpy (buffer, NODE_NAME (token->val.node),
1455                   NODE_LEN (token->val.node));
1456           buffer += NODE_LEN (token->val.node);
1457         }
1458       else
1459         {
1460           size_t i;
1461           const unsigned char * name = NODE_NAME (token->val.node);
1462
1463           for (i = 0; i < NODE_LEN (token->val.node); i++)
1464             if (name[i] & ~0x7F)
1465               {
1466                 i += utf8_to_ucn (buffer, name + i) - 1;
1467                 buffer += 10;
1468               }
1469             else
1470               *buffer++ = NODE_NAME (token->val.node)[i];
1471         }
1472       break;
1473
1474     case SPELL_LITERAL:
1475       memcpy (buffer, token->val.str.text, token->val.str.len);
1476       buffer += token->val.str.len;
1477       break;
1478
1479     case SPELL_NONE:
1480       cpp_error (pfile, CPP_DL_ICE,
1481                  "unspellable token %s", TOKEN_NAME (token));
1482       break;
1483     }
1484
1485   return buffer;
1486 }
1487
1488 /* Returns TOKEN spelt as a null-terminated string.  The string is
1489    freed when the reader is destroyed.  Useful for diagnostics.  */
1490 unsigned char *
1491 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
1492 {
1493   unsigned int len = cpp_token_len (token) + 1;
1494   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
1495
1496   end = cpp_spell_token (pfile, token, start, false);
1497   end[0] = '\0';
1498
1499   return start;
1500 }
1501
1502 /* Used by C front ends, which really should move to using
1503    cpp_token_as_text.  */
1504 const char *
1505 cpp_type2name (enum cpp_ttype type)
1506 {
1507   return (const char *) token_spellings[type].name;
1508 }
1509
1510 /* Writes the spelling of token to FP, without any preceding space.
1511    Separated from cpp_spell_token for efficiency - to avoid stdio
1512    double-buffering.  */
1513 void
1514 cpp_output_token (const cpp_token *token, FILE *fp)
1515 {
1516   switch (TOKEN_SPELL (token))
1517     {
1518     case SPELL_OPERATOR:
1519       {
1520         const unsigned char *spelling;
1521         int c;
1522
1523         if (token->flags & DIGRAPH)
1524           spelling
1525             = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1526         else if (token->flags & NAMED_OP)
1527           goto spell_ident;
1528         else
1529           spelling = TOKEN_NAME (token);
1530
1531         c = *spelling;
1532         do
1533           putc (c, fp);
1534         while ((c = *++spelling) != '\0');
1535       }
1536       break;
1537
1538     spell_ident:
1539     case SPELL_IDENT:
1540       {
1541         size_t i;
1542         const unsigned char * name = NODE_NAME (token->val.node);
1543
1544         for (i = 0; i < NODE_LEN (token->val.node); i++)
1545           if (name[i] & ~0x7F)
1546             {
1547               unsigned char buffer[10];
1548               i += utf8_to_ucn (buffer, name + i) - 1;
1549               fwrite (buffer, 1, 10, fp);
1550             }
1551           else
1552             fputc (NODE_NAME (token->val.node)[i], fp);
1553       }
1554       break;
1555
1556     case SPELL_LITERAL:
1557       fwrite (token->val.str.text, 1, token->val.str.len, fp);
1558       break;
1559
1560     case SPELL_NONE:
1561       /* An error, most probably.  */
1562       break;
1563     }
1564 }
1565
1566 /* Compare two tokens.  */
1567 int
1568 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
1569 {
1570   if (a->type == b->type && a->flags == b->flags)
1571     switch (TOKEN_SPELL (a))
1572       {
1573       default:                  /* Keep compiler happy.  */
1574       case SPELL_OPERATOR:
1575         /* arg_no is used to track where multiple consecutive ##
1576            tokens were originally located.  */
1577         return (a->type != CPP_PASTE || a->val.arg_no == b->val.arg_no);
1578       case SPELL_NONE:
1579         return (a->type != CPP_MACRO_ARG || a->val.arg_no == b->val.arg_no);
1580       case SPELL_IDENT:
1581         return a->val.node == b->val.node;
1582       case SPELL_LITERAL:
1583         return (a->val.str.len == b->val.str.len
1584                 && !memcmp (a->val.str.text, b->val.str.text,
1585                             a->val.str.len));
1586       }
1587
1588   return 0;
1589 }
1590
1591 /* Returns nonzero if a space should be inserted to avoid an
1592    accidental token paste for output.  For simplicity, it is
1593    conservative, and occasionally advises a space where one is not
1594    needed, e.g. "." and ".2".  */
1595 int
1596 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
1597                  const cpp_token *token2)
1598 {
1599   enum cpp_ttype a = token1->type, b = token2->type;
1600   cppchar_t c;
1601
1602   if (token1->flags & NAMED_OP)
1603     a = CPP_NAME;
1604   if (token2->flags & NAMED_OP)
1605     b = CPP_NAME;
1606
1607   c = EOF;
1608   if (token2->flags & DIGRAPH)
1609     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
1610   else if (token_spellings[b].category == SPELL_OPERATOR)
1611     c = token_spellings[b].name[0];
1612
1613   /* Quickly get everything that can paste with an '='.  */
1614   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
1615     return 1;
1616
1617   switch (a)
1618     {
1619     case CPP_GREATER:   return c == '>';
1620     case CPP_LESS:      return c == '<' || c == '%' || c == ':';
1621     case CPP_PLUS:      return c == '+';
1622     case CPP_MINUS:     return c == '-' || c == '>';
1623     case CPP_DIV:       return c == '/' || c == '*'; /* Comments.  */
1624     case CPP_MOD:       return c == ':' || c == '>';
1625     case CPP_AND:       return c == '&';
1626     case CPP_OR:        return c == '|';
1627     case CPP_COLON:     return c == ':' || c == '>';
1628     case CPP_DEREF:     return c == '*';
1629     case CPP_DOT:       return c == '.' || c == '%' || b == CPP_NUMBER;
1630     case CPP_HASH:      return c == '#' || c == '%'; /* Digraph form.  */
1631     case CPP_NAME:      return ((b == CPP_NUMBER
1632                                  && name_p (pfile, &token2->val.str))
1633                                 || b == CPP_NAME
1634                                 || b == CPP_CHAR || b == CPP_STRING); /* L */
1635     case CPP_NUMBER:    return (b == CPP_NUMBER || b == CPP_NAME
1636                                 || c == '.' || c == '+' || c == '-');
1637                                       /* UCNs */
1638     case CPP_OTHER:     return ((token1->val.str.text[0] == '\\'
1639                                  && b == CPP_NAME)
1640                                 || (CPP_OPTION (pfile, objc)
1641                                     && token1->val.str.text[0] == '@'
1642                                     && (b == CPP_NAME || b == CPP_STRING)));
1643     default:            break;
1644     }
1645
1646   return 0;
1647 }
1648
1649 /* Output all the remaining tokens on the current line, and a newline
1650    character, to FP.  Leading whitespace is removed.  If there are
1651    macros, special token padding is not performed.  */
1652 void
1653 cpp_output_line (cpp_reader *pfile, FILE *fp)
1654 {
1655   const cpp_token *token;
1656
1657   token = cpp_get_token (pfile);
1658   while (token->type != CPP_EOF)
1659     {
1660       cpp_output_token (token, fp);
1661       token = cpp_get_token (pfile);
1662       if (token->flags & PREV_WHITE)
1663         putc (' ', fp);
1664     }
1665
1666   putc ('\n', fp);
1667 }
1668
1669 /* Return a string representation of all the remaining tokens on the
1670    current line.  The result is allocated using xmalloc and must be
1671    freed by the caller.  */
1672 unsigned char *
1673 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
1674 {
1675   const cpp_token *token;
1676   unsigned int out = dir_name ? ustrlen (dir_name) : 0;
1677   unsigned int alloced = 120 + out;
1678   unsigned char *result = (unsigned char *) xmalloc (alloced);
1679
1680   /* If DIR_NAME is empty, there are no initial contents.  */
1681   if (dir_name)
1682     {
1683       sprintf ((char *) result, "#%s ", dir_name);
1684       out += 2;
1685     }
1686
1687   token = cpp_get_token (pfile);
1688   while (token->type != CPP_EOF)
1689     {
1690       unsigned char *last;
1691       /* Include room for a possible space and the terminating nul.  */
1692       unsigned int len = cpp_token_len (token) + 2;
1693
1694       if (out + len > alloced)
1695         {
1696           alloced *= 2;
1697           if (out + len > alloced)
1698             alloced = out + len;
1699           result = (unsigned char *) xrealloc (result, alloced);
1700         }
1701
1702       last = cpp_spell_token (pfile, token, &result[out], 0);
1703       out = last - result;
1704
1705       token = cpp_get_token (pfile);
1706       if (token->flags & PREV_WHITE)
1707         result[out++] = ' ';
1708     }
1709
1710   result[out] = '\0';
1711   return result;
1712 }
1713
1714 /* Memory buffers.  Changing these three constants can have a dramatic
1715    effect on performance.  The values here are reasonable defaults,
1716    but might be tuned.  If you adjust them, be sure to test across a
1717    range of uses of cpplib, including heavy nested function-like macro
1718    expansion.  Also check the change in peak memory usage (NJAMD is a
1719    good tool for this).  */
1720 #define MIN_BUFF_SIZE 8000
1721 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
1722 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
1723         (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
1724
1725 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
1726   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
1727 #endif
1728
1729 /* Create a new allocation buffer.  Place the control block at the end
1730    of the buffer, so that buffer overflows will cause immediate chaos.  */
1731 static _cpp_buff *
1732 new_buff (size_t len)
1733 {
1734   _cpp_buff *result;
1735   unsigned char *base;
1736
1737   if (len < MIN_BUFF_SIZE)
1738     len = MIN_BUFF_SIZE;
1739   len = CPP_ALIGN (len);
1740
1741   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
1742   result = (_cpp_buff *) (base + len);
1743   result->base = base;
1744   result->cur = base;
1745   result->limit = base + len;
1746   result->next = NULL;
1747   return result;
1748 }
1749
1750 /* Place a chain of unwanted allocation buffers on the free list.  */
1751 void
1752 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
1753 {
1754   _cpp_buff *end = buff;
1755
1756   while (end->next)
1757     end = end->next;
1758   end->next = pfile->free_buffs;
1759   pfile->free_buffs = buff;
1760 }
1761
1762 /* Return a free buffer of size at least MIN_SIZE.  */
1763 _cpp_buff *
1764 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
1765 {
1766   _cpp_buff *result, **p;
1767
1768   for (p = &pfile->free_buffs;; p = &(*p)->next)
1769     {
1770       size_t size;
1771
1772       if (*p == NULL)
1773         return new_buff (min_size);
1774       result = *p;
1775       size = result->limit - result->base;
1776       /* Return a buffer that's big enough, but don't waste one that's
1777          way too big.  */
1778       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
1779         break;
1780     }
1781
1782   *p = result->next;
1783   result->next = NULL;
1784   result->cur = result->base;
1785   return result;
1786 }
1787
1788 /* Creates a new buffer with enough space to hold the uncommitted
1789    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
1790    the excess bytes to the new buffer.  Chains the new buffer after
1791    BUFF, and returns the new buffer.  */
1792 _cpp_buff *
1793 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
1794 {
1795   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
1796   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
1797
1798   buff->next = new_buff;
1799   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
1800   return new_buff;
1801 }
1802
1803 /* Creates a new buffer with enough space to hold the uncommitted
1804    remaining bytes of the buffer pointed to by BUFF, and at least
1805    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
1806    Chains the new buffer before the buffer pointed to by BUFF, and
1807    updates the pointer to point to the new buffer.  */
1808 void
1809 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
1810 {
1811   _cpp_buff *new_buff, *old_buff = *pbuff;
1812   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
1813
1814   new_buff = _cpp_get_buff (pfile, size);
1815   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
1816   new_buff->next = old_buff;
1817   *pbuff = new_buff;
1818 }
1819
1820 /* Free a chain of buffers starting at BUFF.  */
1821 void
1822 _cpp_free_buff (_cpp_buff *buff)
1823 {
1824   _cpp_buff *next;
1825
1826   for (; buff; buff = next)
1827     {
1828       next = buff->next;
1829       free (buff->base);
1830     }
1831 }
1832
1833 /* Allocate permanent, unaligned storage of length LEN.  */
1834 unsigned char *
1835 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
1836 {
1837   _cpp_buff *buff = pfile->u_buff;
1838   unsigned char *result = buff->cur;
1839
1840   if (len > (size_t) (buff->limit - result))
1841     {
1842       buff = _cpp_get_buff (pfile, len);
1843       buff->next = pfile->u_buff;
1844       pfile->u_buff = buff;
1845       result = buff->cur;
1846     }
1847
1848   buff->cur = result + len;
1849   return result;
1850 }
1851
1852 /* Allocate permanent, unaligned storage of length LEN from a_buff.
1853    That buffer is used for growing allocations when saving macro
1854    replacement lists in a #define, and when parsing an answer to an
1855    assertion in #assert, #unassert or #if (and therefore possibly
1856    whilst expanding macros).  It therefore must not be used by any
1857    code that they might call: specifically the lexer and the guts of
1858    the macro expander.
1859
1860    All existing other uses clearly fit this restriction: storing
1861    registered pragmas during initialization.  */
1862 unsigned char *
1863 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
1864 {
1865   _cpp_buff *buff = pfile->a_buff;
1866   unsigned char *result = buff->cur;
1867
1868   if (len > (size_t) (buff->limit - result))
1869     {
1870       buff = _cpp_get_buff (pfile, len);
1871       buff->next = pfile->a_buff;
1872       pfile->a_buff = buff;
1873       result = buff->cur;
1874     }
1875
1876   buff->cur = result + len;
1877   return result;
1878 }
1879
1880 /* Say which field of TOK is in use.  */
1881
1882 enum cpp_token_fld_kind
1883 cpp_token_val_index (cpp_token *tok)
1884 {
1885   switch (TOKEN_SPELL (tok))
1886     {
1887     case SPELL_IDENT:
1888       return CPP_TOKEN_FLD_NODE;
1889     case SPELL_LITERAL:
1890       return CPP_TOKEN_FLD_STR;
1891     case SPELL_OPERATOR:
1892       if (tok->type == CPP_PASTE)
1893         return CPP_TOKEN_FLD_ARG_NO;
1894       else
1895         return CPP_TOKEN_FLD_NONE;
1896     case SPELL_NONE:
1897       if (tok->type == CPP_MACRO_ARG)
1898         return CPP_TOKEN_FLD_ARG_NO;
1899       else if (tok->type == CPP_PADDING)
1900         return CPP_TOKEN_FLD_SOURCE;
1901       else if (tok->type == CPP_PRAGMA)
1902         return CPP_TOKEN_FLD_PRAGMA;
1903       /* else fall through */
1904     default:
1905       return CPP_TOKEN_FLD_NONE;
1906     }
1907 }