libcpp/lex.c

   1 /* CPP Library - lexical analysis.
   2    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2007, 2008, 2009, 2010,
   3    2011, 2012 Free Software Foundation, Inc.
   4    Contributed by Per Bothner, 1994-95.
   5    Based on CCCP program by Paul Rubin, June 1986
   6    Adapted to ANSI C, Richard Stallman, Jan 1987
   7    Broken out to separate file, Zack Weinberg, Mar 2000
   8
   9 This program is free software; you can redistribute it and/or modify it
  10 under the terms of the GNU General Public License as published by the
  11 Free Software Foundation; either version 3, or (at your option) any
  12 later version.
  13
  14 This program is distributed in the hope that it will be useful,
  15 but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17 GNU General Public License for more details.
  18
  19 You should have received a copy of the GNU General Public License
  20 along with this program; see the file COPYING3.  If not see
  21 <http://www.gnu.org/licenses/>.  */
  22
  23 #include "config.h"
  24 #include "system.h"
  25 #include "cpplib.h"
  26 #include "internal.h"
  27
  28 enum spell_type
  29 {
  30   SPELL_OPERATOR = 0,
  31   SPELL_IDENT,
  32   SPELL_LITERAL,
  33   SPELL_NONE
  34 };
  35
  36 struct token_spelling
  37 {
  38   enum spell_type category;
  39   const unsigned char *name;
  40 };
  41
  42 static const unsigned char *const digraph_spellings[] =
  43 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
  44
  45 #define OP(e, s) { SPELL_OPERATOR, UC s  },
  46 #define TK(e, s) { SPELL_ ## s,    UC #e },
  47 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
  48 #undef OP
  49 #undef TK
  50
  51 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
  52 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
  53
  54 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
  55 static int skip_line_comment (cpp_reader *);
  56 static void skip_whitespace (cpp_reader *, cppchar_t);
  57 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
  58 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
  59 static void store_comment (cpp_reader *, cpp_token *);
  60 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
  61                             unsigned int, enum cpp_ttype);
  62 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
  63 static int name_p (cpp_reader *, const cpp_string *);
  64 static tokenrun *next_tokenrun (tokenrun *);
  65
  66 static _cpp_buff *new_buff (size_t);
  67
  68
  69 /* Utility routine:
  70
  71    Compares, the token TOKEN to the NUL-terminated string STRING.
  72    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
  73 int
  74 cpp_ideq (const cpp_token *token, const char *string)
  75 {
  76   if (token->type != CPP_NAME)
  77     return 0;
  78
  79   return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
  80 }
  81
  82 /* Record a note TYPE at byte POS into the current cleaned logical
  83    line.  */
  84 static void
  85 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
  86 {
  87   if (buffer->notes_used == buffer->notes_cap)
  88     {
  89       buffer->notes_cap = buffer->notes_cap * 2 + 200;
  90       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
  91                                   buffer->notes_cap);
  92     }
  93
  94   buffer->notes[buffer->notes_used].pos = pos;
  95   buffer->notes[buffer->notes_used].type = type;
  96   buffer->notes_used++;
  97 }
  98
  99 \f
 100 /* Fast path to find line special characters using optimized character
 101    scanning algorithms.  Anything complicated falls back to the slow
 102    path below.  Since this loop is very hot it's worth doing these kinds
 103    of optimizations.
 104
 105    One of the paths through the ifdefs should provide
 106
 107      const uchar *search_line_fast (const uchar *s, const uchar *end);
 108
 109    Between S and END, search for \n, \r, \\, ?.  Return a pointer to
 110    the found character.
 111
 112    Note that the last character of the buffer is *always* a newline,
 113    as forced by _cpp_convert_input.  This fact can be used to avoid
 114    explicitly looking for the end of the buffer.  */
 115
 116 /* Configure gives us an ifdef test.  */
 117 #ifndef WORDS_BIGENDIAN
 118 #define WORDS_BIGENDIAN 0
 119 #endif
 120
 121 /* We'd like the largest integer that fits into a register.  There's nothing
 122    in <stdint.h> that gives us that.  For most hosts this is unsigned long,
 123    but MS decided on an LLP64 model.  Thankfully when building with GCC we
 124    can get the "real" word size.  */
 125 #ifdef __GNUC__
 126 typedef unsigned int word_type __attribute__((__mode__(__word__)));
 127 #else
 128 typedef unsigned long word_type;
 129 #endif
 130
 131 /* The code below is only expecting sizes 4 or 8.
 132    Die at compile-time if this expectation is violated.  */
 133 typedef char check_word_type_size
 134   [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
 135
 136 /* Return X with the first N bytes forced to values that won't match one
 137    of the interesting characters.  Note that NUL is not interesting.  */
 138
 139 static inline word_type
 140 acc_char_mask_misalign (word_type val, unsigned int n)
 141 {
 142   word_type mask = -1;
 143   if (WORDS_BIGENDIAN)
 144     mask >>= n * 8;
 145   else
 146     mask <<= n * 8;
 147   return val & mask;
 148 }
 149
 150 /* Return X replicated to all byte positions within WORD_TYPE.  */
 151
 152 static inline word_type
 153 acc_char_replicate (uchar x)
 154 {
 155   word_type ret;
 156
 157   ret = (x << 24) | (x << 16) | (x << 8) | x;
 158   if (sizeof(word_type) == 8)
 159     ret = (ret << 16 << 16) | ret;
 160   return ret;
 161 }
 162
 163 /* Return non-zero if some byte of VAL is (probably) C.  */
 164
 165 static inline word_type
 166 acc_char_cmp (word_type val, word_type c)
 167 {
 168 #if defined(__GNUC__) && defined(__alpha__)
 169   /* We can get exact results using a compare-bytes instruction.
 170      Get (val == c) via (0 >= (val ^ c)).  */
 171   return __builtin_alpha_cmpbge (0, val ^ c);
 172 #else
 173   word_type magic = 0x7efefefeU;
 174   if (sizeof(word_type) == 8)
 175     magic = (magic << 16 << 16) | 0xfefefefeU;
 176   magic |= 1;
 177
 178   val ^= c;
 179   return ((val + magic) ^ ~val) & ~magic;
 180 #endif
 181 }
 182
 183 /* Given the result of acc_char_cmp is non-zero, return the index of
 184    the found character.  If this was a false positive, return -1.  */
 185
 186 static inline int
 187 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
 188                 word_type val ATTRIBUTE_UNUSED)
 189 {
 190 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
 191   /* The cmpbge instruction sets *bits* of the result corresponding to
 192      matches in the bytes with no false positives.  */
 193   return __builtin_ctzl (cmp);
 194 #else
 195   unsigned int i;
 196
 197   /* ??? It would be nice to force unrolling here,
 198      and have all of these constants folded.  */
 199   for (i = 0; i < sizeof(word_type); ++i)
 200     {
 201       uchar c;
 202       if (WORDS_BIGENDIAN)
 203         c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
 204       else
 205         c = (val >> i * 8) & 0xff;
 206
 207       if (c == '\n' || c == '\r' || c == '\\' || c == '?')
 208         return i;
 209     }
 210
 211   return -1;
 212 #endif
 213 }
 214
 215 /* A version of the fast scanner using bit fiddling techniques.
 216
 217    For 32-bit words, one would normally perform 16 comparisons and
 218    16 branches.  With this algorithm one performs 24 arithmetic
 219    operations and one branch.  Whether this is faster with a 32-bit
 220    word size is going to be somewhat system dependent.
 221
 222    For 64-bit words, we eliminate twice the number of comparisons
 223    and branches without increasing the number of arithmetic operations.
 224    It's almost certainly going to be a win with 64-bit word size.  */
 225
 226 static const uchar * search_line_acc_char (const uchar *, const uchar *)
 227   ATTRIBUTE_UNUSED;
 228
 229 static const uchar *
 230 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 231 {
 232   const word_type repl_nl = acc_char_replicate ('\n');
 233   const word_type repl_cr = acc_char_replicate ('\r');
 234   const word_type repl_bs = acc_char_replicate ('\\');
 235   const word_type repl_qm = acc_char_replicate ('?');
 236
 237   unsigned int misalign;
 238   const word_type *p;
 239   word_type val, t;
 240
 241   /* Align the buffer.  Mask out any bytes from before the beginning.  */
 242   p = (word_type *)((uintptr_t)s & -sizeof(word_type));
 243   val = *p;
 244   misalign = (uintptr_t)s & (sizeof(word_type) - 1);
 245   if (misalign)
 246     val = acc_char_mask_misalign (val, misalign);
 247
 248   /* Main loop.  */
 249   while (1)
 250     {
 251       t  = acc_char_cmp (val, repl_nl);
 252       t |= acc_char_cmp (val, repl_cr);
 253       t |= acc_char_cmp (val, repl_bs);
 254       t |= acc_char_cmp (val, repl_qm);
 255
 256       if (__builtin_expect (t != 0, 0))
 257         {
 258           int i = acc_char_index (t, val);
 259           if (i >= 0)
 260             return (const uchar *)p + i;
 261         }
 262
 263       val = *++p;
 264     }
 265 }
 266
 267 /* Disable on Solaris 2/x86 until the following problems can be properly
 268    autoconfed:
 269
 270    The Solaris 9 assembler cannot assemble SSE4.2 insns.
 271    Before Solaris 9 Update 6, SSE insns cannot be executed.
 272    The Solaris 10+ assembler tags objects with the instruction set
 273    extensions used, so SSE4.2 executables cannot run on machines that
 274    don't support that extension.  */
 275
 276 #if (GCC_VERSION >= 4005) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
 277
 278 /* Replicated character data to be shared between implementations.
 279    Recall that outside of a context with vector support we can't
 280    define compatible vector types, therefore these are all defined
 281    in terms of raw characters.  */
 282 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
 283   { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 284     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
 285   { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 286     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
 287   { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 288     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
 289   { '?', '?', '?', '?', '?', '?', '?', '?',
 290     '?', '?', '?', '?', '?', '?', '?', '?' },
 291 };
 292
 293 /* A version of the fast scanner using MMX vectorized byte compare insns.
 294
 295    This uses the PMOVMSKB instruction which was introduced with "MMX2",
 296    which was packaged into SSE1; it is also present in the AMD MMX
 297    extension.  Mark the function as using "sse" so that we emit a real
 298    "emms" instruction, rather than the 3dNOW "femms" instruction.  */
 299
 300 static const uchar *
 301 #ifndef __SSE__
 302 __attribute__((__target__("sse")))
 303 #endif
 304 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 305 {
 306   typedef char v8qi __attribute__ ((__vector_size__ (8)));
 307   typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
 308
 309   const v8qi repl_nl = *(const v8qi *)repl_chars[0];
 310   const v8qi repl_cr = *(const v8qi *)repl_chars[1];
 311   const v8qi repl_bs = *(const v8qi *)repl_chars[2];
 312   const v8qi repl_qm = *(const v8qi *)repl_chars[3];
 313
 314   unsigned int misalign, found, mask;
 315   const v8qi *p;
 316   v8qi data, t, c;
 317
 318   /* Align the source pointer.  While MMX doesn't generate unaligned data
 319      faults, this allows us to safely scan to the end of the buffer without
 320      reading beyond the end of the last page.  */
 321   misalign = (uintptr_t)s & 7;
 322   p = (const v8qi *)((uintptr_t)s & -8);
 323   data = *p;
 324
 325   /* Create a mask for the bytes that are valid within the first
 326      16-byte block.  The Idea here is that the AND with the mask
 327      within the loop is "free", since we need some AND or TEST
 328      insn in order to set the flags for the branch anyway.  */
 329   mask = -1u << misalign;
 330
 331   /* Main loop processing 8 bytes at a time.  */
 332   goto start;
 333   do
 334     {
 335       data = *++p;
 336       mask = -1;
 337
 338     start:
 339       t = __builtin_ia32_pcmpeqb(data, repl_nl);
 340       c = __builtin_ia32_pcmpeqb(data, repl_cr);
 341       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 342       c = __builtin_ia32_pcmpeqb(data, repl_bs);
 343       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 344       c = __builtin_ia32_pcmpeqb(data, repl_qm);
 345       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 346       found = __builtin_ia32_pmovmskb (t);
 347       found &= mask;
 348     }
 349   while (!found);
 350
 351   __builtin_ia32_emms ();
 352
 353   /* FOUND contains 1 in bits for which we matched a relevant
 354      character.  Conversion to the byte index is trivial.  */
 355   found = __builtin_ctz(found);
 356   return (const uchar *)p + found;
 357 }
 358
 359 /* A version of the fast scanner using SSE2 vectorized byte compare insns.  */
 360
 361 static const uchar *
 362 #ifndef __SSE2__
 363 __attribute__((__target__("sse2")))
 364 #endif
 365 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 366 {
 367   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 368
 369   const v16qi repl_nl = *(const v16qi *)repl_chars[0];
 370   const v16qi repl_cr = *(const v16qi *)repl_chars[1];
 371   const v16qi repl_bs = *(const v16qi *)repl_chars[2];
 372   const v16qi repl_qm = *(const v16qi *)repl_chars[3];
 373
 374   unsigned int misalign, found, mask;
 375   const v16qi *p;
 376   v16qi data, t;
 377
 378   /* Align the source pointer.  */
 379   misalign = (uintptr_t)s & 15;
 380   p = (const v16qi *)((uintptr_t)s & -16);
 381   data = *p;
 382
 383   /* Create a mask for the bytes that are valid within the first
 384      16-byte block.  The Idea here is that the AND with the mask
 385      within the loop is "free", since we need some AND or TEST
 386      insn in order to set the flags for the branch anyway.  */
 387   mask = -1u << misalign;
 388
 389   /* Main loop processing 16 bytes at a time.  */
 390   goto start;
 391   do
 392     {
 393       data = *++p;
 394       mask = -1;
 395
 396     start:
 397       t  = __builtin_ia32_pcmpeqb128(data, repl_nl);
 398       t |= __builtin_ia32_pcmpeqb128(data, repl_cr);
 399       t |= __builtin_ia32_pcmpeqb128(data, repl_bs);
 400       t |= __builtin_ia32_pcmpeqb128(data, repl_qm);
 401       found = __builtin_ia32_pmovmskb128 (t);
 402       found &= mask;
 403     }
 404   while (!found);
 405
 406   /* FOUND contains 1 in bits for which we matched a relevant
 407      character.  Conversion to the byte index is trivial.  */
 408   found = __builtin_ctz(found);
 409   return (const uchar *)p + found;
 410 }
 411
 412 #ifdef HAVE_SSE4
 413 /* A version of the fast scanner using SSE 4.2 vectorized string insns.  */
 414
 415 static const uchar *
 416 #ifndef __SSE4_2__
 417 __attribute__((__target__("sse4.2")))
 418 #endif
 419 search_line_sse42 (const uchar *s, const uchar *end)
 420 {
 421   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 422   static const v16qi search = { '\n', '\r', '?', '\\' };
 423
 424   uintptr_t si = (uintptr_t)s;
 425   uintptr_t index;
 426
 427   /* Check for unaligned input.  */
 428   if (si & 15)
 429     {
 430       if (__builtin_expect (end - s < 16, 0)
 431           && __builtin_expect ((si & 0xfff) > 0xff0, 0))
 432         {
 433           /* There are less than 16 bytes left in the buffer, and less
 434              than 16 bytes left on the page.  Reading 16 bytes at this
 435              point might generate a spurious page fault.  Defer to the
 436              SSE2 implementation, which already handles alignment.  */
 437           return search_line_sse2 (s, end);
 438         }
 439
 440       /* ??? The builtin doesn't understand that the PCMPESTRI read from
 441          memory need not be aligned.  */
 442       __asm ("%vpcmpestri $0, (%1), %2"
 443              : "=c"(index) : "r"(s), "x"(search), "a"(4), "d"(16));
 444       if (__builtin_expect (index < 16, 0))
 445         goto found;
 446
 447       /* Advance the pointer to an aligned address.  We will re-scan a
 448          few bytes, but we no longer need care for reading past the
 449          end of a page, since we're guaranteed a match.  */
 450       s = (const uchar *)((si + 16) & -16);
 451     }
 452
 453   /* Main loop, processing 16 bytes at a time.  By doing the whole loop
 454      in inline assembly, we can make proper use of the flags set.  */
 455   __asm (      "sub $16, %1\n"
 456         "       .balign 16\n"
 457         "0:     add $16, %1\n"
 458         "       %vpcmpestri $0, (%1), %2\n"
 459         "       jnc 0b"
 460         : "=&c"(index), "+r"(s)
 461         : "x"(search), "a"(4), "d"(16));
 462
 463  found:
 464   return s + index;
 465 }
 466
 467 #else
 468 /* Work around out-dated assemblers without sse4 support.  */
 469 #define search_line_sse42 search_line_sse2
 470 #endif
 471
 472 /* Check the CPU capabilities.  */
 473
 474 #include "../gcc/config/i386/cpuid.h"
 475
 476 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
 477 static search_line_fast_type search_line_fast;
 478
 479 #define HAVE_init_vectorized_lexer 1
 480 static inline void
 481 init_vectorized_lexer (void)
 482 {
 483   unsigned dummy, ecx = 0, edx = 0;
 484   search_line_fast_type impl = search_line_acc_char;
 485   int minimum = 0;
 486
 487 #if defined(__SSE4_2__)
 488   minimum = 3;
 489 #elif defined(__SSE2__)
 490   minimum = 2;
 491 #elif defined(__SSE__)
 492   minimum = 1;
 493 #endif
 494
 495   if (minimum == 3)
 496     impl = search_line_sse42;
 497   else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
 498     {
 499       if (minimum == 3 || (ecx & bit_SSE4_2))
 500         impl = search_line_sse42;
 501       else if (minimum == 2 || (edx & bit_SSE2))
 502         impl = search_line_sse2;
 503       else if (minimum == 1 || (edx & bit_SSE))
 504         impl = search_line_mmx;
 505     }
 506   else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
 507     {
 508       if (minimum == 1
 509           || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
 510         impl = search_line_mmx;
 511     }
 512
 513   search_line_fast = impl;
 514 }
 515
 516 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__)
 517
 518 /* A vection of the fast scanner using AltiVec vectorized byte compares.  */
 519 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
 520    so we can't compile this function without -maltivec on the command line
 521    (or implied by some other switch).  */
 522
 523 static const uchar *
 524 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 525 {
 526   typedef __attribute__((altivec(vector))) unsigned char vc;
 527
 528   const vc repl_nl = {
 529     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 530     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 531   };
 532   const vc repl_cr = {
 533     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 534     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 535   };
 536   const vc repl_bs = {
 537     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 538     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 539   };
 540   const vc repl_qm = {
 541     '?', '?', '?', '?', '?', '?', '?', '?',
 542     '?', '?', '?', '?', '?', '?', '?', '?',
 543   };
 544   const vc ones = {
 545     -1, -1, -1, -1, -1, -1, -1, -1,
 546     -1, -1, -1, -1, -1, -1, -1, -1,
 547   };
 548   const vc zero = { 0 };
 549
 550   vc data, mask, t;
 551
 552   /* Altivec loads automatically mask addresses with -16.  This lets us
 553      issue the first load as early as possible.  */
 554   data = __builtin_vec_ld(0, (const vc *)s);
 555
 556   /* Discard bytes before the beginning of the buffer.  Do this by
 557      beginning with all ones and shifting in zeros according to the
 558      mis-alignment.  The LVSR instruction pulls the exact shift we
 559      want from the address.  */
 560   mask = __builtin_vec_lvsr(0, s);
 561   mask = __builtin_vec_perm(zero, ones, mask);
 562   data &= mask;
 563
 564   /* While altivec loads mask addresses, we still need to align S so
 565      that the offset we compute at the end is correct.  */
 566   s = (const uchar *)((uintptr_t)s & -16);
 567
 568   /* Main loop processing 16 bytes at a time.  */
 569   goto start;
 570   do
 571     {
 572       vc m_nl, m_cr, m_bs, m_qm;
 573
 574       s += 16;
 575       data = __builtin_vec_ld(0, (const vc *)s);
 576
 577     start:
 578       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 579       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 580       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 581       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 582       t = (m_nl | m_cr) | (m_bs | m_qm);
 583
 584       /* T now contains 0xff in bytes for which we matched one of the relevant
 585          characters.  We want to exit the loop if any byte in T is non-zero.
 586          Below is the expansion of vec_any_ne(t, zero).  */
 587     }
 588   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 589
 590   {
 591 #define N  (sizeof(vc) / sizeof(long))
 592
 593     typedef char check_count[(N == 2 || N == 4) * 2 - 1];
 594     union {
 595       vc v;
 596       unsigned long l[N];
 597     } u;
 598     unsigned long l, i = 0;
 599
 600     u.v = t;
 601
 602     /* Find the first word of T that is non-zero.  */
 603     switch (N)
 604       {
 605       case 4:
 606         l = u.l[i++];
 607         if (l != 0)
 608           break;
 609         s += sizeof(unsigned long);
 610         l = u.l[i++];
 611         if (l != 0)
 612           break;
 613         s += sizeof(unsigned long);
 614       case 2:
 615         l = u.l[i++];
 616         if (l != 0)
 617           break;
 618         s += sizeof(unsigned long);
 619         l = u.l[i];
 620       }
 621
 622     /* L now contains 0xff in bytes for which we matched one of the
 623        relevant characters.  We can find the byte index by finding
 624        its bit index and dividing by 8.  */
 625     l = __builtin_clzl(l) >> 3;
 626     return s + l;
 627
 628 #undef N
 629   }
 630 }
 631
 632 #else
 633
 634 /* We only have one accellerated alternative.  Use a direct call so that
 635    we encourage inlining.  */
 636
 637 #define search_line_fast  search_line_acc_char
 638
 639 #endif
 640
 641 /* Initialize the lexer if needed.  */
 642
 643 void
 644 _cpp_init_lexer (void)
 645 {
 646 #ifdef HAVE_init_vectorized_lexer
 647   init_vectorized_lexer ();
 648 #endif
 649 }
 650
 651 /* Returns with a logical line that contains no escaped newlines or
 652    trigraphs.  This is a time-critical inner loop.  */
 653 void
 654 _cpp_clean_line (cpp_reader *pfile)
 655 {
 656   cpp_buffer *buffer;
 657   const uchar *s;
 658   uchar c, *d, *p;
 659
 660   buffer = pfile->buffer;
 661   buffer->cur_note = buffer->notes_used = 0;
 662   buffer->cur = buffer->line_base = buffer->next_line;
 663   buffer->need_line = false;
 664   s = buffer->next_line;
 665
 666   if (!buffer->from_stage3)
 667     {
 668       const uchar *pbackslash = NULL;
 669
 670       /* Fast path.  This is the common case of an un-escaped line with
 671          no trigraphs.  The primary win here is by not writing any
 672          data back to memory until we have to.  */
 673       while (1)
 674         {
 675           /* Perform an optimized search for \n, \r, \\, ?.  */
 676           s = search_line_fast (s, buffer->rlimit);
 677
 678           c = *s;
 679           if (c == '\\')
 680             {
 681               /* Record the location of the backslash and continue.  */
 682               pbackslash = s++;
 683             }
 684           else if (__builtin_expect (c == '?', 0))
 685             {
 686               if (__builtin_expect (s[1] == '?', false)
 687                    && _cpp_trigraph_map[s[2]])
 688                 {
 689                   /* Have a trigraph.  We may or may not have to convert
 690                      it.  Add a line note regardless, for -Wtrigraphs.  */
 691                   add_line_note (buffer, s, s[2]);
 692                   if (CPP_OPTION (pfile, trigraphs))
 693                     {
 694                       /* We do, and that means we have to switch to the
 695                          slow path.  */
 696                       d = (uchar *) s;
 697                       *d = _cpp_trigraph_map[s[2]];
 698                       s += 2;
 699                       goto slow_path;
 700                     }
 701                 }
 702               /* Not a trigraph.  Continue on fast-path.  */
 703               s++;
 704             }
 705           else
 706             break;
 707         }
 708
 709       /* This must be \r or \n.  We're either done, or we'll be forced
 710          to write back to the buffer and continue on the slow path.  */
 711       d = (uchar *) s;
 712
 713       if (__builtin_expect (s == buffer->rlimit, false))
 714         goto done;
 715
 716       /* DOS line ending? */
 717       if (__builtin_expect (c == '\r', false) && s[1] == '\n')
 718         {
 719           s++;
 720           if (s == buffer->rlimit)
 721             goto done;
 722         }
 723
 724       if (__builtin_expect (pbackslash == NULL, true))
 725         goto done;
 726
 727       /* Check for escaped newline.  */
 728       p = d;
 729       while (is_nvspace (p[-1]))
 730         p--;
 731       if (p - 1 != pbackslash)
 732         goto done;
 733
 734       /* Have an escaped newline; process it and proceed to
 735          the slow path.  */
 736       add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
 737       d = p - 2;
 738       buffer->next_line = p - 1;
 739
 740     slow_path:
 741       while (1)
 742         {
 743           c = *++s;
 744           *++d = c;
 745
 746           if (c == '\n' || c == '\r')
 747             {
 748               /* Handle DOS line endings.  */
 749               if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
 750                 s++;
 751               if (s == buffer->rlimit)
 752                 break;
 753
 754               /* Escaped?  */
 755               p = d;
 756               while (p != buffer->next_line && is_nvspace (p[-1]))
 757                 p--;
 758               if (p == buffer->next_line || p[-1] != '\\')
 759                 break;
 760
 761               add_line_note (buffer, p - 1, p != d ? ' ': '\\');
 762               d = p - 2;
 763               buffer->next_line = p - 1;
 764             }
 765           else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
 766             {
 767               /* Add a note regardless, for the benefit of -Wtrigraphs.  */
 768               add_line_note (buffer, d, s[2]);
 769               if (CPP_OPTION (pfile, trigraphs))
 770                 {
 771                   *d = _cpp_trigraph_map[s[2]];
 772                   s += 2;
 773                 }
 774             }
 775         }
 776     }
 777   else
 778     {
 779       while (*s != '\n' && *s != '\r')
 780         s++;
 781       d = (uchar *) s;
 782
 783       /* Handle DOS line endings.  */
 784       if (*s == '\r' && s != buffer->rlimit && s[1] == '\n')
 785         s++;
 786     }
 787
 788  done:
 789   *d = '\n';
 790   /* A sentinel note that should never be processed.  */
 791   add_line_note (buffer, d + 1, '\n');
 792   buffer->next_line = s + 1;
 793 }
 794
 795 /* Return true if the trigraph indicated by NOTE should be warned
 796    about in a comment.  */
 797 static bool
 798 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
 799 {
 800   const uchar *p;
 801
 802   /* Within comments we don't warn about trigraphs, unless the
 803      trigraph forms an escaped newline, as that may change
 804      behavior.  */
 805   if (note->type != '/')
 806     return false;
 807
 808   /* If -trigraphs, then this was an escaped newline iff the next note
 809      is coincident.  */
 810   if (CPP_OPTION (pfile, trigraphs))
 811     return note[1].pos == note->pos;
 812
 813   /* Otherwise, see if this forms an escaped newline.  */
 814   p = note->pos + 3;
 815   while (is_nvspace (*p))
 816     p++;
 817
 818   /* There might have been escaped newlines between the trigraph and the
 819      newline we found.  Hence the position test.  */
 820   return (*p == '\n' && p < note[1].pos);
 821 }
 822
 823 /* Process the notes created by add_line_note as far as the current
 824    location.  */
 825 void
 826 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
 827 {
 828   cpp_buffer *buffer = pfile->buffer;
 829
 830   for (;;)
 831     {
 832       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
 833       unsigned int col;
 834
 835       if (note->pos > buffer->cur)
 836         break;
 837
 838       buffer->cur_note++;
 839       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
 840
 841       if (note->type == '\\' || note->type == ' ')
 842         {
 843           if (note->type == ' ' && !in_comment)
 844             cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
 845                                  "backslash and newline separated by space");
 846
 847           if (buffer->next_line > buffer->rlimit)
 848             {
 849               cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
 850                                    "backslash-newline at end of file");
 851               /* Prevent "no newline at end of file" warning.  */
 852               buffer->next_line = buffer->rlimit;
 853             }
 854
 855           buffer->line_base = note->pos;
 856           CPP_INCREMENT_LINE (pfile, 0);
 857         }
 858       else if (_cpp_trigraph_map[note->type])
 859         {
 860           if (CPP_OPTION (pfile, warn_trigraphs)
 861               && (!in_comment || warn_in_comment (pfile, note)))
 862             {
 863               if (CPP_OPTION (pfile, trigraphs))
 864                 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
 865                                        pfile->line_table->highest_line, col,
 866                                        "trigraph ??%c converted to %c",
 867                                        note->type,
 868                                        (int) _cpp_trigraph_map[note->type]);
 869               else
 870                 {
 871                   cpp_warning_with_line
 872                     (pfile, CPP_W_TRIGRAPHS,
 873                      pfile->line_table->highest_line, col,
 874                      "trigraph ??%c ignored, use -trigraphs to enable",
 875                      note->type);
 876                 }
 877             }
 878         }
 879       else if (note->type == 0)
 880         /* Already processed in lex_raw_string.  */;
 881       else
 882         abort ();
 883     }
 884 }
 885
 886 /* Skip a C-style block comment.  We find the end of the comment by
 887    seeing if an asterisk is before every '/' we encounter.  Returns
 888    nonzero if comment terminated by EOF, zero otherwise.
 889
 890    Buffer->cur points to the initial asterisk of the comment.  */
 891 bool
 892 _cpp_skip_block_comment (cpp_reader *pfile)
 893 {
 894   cpp_buffer *buffer = pfile->buffer;
 895   const uchar *cur = buffer->cur;
 896   uchar c;
 897
 898   cur++;
 899   if (*cur == '/')
 900     cur++;
 901
 902   for (;;)
 903     {
 904       /* People like decorating comments with '*', so check for '/'
 905          instead for efficiency.  */
 906       c = *cur++;
 907
 908       if (c == '/')
 909         {
 910           if (cur[-2] == '*')
 911             break;
 912
 913           /* Warn about potential nested comments, but not if the '/'
 914              comes immediately before the true comment delimiter.
 915              Don't bother to get it right across escaped newlines.  */
 916           if (CPP_OPTION (pfile, warn_comments)
 917               && cur[0] == '*' && cur[1] != '/')
 918             {
 919               buffer->cur = cur;
 920               cpp_warning_with_line (pfile, CPP_W_COMMENTS,
 921                                      pfile->line_table->highest_line,
 922                                      CPP_BUF_COL (buffer),
 923                                      "\"/*\" within comment");
 924             }
 925         }
 926       else if (c == '\n')
 927         {
 928           unsigned int cols;
 929           buffer->cur = cur - 1;
 930           _cpp_process_line_notes (pfile, true);
 931           if (buffer->next_line >= buffer->rlimit)
 932             return true;
 933           _cpp_clean_line (pfile);
 934
 935           cols = buffer->next_line - buffer->line_base;
 936           CPP_INCREMENT_LINE (pfile, cols);
 937
 938           cur = buffer->cur;
 939         }
 940     }
 941
 942   buffer->cur = cur;
 943   _cpp_process_line_notes (pfile, true);
 944   return false;
 945 }
 946
 947 /* Skip a C++ line comment, leaving buffer->cur pointing to the
 948    terminating newline.  Handles escaped newlines.  Returns nonzero
 949    if a multiline comment.  */
 950 static int
 951 skip_line_comment (cpp_reader *pfile)
 952 {
 953   cpp_buffer *buffer = pfile->buffer;
 954   source_location orig_line = pfile->line_table->highest_line;
 955
 956   while (*buffer->cur != '\n')
 957     buffer->cur++;
 958
 959   _cpp_process_line_notes (pfile, true);
 960   return orig_line != pfile->line_table->highest_line;
 961 }
 962
 963 /* Skips whitespace, saving the next non-whitespace character.  */
 964 static void
 965 skip_whitespace (cpp_reader *pfile, cppchar_t c)
 966 {
 967   cpp_buffer *buffer = pfile->buffer;
 968   bool saw_NUL = false;
 969
 970   do
 971     {
 972       /* Horizontal space always OK.  */
 973       if (c == ' ' || c == '\t')
 974         ;
 975       /* Just \f \v or \0 left.  */
 976       else if (c == '\0')
 977         saw_NUL = true;
 978       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
 979         cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
 980                              CPP_BUF_COL (buffer),
 981                              "%s in preprocessing directive",
 982                              c == '\f' ? "form feed" : "vertical tab");
 983
 984       c = *buffer->cur++;
 985     }
 986   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
 987   while (is_nvspace (c));
 988
 989   if (saw_NUL)
 990     cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored");
 991
 992   buffer->cur--;
 993 }
 994
 995 /* See if the characters of a number token are valid in a name (no
 996    '.', '+' or '-').  */
 997 static int
 998 name_p (cpp_reader *pfile, const cpp_string *string)
 999 {
1000   unsigned int i;
1001
1002   for (i = 0; i < string->len; i++)
1003     if (!is_idchar (string->text[i]))
1004       return 0;
1005
1006   return 1;
1007 }
1008
1009 /* After parsing an identifier or other sequence, produce a warning about
1010    sequences not in NFC/NFKC.  */
1011 static void
1012 warn_about_normalization (cpp_reader *pfile,
1013                           const cpp_token *token,
1014                           const struct normalize_state *s)
1015 {
1016   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
1017       && !pfile->state.skipping)
1018     {
1019       /* Make sure that the token is printed using UCNs, even
1020          if we'd otherwise happily print UTF-8.  */
1021       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
1022       size_t sz;
1023
1024       sz = cpp_spell_token (pfile, token, buf, false) - buf;
1025       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
1026         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1027                                "`%.*s' is not in NFKC", (int) sz, buf);
1028       else
1029         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1030                                "`%.*s' is not in NFC", (int) sz, buf);
1031     }
1032 }
1033
1034 /* Returns TRUE if the sequence starting at buffer->cur is invalid in
1035    an identifier.  FIRST is TRUE if this starts an identifier.  */
1036 static bool
1037 forms_identifier_p (cpp_reader *pfile, int first,
1038                     struct normalize_state *state)
1039 {
1040   cpp_buffer *buffer = pfile->buffer;
1041
1042   if (*buffer->cur == '$')
1043     {
1044       if (!CPP_OPTION (pfile, dollars_in_ident))
1045         return false;
1046
1047       buffer->cur++;
1048       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
1049         {
1050           CPP_OPTION (pfile, warn_dollars) = 0;
1051           cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
1052         }
1053
1054       return true;
1055     }
1056
1057   /* Is this a syntactically valid UCN?  */
1058   if (CPP_OPTION (pfile, extended_identifiers)
1059       && *buffer->cur == '\\'
1060       && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
1061     {
1062       buffer->cur += 2;
1063       if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1064                           state))
1065         return true;
1066       buffer->cur -= 2;
1067     }
1068
1069   return false;
1070 }
1071
1072 /* Helper function to get the cpp_hashnode of the identifier BASE.  */
1073 static cpp_hashnode *
1074 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
1075 {
1076   cpp_hashnode *result;
1077   const uchar *cur;
1078   unsigned int len;
1079   unsigned int hash = HT_HASHSTEP (0, *base);
1080
1081   cur = base + 1;
1082   while (ISIDNUM (*cur))
1083     {
1084       hash = HT_HASHSTEP (hash, *cur);
1085       cur++;
1086     }
1087   len = cur - base;
1088   hash = HT_HASHFINISH (hash, len);
1089   result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1090                                               base, len, hash, HT_ALLOC));
1091
1092   /* Rarely, identifiers require diagnostics when lexed.  */
1093   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1094                         && !pfile->state.skipping, 0))
1095     {
1096       /* It is allowed to poison the same identifier twice.  */
1097       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1098         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1099                    NODE_NAME (result));
1100
1101       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1102          replacement list of a variadic macro.  */
1103       if (result == pfile->spec_nodes.n__VA_ARGS__
1104           && !pfile->state.va_args_ok)
1105         cpp_error (pfile, CPP_DL_PEDWARN,
1106                    "__VA_ARGS__ can only appear in the expansion"
1107                    " of a C99 variadic macro");
1108
1109       /* For -Wc++-compat, warn about use of C++ named operators.  */
1110       if (result->flags & NODE_WARN_OPERATOR)
1111         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1112                      "identifier \"%s\" is a special operator name in C++",
1113                      NODE_NAME (result));
1114     }
1115
1116   return result;
1117 }
1118
1119 /* Get the cpp_hashnode of an identifier specified by NAME in
1120    the current cpp_reader object.  If none is found, NULL is returned.  */
1121 cpp_hashnode *
1122 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
1123 {
1124   cpp_hashnode *result;
1125   result = lex_identifier_intern (pfile, (uchar *) name);
1126   return result;
1127 }
1128
1129 /* Lex an identifier starting at BUFFER->CUR - 1.  */
1130 static cpp_hashnode *
1131 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
1132                 struct normalize_state *nst)
1133 {
1134   cpp_hashnode *result;
1135   const uchar *cur;
1136   unsigned int len;
1137   unsigned int hash = HT_HASHSTEP (0, *base);
1138
1139   cur = pfile->buffer->cur;
1140   if (! starts_ucn)
1141     while (ISIDNUM (*cur))
1142       {
1143         hash = HT_HASHSTEP (hash, *cur);
1144         cur++;
1145       }
1146   pfile->buffer->cur = cur;
1147   if (starts_ucn || forms_identifier_p (pfile, false, nst))
1148     {
1149       /* Slower version for identifiers containing UCNs (or $).  */
1150       do {
1151         while (ISIDNUM (*pfile->buffer->cur))
1152           {
1153             pfile->buffer->cur++;
1154             NORMALIZE_STATE_UPDATE_IDNUM (nst);
1155           }
1156       } while (forms_identifier_p (pfile, false, nst));
1157       result = _cpp_interpret_identifier (pfile, base,
1158                                           pfile->buffer->cur - base);
1159     }
1160   else
1161     {
1162       len = cur - base;
1163       hash = HT_HASHFINISH (hash, len);
1164
1165       result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1166                                                   base, len, hash, HT_ALLOC));
1167     }
1168
1169   /* Rarely, identifiers require diagnostics when lexed.  */
1170   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1171                         && !pfile->state.skipping, 0))
1172     {
1173       /* It is allowed to poison the same identifier twice.  */
1174       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1175         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1176                    NODE_NAME (result));
1177
1178       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1179          replacement list of a variadic macro.  */
1180       if (result == pfile->spec_nodes.n__VA_ARGS__
1181           && !pfile->state.va_args_ok)
1182         cpp_error (pfile, CPP_DL_PEDWARN,
1183                    "__VA_ARGS__ can only appear in the expansion"
1184                    " of a C99 variadic macro");
1185
1186       /* For -Wc++-compat, warn about use of C++ named operators.  */
1187       if (result->flags & NODE_WARN_OPERATOR)
1188         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1189                      "identifier \"%s\" is a special operator name in C++",
1190                      NODE_NAME (result));
1191     }
1192
1193   return result;
1194 }
1195
1196 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
1197 static void
1198 lex_number (cpp_reader *pfile, cpp_string *number,
1199             struct normalize_state *nst)
1200 {
1201   const uchar *cur;
1202   const uchar *base;
1203   uchar *dest;
1204
1205   base = pfile->buffer->cur - 1;
1206   do
1207     {
1208       cur = pfile->buffer->cur;
1209
1210       /* N.B. ISIDNUM does not include $.  */
1211       while (ISIDNUM (*cur) || *cur == '.' || VALID_SIGN (*cur, cur[-1]))
1212         {
1213           cur++;
1214           NORMALIZE_STATE_UPDATE_IDNUM (nst);
1215         }
1216
1217       pfile->buffer->cur = cur;
1218     }
1219   while (forms_identifier_p (pfile, false, nst));
1220
1221   number->len = cur - base;
1222   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
1223   memcpy (dest, base, number->len);
1224   dest[number->len] = '\0';
1225   number->text = dest;
1226 }
1227
1228 /* Create a token of type TYPE with a literal spelling.  */
1229 static void
1230 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
1231                 unsigned int len, enum cpp_ttype type)
1232 {
1233   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
1234
1235   memcpy (dest, base, len);
1236   dest[len] = '\0';
1237   token->type = type;
1238   token->val.str.len = len;
1239   token->val.str.text = dest;
1240 }
1241
1242 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
1243    sequence from *FIRST_BUFF_P to LAST_BUFF_P.  */
1244
1245 static void
1246 bufring_append (cpp_reader *pfile, const uchar *base, size_t len,
1247                 _cpp_buff **first_buff_p, _cpp_buff **last_buff_p)
1248 {
1249   _cpp_buff *first_buff = *first_buff_p;
1250   _cpp_buff *last_buff = *last_buff_p;
1251
1252   if (first_buff == NULL)
1253     first_buff = last_buff = _cpp_get_buff (pfile, len);
1254   else if (len > BUFF_ROOM (last_buff))
1255     {
1256       size_t room = BUFF_ROOM (last_buff);
1257       memcpy (BUFF_FRONT (last_buff), base, room);
1258       BUFF_FRONT (last_buff) += room;
1259       base += room;
1260       len -= room;
1261       last_buff = _cpp_append_extend_buff (pfile, last_buff, len);
1262     }
1263
1264   memcpy (BUFF_FRONT (last_buff), base, len);
1265   BUFF_FRONT (last_buff) += len;
1266
1267   *first_buff_p = first_buff;
1268   *last_buff_p = last_buff;
1269 }
1270
1271 /* Lexes a raw string.  The stored string contains the spelling, including
1272    double quotes, delimiter string, '(' and ')', any leading
1273    'L', 'u', 'U' or 'u8' and 'R' modifier.  It returns the type of the
1274    literal, or CPP_OTHER if it was not properly terminated.
1275
1276    The spelling is NUL-terminated, but it is not guaranteed that this
1277    is the first NUL since embedded NULs are preserved.  */
1278
1279 static void
1280 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base,
1281                 const uchar *cur)
1282 {
1283   const uchar *raw_prefix;
1284   unsigned int raw_prefix_len = 0;
1285   enum cpp_ttype type;
1286   size_t total_len = 0;
1287   _cpp_buff *first_buff = NULL, *last_buff = NULL;
1288   _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
1289
1290   type = (*base == 'L' ? CPP_WSTRING :
1291           *base == 'U' ? CPP_STRING32 :
1292           *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1293           : CPP_STRING);
1294
1295   raw_prefix = cur + 1;
1296   while (raw_prefix_len < 16)
1297     {
1298       switch (raw_prefix[raw_prefix_len])
1299         {
1300         case ' ': case '(': case ')': case '\\': case '\t':
1301         case '\v': case '\f': case '\n': default:
1302           break;
1303         /* Basic source charset except the above chars.  */
1304         case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1305         case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1306         case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1307         case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1308         case 'y': case 'z':
1309         case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1310         case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1311         case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1312         case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1313         case 'Y': case 'Z':
1314         case '0': case '1': case '2': case '3': case '4': case '5':
1315         case '6': case '7': case '8': case '9':
1316         case '_': case '{': case '}': case '#': case '[': case ']':
1317         case '<': case '>': case '%': case ':': case ';': case '.':
1318         case '?': case '*': case '+': case '-': case '/': case '^':
1319         case '&': case '|': case '~': case '!': case '=': case ',':
1320         case '"': case '\'':
1321           raw_prefix_len++;
1322           continue;
1323         }
1324       break;
1325     }
1326
1327   if (raw_prefix[raw_prefix_len] != '(')
1328     {
1329       int col = CPP_BUF_COLUMN (pfile->buffer, raw_prefix + raw_prefix_len)
1330                 + 1;
1331       if (raw_prefix_len == 16)
1332         cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, col,
1333                              "raw string delimiter longer than 16 characters");
1334       else
1335         cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, col,
1336                              "invalid character '%c' in raw string delimiter",
1337                              (int) raw_prefix[raw_prefix_len]);
1338       pfile->buffer->cur = raw_prefix - 1;
1339       create_literal (pfile, token, base, raw_prefix - 1 - base, CPP_OTHER);
1340       return;
1341     }
1342
1343   cur = raw_prefix + raw_prefix_len + 1;
1344   for (;;)
1345     {
1346 #define BUF_APPEND(STR,LEN)                                     \
1347       do {                                                      \
1348         bufring_append (pfile, (const uchar *)(STR), (LEN),     \
1349                         &first_buff, &last_buff);               \
1350         total_len += (LEN);                                     \
1351       } while (0);
1352
1353       cppchar_t c;
1354
1355       /* If we previously performed any trigraph or line splicing
1356          transformations, undo them within the body of the raw string.  */
1357       while (note->pos < cur)
1358         ++note;
1359       for (; note->pos == cur; ++note)
1360         {
1361           switch (note->type)
1362             {
1363             case '\\':
1364             case ' ':
1365               /* Restore backslash followed by newline.  */
1366               BUF_APPEND (base, cur - base);
1367               base = cur;
1368               BUF_APPEND ("\\", 1);
1369             after_backslash:
1370               if (note->type == ' ')
1371                 {
1372                   /* GNU backslash whitespace newline extension.  FIXME
1373                      could be any sequence of non-vertical space.  When we
1374                      can properly restore any such sequence, we should mark
1375                      this note as handled so _cpp_process_line_notes
1376                      doesn't warn.  */
1377                   BUF_APPEND (" ", 1);
1378                 }
1379
1380               BUF_APPEND ("\n", 1);
1381               break;
1382
1383             case 0:
1384               /* Already handled.  */
1385               break;
1386
1387             default:
1388               if (_cpp_trigraph_map[note->type])
1389                 {
1390                   /* Don't warn about this trigraph in
1391                      _cpp_process_line_notes, since trigraphs show up as
1392                      trigraphs in raw strings.  */
1393                   uchar type = note->type;
1394                   note->type = 0;
1395
1396                   if (!CPP_OPTION (pfile, trigraphs))
1397                     /* If we didn't convert the trigraph in the first
1398                        place, don't do anything now either.  */
1399                     break;
1400
1401                   BUF_APPEND (base, cur - base);
1402                   base = cur;
1403                   BUF_APPEND ("??", 2);
1404
1405                   /* ??/ followed by newline gets two line notes, one for
1406                      the trigraph and one for the backslash/newline.  */
1407                   if (type == '/' && note[1].pos == cur)
1408                     {
1409                       if (note[1].type != '\\'
1410                           && note[1].type != ' ')
1411                         abort ();
1412                       BUF_APPEND ("/", 1);
1413                       ++note;
1414                       goto after_backslash;
1415                     }
1416                   /* The ) from ??) could be part of the suffix.  */
1417                   else if (type == ')'
1418                            && strncmp ((const char *) cur+1,
1419                                        (const char *) raw_prefix,
1420                                        raw_prefix_len) == 0
1421                            && cur[raw_prefix_len+1] == '"')
1422                     {
1423                       BUF_APPEND (")", 1);
1424                       base++;
1425                       cur += raw_prefix_len + 2;
1426                       goto break_outer_loop;
1427                     }
1428                   else
1429                     {
1430                       /* Skip the replacement character.  */
1431                       base = ++cur;
1432                       BUF_APPEND (&type, 1);
1433                     }
1434                 }
1435               else
1436                 abort ();
1437               break;
1438             }
1439         }
1440       c = *cur++;
1441
1442       if (c == ')'
1443           && strncmp ((const char *) cur, (const char *) raw_prefix,
1444                       raw_prefix_len) == 0
1445           && cur[raw_prefix_len] == '"')
1446         {
1447           cur += raw_prefix_len + 1;
1448           break;
1449         }
1450       else if (c == '\n')
1451         {
1452           if (pfile->state.in_directive
1453               || pfile->state.parsing_args
1454               || pfile->state.in_deferred_pragma)
1455             {
1456               cur--;
1457               type = CPP_OTHER;
1458               cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
1459                                    "unterminated raw string");
1460               break;
1461             }
1462
1463           BUF_APPEND (base, cur - base);
1464
1465           if (pfile->buffer->cur < pfile->buffer->rlimit)
1466             CPP_INCREMENT_LINE (pfile, 0);
1467           pfile->buffer->need_line = true;
1468
1469           pfile->buffer->cur = cur-1;
1470           _cpp_process_line_notes (pfile, false);
1471           if (!_cpp_get_fresh_line (pfile))
1472             {
1473               source_location src_loc = token->src_loc;
1474               token->type = CPP_EOF;
1475               /* Tell the compiler the line number of the EOF token.  */
1476               token->src_loc = pfile->line_table->highest_line;
1477               token->flags = BOL;
1478               if (first_buff != NULL)
1479                 _cpp_release_buff (pfile, first_buff);
1480               cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
1481                                    "unterminated raw string");
1482               return;
1483             }
1484
1485           cur = base = pfile->buffer->cur;
1486           note = &pfile->buffer->notes[pfile->buffer->cur_note];
1487         }
1488     }
1489  break_outer_loop:
1490
1491   if (CPP_OPTION (pfile, user_literals))
1492     {
1493       /* Grab user defined literal suffix.  */
1494       if (ISIDST (*cur))
1495         {
1496           type = cpp_userdef_string_add_type (type);
1497           ++cur;
1498         }
1499       while (ISIDNUM (*cur))
1500         ++cur;
1501     }
1502
1503   pfile->buffer->cur = cur;
1504   if (first_buff == NULL)
1505     create_literal (pfile, token, base, cur - base, type);
1506   else
1507     {
1508       uchar *dest = _cpp_unaligned_alloc (pfile, total_len + (cur - base) + 1);
1509
1510       token->type = type;
1511       token->val.str.len = total_len + (cur - base);
1512       token->val.str.text = dest;
1513       last_buff = first_buff;
1514       while (last_buff != NULL)
1515         {
1516           memcpy (dest, last_buff->base,
1517                   BUFF_FRONT (last_buff) - last_buff->base);
1518           dest += BUFF_FRONT (last_buff) - last_buff->base;
1519           last_buff = last_buff->next;
1520         }
1521       _cpp_release_buff (pfile, first_buff);
1522       memcpy (dest, base, cur - base);
1523       dest[cur - base] = '\0';
1524     }
1525 }
1526
1527 /* Lexes a string, character constant, or angle-bracketed header file
1528    name.  The stored string contains the spelling, including opening
1529    quote and any leading 'L', 'u', 'U' or 'u8' and optional
1530    'R' modifier.  It returns the type of the literal, or CPP_OTHER
1531    if it was not properly terminated, or CPP_LESS for an unterminated
1532    header name which must be relexed as normal tokens.
1533
1534    The spelling is NUL-terminated, but it is not guaranteed that this
1535    is the first NUL since embedded NULs are preserved.  */
1536 static void
1537 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
1538 {
1539   bool saw_NUL = false;
1540   const uchar *cur;
1541   cppchar_t terminator;
1542   enum cpp_ttype type;
1543
1544   cur = base;
1545   terminator = *cur++;
1546   if (terminator == 'L' || terminator == 'U')
1547     terminator = *cur++;
1548   else if (terminator == 'u')
1549     {
1550       terminator = *cur++;
1551       if (terminator == '8')
1552         terminator = *cur++;
1553     }
1554   if (terminator == 'R')
1555     {
1556       lex_raw_string (pfile, token, base, cur);
1557       return;
1558     }
1559   if (terminator == '"')
1560     type = (*base == 'L' ? CPP_WSTRING :
1561             *base == 'U' ? CPP_STRING32 :
1562             *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1563                          : CPP_STRING);
1564   else if (terminator == '\'')
1565     type = (*base == 'L' ? CPP_WCHAR :
1566             *base == 'U' ? CPP_CHAR32 :
1567             *base == 'u' ? CPP_CHAR16 : CPP_CHAR);
1568   else
1569     terminator = '>', type = CPP_HEADER_NAME;
1570
1571   for (;;)
1572     {
1573       cppchar_t c = *cur++;
1574
1575       /* In #include-style directives, terminators are not escapable.  */
1576       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
1577         cur++;
1578       else if (c == terminator)
1579         break;
1580       else if (c == '\n')
1581         {
1582           cur--;
1583           /* Unmatched quotes always yield undefined behavior, but
1584              greedy lexing means that what appears to be an unterminated
1585              header name may actually be a legitimate sequence of tokens.  */
1586           if (terminator == '>')
1587             {
1588               token->type = CPP_LESS;
1589               return;
1590             }
1591           type = CPP_OTHER;
1592           break;
1593         }
1594       else if (c == '\0')
1595         saw_NUL = true;
1596     }
1597
1598   if (saw_NUL && !pfile->state.skipping)
1599     cpp_error (pfile, CPP_DL_WARNING,
1600                "null character(s) preserved in literal");
1601
1602   if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
1603     cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
1604                (int) terminator);
1605
1606   if (CPP_OPTION (pfile, user_literals))
1607     {
1608       /* Grab user defined literal suffix.  */
1609       if (ISIDST (*cur))
1610         {
1611           type = cpp_userdef_char_add_type (type);
1612           type = cpp_userdef_string_add_type (type);
1613           ++cur;
1614         }
1615       while (ISIDNUM (*cur))
1616         ++cur;
1617     }
1618
1619   pfile->buffer->cur = cur;
1620   create_literal (pfile, token, base, cur - base, type);
1621 }
1622
1623 /* Return the comment table. The client may not make any assumption
1624    about the ordering of the table.  */
1625 cpp_comment_table *
1626 cpp_get_comments (cpp_reader *pfile)
1627 {
1628   return &pfile->comments;
1629 }
1630
1631 /* Append a comment to the end of the comment table. */
1632 static void
1633 store_comment (cpp_reader *pfile, cpp_token *token)
1634 {
1635   int len;
1636
1637   if (pfile->comments.allocated == 0)
1638     {
1639       pfile->comments.allocated = 256;
1640       pfile->comments.entries = (cpp_comment *) xmalloc
1641         (pfile->comments.allocated * sizeof (cpp_comment));
1642     }
1643
1644   if (pfile->comments.count == pfile->comments.allocated)
1645     {
1646       pfile->comments.allocated *= 2;
1647       pfile->comments.entries = (cpp_comment *) xrealloc
1648         (pfile->comments.entries,
1649          pfile->comments.allocated * sizeof (cpp_comment));
1650     }
1651
1652   len = token->val.str.len;
1653
1654   /* Copy comment. Note, token may not be NULL terminated. */
1655   pfile->comments.entries[pfile->comments.count].comment =
1656     (char *) xmalloc (sizeof (char) * (len + 1));
1657   memcpy (pfile->comments.entries[pfile->comments.count].comment,
1658           token->val.str.text, len);
1659   pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
1660
1661   /* Set source location. */
1662   pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
1663
1664   /* Increment the count of entries in the comment table. */
1665   pfile->comments.count++;
1666 }
1667
1668 /* The stored comment includes the comment start and any terminator.  */
1669 static void
1670 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
1671               cppchar_t type)
1672 {
1673   unsigned char *buffer;
1674   unsigned int len, clen, i;
1675
1676   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
1677
1678   /* C++ comments probably (not definitely) have moved past a new
1679      line, which we don't want to save in the comment.  */
1680   if (is_vspace (pfile->buffer->cur[-1]))
1681     len--;
1682
1683   /* If we are currently in a directive or in argument parsing, then
1684      we need to store all C++ comments as C comments internally, and
1685      so we need to allocate a little extra space in that case.
1686
1687      Note that the only time we encounter a directive here is
1688      when we are saving comments in a "#define".  */
1689   clen = ((pfile->state.in_directive || pfile->state.parsing_args)
1690           && type == '/') ? len + 2 : len;
1691
1692   buffer = _cpp_unaligned_alloc (pfile, clen);
1693
1694   token->type = CPP_COMMENT;
1695   token->val.str.len = clen;
1696   token->val.str.text = buffer;
1697
1698   buffer[0] = '/';
1699   memcpy (buffer + 1, from, len - 1);
1700
1701   /* Finish conversion to a C comment, if necessary.  */
1702   if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
1703     {
1704       buffer[1] = '*';
1705       buffer[clen - 2] = '*';
1706       buffer[clen - 1] = '/';
1707       /* As there can be in a C++ comments illegal sequences for C comments
1708          we need to filter them out.  */
1709       for (i = 2; i < (clen - 2); i++)
1710         if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
1711           buffer[i] = '|';
1712     }
1713
1714   /* Finally store this comment for use by clients of libcpp. */
1715   store_comment (pfile, token);
1716 }
1717
1718 /* Allocate COUNT tokens for RUN.  */
1719 void
1720 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
1721 {
1722   run->base = XNEWVEC (cpp_token, count);
1723   run->limit = run->base + count;
1724   run->next = NULL;
1725 }
1726
1727 /* Returns the next tokenrun, or creates one if there is none.  */
1728 static tokenrun *
1729 next_tokenrun (tokenrun *run)
1730 {
1731   if (run->next == NULL)
1732     {
1733       run->next = XNEW (tokenrun);
1734       run->next->prev = run;
1735       _cpp_init_tokenrun (run->next, 250);
1736     }
1737
1738   return run->next;
1739 }
1740
1741 /* Return the number of not yet processed token in a given
1742    context.  */
1743 int
1744 _cpp_remaining_tokens_num_in_context (cpp_context *context)
1745 {
1746   if (context->tokens_kind == TOKENS_KIND_DIRECT)
1747     return (LAST (context).token - FIRST (context).token);
1748   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
1749            || context->tokens_kind == TOKENS_KIND_EXTENDED)
1750     return (LAST (context).ptoken - FIRST (context).ptoken);
1751   else
1752       abort ();
1753 }
1754
1755 /* Returns the token present at index INDEX in a given context.  If
1756    INDEX is zero, the next token to be processed is returned.  */
1757 static const cpp_token*
1758 _cpp_token_from_context_at (cpp_context *context, int index)
1759 {
1760   if (context->tokens_kind == TOKENS_KIND_DIRECT)
1761     return &(FIRST (context).token[index]);
1762   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
1763            || context->tokens_kind == TOKENS_KIND_EXTENDED)
1764     return FIRST (context).ptoken[index];
1765  else
1766    abort ();
1767 }
1768
1769 /* Look ahead in the input stream.  */
1770 const cpp_token *
1771 cpp_peek_token (cpp_reader *pfile, int index)
1772 {
1773   cpp_context *context = pfile->context;
1774   const cpp_token *peektok;
1775   int count;
1776
1777   /* First, scan through any pending cpp_context objects.  */
1778   while (context->prev)
1779     {
1780       ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
1781
1782       if (index < (int) sz)
1783         return _cpp_token_from_context_at (context, index);
1784       index -= (int) sz;
1785       context = context->prev;
1786     }
1787
1788   /* We will have to read some new tokens after all (and do so
1789      without invalidating preceding tokens).  */
1790   count = index;
1791   pfile->keep_tokens++;
1792
1793   do
1794     {
1795       peektok = _cpp_lex_token (pfile);
1796       if (peektok->type == CPP_EOF)
1797         return peektok;
1798     }
1799   while (index--);
1800
1801   _cpp_backup_tokens_direct (pfile, count + 1);
1802   pfile->keep_tokens--;
1803
1804   return peektok;
1805 }
1806
1807 /* Allocate a single token that is invalidated at the same time as the
1808    rest of the tokens on the line.  Has its line and col set to the
1809    same as the last lexed token, so that diagnostics appear in the
1810    right place.  */
1811 cpp_token *
1812 _cpp_temp_token (cpp_reader *pfile)
1813 {
1814   cpp_token *old, *result;
1815   ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
1816   ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
1817
1818   old = pfile->cur_token - 1;
1819   /* Any pre-existing lookaheads must not be clobbered.  */
1820   if (la)
1821     {
1822       if (sz <= la)
1823         {
1824           tokenrun *next = next_tokenrun (pfile->cur_run);
1825
1826           if (sz < la)
1827             memmove (next->base + 1, next->base,
1828                      (la - sz) * sizeof (cpp_token));
1829
1830           next->base[0] = pfile->cur_run->limit[-1];
1831         }
1832
1833       if (sz > 1)
1834         memmove (pfile->cur_token + 1, pfile->cur_token,
1835                  MIN (la, sz - 1) * sizeof (cpp_token));
1836     }
1837
1838   if (!sz && pfile->cur_token == pfile->cur_run->limit)
1839     {
1840       pfile->cur_run = next_tokenrun (pfile->cur_run);
1841       pfile->cur_token = pfile->cur_run->base;
1842     }
1843
1844   result = pfile->cur_token++;
1845   result->src_loc = old->src_loc;
1846   return result;
1847 }
1848
1849 /* Lex a token into RESULT (external interface).  Takes care of issues
1850    like directive handling, token lookahead, multiple include
1851    optimization and skipping.  */
1852 const cpp_token *
1853 _cpp_lex_token (cpp_reader *pfile)
1854 {
1855   cpp_token *result;
1856
1857   for (;;)
1858     {
1859       if (pfile->cur_token == pfile->cur_run->limit)
1860         {
1861           pfile->cur_run = next_tokenrun (pfile->cur_run);
1862           pfile->cur_token = pfile->cur_run->base;
1863         }
1864       /* We assume that the current token is somewhere in the current
1865          run.  */
1866       if (pfile->cur_token < pfile->cur_run->base
1867           || pfile->cur_token >= pfile->cur_run->limit)
1868         abort ();
1869
1870       if (pfile->lookaheads)
1871         {
1872           pfile->lookaheads--;
1873           result = pfile->cur_token++;
1874         }
1875       else
1876         result = _cpp_lex_direct (pfile);
1877
1878       if (result->flags & BOL)
1879         {
1880           /* Is this a directive.  If _cpp_handle_directive returns
1881              false, it is an assembler #.  */
1882           if (result->type == CPP_HASH
1883               /* 6.10.3 p 11: Directives in a list of macro arguments
1884                  gives undefined behavior.  This implementation
1885                  handles the directive as normal.  */
1886               && pfile->state.parsing_args != 1)
1887             {
1888               if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
1889                 {
1890                   if (pfile->directive_result.type == CPP_PADDING)
1891                     continue;
1892                   result = &pfile->directive_result;
1893                 }
1894             }
1895           else if (pfile->state.in_deferred_pragma)
1896             result = &pfile->directive_result;
1897
1898           if (pfile->cb.line_change && !pfile->state.skipping)
1899             pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
1900         }
1901
1902       /* We don't skip tokens in directives.  */
1903       if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
1904         break;
1905
1906       /* Outside a directive, invalidate controlling macros.  At file
1907          EOF, _cpp_lex_direct takes care of popping the buffer, so we never
1908          get here and MI optimization works.  */
1909       pfile->mi_valid = false;
1910
1911       if (!pfile->state.skipping || result->type == CPP_EOF)
1912         break;
1913     }
1914
1915   return result;
1916 }
1917
1918 /* Returns true if a fresh line has been loaded.  */
1919 bool
1920 _cpp_get_fresh_line (cpp_reader *pfile)
1921 {
1922   int return_at_eof;
1923
1924   /* We can't get a new line until we leave the current directive.  */
1925   if (pfile->state.in_directive)
1926     return false;
1927
1928   for (;;)
1929     {
1930       cpp_buffer *buffer = pfile->buffer;
1931
1932       if (!buffer->need_line)
1933         return true;
1934
1935       if (buffer->next_line < buffer->rlimit)
1936         {
1937           _cpp_clean_line (pfile);
1938           return true;
1939         }
1940
1941       /* First, get out of parsing arguments state.  */
1942       if (pfile->state.parsing_args)
1943         return false;
1944
1945       /* End of buffer.  Non-empty files should end in a newline.  */
1946       if (buffer->buf != buffer->rlimit
1947           && buffer->next_line > buffer->rlimit
1948           && !buffer->from_stage3)
1949         {
1950           /* Clip to buffer size.  */
1951           buffer->next_line = buffer->rlimit;
1952         }
1953
1954       return_at_eof = buffer->return_at_eof;
1955       _cpp_pop_buffer (pfile);
1956       if (pfile->buffer == NULL || return_at_eof)
1957         return false;
1958     }
1959 }
1960
1961 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)          \
1962   do                                                    \
1963     {                                                   \
1964       result->type = ELSE_TYPE;                         \
1965       if (*buffer->cur == CHAR)                         \
1966         buffer->cur++, result->type = THEN_TYPE;        \
1967     }                                                   \
1968   while (0)
1969
1970 /* Lex a token into pfile->cur_token, which is also incremented, to
1971    get diagnostics pointing to the correct location.
1972
1973    Does not handle issues such as token lookahead, multiple-include
1974    optimization, directives, skipping etc.  This function is only
1975    suitable for use by _cpp_lex_token, and in special cases like
1976    lex_expansion_token which doesn't care for any of these issues.
1977
1978    When meeting a newline, returns CPP_EOF if parsing a directive,
1979    otherwise returns to the start of the token buffer if permissible.
1980    Returns the location of the lexed token.  */
1981 cpp_token *
1982 _cpp_lex_direct (cpp_reader *pfile)
1983 {
1984   cppchar_t c;
1985   cpp_buffer *buffer;
1986   const unsigned char *comment_start;
1987   cpp_token *result = pfile->cur_token++;
1988
1989  fresh_line:
1990   result->flags = 0;
1991   buffer = pfile->buffer;
1992   if (buffer->need_line)
1993     {
1994       if (pfile->state.in_deferred_pragma)
1995         {
1996           result->type = CPP_PRAGMA_EOL;
1997           pfile->state.in_deferred_pragma = false;
1998           if (!pfile->state.pragma_allow_expansion)
1999             pfile->state.prevent_expansion--;
2000           return result;
2001         }
2002       if (!_cpp_get_fresh_line (pfile))
2003         {
2004           result->type = CPP_EOF;
2005           if (!pfile->state.in_directive)
2006             {
2007               /* Tell the compiler the line number of the EOF token.  */
2008               result->src_loc = pfile->line_table->highest_line;
2009               result->flags = BOL;
2010             }
2011           return result;
2012         }
2013       if (!pfile->keep_tokens)
2014         {
2015           pfile->cur_run = &pfile->base_run;
2016           result = pfile->base_run.base;
2017           pfile->cur_token = result + 1;
2018         }
2019       result->flags = BOL;
2020       if (pfile->state.parsing_args == 2)
2021         result->flags |= PREV_WHITE;
2022     }
2023   buffer = pfile->buffer;
2024  update_tokens_line:
2025   result->src_loc = pfile->line_table->highest_line;
2026
2027  skipped_white:
2028   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
2029       && !pfile->overlaid_buffer)
2030     {
2031       _cpp_process_line_notes (pfile, false);
2032       result->src_loc = pfile->line_table->highest_line;
2033     }
2034   c = *buffer->cur++;
2035
2036   if (pfile->forced_token_location_p)
2037     result->src_loc = *pfile->forced_token_location_p;
2038   else
2039     result->src_loc = linemap_position_for_column (pfile->line_table,
2040                                           CPP_BUF_COLUMN (buffer, buffer->cur));
2041
2042   switch (c)
2043     {
2044     case ' ': case '\t': case '\f': case '\v': case '\0':
2045       result->flags |= PREV_WHITE;
2046       skip_whitespace (pfile, c);
2047       goto skipped_white;
2048
2049     case '\n':
2050       if (buffer->cur < buffer->rlimit)
2051         CPP_INCREMENT_LINE (pfile, 0);
2052       buffer->need_line = true;
2053       goto fresh_line;
2054
2055     case '0': case '1': case '2': case '3': case '4':
2056     case '5': case '6': case '7': case '8': case '9':
2057       {
2058         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2059         result->type = CPP_NUMBER;
2060         lex_number (pfile, &result->val.str, &nst);
2061         warn_about_normalization (pfile, result, &nst);
2062         break;
2063       }
2064
2065     case 'L':
2066     case 'u':
2067     case 'U':
2068     case 'R':
2069       /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
2070          wide strings or raw strings.  */
2071       if (c == 'L' || CPP_OPTION (pfile, rliterals)
2072           || (c != 'R' && CPP_OPTION (pfile, uliterals)))
2073         {
2074           if ((*buffer->cur == '\'' && c != 'R')
2075               || *buffer->cur == '"'
2076               || (*buffer->cur == 'R'
2077                   && c != 'R'
2078                   && buffer->cur[1] == '"'
2079                   && CPP_OPTION (pfile, rliterals))
2080               || (*buffer->cur == '8'
2081                   && c == 'u'
2082                   && (buffer->cur[1] == '"'
2083                       || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
2084                           && CPP_OPTION (pfile, rliterals)))))
2085             {
2086               lex_string (pfile, result, buffer->cur - 1);
2087               break;
2088             }
2089         }
2090       /* Fall through.  */
2091
2092     case '_':
2093     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
2094     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
2095     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
2096     case 's': case 't':           case 'v': case 'w': case 'x':
2097     case 'y': case 'z':
2098     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
2099     case 'G': case 'H': case 'I': case 'J': case 'K':
2100     case 'M': case 'N': case 'O': case 'P': case 'Q':
2101     case 'S': case 'T':           case 'V': case 'W': case 'X':
2102     case 'Y': case 'Z':
2103       result->type = CPP_NAME;
2104       {
2105         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2106         result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
2107                                                 &nst);
2108         warn_about_normalization (pfile, result, &nst);
2109       }
2110
2111       /* Convert named operators to their proper types.  */
2112       if (result->val.node.node->flags & NODE_OPERATOR)
2113         {
2114           result->flags |= NAMED_OP;
2115           result->type = (enum cpp_ttype) result->val.node.node->directive_index;
2116         }
2117       break;
2118
2119     case '\'':
2120     case '"':
2121       lex_string (pfile, result, buffer->cur - 1);
2122       break;
2123
2124     case '/':
2125       /* A potential block or line comment.  */
2126       comment_start = buffer->cur;
2127       c = *buffer->cur;
2128
2129       if (c == '*')
2130         {
2131           if (_cpp_skip_block_comment (pfile))
2132             cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
2133         }
2134       else if (c == '/' && (CPP_OPTION (pfile, cplusplus_comments)
2135                             || cpp_in_system_header (pfile)))
2136         {
2137           /* Warn about comments only if pedantically GNUC89, and not
2138              in system headers.  */
2139           if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile)
2140               && ! buffer->warned_cplusplus_comments)
2141             {
2142               cpp_error (pfile, CPP_DL_PEDWARN,
2143                          "C++ style comments are not allowed in ISO C90");
2144               cpp_error (pfile, CPP_DL_PEDWARN,
2145                          "(this will be reported only once per input file)");
2146               buffer->warned_cplusplus_comments = 1;
2147             }
2148
2149           if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
2150             cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
2151         }
2152       else if (c == '=')
2153         {
2154           buffer->cur++;
2155           result->type = CPP_DIV_EQ;
2156           break;
2157         }
2158       else
2159         {
2160           result->type = CPP_DIV;
2161           break;
2162         }
2163
2164       if (!pfile->state.save_comments)
2165         {
2166           result->flags |= PREV_WHITE;
2167           goto update_tokens_line;
2168         }
2169
2170       /* Save the comment as a token in its own right.  */
2171       save_comment (pfile, result, comment_start, c);
2172       break;
2173
2174     case '<':
2175       if (pfile->state.angled_headers)
2176         {
2177           lex_string (pfile, result, buffer->cur - 1);
2178           if (result->type != CPP_LESS)
2179             break;
2180         }
2181
2182       result->type = CPP_LESS;
2183       if (*buffer->cur == '=')
2184         buffer->cur++, result->type = CPP_LESS_EQ;
2185       else if (*buffer->cur == '<')
2186         {
2187           buffer->cur++;
2188           IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
2189         }
2190       else if (CPP_OPTION (pfile, digraphs))
2191         {
2192           if (*buffer->cur == ':')
2193             {
2194               buffer->cur++;
2195               result->flags |= DIGRAPH;
2196               result->type = CPP_OPEN_SQUARE;
2197             }
2198           else if (*buffer->cur == '%')
2199             {
2200               buffer->cur++;
2201               result->flags |= DIGRAPH;
2202               result->type = CPP_OPEN_BRACE;
2203             }
2204         }
2205       break;
2206
2207     case '>':
2208       result->type = CPP_GREATER;
2209       if (*buffer->cur == '=')
2210         buffer->cur++, result->type = CPP_GREATER_EQ;
2211       else if (*buffer->cur == '>')
2212         {
2213           buffer->cur++;
2214           IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
2215         }
2216       break;
2217
2218     case '%':
2219       result->type = CPP_MOD;
2220       if (*buffer->cur == '=')
2221         buffer->cur++, result->type = CPP_MOD_EQ;
2222       else if (CPP_OPTION (pfile, digraphs))
2223         {
2224           if (*buffer->cur == ':')
2225             {
2226               buffer->cur++;
2227               result->flags |= DIGRAPH;
2228               result->type = CPP_HASH;
2229               if (*buffer->cur == '%' && buffer->cur[1] == ':')
2230                 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
2231             }
2232           else if (*buffer->cur == '>')
2233             {
2234               buffer->cur++;
2235               result->flags |= DIGRAPH;
2236               result->type = CPP_CLOSE_BRACE;
2237             }
2238         }
2239       break;
2240
2241     case '.':
2242       result->type = CPP_DOT;
2243       if (ISDIGIT (*buffer->cur))
2244         {
2245           struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2246           result->type = CPP_NUMBER;
2247           lex_number (pfile, &result->val.str, &nst);
2248           warn_about_normalization (pfile, result, &nst);
2249         }
2250       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
2251         buffer->cur += 2, result->type = CPP_ELLIPSIS;
2252       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
2253         buffer->cur++, result->type = CPP_DOT_STAR;
2254       break;
2255
2256     case '+':
2257       result->type = CPP_PLUS;
2258       if (*buffer->cur == '+')
2259         buffer->cur++, result->type = CPP_PLUS_PLUS;
2260       else if (*buffer->cur == '=')
2261         buffer->cur++, result->type = CPP_PLUS_EQ;
2262       break;
2263
2264     case '-':
2265       result->type = CPP_MINUS;
2266       if (*buffer->cur == '>')
2267         {
2268           buffer->cur++;
2269           result->type = CPP_DEREF;
2270           if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
2271             buffer->cur++, result->type = CPP_DEREF_STAR;
2272         }
2273       else if (*buffer->cur == '-')
2274         buffer->cur++, result->type = CPP_MINUS_MINUS;
2275       else if (*buffer->cur == '=')
2276         buffer->cur++, result->type = CPP_MINUS_EQ;
2277       break;
2278
2279     case '&':
2280       result->type = CPP_AND;
2281       if (*buffer->cur == '&')
2282         buffer->cur++, result->type = CPP_AND_AND;
2283       else if (*buffer->cur == '=')
2284         buffer->cur++, result->type = CPP_AND_EQ;
2285       break;
2286
2287     case '|':
2288       result->type = CPP_OR;
2289       if (*buffer->cur == '|')
2290         buffer->cur++, result->type = CPP_OR_OR;
2291       else if (*buffer->cur == '=')
2292         buffer->cur++, result->type = CPP_OR_EQ;
2293       break;
2294
2295     case ':':
2296       result->type = CPP_COLON;
2297       if (*buffer->cur == ':' && CPP_OPTION (pfile, cplusplus))
2298         buffer->cur++, result->type = CPP_SCOPE;
2299       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
2300         {
2301           buffer->cur++;
2302           result->flags |= DIGRAPH;
2303           result->type = CPP_CLOSE_SQUARE;
2304         }
2305       break;
2306
2307     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
2308     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
2309     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
2310     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
2311     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
2312
2313     case '?': result->type = CPP_QUERY; break;
2314     case '~': result->type = CPP_COMPL; break;
2315     case ',': result->type = CPP_COMMA; break;
2316     case '(': result->type = CPP_OPEN_PAREN; break;
2317     case ')': result->type = CPP_CLOSE_PAREN; break;
2318     case '[': result->type = CPP_OPEN_SQUARE; break;
2319     case ']': result->type = CPP_CLOSE_SQUARE; break;
2320     case '{': result->type = CPP_OPEN_BRACE; break;
2321     case '}': result->type = CPP_CLOSE_BRACE; break;
2322     case ';': result->type = CPP_SEMICOLON; break;
2323
2324       /* @ is a punctuator in Objective-C.  */
2325     case '@': result->type = CPP_ATSIGN; break;
2326
2327     case '$':
2328     case '\\':
2329       {
2330         const uchar *base = --buffer->cur;
2331         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2332
2333         if (forms_identifier_p (pfile, true, &nst))
2334           {
2335             result->type = CPP_NAME;
2336             result->val.node.node = lex_identifier (pfile, base, true, &nst);
2337             warn_about_normalization (pfile, result, &nst);
2338             break;
2339           }
2340         buffer->cur++;
2341       }
2342
2343     default:
2344       create_literal (pfile, result, buffer->cur - 1, 1, CPP_OTHER);
2345       break;
2346     }
2347
2348   return result;
2349 }
2350
2351 /* An upper bound on the number of bytes needed to spell TOKEN.
2352    Does not include preceding whitespace.  */
2353 unsigned int
2354 cpp_token_len (const cpp_token *token)
2355 {
2356   unsigned int len;
2357
2358   switch (TOKEN_SPELL (token))
2359     {
2360     default:            len = 6;                                break;
2361     case SPELL_LITERAL: len = token->val.str.len;               break;
2362     case SPELL_IDENT:   len = NODE_LEN (token->val.node.node) * 10;     break;
2363     }
2364
2365   return len;
2366 }
2367
2368 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
2369    Return the number of bytes read out of NAME.  (There are always
2370    10 bytes written to BUFFER.)  */
2371
2372 static size_t
2373 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
2374 {
2375   int j;
2376   int ucn_len = 0;
2377   int ucn_len_c;
2378   unsigned t;
2379   unsigned long utf32;
2380
2381   /* Compute the length of the UTF-8 sequence.  */
2382   for (t = *name; t & 0x80; t <<= 1)
2383     ucn_len++;
2384
2385   utf32 = *name & (0x7F >> ucn_len);
2386   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
2387     {
2388       utf32 = (utf32 << 6) | (*++name & 0x3F);
2389
2390       /* Ill-formed UTF-8.  */
2391       if ((*name & ~0x3F) != 0x80)
2392         abort ();
2393     }
2394
2395   *buffer++ = '\\';
2396   *buffer++ = 'U';
2397   for (j = 7; j >= 0; j--)
2398     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
2399   return ucn_len;
2400 }
2401
2402 /* Given a token TYPE corresponding to a digraph, return a pointer to
2403    the spelling of the digraph.  */
2404 static const unsigned char *
2405 cpp_digraph2name (enum cpp_ttype type)
2406 {
2407   return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
2408 }
2409
2410 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
2411    already contain the enough space to hold the token's spelling.
2412    Returns a pointer to the character after the last character written.
2413    FORSTRING is true if this is to be the spelling after translation
2414    phase 1 (this is different for UCNs).
2415    FIXME: Would be nice if we didn't need the PFILE argument.  */
2416 unsigned char *
2417 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
2418                  unsigned char *buffer, bool forstring)
2419 {
2420   switch (TOKEN_SPELL (token))
2421     {
2422     case SPELL_OPERATOR:
2423       {
2424         const unsigned char *spelling;
2425         unsigned char c;
2426
2427         if (token->flags & DIGRAPH)
2428           spelling = cpp_digraph2name (token->type);
2429         else if (token->flags & NAMED_OP)
2430           goto spell_ident;
2431         else
2432           spelling = TOKEN_NAME (token);
2433
2434         while ((c = *spelling++) != '\0')
2435           *buffer++ = c;
2436       }
2437       break;
2438
2439     spell_ident:
2440     case SPELL_IDENT:
2441       if (forstring)
2442         {
2443           memcpy (buffer, NODE_NAME (token->val.node.node),
2444                   NODE_LEN (token->val.node.node));
2445           buffer += NODE_LEN (token->val.node.node);
2446         }
2447       else
2448         {
2449           size_t i;
2450           const unsigned char * name = NODE_NAME (token->val.node.node);
2451
2452           for (i = 0; i < NODE_LEN (token->val.node.node); i++)
2453             if (name[i] & ~0x7F)
2454               {
2455                 i += utf8_to_ucn (buffer, name + i) - 1;
2456                 buffer += 10;
2457               }
2458             else
2459               *buffer++ = NODE_NAME (token->val.node.node)[i];
2460         }
2461       break;
2462
2463     case SPELL_LITERAL:
2464       memcpy (buffer, token->val.str.text, token->val.str.len);
2465       buffer += token->val.str.len;
2466       break;
2467
2468     case SPELL_NONE:
2469       cpp_error (pfile, CPP_DL_ICE,
2470                  "unspellable token %s", TOKEN_NAME (token));
2471       break;
2472     }
2473
2474   return buffer;
2475 }
2476
2477 /* Returns TOKEN spelt as a null-terminated string.  The string is
2478    freed when the reader is destroyed.  Useful for diagnostics.  */
2479 unsigned char *
2480 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
2481 {
2482   unsigned int len = cpp_token_len (token) + 1;
2483   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
2484
2485   end = cpp_spell_token (pfile, token, start, false);
2486   end[0] = '\0';
2487
2488   return start;
2489 }
2490
2491 /* Returns a pointer to a string which spells the token defined by
2492    TYPE and FLAGS.  Used by C front ends, which really should move to
2493    using cpp_token_as_text.  */
2494 const char *
2495 cpp_type2name (enum cpp_ttype type, unsigned char flags)
2496 {
2497   if (flags & DIGRAPH)
2498     return (const char *) cpp_digraph2name (type);
2499   else if (flags & NAMED_OP)
2500     return cpp_named_operator2name (type);
2501
2502   return (const char *) token_spellings[type].name;
2503 }
2504
2505 /* Writes the spelling of token to FP, without any preceding space.
2506    Separated from cpp_spell_token for efficiency - to avoid stdio
2507    double-buffering.  */
2508 void
2509 cpp_output_token (const cpp_token *token, FILE *fp)
2510 {
2511   switch (TOKEN_SPELL (token))
2512     {
2513     case SPELL_OPERATOR:
2514       {
2515         const unsigned char *spelling;
2516         int c;
2517
2518         if (token->flags & DIGRAPH)
2519           spelling = cpp_digraph2name (token->type);
2520         else if (token->flags & NAMED_OP)
2521           goto spell_ident;
2522         else
2523           spelling = TOKEN_NAME (token);
2524
2525         c = *spelling;
2526         do
2527           putc (c, fp);
2528         while ((c = *++spelling) != '\0');
2529       }
2530       break;
2531
2532     spell_ident:
2533     case SPELL_IDENT:
2534       {
2535         size_t i;
2536         const unsigned char * name = NODE_NAME (token->val.node.node);
2537
2538         for (i = 0; i < NODE_LEN (token->val.node.node); i++)
2539           if (name[i] & ~0x7F)
2540             {
2541               unsigned char buffer[10];
2542               i += utf8_to_ucn (buffer, name + i) - 1;
2543               fwrite (buffer, 1, 10, fp);
2544             }
2545           else
2546             fputc (NODE_NAME (token->val.node.node)[i], fp);
2547       }
2548       break;
2549
2550     case SPELL_LITERAL:
2551       fwrite (token->val.str.text, 1, token->val.str.len, fp);
2552       break;
2553
2554     case SPELL_NONE:
2555       /* An error, most probably.  */
2556       break;
2557     }
2558 }
2559
2560 /* Compare two tokens.  */
2561 int
2562 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
2563 {
2564   if (a->type == b->type && a->flags == b->flags)
2565     switch (TOKEN_SPELL (a))
2566       {
2567       default:                  /* Keep compiler happy.  */
2568       case SPELL_OPERATOR:
2569         /* token_no is used to track where multiple consecutive ##
2570            tokens were originally located.  */
2571         return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
2572       case SPELL_NONE:
2573         return (a->type != CPP_MACRO_ARG
2574                 || a->val.macro_arg.arg_no == b->val.macro_arg.arg_no);
2575       case SPELL_IDENT:
2576         return a->val.node.node == b->val.node.node;
2577       case SPELL_LITERAL:
2578         return (a->val.str.len == b->val.str.len
2579                 && !memcmp (a->val.str.text, b->val.str.text,
2580                             a->val.str.len));
2581       }
2582
2583   return 0;
2584 }
2585
2586 /* Returns nonzero if a space should be inserted to avoid an
2587    accidental token paste for output.  For simplicity, it is
2588    conservative, and occasionally advises a space where one is not
2589    needed, e.g. "." and ".2".  */
2590 int
2591 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
2592                  const cpp_token *token2)
2593 {
2594   enum cpp_ttype a = token1->type, b = token2->type;
2595   cppchar_t c;
2596
2597   if (token1->flags & NAMED_OP)
2598     a = CPP_NAME;
2599   if (token2->flags & NAMED_OP)
2600     b = CPP_NAME;
2601
2602   c = EOF;
2603   if (token2->flags & DIGRAPH)
2604     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
2605   else if (token_spellings[b].category == SPELL_OPERATOR)
2606     c = token_spellings[b].name[0];
2607
2608   /* Quickly get everything that can paste with an '='.  */
2609   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
2610     return 1;
2611
2612   switch (a)
2613     {
2614     case CPP_GREATER:   return c == '>';
2615     case CPP_LESS:      return c == '<' || c == '%' || c == ':';
2616     case CPP_PLUS:      return c == '+';
2617     case CPP_MINUS:     return c == '-' || c == '>';
2618     case CPP_DIV:       return c == '/' || c == '*'; /* Comments.  */
2619     case CPP_MOD:       return c == ':' || c == '>';
2620     case CPP_AND:       return c == '&';
2621     case CPP_OR:        return c == '|';
2622     case CPP_COLON:     return c == ':' || c == '>';
2623     case CPP_DEREF:     return c == '*';
2624     case CPP_DOT:       return c == '.' || c == '%' || b == CPP_NUMBER;
2625     case CPP_HASH:      return c == '#' || c == '%'; /* Digraph form.  */
2626     case CPP_NAME:      return ((b == CPP_NUMBER
2627                                  && name_p (pfile, &token2->val.str))
2628                                 || b == CPP_NAME
2629                                 || b == CPP_CHAR || b == CPP_STRING); /* L */
2630     case CPP_NUMBER:    return (b == CPP_NUMBER || b == CPP_NAME
2631                                 || c == '.' || c == '+' || c == '-');
2632                                       /* UCNs */
2633     case CPP_OTHER:     return ((token1->val.str.text[0] == '\\'
2634                                  && b == CPP_NAME)
2635                                 || (CPP_OPTION (pfile, objc)
2636                                     && token1->val.str.text[0] == '@'
2637                                     && (b == CPP_NAME || b == CPP_STRING)));
2638     default:            break;
2639     }
2640
2641   return 0;
2642 }
2643
2644 /* Output all the remaining tokens on the current line, and a newline
2645    character, to FP.  Leading whitespace is removed.  If there are
2646    macros, special token padding is not performed.  */
2647 void
2648 cpp_output_line (cpp_reader *pfile, FILE *fp)
2649 {
2650   const cpp_token *token;
2651
2652   token = cpp_get_token (pfile);
2653   while (token->type != CPP_EOF)
2654     {
2655       cpp_output_token (token, fp);
2656       token = cpp_get_token (pfile);
2657       if (token->flags & PREV_WHITE)
2658         putc (' ', fp);
2659     }
2660
2661   putc ('\n', fp);
2662 }
2663
2664 /* Return a string representation of all the remaining tokens on the
2665    current line.  The result is allocated using xmalloc and must be
2666    freed by the caller.  */
2667 unsigned char *
2668 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
2669 {
2670   const cpp_token *token;
2671   unsigned int out = dir_name ? ustrlen (dir_name) : 0;
2672   unsigned int alloced = 120 + out;
2673   unsigned char *result = (unsigned char *) xmalloc (alloced);
2674
2675   /* If DIR_NAME is empty, there are no initial contents.  */
2676   if (dir_name)
2677     {
2678       sprintf ((char *) result, "#%s ", dir_name);
2679       out += 2;
2680     }
2681
2682   token = cpp_get_token (pfile);
2683   while (token->type != CPP_EOF)
2684     {
2685       unsigned char *last;
2686       /* Include room for a possible space and the terminating nul.  */
2687       unsigned int len = cpp_token_len (token) + 2;
2688
2689       if (out + len > alloced)
2690         {
2691           alloced *= 2;
2692           if (out + len > alloced)
2693             alloced = out + len;
2694           result = (unsigned char *) xrealloc (result, alloced);
2695         }
2696
2697       last = cpp_spell_token (pfile, token, &result[out], 0);
2698       out = last - result;
2699
2700       token = cpp_get_token (pfile);
2701       if (token->flags & PREV_WHITE)
2702         result[out++] = ' ';
2703     }
2704
2705   result[out] = '\0';
2706   return result;
2707 }
2708
2709 /* Memory buffers.  Changing these three constants can have a dramatic
2710    effect on performance.  The values here are reasonable defaults,
2711    but might be tuned.  If you adjust them, be sure to test across a
2712    range of uses of cpplib, including heavy nested function-like macro
2713    expansion.  Also check the change in peak memory usage (NJAMD is a
2714    good tool for this).  */
2715 #define MIN_BUFF_SIZE 8000
2716 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
2717 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
2718         (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
2719
2720 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
2721   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
2722 #endif
2723
2724 /* Create a new allocation buffer.  Place the control block at the end
2725    of the buffer, so that buffer overflows will cause immediate chaos.  */
2726 static _cpp_buff *
2727 new_buff (size_t len)
2728 {
2729   _cpp_buff *result;
2730   unsigned char *base;
2731
2732   if (len < MIN_BUFF_SIZE)
2733     len = MIN_BUFF_SIZE;
2734   len = CPP_ALIGN (len);
2735
2736   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
2737   result = (_cpp_buff *) (base + len);
2738   result->base = base;
2739   result->cur = base;
2740   result->limit = base + len;
2741   result->next = NULL;
2742   return result;
2743 }
2744
2745 /* Place a chain of unwanted allocation buffers on the free list.  */
2746 void
2747 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
2748 {
2749   _cpp_buff *end = buff;
2750
2751   while (end->next)
2752     end = end->next;
2753   end->next = pfile->free_buffs;
2754   pfile->free_buffs = buff;
2755 }
2756
2757 /* Return a free buffer of size at least MIN_SIZE.  */
2758 _cpp_buff *
2759 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
2760 {
2761   _cpp_buff *result, **p;
2762
2763   for (p = &pfile->free_buffs;; p = &(*p)->next)
2764     {
2765       size_t size;
2766
2767       if (*p == NULL)
2768         return new_buff (min_size);
2769       result = *p;
2770       size = result->limit - result->base;
2771       /* Return a buffer that's big enough, but don't waste one that's
2772          way too big.  */
2773       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
2774         break;
2775     }
2776
2777   *p = result->next;
2778   result->next = NULL;
2779   result->cur = result->base;
2780   return result;
2781 }
2782
2783 /* Creates a new buffer with enough space to hold the uncommitted
2784    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
2785    the excess bytes to the new buffer.  Chains the new buffer after
2786    BUFF, and returns the new buffer.  */
2787 _cpp_buff *
2788 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
2789 {
2790   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
2791   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
2792
2793   buff->next = new_buff;
2794   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
2795   return new_buff;
2796 }
2797
2798 /* Creates a new buffer with enough space to hold the uncommitted
2799    remaining bytes of the buffer pointed to by BUFF, and at least
2800    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
2801    Chains the new buffer before the buffer pointed to by BUFF, and
2802    updates the pointer to point to the new buffer.  */
2803 void
2804 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
2805 {
2806   _cpp_buff *new_buff, *old_buff = *pbuff;
2807   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
2808
2809   new_buff = _cpp_get_buff (pfile, size);
2810   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
2811   new_buff->next = old_buff;
2812   *pbuff = new_buff;
2813 }
2814
2815 /* Free a chain of buffers starting at BUFF.  */
2816 void
2817 _cpp_free_buff (_cpp_buff *buff)
2818 {
2819   _cpp_buff *next;
2820
2821   for (; buff; buff = next)
2822     {
2823       next = buff->next;
2824       free (buff->base);
2825     }
2826 }
2827
2828 /* Allocate permanent, unaligned storage of length LEN.  */
2829 unsigned char *
2830 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
2831 {
2832   _cpp_buff *buff = pfile->u_buff;
2833   unsigned char *result = buff->cur;
2834
2835   if (len > (size_t) (buff->limit - result))
2836     {
2837       buff = _cpp_get_buff (pfile, len);
2838       buff->next = pfile->u_buff;
2839       pfile->u_buff = buff;
2840       result = buff->cur;
2841     }
2842
2843   buff->cur = result + len;
2844   return result;
2845 }
2846
2847 /* Allocate permanent, unaligned storage of length LEN from a_buff.
2848    That buffer is used for growing allocations when saving macro
2849    replacement lists in a #define, and when parsing an answer to an
2850    assertion in #assert, #unassert or #if (and therefore possibly
2851    whilst expanding macros).  It therefore must not be used by any
2852    code that they might call: specifically the lexer and the guts of
2853    the macro expander.
2854
2855    All existing other uses clearly fit this restriction: storing
2856    registered pragmas during initialization.  */
2857 unsigned char *
2858 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
2859 {
2860   _cpp_buff *buff = pfile->a_buff;
2861   unsigned char *result = buff->cur;
2862
2863   if (len > (size_t) (buff->limit - result))
2864     {
2865       buff = _cpp_get_buff (pfile, len);
2866       buff->next = pfile->a_buff;
2867       pfile->a_buff = buff;
2868       result = buff->cur;
2869     }
2870
2871   buff->cur = result + len;
2872   return result;
2873 }
2874
2875 /* Say which field of TOK is in use.  */
2876
2877 enum cpp_token_fld_kind
2878 cpp_token_val_index (cpp_token *tok)
2879 {
2880   switch (TOKEN_SPELL (tok))
2881     {
2882     case SPELL_IDENT:
2883       return CPP_TOKEN_FLD_NODE;
2884     case SPELL_LITERAL:
2885       return CPP_TOKEN_FLD_STR;
2886     case SPELL_OPERATOR:
2887       if (tok->type == CPP_PASTE)
2888         return CPP_TOKEN_FLD_TOKEN_NO;
2889       else
2890         return CPP_TOKEN_FLD_NONE;
2891     case SPELL_NONE:
2892       if (tok->type == CPP_MACRO_ARG)
2893         return CPP_TOKEN_FLD_ARG_NO;
2894       else if (tok->type == CPP_PADDING)
2895         return CPP_TOKEN_FLD_SOURCE;
2896       else if (tok->type == CPP_PRAGMA)
2897         return CPP_TOKEN_FLD_PRAGMA;
2898       /* else fall through */
2899     default:
2900       return CPP_TOKEN_FLD_NONE;
2901     }
2902 }
2903
2904 /* All tokens lexed in R after calling this function will be forced to have
2905    their source_location the same as the location referenced by P, until
2906    cpp_stop_forcing_token_locations is called for R.  */
2907
2908 void
2909 cpp_force_token_locations (cpp_reader *r, source_location *p)
2910 {
2911   r->forced_token_location_p = p;
2912 }
2913
2914 /* Go back to assigning locations naturally for lexed tokens.  */
2915
2916 void
2917 cpp_stop_forcing_token_locations (cpp_reader *r)
2918 {
2919   r->forced_token_location_p = NULL;
2920 }