gcc-4_7-mobile/libcpp/lex.c

   1 /* CPP Library - lexical analysis.
   2    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2007, 2008, 2009, 2010,
   3    2011 Free Software Foundation, Inc.
   4    Contributed by Per Bothner, 1994-95.
   5    Based on CCCP program by Paul Rubin, June 1986
   6    Adapted to ANSI C, Richard Stallman, Jan 1987
   7    Broken out to separate file, Zack Weinberg, Mar 2000
   8
   9 This program is free software; you can redistribute it and/or modify it
  10 under the terms of the GNU General Public License as published by the
  11 Free Software Foundation; either version 3, or (at your option) any
  12 later version.
  13
  14 This program is distributed in the hope that it will be useful,
  15 but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17 GNU General Public License for more details.
  18
  19 You should have received a copy of the GNU General Public License
  20 along with this program; see the file COPYING3.  If not see
  21 <http://www.gnu.org/licenses/>.  */
  22
  23 #include "config.h"
  24 #include "system.h"
  25 #include "cpplib.h"
  26 #include "internal.h"
  27
  28 enum spell_type
  29 {
  30   SPELL_OPERATOR = 0,
  31   SPELL_IDENT,
  32   SPELL_LITERAL,
  33   SPELL_NONE
  34 };
  35
  36 struct token_spelling
  37 {
  38   enum spell_type category;
  39   const unsigned char *name;
  40 };
  41
  42 static const unsigned char *const digraph_spellings[] =
  43 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
  44
  45 #define OP(e, s) { SPELL_OPERATOR, UC s  },
  46 #define TK(e, s) { SPELL_ ## s,    UC #e },
  47 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
  48 #undef OP
  49 #undef TK
  50
  51 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
  52 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
  53
  54 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
  55 static int skip_line_comment (cpp_reader *);
  56 static void skip_whitespace (cpp_reader *, cppchar_t);
  57 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
  58 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
  59 static void store_comment (cpp_reader *, cpp_token *);
  60 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
  61                             unsigned int, enum cpp_ttype);
  62 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
  63 static int name_p (cpp_reader *, const cpp_string *);
  64 static tokenrun *next_tokenrun (tokenrun *);
  65
  66 static _cpp_buff *new_buff (size_t);
  67
  68
  69 /* Utility routine:
  70
  71    Compares, the token TOKEN to the NUL-terminated string STRING.
  72    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
  73 int
  74 cpp_ideq (const cpp_token *token, const char *string)
  75 {
  76   if (token->type != CPP_NAME)
  77     return 0;
  78
  79   return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
  80 }
  81
  82 /* Record a note TYPE at byte POS into the current cleaned logical
  83    line.  */
  84 static void
  85 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
  86 {
  87   if (buffer->notes_used == buffer->notes_cap)
  88     {
  89       buffer->notes_cap = buffer->notes_cap * 2 + 200;
  90       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
  91                                   buffer->notes_cap);
  92     }
  93
  94   buffer->notes[buffer->notes_used].pos = pos;
  95   buffer->notes[buffer->notes_used].type = type;
  96   buffer->notes_used++;
  97 }
  98
  99 \f
 100 /* Fast path to find line special characters using optimized character
 101    scanning algorithms.  Anything complicated falls back to the slow
 102    path below.  Since this loop is very hot it's worth doing these kinds
 103    of optimizations.
 104
 105    One of the paths through the ifdefs should provide
 106
 107      const uchar *search_line_fast (const uchar *s, const uchar *end);
 108
 109    Between S and END, search for \n, \r, \\, ?.  Return a pointer to
 110    the found character.
 111
 112    Note that the last character of the buffer is *always* a newline,
 113    as forced by _cpp_convert_input.  This fact can be used to avoid
 114    explicitly looking for the end of the buffer.  */
 115
 116 /* Configure gives us an ifdef test.  */
 117 #ifndef WORDS_BIGENDIAN
 118 #define WORDS_BIGENDIAN 0
 119 #endif
 120
 121 /* We'd like the largest integer that fits into a register.  There's nothing
 122    in <stdint.h> that gives us that.  For most hosts this is unsigned long,
 123    but MS decided on an LLP64 model.  Thankfully when building with GCC we
 124    can get the "real" word size.  */
 125 #ifdef __GNUC__
 126 typedef unsigned int word_type __attribute__((__mode__(__word__)));
 127 #else
 128 typedef unsigned long word_type;
 129 #endif
 130
 131 /* The code below is only expecting sizes 4 or 8.
 132    Die at compile-time if this expectation is violated.  */
 133 typedef char check_word_type_size
 134   [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
 135
 136 /* Return X with the first N bytes forced to values that won't match one
 137    of the interesting characters.  Note that NUL is not interesting.  */
 138
 139 static inline word_type
 140 acc_char_mask_misalign (word_type val, unsigned int n)
 141 {
 142   word_type mask = -1;
 143   if (WORDS_BIGENDIAN)
 144     mask >>= n * 8;
 145   else
 146     mask <<= n * 8;
 147   return val & mask;
 148 }
 149
 150 /* Return X replicated to all byte positions within WORD_TYPE.  */
 151
 152 static inline word_type
 153 acc_char_replicate (uchar x)
 154 {
 155   word_type ret;
 156
 157   ret = (x << 24) | (x << 16) | (x << 8) | x;
 158   if (sizeof(word_type) == 8)
 159     ret = (ret << 16 << 16) | ret;
 160   return ret;
 161 }
 162
 163 /* Return non-zero if some byte of VAL is (probably) C.  */
 164
 165 static inline word_type
 166 acc_char_cmp (word_type val, word_type c)
 167 {
 168 #if defined(__GNUC__) && defined(__alpha__)
 169   /* We can get exact results using a compare-bytes instruction.
 170      Get (val == c) via (0 >= (val ^ c)).  */
 171   return __builtin_alpha_cmpbge (0, val ^ c);
 172 #else
 173   word_type magic = 0x7efefefeU;
 174   if (sizeof(word_type) == 8)
 175     magic = (magic << 16 << 16) | 0xfefefefeU;
 176   magic |= 1;
 177
 178   val ^= c;
 179   return ((val + magic) ^ ~val) & ~magic;
 180 #endif
 181 }
 182
 183 /* Given the result of acc_char_cmp is non-zero, return the index of
 184    the found character.  If this was a false positive, return -1.  */
 185
 186 static inline int
 187 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
 188                 word_type val ATTRIBUTE_UNUSED)
 189 {
 190 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
 191   /* The cmpbge instruction sets *bits* of the result corresponding to
 192      matches in the bytes with no false positives.  */
 193   return __builtin_ctzl (cmp);
 194 #else
 195   unsigned int i;
 196
 197   /* ??? It would be nice to force unrolling here,
 198      and have all of these constants folded.  */
 199   for (i = 0; i < sizeof(word_type); ++i)
 200     {
 201       uchar c;
 202       if (WORDS_BIGENDIAN)
 203         c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
 204       else
 205         c = (val >> i * 8) & 0xff;
 206
 207       if (c == '\n' || c == '\r' || c == '\\' || c == '?')
 208         return i;
 209     }
 210
 211   return -1;
 212 #endif
 213 }
 214
 215 /* A version of the fast scanner using bit fiddling techniques.
 216
 217    For 32-bit words, one would normally perform 16 comparisons and
 218    16 branches.  With this algorithm one performs 24 arithmetic
 219    operations and one branch.  Whether this is faster with a 32-bit
 220    word size is going to be somewhat system dependent.
 221
 222    For 64-bit words, we eliminate twice the number of comparisons
 223    and branches without increasing the number of arithmetic operations.
 224    It's almost certainly going to be a win with 64-bit word size.  */
 225
 226 static const uchar * search_line_acc_char (const uchar *, const uchar *)
 227   ATTRIBUTE_UNUSED;
 228
 229 static const uchar *
 230 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 231 {
 232   const word_type repl_nl = acc_char_replicate ('\n');
 233   const word_type repl_cr = acc_char_replicate ('\r');
 234   const word_type repl_bs = acc_char_replicate ('\\');
 235   const word_type repl_qm = acc_char_replicate ('?');
 236
 237   unsigned int misalign;
 238   const word_type *p;
 239   word_type val, t;
 240
 241   /* Align the buffer.  Mask out any bytes from before the beginning.  */
 242   p = (word_type *)((uintptr_t)s & -sizeof(word_type));
 243   val = *p;
 244   misalign = (uintptr_t)s & (sizeof(word_type) - 1);
 245   if (misalign)
 246     val = acc_char_mask_misalign (val, misalign);
 247
 248   /* Main loop.  */
 249   while (1)
 250     {
 251       t  = acc_char_cmp (val, repl_nl);
 252       t |= acc_char_cmp (val, repl_cr);
 253       t |= acc_char_cmp (val, repl_bs);
 254       t |= acc_char_cmp (val, repl_qm);
 255
 256       if (__builtin_expect (t != 0, 0))
 257         {
 258           int i = acc_char_index (t, val);
 259           if (i >= 0)
 260             return (const uchar *)p + i;
 261         }
 262
 263       val = *++p;
 264     }
 265 }
 266
 267 /* Disable on Solaris 2/x86 until the following problems can be properly
 268    autoconfed:
 269
 270    The Solaris 8 assembler cannot assemble SSE2/SSE4.2 insns.
 271    The Solaris 9 assembler cannot assemble SSE4.2 insns.
 272    Before Solaris 9 Update 6, SSE insns cannot be executed.
 273    The Solaris 10+ assembler tags objects with the instruction set
 274    extensions used, so SSE4.2 executables cannot run on machines that
 275    don't support that extension.  */
 276
 277 #if (GCC_VERSION >= 4005) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
 278
 279 /* Replicated character data to be shared between implementations.
 280    Recall that outside of a context with vector support we can't
 281    define compatible vector types, therefore these are all defined
 282    in terms of raw characters.  */
 283 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
 284   { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 285     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
 286   { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 287     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
 288   { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 289     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
 290   { '?', '?', '?', '?', '?', '?', '?', '?',
 291     '?', '?', '?', '?', '?', '?', '?', '?' },
 292 };
 293
 294 /* A version of the fast scanner using MMX vectorized byte compare insns.
 295
 296    This uses the PMOVMSKB instruction which was introduced with "MMX2",
 297    which was packaged into SSE1; it is also present in the AMD MMX
 298    extension.  Mark the function as using "sse" so that we emit a real
 299    "emms" instruction, rather than the 3dNOW "femms" instruction.  */
 300
 301 static const uchar *
 302 #ifndef __SSE__
 303 __attribute__((__target__("sse")))
 304 #endif
 305 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 306 {
 307   typedef char v8qi __attribute__ ((__vector_size__ (8)));
 308   typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
 309
 310   const v8qi repl_nl = *(const v8qi *)repl_chars[0];
 311   const v8qi repl_cr = *(const v8qi *)repl_chars[1];
 312   const v8qi repl_bs = *(const v8qi *)repl_chars[2];
 313   const v8qi repl_qm = *(const v8qi *)repl_chars[3];
 314
 315   unsigned int misalign, found, mask;
 316   const v8qi *p;
 317   v8qi data, t, c;
 318
 319   /* Align the source pointer.  While MMX doesn't generate unaligned data
 320      faults, this allows us to safely scan to the end of the buffer without
 321      reading beyond the end of the last page.  */
 322   misalign = (uintptr_t)s & 7;
 323   p = (const v8qi *)((uintptr_t)s & -8);
 324   data = *p;
 325
 326   /* Create a mask for the bytes that are valid within the first
 327      16-byte block.  The Idea here is that the AND with the mask
 328      within the loop is "free", since we need some AND or TEST
 329      insn in order to set the flags for the branch anyway.  */
 330   mask = -1u << misalign;
 331
 332   /* Main loop processing 8 bytes at a time.  */
 333   goto start;
 334   do
 335     {
 336       data = *++p;
 337       mask = -1;
 338
 339     start:
 340       t = __builtin_ia32_pcmpeqb(data, repl_nl);
 341       c = __builtin_ia32_pcmpeqb(data, repl_cr);
 342       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 343       c = __builtin_ia32_pcmpeqb(data, repl_bs);
 344       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 345       c = __builtin_ia32_pcmpeqb(data, repl_qm);
 346       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 347       found = __builtin_ia32_pmovmskb (t);
 348       found &= mask;
 349     }
 350   while (!found);
 351
 352   __builtin_ia32_emms ();
 353
 354   /* FOUND contains 1 in bits for which we matched a relevant
 355      character.  Conversion to the byte index is trivial.  */
 356   found = __builtin_ctz(found);
 357   return (const uchar *)p + found;
 358 }
 359
 360 /* A version of the fast scanner using SSE2 vectorized byte compare insns.  */
 361
 362 static const uchar *
 363 #ifndef __SSE2__
 364 __attribute__((__target__("sse2")))
 365 #endif
 366 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 367 {
 368   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 369
 370   const v16qi repl_nl = *(const v16qi *)repl_chars[0];
 371   const v16qi repl_cr = *(const v16qi *)repl_chars[1];
 372   const v16qi repl_bs = *(const v16qi *)repl_chars[2];
 373   const v16qi repl_qm = *(const v16qi *)repl_chars[3];
 374
 375   unsigned int misalign, found, mask;
 376   const v16qi *p;
 377   v16qi data, t;
 378
 379   /* Align the source pointer.  */
 380   misalign = (uintptr_t)s & 15;
 381   p = (const v16qi *)((uintptr_t)s & -16);
 382   data = *p;
 383
 384   /* Create a mask for the bytes that are valid within the first
 385      16-byte block.  The Idea here is that the AND with the mask
 386      within the loop is "free", since we need some AND or TEST
 387      insn in order to set the flags for the branch anyway.  */
 388   mask = -1u << misalign;
 389
 390   /* Main loop processing 16 bytes at a time.  */
 391   goto start;
 392   do
 393     {
 394       data = *++p;
 395       mask = -1;
 396
 397     start:
 398       t  = __builtin_ia32_pcmpeqb128(data, repl_nl);
 399       t |= __builtin_ia32_pcmpeqb128(data, repl_cr);
 400       t |= __builtin_ia32_pcmpeqb128(data, repl_bs);
 401       t |= __builtin_ia32_pcmpeqb128(data, repl_qm);
 402       found = __builtin_ia32_pmovmskb128 (t);
 403       found &= mask;
 404     }
 405   while (!found);
 406
 407   /* FOUND contains 1 in bits for which we matched a relevant
 408      character.  Conversion to the byte index is trivial.  */
 409   found = __builtin_ctz(found);
 410   return (const uchar *)p + found;
 411 }
 412
 413 #ifdef HAVE_SSE4
 414 /* A version of the fast scanner using SSE 4.2 vectorized string insns.  */
 415
 416 static const uchar *
 417 #ifndef __SSE4_2__
 418 __attribute__((__target__("sse4.2")))
 419 #endif
 420 search_line_sse42 (const uchar *s, const uchar *end)
 421 {
 422   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 423   static const v16qi search = { '\n', '\r', '?', '\\' };
 424
 425   uintptr_t si = (uintptr_t)s;
 426   uintptr_t index;
 427
 428   /* Check for unaligned input.  */
 429   if (si & 15)
 430     {
 431       if (__builtin_expect (end - s < 16, 0)
 432           && __builtin_expect ((si & 0xfff) > 0xff0, 0))
 433         {
 434           /* There are less than 16 bytes left in the buffer, and less
 435              than 16 bytes left on the page.  Reading 16 bytes at this
 436              point might generate a spurious page fault.  Defer to the
 437              SSE2 implementation, which already handles alignment.  */
 438           return search_line_sse2 (s, end);
 439         }
 440
 441       /* ??? The builtin doesn't understand that the PCMPESTRI read from
 442          memory need not be aligned.  */
 443       __asm ("%vpcmpestri $0, (%1), %2"
 444              : "=c"(index) : "r"(s), "x"(search), "a"(4), "d"(16));
 445       if (__builtin_expect (index < 16, 0))
 446         goto found;
 447
 448       /* Advance the pointer to an aligned address.  We will re-scan a
 449          few bytes, but we no longer need care for reading past the
 450          end of a page, since we're guaranteed a match.  */
 451       s = (const uchar *)((si + 16) & -16);
 452     }
 453
 454   /* Main loop, processing 16 bytes at a time.  By doing the whole loop
 455      in inline assembly, we can make proper use of the flags set.  */
 456   __asm (      "sub $16, %1\n"
 457         "       .balign 16\n"
 458         "0:     add $16, %1\n"
 459         "       %vpcmpestri $0, (%1), %2\n"
 460         "       jnc 0b"
 461         : "=&c"(index), "+r"(s)
 462         : "x"(search), "a"(4), "d"(16));
 463
 464  found:
 465   return s + index;
 466 }
 467
 468 #else
 469 /* Work around out-dated assemblers without sse4 support.  */
 470 #define search_line_sse42 search_line_sse2
 471 #endif
 472
 473 /* Check the CPU capabilities.  */
 474
 475 #include "../gcc/config/i386/cpuid.h"
 476
 477 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
 478 static search_line_fast_type search_line_fast;
 479
 480 #define HAVE_init_vectorized_lexer 1
 481 static inline void
 482 init_vectorized_lexer (void)
 483 {
 484   unsigned dummy, ecx = 0, edx = 0;
 485   search_line_fast_type impl = search_line_acc_char;
 486   int minimum = 0;
 487
 488 #if defined(__SSE4_2__)
 489   minimum = 3;
 490 #elif defined(__SSE2__)
 491   minimum = 2;
 492 #elif defined(__SSE__)
 493   minimum = 1;
 494 #endif
 495
 496   if (minimum == 3)
 497     impl = search_line_sse42;
 498   else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
 499     {
 500       if (minimum == 3 || (ecx & bit_SSE4_2))
 501         impl = search_line_sse42;
 502       else if (minimum == 2 || (edx & bit_SSE2))
 503         impl = search_line_sse2;
 504       else if (minimum == 1 || (edx & bit_SSE))
 505         impl = search_line_mmx;
 506     }
 507   else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
 508     {
 509       if (minimum == 1
 510           || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
 511         impl = search_line_mmx;
 512     }
 513
 514   search_line_fast = impl;
 515 }
 516
 517 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__)
 518
 519 /* A vection of the fast scanner using AltiVec vectorized byte compares.  */
 520 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
 521    so we can't compile this function without -maltivec on the command line
 522    (or implied by some other switch).  */
 523
 524 static const uchar *
 525 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 526 {
 527   typedef __attribute__((altivec(vector))) unsigned char vc;
 528
 529   const vc repl_nl = {
 530     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 531     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 532   };
 533   const vc repl_cr = {
 534     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 535     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 536   };
 537   const vc repl_bs = {
 538     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 539     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 540   };
 541   const vc repl_qm = {
 542     '?', '?', '?', '?', '?', '?', '?', '?',
 543     '?', '?', '?', '?', '?', '?', '?', '?',
 544   };
 545   const vc ones = {
 546     -1, -1, -1, -1, -1, -1, -1, -1,
 547     -1, -1, -1, -1, -1, -1, -1, -1,
 548   };
 549   const vc zero = { 0 };
 550
 551   vc data, mask, t;
 552
 553   /* Altivec loads automatically mask addresses with -16.  This lets us
 554      issue the first load as early as possible.  */
 555   data = __builtin_vec_ld(0, (const vc *)s);
 556
 557   /* Discard bytes before the beginning of the buffer.  Do this by
 558      beginning with all ones and shifting in zeros according to the
 559      mis-alignment.  The LVSR instruction pulls the exact shift we
 560      want from the address.  */
 561   mask = __builtin_vec_lvsr(0, s);
 562   mask = __builtin_vec_perm(zero, ones, mask);
 563   data &= mask;
 564
 565   /* While altivec loads mask addresses, we still need to align S so
 566      that the offset we compute at the end is correct.  */
 567   s = (const uchar *)((uintptr_t)s & -16);
 568
 569   /* Main loop processing 16 bytes at a time.  */
 570   goto start;
 571   do
 572     {
 573       vc m_nl, m_cr, m_bs, m_qm;
 574
 575       s += 16;
 576       data = __builtin_vec_ld(0, (const vc *)s);
 577
 578     start:
 579       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 580       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 581       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 582       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 583       t = (m_nl | m_cr) | (m_bs | m_qm);
 584
 585       /* T now contains 0xff in bytes for which we matched one of the relevant
 586          characters.  We want to exit the loop if any byte in T is non-zero.
 587          Below is the expansion of vec_any_ne(t, zero).  */
 588     }
 589   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 590
 591   {
 592 #define N  (sizeof(vc) / sizeof(long))
 593
 594     typedef char check_count[(N == 2 || N == 4) * 2 - 1];
 595     union {
 596       vc v;
 597       unsigned long l[N];
 598     } u;
 599     unsigned long l, i = 0;
 600
 601     u.v = t;
 602
 603     /* Find the first word of T that is non-zero.  */
 604     switch (N)
 605       {
 606       case 4:
 607         l = u.l[i++];
 608         if (l != 0)
 609           break;
 610         s += sizeof(unsigned long);
 611         l = u.l[i++];
 612         if (l != 0)
 613           break;
 614         s += sizeof(unsigned long);
 615       case 2:
 616         l = u.l[i++];
 617         if (l != 0)
 618           break;
 619         s += sizeof(unsigned long);
 620         l = u.l[i];
 621       }
 622
 623     /* L now contains 0xff in bytes for which we matched one of the
 624        relevant characters.  We can find the byte index by finding
 625        its bit index and dividing by 8.  */
 626     l = __builtin_clzl(l) >> 3;
 627     return s + l;
 628
 629 #undef N
 630   }
 631 }
 632
 633 #else
 634
 635 /* We only have one accellerated alternative.  Use a direct call so that
 636    we encourage inlining.  */
 637
 638 #define search_line_fast  search_line_acc_char
 639
 640 #endif
 641
 642 /* Initialize the lexer if needed.  */
 643
 644 void
 645 _cpp_init_lexer (void)
 646 {
 647 #ifdef HAVE_init_vectorized_lexer
 648   init_vectorized_lexer ();
 649 #endif
 650 }
 651
 652 /* Returns with a logical line that contains no escaped newlines or
 653    trigraphs.  This is a time-critical inner loop.  */
 654 void
 655 _cpp_clean_line (cpp_reader *pfile)
 656 {
 657   cpp_buffer *buffer;
 658   const uchar *s;
 659   uchar c, *d, *p;
 660
 661   buffer = pfile->buffer;
 662   buffer->cur_note = buffer->notes_used = 0;
 663   buffer->cur = buffer->line_base = buffer->next_line;
 664   buffer->need_line = false;
 665   s = buffer->next_line;
 666
 667   if (!buffer->from_stage3)
 668     {
 669       const uchar *pbackslash = NULL;
 670
 671       /* Fast path.  This is the common case of an un-escaped line with
 672          no trigraphs.  The primary win here is by not writing any
 673          data back to memory until we have to.  */
 674       while (1)
 675         {
 676           /* Perform an optimized search for \n, \r, \\, ?.  */
 677           s = search_line_fast (s, buffer->rlimit);
 678
 679           c = *s;
 680           if (c == '\\')
 681             {
 682               /* Record the location of the backslash and continue.  */
 683               pbackslash = s++;
 684             }
 685           else if (__builtin_expect (c == '?', 0))
 686             {
 687               if (__builtin_expect (s[1] == '?', false)
 688                    && _cpp_trigraph_map[s[2]])
 689                 {
 690                   /* Have a trigraph.  We may or may not have to convert
 691                      it.  Add a line note regardless, for -Wtrigraphs.  */
 692                   add_line_note (buffer, s, s[2]);
 693                   if (CPP_OPTION (pfile, trigraphs))
 694                     {
 695                       /* We do, and that means we have to switch to the
 696                          slow path.  */
 697                       d = (uchar *) s;
 698                       *d = _cpp_trigraph_map[s[2]];
 699                       s += 2;
 700                       goto slow_path;
 701                     }
 702                 }
 703               /* Not a trigraph.  Continue on fast-path.  */
 704               s++;
 705             }
 706           else
 707             break;
 708         }
 709
 710       /* This must be \r or \n.  We're either done, or we'll be forced
 711          to write back to the buffer and continue on the slow path.  */
 712       d = (uchar *) s;
 713
 714       if (__builtin_expect (s == buffer->rlimit, false))
 715         goto done;
 716
 717       /* DOS line ending? */
 718       if (__builtin_expect (c == '\r', false) && s[1] == '\n')
 719         {
 720           s++;
 721           if (s == buffer->rlimit)
 722             goto done;
 723         }
 724
 725       if (__builtin_expect (pbackslash == NULL, true))
 726         goto done;
 727
 728       /* Check for escaped newline.  */
 729       p = d;
 730       while (is_nvspace (p[-1]))
 731         p--;
 732       if (p - 1 != pbackslash)
 733         goto done;
 734
 735       /* Have an escaped newline; process it and proceed to
 736          the slow path.  */
 737       add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
 738       d = p - 2;
 739       buffer->next_line = p - 1;
 740
 741     slow_path:
 742       while (1)
 743         {
 744           c = *++s;
 745           *++d = c;
 746
 747           if (c == '\n' || c == '\r')
 748             {
 749               /* Handle DOS line endings.  */
 750               if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
 751                 s++;
 752               if (s == buffer->rlimit)
 753                 break;
 754
 755               /* Escaped?  */
 756               p = d;
 757               while (p != buffer->next_line && is_nvspace (p[-1]))
 758                 p--;
 759               if (p == buffer->next_line || p[-1] != '\\')
 760                 break;
 761
 762               add_line_note (buffer, p - 1, p != d ? ' ': '\\');
 763               d = p - 2;
 764               buffer->next_line = p - 1;
 765             }
 766           else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
 767             {
 768               /* Add a note regardless, for the benefit of -Wtrigraphs.  */
 769               add_line_note (buffer, d, s[2]);
 770               if (CPP_OPTION (pfile, trigraphs))
 771                 {
 772                   *d = _cpp_trigraph_map[s[2]];
 773                   s += 2;
 774                 }
 775             }
 776         }
 777     }
 778   else
 779     {
 780       while (*s != '\n' && *s != '\r')
 781         s++;
 782       d = (uchar *) s;
 783
 784       /* Handle DOS line endings.  */
 785       if (*s == '\r' && s != buffer->rlimit && s[1] == '\n')
 786         s++;
 787     }
 788
 789  done:
 790   *d = '\n';
 791   /* A sentinel note that should never be processed.  */
 792   add_line_note (buffer, d + 1, '\n');
 793   buffer->next_line = s + 1;
 794 }
 795
 796 /* Return true if the trigraph indicated by NOTE should be warned
 797    about in a comment.  */
 798 static bool
 799 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
 800 {
 801   const uchar *p;
 802
 803   /* Within comments we don't warn about trigraphs, unless the
 804      trigraph forms an escaped newline, as that may change
 805      behavior.  */
 806   if (note->type != '/')
 807     return false;
 808
 809   /* If -trigraphs, then this was an escaped newline iff the next note
 810      is coincident.  */
 811   if (CPP_OPTION (pfile, trigraphs))
 812     return note[1].pos == note->pos;
 813
 814   /* Otherwise, see if this forms an escaped newline.  */
 815   p = note->pos + 3;
 816   while (is_nvspace (*p))
 817     p++;
 818
 819   /* There might have been escaped newlines between the trigraph and the
 820      newline we found.  Hence the position test.  */
 821   return (*p == '\n' && p < note[1].pos);
 822 }
 823
 824 /* Process the notes created by add_line_note as far as the current
 825    location.  */
 826 void
 827 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
 828 {
 829   cpp_buffer *buffer = pfile->buffer;
 830
 831   for (;;)
 832     {
 833       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
 834       unsigned int col;
 835
 836       if (note->pos > buffer->cur)
 837         break;
 838
 839       buffer->cur_note++;
 840       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
 841
 842       if (note->type == '\\' || note->type == ' ')
 843         {
 844           if (note->type == ' ' && !in_comment)
 845             cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
 846                                  "backslash and newline separated by space");
 847
 848           if (buffer->next_line > buffer->rlimit)
 849             {
 850               cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
 851                                    "backslash-newline at end of file");
 852               /* Prevent "no newline at end of file" warning.  */
 853               buffer->next_line = buffer->rlimit;
 854             }
 855
 856           buffer->line_base = note->pos;
 857           CPP_INCREMENT_LINE (pfile, 0);
 858         }
 859       else if (_cpp_trigraph_map[note->type])
 860         {
 861           if (CPP_OPTION (pfile, warn_trigraphs)
 862               && (!in_comment || warn_in_comment (pfile, note)))
 863             {
 864               if (CPP_OPTION (pfile, trigraphs))
 865                 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
 866                                        pfile->line_table->highest_line, col,
 867                                        "trigraph ??%c converted to %c",
 868                                        note->type,
 869                                        (int) _cpp_trigraph_map[note->type]);
 870               else
 871                 {
 872                   cpp_warning_with_line
 873                     (pfile, CPP_W_TRIGRAPHS,
 874                      pfile->line_table->highest_line, col,
 875                      "trigraph ??%c ignored, use -trigraphs to enable",
 876                      note->type);
 877                 }
 878             }
 879         }
 880       else if (note->type == 0)
 881         /* Already processed in lex_raw_string.  */;
 882       else
 883         abort ();
 884     }
 885 }
 886
 887 /* Skip a C-style block comment.  We find the end of the comment by
 888    seeing if an asterisk is before every '/' we encounter.  Returns
 889    nonzero if comment terminated by EOF, zero otherwise.
 890
 891    Buffer->cur points to the initial asterisk of the comment.  */
 892 bool
 893 _cpp_skip_block_comment (cpp_reader *pfile)
 894 {
 895   cpp_buffer *buffer = pfile->buffer;
 896   const uchar *cur = buffer->cur;
 897   uchar c;
 898
 899   cur++;
 900   if (*cur == '/')
 901     cur++;
 902
 903   for (;;)
 904     {
 905       /* People like decorating comments with '*', so check for '/'
 906          instead for efficiency.  */
 907       c = *cur++;
 908
 909       if (c == '/')
 910         {
 911           if (cur[-2] == '*')
 912             break;
 913
 914           /* Warn about potential nested comments, but not if the '/'
 915              comes immediately before the true comment delimiter.
 916              Don't bother to get it right across escaped newlines.  */
 917           if (CPP_OPTION (pfile, warn_comments)
 918               && cur[0] == '*' && cur[1] != '/')
 919             {
 920               buffer->cur = cur;
 921               cpp_warning_with_line (pfile, CPP_W_COMMENTS,
 922                                      pfile->line_table->highest_line,
 923                                      CPP_BUF_COL (buffer),
 924                                      "\"/*\" within comment");
 925             }
 926         }
 927       else if (c == '\n')
 928         {
 929           unsigned int cols;
 930           buffer->cur = cur - 1;
 931           _cpp_process_line_notes (pfile, true);
 932           if (buffer->next_line >= buffer->rlimit)
 933             return true;
 934           _cpp_clean_line (pfile);
 935
 936           cols = buffer->next_line - buffer->line_base;
 937           CPP_INCREMENT_LINE (pfile, cols);
 938
 939           cur = buffer->cur;
 940         }
 941     }
 942
 943   buffer->cur = cur;
 944   _cpp_process_line_notes (pfile, true);
 945   return false;
 946 }
 947
 948 /* Skip a C++ line comment, leaving buffer->cur pointing to the
 949    terminating newline.  Handles escaped newlines.  Returns nonzero
 950    if a multiline comment.  */
 951 static int
 952 skip_line_comment (cpp_reader *pfile)
 953 {
 954   cpp_buffer *buffer = pfile->buffer;
 955   source_location orig_line = pfile->line_table->highest_line;
 956
 957   while (*buffer->cur != '\n')
 958     buffer->cur++;
 959
 960   _cpp_process_line_notes (pfile, true);
 961   return orig_line != pfile->line_table->highest_line;
 962 }
 963
 964 /* Skips whitespace, saving the next non-whitespace character.  */
 965 static void
 966 skip_whitespace (cpp_reader *pfile, cppchar_t c)
 967 {
 968   cpp_buffer *buffer = pfile->buffer;
 969   bool saw_NUL = false;
 970
 971   do
 972     {
 973       /* Horizontal space always OK.  */
 974       if (c == ' ' || c == '\t')
 975         ;
 976       /* Just \f \v or \0 left.  */
 977       else if (c == '\0')
 978         saw_NUL = true;
 979       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
 980         cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
 981                              CPP_BUF_COL (buffer),
 982                              "%s in preprocessing directive",
 983                              c == '\f' ? "form feed" : "vertical tab");
 984
 985       c = *buffer->cur++;
 986     }
 987   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
 988   while (is_nvspace (c));
 989
 990   if (saw_NUL)
 991     cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored");
 992
 993   buffer->cur--;
 994 }
 995
 996 /* See if the characters of a number token are valid in a name (no
 997    '.', '+' or '-').  */
 998 static int
 999 name_p (cpp_reader *pfile, const cpp_string *string)
1000 {
1001   unsigned int i;
1002
1003   for (i = 0; i < string->len; i++)
1004     if (!is_idchar (string->text[i]))
1005       return 0;
1006
1007   return 1;
1008 }
1009
1010 /* After parsing an identifier or other sequence, produce a warning about
1011    sequences not in NFC/NFKC.  */
1012 static void
1013 warn_about_normalization (cpp_reader *pfile,
1014                           const cpp_token *token,
1015                           const struct normalize_state *s)
1016 {
1017   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
1018       && !pfile->state.skipping)
1019     {
1020       /* Make sure that the token is printed using UCNs, even
1021          if we'd otherwise happily print UTF-8.  */
1022       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
1023       size_t sz;
1024
1025       sz = cpp_spell_token (pfile, token, buf, false) - buf;
1026       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
1027         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1028                                "`%.*s' is not in NFKC", (int) sz, buf);
1029       else
1030         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1031                                "`%.*s' is not in NFC", (int) sz, buf);
1032     }
1033 }
1034
1035 /* Returns TRUE if the sequence starting at buffer->cur is invalid in
1036    an identifier.  FIRST is TRUE if this starts an identifier.  */
1037 static bool
1038 forms_identifier_p (cpp_reader *pfile, int first,
1039                     struct normalize_state *state)
1040 {
1041   cpp_buffer *buffer = pfile->buffer;
1042
1043   if (*buffer->cur == '$')
1044     {
1045       if (!CPP_OPTION (pfile, dollars_in_ident))
1046         return false;
1047
1048       buffer->cur++;
1049       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
1050         {
1051           CPP_OPTION (pfile, warn_dollars) = 0;
1052           cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
1053         }
1054
1055       return true;
1056     }
1057
1058   /* Is this a syntactically valid UCN?  */
1059   if (CPP_OPTION (pfile, extended_identifiers)
1060       && *buffer->cur == '\\'
1061       && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
1062     {
1063       buffer->cur += 2;
1064       if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1065                           state))
1066         return true;
1067       buffer->cur -= 2;
1068     }
1069
1070   return false;
1071 }
1072
1073 /* Helper function to get the cpp_hashnode of the identifier BASE.  */
1074 static cpp_hashnode *
1075 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
1076 {
1077   cpp_hashnode *result;
1078   const uchar *cur;
1079   unsigned int len;
1080   unsigned int hash = HT_HASHSTEP (0, *base);
1081
1082   cur = base + 1;
1083   while (ISIDNUM (*cur))
1084     {
1085       hash = HT_HASHSTEP (hash, *cur);
1086       cur++;
1087     }
1088   len = cur - base;
1089   hash = HT_HASHFINISH (hash, len);
1090   result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1091                                               base, len, hash, HT_ALLOC));
1092
1093   /* Rarely, identifiers require diagnostics when lexed.  */
1094   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1095                         && !pfile->state.skipping, 0))
1096     {
1097       /* It is allowed to poison the same identifier twice.  */
1098       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1099         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1100                    NODE_NAME (result));
1101
1102       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1103          replacement list of a variadic macro.  */
1104       if (result == pfile->spec_nodes.n__VA_ARGS__
1105           && !pfile->state.va_args_ok)
1106         cpp_error (pfile, CPP_DL_PEDWARN,
1107                    "__VA_ARGS__ can only appear in the expansion"
1108                    " of a C99 variadic macro");
1109
1110       /* For -Wc++-compat, warn about use of C++ named operators.  */
1111       if (result->flags & NODE_WARN_OPERATOR)
1112         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1113                      "identifier \"%s\" is a special operator name in C++",
1114                      NODE_NAME (result));
1115     }
1116
1117   return result;
1118 }
1119
1120 /* Get the cpp_hashnode of an identifier specified by NAME in
1121    the current cpp_reader object.  If none is found, NULL is returned.  */
1122 cpp_hashnode *
1123 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
1124 {
1125   cpp_hashnode *result;
1126   result = lex_identifier_intern (pfile, (uchar *) name);
1127   return result;
1128 }
1129
1130 /* Lex an identifier starting at BUFFER->CUR - 1.  */
1131 static cpp_hashnode *
1132 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
1133                 struct normalize_state *nst)
1134 {
1135   cpp_hashnode *result;
1136   const uchar *cur;
1137   unsigned int len;
1138   unsigned int hash = HT_HASHSTEP (0, *base);
1139
1140   cur = pfile->buffer->cur;
1141   if (! starts_ucn)
1142     while (ISIDNUM (*cur))
1143       {
1144         hash = HT_HASHSTEP (hash, *cur);
1145         cur++;
1146       }
1147   pfile->buffer->cur = cur;
1148   if (starts_ucn || forms_identifier_p (pfile, false, nst))
1149     {
1150       /* Slower version for identifiers containing UCNs (or $).  */
1151       do {
1152         while (ISIDNUM (*pfile->buffer->cur))
1153           {
1154             pfile->buffer->cur++;
1155             NORMALIZE_STATE_UPDATE_IDNUM (nst);
1156           }
1157       } while (forms_identifier_p (pfile, false, nst));
1158       result = _cpp_interpret_identifier (pfile, base,
1159                                           pfile->buffer->cur - base);
1160     }
1161   else
1162     {
1163       len = cur - base;
1164       hash = HT_HASHFINISH (hash, len);
1165
1166       result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1167                                                   base, len, hash, HT_ALLOC));
1168     }
1169
1170   /* Rarely, identifiers require diagnostics when lexed.  */
1171   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1172                         && !pfile->state.skipping, 0))
1173     {
1174       /* It is allowed to poison the same identifier twice.  */
1175       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1176         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1177                    NODE_NAME (result));
1178
1179       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1180          replacement list of a variadic macro.  */
1181       if (result == pfile->spec_nodes.n__VA_ARGS__
1182           && !pfile->state.va_args_ok)
1183         cpp_error (pfile, CPP_DL_PEDWARN,
1184                    "__VA_ARGS__ can only appear in the expansion"
1185                    " of a C99 variadic macro");
1186
1187       /* For -Wc++-compat, warn about use of C++ named operators.  */
1188       if (result->flags & NODE_WARN_OPERATOR)
1189         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1190                      "identifier \"%s\" is a special operator name in C++",
1191                      NODE_NAME (result));
1192     }
1193
1194   return result;
1195 }
1196
1197 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
1198 static void
1199 lex_number (cpp_reader *pfile, cpp_string *number,
1200             struct normalize_state *nst)
1201 {
1202   const uchar *cur;
1203   const uchar *base;
1204   uchar *dest;
1205
1206   base = pfile->buffer->cur - 1;
1207   do
1208     {
1209       cur = pfile->buffer->cur;
1210
1211       /* N.B. ISIDNUM does not include $.  */
1212       while (ISIDNUM (*cur) || *cur == '.' || VALID_SIGN (*cur, cur[-1]))
1213         {
1214           cur++;
1215           NORMALIZE_STATE_UPDATE_IDNUM (nst);
1216         }
1217
1218       pfile->buffer->cur = cur;
1219     }
1220   while (forms_identifier_p (pfile, false, nst));
1221
1222   number->len = cur - base;
1223   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
1224   memcpy (dest, base, number->len);
1225   dest[number->len] = '\0';
1226   number->text = dest;
1227 }
1228
1229 /* Create a token of type TYPE with a literal spelling.  */
1230 static void
1231 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
1232                 unsigned int len, enum cpp_ttype type)
1233 {
1234   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
1235
1236   memcpy (dest, base, len);
1237   dest[len] = '\0';
1238   token->type = type;
1239   token->val.str.len = len;
1240   token->val.str.text = dest;
1241 }
1242
1243 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
1244    sequence from *FIRST_BUFF_P to LAST_BUFF_P.  */
1245
1246 static void
1247 bufring_append (cpp_reader *pfile, const uchar *base, size_t len,
1248                 _cpp_buff **first_buff_p, _cpp_buff **last_buff_p)
1249 {
1250   _cpp_buff *first_buff = *first_buff_p;
1251   _cpp_buff *last_buff = *last_buff_p;
1252
1253   if (first_buff == NULL)
1254     first_buff = last_buff = _cpp_get_buff (pfile, len);
1255   else if (len > BUFF_ROOM (last_buff))
1256     {
1257       size_t room = BUFF_ROOM (last_buff);
1258       memcpy (BUFF_FRONT (last_buff), base, room);
1259       BUFF_FRONT (last_buff) += room;
1260       base += room;
1261       len -= room;
1262       last_buff = _cpp_append_extend_buff (pfile, last_buff, len);
1263     }
1264
1265   memcpy (BUFF_FRONT (last_buff), base, len);
1266   BUFF_FRONT (last_buff) += len;
1267
1268   *first_buff_p = first_buff;
1269   *last_buff_p = last_buff;
1270 }
1271
1272 /* Lexes a raw string.  The stored string contains the spelling, including
1273    double quotes, delimiter string, '(' and ')', any leading
1274    'L', 'u', 'U' or 'u8' and 'R' modifier.  It returns the type of the
1275    literal, or CPP_OTHER if it was not properly terminated.
1276
1277    The spelling is NUL-terminated, but it is not guaranteed that this
1278    is the first NUL since embedded NULs are preserved.  */
1279
1280 static void
1281 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base,
1282                 const uchar *cur)
1283 {
1284   const uchar *raw_prefix;
1285   unsigned int raw_prefix_len = 0;
1286   enum cpp_ttype type;
1287   size_t total_len = 0;
1288   _cpp_buff *first_buff = NULL, *last_buff = NULL;
1289   _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
1290
1291   type = (*base == 'L' ? CPP_WSTRING :
1292           *base == 'U' ? CPP_STRING32 :
1293           *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1294           : CPP_STRING);
1295
1296   raw_prefix = cur + 1;
1297   while (raw_prefix_len < 16)
1298     {
1299       switch (raw_prefix[raw_prefix_len])
1300         {
1301         case ' ': case '(': case ')': case '\\': case '\t':
1302         case '\v': case '\f': case '\n': default:
1303           break;
1304         /* Basic source charset except the above chars.  */
1305         case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1306         case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1307         case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1308         case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1309         case 'y': case 'z':
1310         case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1311         case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1312         case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1313         case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1314         case 'Y': case 'Z':
1315         case '0': case '1': case '2': case '3': case '4': case '5':
1316         case '6': case '7': case '8': case '9':
1317         case '_': case '{': case '}': case '#': case '[': case ']':
1318         case '<': case '>': case '%': case ':': case ';': case '.':
1319         case '?': case '*': case '+': case '-': case '/': case '^':
1320         case '&': case '|': case '~': case '!': case '=': case ',':
1321         case '"': case '\'':
1322           raw_prefix_len++;
1323           continue;
1324         }
1325       break;
1326     }
1327
1328   if (raw_prefix[raw_prefix_len] != '(')
1329     {
1330       int col = CPP_BUF_COLUMN (pfile->buffer, raw_prefix + raw_prefix_len)
1331                 + 1;
1332       if (raw_prefix_len == 16)
1333         cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, col,
1334                              "raw string delimiter longer than 16 characters");
1335       else
1336         cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, col,
1337                              "invalid character '%c' in raw string delimiter",
1338                              (int) raw_prefix[raw_prefix_len]);
1339       pfile->buffer->cur = raw_prefix - 1;
1340       create_literal (pfile, token, base, raw_prefix - 1 - base, CPP_OTHER);
1341       return;
1342     }
1343
1344   cur = raw_prefix + raw_prefix_len + 1;
1345   for (;;)
1346     {
1347 #define BUF_APPEND(STR,LEN)                                     \
1348       do {                                                      \
1349         bufring_append (pfile, (const uchar *)(STR), (LEN),     \
1350                         &first_buff, &last_buff);               \
1351         total_len += (LEN);                                     \
1352       } while (0);
1353
1354       cppchar_t c;
1355
1356       /* If we previously performed any trigraph or line splicing
1357          transformations, undo them within the body of the raw string.  */
1358       while (note->pos < cur)
1359         ++note;
1360       for (; note->pos == cur; ++note)
1361         {
1362           switch (note->type)
1363             {
1364             case '\\':
1365             case ' ':
1366               /* Restore backslash followed by newline.  */
1367               BUF_APPEND (base, cur - base);
1368               base = cur;
1369               BUF_APPEND ("\\", 1);
1370             after_backslash:
1371               if (note->type == ' ')
1372                 {
1373                   /* GNU backslash whitespace newline extension.  FIXME
1374                      could be any sequence of non-vertical space.  When we
1375                      can properly restore any such sequence, we should mark
1376                      this note as handled so _cpp_process_line_notes
1377                      doesn't warn.  */
1378                   BUF_APPEND (" ", 1);
1379                 }
1380
1381               BUF_APPEND ("\n", 1);
1382               break;
1383
1384             case 0:
1385               /* Already handled.  */
1386               break;
1387
1388             default:
1389               if (_cpp_trigraph_map[note->type])
1390                 {
1391                   /* Don't warn about this trigraph in
1392                      _cpp_process_line_notes, since trigraphs show up as
1393                      trigraphs in raw strings.  */
1394                   uchar type = note->type;
1395                   note->type = 0;
1396
1397                   if (!CPP_OPTION (pfile, trigraphs))
1398                     /* If we didn't convert the trigraph in the first
1399                        place, don't do anything now either.  */
1400                     break;
1401
1402                   BUF_APPEND (base, cur - base);
1403                   base = cur;
1404                   BUF_APPEND ("??", 2);
1405
1406                   /* ??/ followed by newline gets two line notes, one for
1407                      the trigraph and one for the backslash/newline.  */
1408                   if (type == '/' && note[1].pos == cur)
1409                     {
1410                       if (note[1].type != '\\'
1411                           && note[1].type != ' ')
1412                         abort ();
1413                       BUF_APPEND ("/", 1);
1414                       ++note;
1415                       goto after_backslash;
1416                     }
1417                   /* The ) from ??) could be part of the suffix.  */
1418                   else if (type == ')'
1419                            && strncmp ((const char *) cur+1,
1420                                        (const char *) raw_prefix,
1421                                        raw_prefix_len) == 0
1422                            && cur[raw_prefix_len+1] == '"')
1423                     {
1424                       BUF_APPEND (")", 1);
1425                       base++;
1426                       cur += raw_prefix_len + 2;
1427                       goto break_outer_loop;
1428                     }
1429                   else
1430                     {
1431                       /* Skip the replacement character.  */
1432                       base = ++cur;
1433                       BUF_APPEND (&type, 1);
1434                     }
1435                 }
1436               else
1437                 abort ();
1438               break;
1439             }
1440         }
1441       c = *cur++;
1442
1443       if (c == ')'
1444           && strncmp ((const char *) cur, (const char *) raw_prefix,
1445                       raw_prefix_len) == 0
1446           && cur[raw_prefix_len] == '"')
1447         {
1448           cur += raw_prefix_len + 1;
1449           break;
1450         }
1451       else if (c == '\n')
1452         {
1453           if (pfile->state.in_directive
1454               || pfile->state.parsing_args
1455               || pfile->state.in_deferred_pragma)
1456             {
1457               cur--;
1458               type = CPP_OTHER;
1459               cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
1460                                    "unterminated raw string");
1461               break;
1462             }
1463
1464           BUF_APPEND (base, cur - base);
1465
1466           if (pfile->buffer->cur < pfile->buffer->rlimit)
1467             CPP_INCREMENT_LINE (pfile, 0);
1468           pfile->buffer->need_line = true;
1469
1470           pfile->buffer->cur = cur-1;
1471           _cpp_process_line_notes (pfile, false);
1472           if (!_cpp_get_fresh_line (pfile))
1473             {
1474               source_location src_loc = token->src_loc;
1475               token->type = CPP_EOF;
1476               /* Tell the compiler the line number of the EOF token.  */
1477               token->src_loc = pfile->line_table->highest_line;
1478               token->flags = BOL;
1479               if (first_buff != NULL)
1480                 _cpp_release_buff (pfile, first_buff);
1481               cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
1482                                    "unterminated raw string");
1483               return;
1484             }
1485
1486           cur = base = pfile->buffer->cur;
1487           note = &pfile->buffer->notes[pfile->buffer->cur_note];
1488         }
1489     }
1490  break_outer_loop:
1491
1492   if (CPP_OPTION (pfile, user_literals))
1493     {
1494       /* According to C++11 [lex.ext]p10, a ud-suffix not starting with an
1495          underscore is ill-formed.  Since this breaks programs using macros
1496          from inttypes.h, we generate a warning and treat the ud-suffix as a
1497          separate preprocessing token.  This approach is under discussion by
1498          the standards committee, and has been adopted as a conforming
1499          extension by other front ends such as clang. */
1500       if (ISALPHA(*cur))
1501         {
1502           // Raise a warning, but do not consume subsequent tokens.
1503           if (CPP_OPTION (pfile, warn_literal_suffix))
1504             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
1505                                    token->src_loc, 0,
1506                                    "invalid suffix on literal; C++11 requires "
1507                                    "a space between literal and identifier");
1508         }
1509       /* Grab user defined literal suffix.  */
1510       else if (*cur == '_')
1511         {
1512           type = cpp_userdef_string_add_type (type);
1513           ++cur;
1514
1515           while (ISIDNUM (*cur))
1516             ++cur;
1517         }
1518     }
1519
1520   pfile->buffer->cur = cur;
1521   if (first_buff == NULL)
1522     create_literal (pfile, token, base, cur - base, type);
1523   else
1524     {
1525       uchar *dest = _cpp_unaligned_alloc (pfile, total_len + (cur - base) + 1);
1526
1527       token->type = type;
1528       token->val.str.len = total_len + (cur - base);
1529       token->val.str.text = dest;
1530       last_buff = first_buff;
1531       while (last_buff != NULL)
1532         {
1533           memcpy (dest, last_buff->base,
1534                   BUFF_FRONT (last_buff) - last_buff->base);
1535           dest += BUFF_FRONT (last_buff) - last_buff->base;
1536           last_buff = last_buff->next;
1537         }
1538       _cpp_release_buff (pfile, first_buff);
1539       memcpy (dest, base, cur - base);
1540       dest[cur - base] = '\0';
1541     }
1542 }
1543
1544 /* Lexes a string, character constant, or angle-bracketed header file
1545    name.  The stored string contains the spelling, including opening
1546    quote and any leading 'L', 'u', 'U' or 'u8' and optional
1547    'R' modifier.  It returns the type of the literal, or CPP_OTHER
1548    if it was not properly terminated, or CPP_LESS for an unterminated
1549    header name which must be relexed as normal tokens.
1550
1551    The spelling is NUL-terminated, but it is not guaranteed that this
1552    is the first NUL since embedded NULs are preserved.  */
1553 static void
1554 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
1555 {
1556   bool saw_NUL = false;
1557   const uchar *cur;
1558   cppchar_t terminator;
1559   enum cpp_ttype type;
1560
1561   cur = base;
1562   terminator = *cur++;
1563   if (terminator == 'L' || terminator == 'U')
1564     terminator = *cur++;
1565   else if (terminator == 'u')
1566     {
1567       terminator = *cur++;
1568       if (terminator == '8')
1569         terminator = *cur++;
1570     }
1571   if (terminator == 'R')
1572     {
1573       lex_raw_string (pfile, token, base, cur);
1574       return;
1575     }
1576   if (terminator == '"')
1577     type = (*base == 'L' ? CPP_WSTRING :
1578             *base == 'U' ? CPP_STRING32 :
1579             *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1580                          : CPP_STRING);
1581   else if (terminator == '\'')
1582     type = (*base == 'L' ? CPP_WCHAR :
1583             *base == 'U' ? CPP_CHAR32 :
1584             *base == 'u' ? CPP_CHAR16 : CPP_CHAR);
1585   else
1586     terminator = '>', type = CPP_HEADER_NAME;
1587
1588   for (;;)
1589     {
1590       cppchar_t c = *cur++;
1591
1592       /* In #include-style directives, terminators are not escapable.  */
1593       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
1594         cur++;
1595       else if (c == terminator)
1596         break;
1597       else if (c == '\n')
1598         {
1599           cur--;
1600           /* Unmatched quotes always yield undefined behavior, but
1601              greedy lexing means that what appears to be an unterminated
1602              header name may actually be a legitimate sequence of tokens.  */
1603           if (terminator == '>')
1604             {
1605               token->type = CPP_LESS;
1606               return;
1607             }
1608           type = CPP_OTHER;
1609           break;
1610         }
1611       else if (c == '\0')
1612         saw_NUL = true;
1613     }
1614
1615   if (saw_NUL && !pfile->state.skipping)
1616     cpp_error (pfile, CPP_DL_WARNING,
1617                "null character(s) preserved in literal");
1618
1619   if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
1620     cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
1621                (int) terminator);
1622
1623   if (CPP_OPTION (pfile, user_literals))
1624     {
1625       /* According to C++11 [lex.ext]p10, a ud-suffix not starting with an
1626          underscore is ill-formed.  Since this breaks programs using macros
1627          from inttypes.h, we generate a warning and treat the ud-suffix as a
1628          separate preprocessing token.  This approach is under discussion by
1629          the standards committee, and has been adopted as a conforming
1630          extension by other front ends such as clang. */
1631       if (ISALPHA(*cur))
1632         {
1633           // Raise a warning, but do not consume subsequent tokens.
1634           if (CPP_OPTION (pfile, warn_literal_suffix))
1635             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
1636                                    token->src_loc, 0,
1637                                    "invalid suffix on literal; C++11 requires "
1638                                    "a space between literal and identifier");
1639         }
1640       /* Grab user defined literal suffix.  */
1641       else if (*cur == '_')
1642         {
1643           type = cpp_userdef_char_add_type (type);
1644           type = cpp_userdef_string_add_type (type);
1645           ++cur;
1646
1647           while (ISIDNUM (*cur))
1648             ++cur;
1649         }
1650     }
1651
1652   pfile->buffer->cur = cur;
1653   create_literal (pfile, token, base, cur - base, type);
1654 }
1655
1656 /* Return the comment table. The client may not make any assumption
1657    about the ordering of the table.  */
1658 cpp_comment_table *
1659 cpp_get_comments (cpp_reader *pfile)
1660 {
1661   return &pfile->comments;
1662 }
1663
1664 /* Append a comment to the end of the comment table. */
1665 static void
1666 store_comment (cpp_reader *pfile, cpp_token *token)
1667 {
1668   int len;
1669
1670   if (pfile->comments.allocated == 0)
1671     {
1672       pfile->comments.allocated = 256;
1673       pfile->comments.entries = (cpp_comment *) xmalloc
1674         (pfile->comments.allocated * sizeof (cpp_comment));
1675     }
1676
1677   if (pfile->comments.count == pfile->comments.allocated)
1678     {
1679       pfile->comments.allocated *= 2;
1680       pfile->comments.entries = (cpp_comment *) xrealloc
1681         (pfile->comments.entries,
1682          pfile->comments.allocated * sizeof (cpp_comment));
1683     }
1684
1685   len = token->val.str.len;
1686
1687   /* Copy comment. Note, token may not be NULL terminated. */
1688   pfile->comments.entries[pfile->comments.count].comment =
1689     (char *) xmalloc (sizeof (char) * (len + 1));
1690   memcpy (pfile->comments.entries[pfile->comments.count].comment,
1691           token->val.str.text, len);
1692   pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
1693
1694   /* Set source location. */
1695   pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
1696
1697   /* Increment the count of entries in the comment table. */
1698   pfile->comments.count++;
1699 }
1700
1701 /* The stored comment includes the comment start and any terminator.  */
1702 static void
1703 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
1704               cppchar_t type)
1705 {
1706   unsigned char *buffer;
1707   unsigned int len, clen, i;
1708
1709   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
1710
1711   /* C++ comments probably (not definitely) have moved past a new
1712      line, which we don't want to save in the comment.  */
1713   if (is_vspace (pfile->buffer->cur[-1]))
1714     len--;
1715
1716   /* If we are currently in a directive or in argument parsing, then
1717      we need to store all C++ comments as C comments internally, and
1718      so we need to allocate a little extra space in that case.
1719
1720      Note that the only time we encounter a directive here is
1721      when we are saving comments in a "#define".  */
1722   clen = ((pfile->state.in_directive || pfile->state.parsing_args)
1723           && type == '/') ? len + 2 : len;
1724
1725   buffer = _cpp_unaligned_alloc (pfile, clen);
1726
1727   token->type = CPP_COMMENT;
1728   token->val.str.len = clen;
1729   token->val.str.text = buffer;
1730
1731   buffer[0] = '/';
1732   memcpy (buffer + 1, from, len - 1);
1733
1734   /* Finish conversion to a C comment, if necessary.  */
1735   if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
1736     {
1737       buffer[1] = '*';
1738       buffer[clen - 2] = '*';
1739       buffer[clen - 1] = '/';
1740       /* As there can be in a C++ comments illegal sequences for C comments
1741          we need to filter them out.  */
1742       for (i = 2; i < (clen - 2); i++)
1743         if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
1744           buffer[i] = '|';
1745     }
1746
1747   /* Finally store this comment for use by clients of libcpp. */
1748   store_comment (pfile, token);
1749 }
1750
1751 /* Allocate COUNT tokens for RUN.  */
1752 void
1753 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
1754 {
1755   run->base = XNEWVEC (cpp_token, count);
1756   run->limit = run->base + count;
1757   run->next = NULL;
1758 }
1759
1760 /* Returns the next tokenrun, or creates one if there is none.  */
1761 static tokenrun *
1762 next_tokenrun (tokenrun *run)
1763 {
1764   if (run->next == NULL)
1765     {
1766       run->next = XNEW (tokenrun);
1767       run->next->prev = run;
1768       _cpp_init_tokenrun (run->next, 250);
1769     }
1770
1771   return run->next;
1772 }
1773
1774 /* Return the number of not yet processed token in a given
1775    context.  */
1776 int
1777 _cpp_remaining_tokens_num_in_context (cpp_context *context)
1778 {
1779   if (context->tokens_kind == TOKENS_KIND_DIRECT)
1780     return (LAST (context).token - FIRST (context).token);
1781   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
1782            || context->tokens_kind == TOKENS_KIND_EXTENDED)
1783     return (LAST (context).ptoken - FIRST (context).ptoken);
1784   else
1785       abort ();
1786 }
1787
1788 /* Returns the token present at index INDEX in a given context.  If
1789    INDEX is zero, the next token to be processed is returned.  */
1790 static const cpp_token*
1791 _cpp_token_from_context_at (cpp_context *context, int index)
1792 {
1793   if (context->tokens_kind == TOKENS_KIND_DIRECT)
1794     return &(FIRST (context).token[index]);
1795   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
1796            || context->tokens_kind == TOKENS_KIND_EXTENDED)
1797     return FIRST (context).ptoken[index];
1798  else
1799    abort ();
1800 }
1801
1802 /* Look ahead in the input stream.  */
1803 const cpp_token *
1804 cpp_peek_token (cpp_reader *pfile, int index)
1805 {
1806   cpp_context *context = pfile->context;
1807   const cpp_token *peektok;
1808   int count;
1809
1810   /* First, scan through any pending cpp_context objects.  */
1811   while (context->prev)
1812     {
1813       ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
1814
1815       if (index < (int) sz)
1816         return _cpp_token_from_context_at (context, index);
1817       index -= (int) sz;
1818       context = context->prev;
1819     }
1820
1821   /* We will have to read some new tokens after all (and do so
1822      without invalidating preceding tokens).  */
1823   count = index;
1824   pfile->keep_tokens++;
1825
1826   do
1827     {
1828       peektok = _cpp_lex_token (pfile);
1829       if (peektok->type == CPP_EOF)
1830         return peektok;
1831     }
1832   while (index--);
1833
1834   _cpp_backup_tokens_direct (pfile, count + 1);
1835   pfile->keep_tokens--;
1836
1837   return peektok;
1838 }
1839
1840 /* Allocate a single token that is invalidated at the same time as the
1841    rest of the tokens on the line.  Has its line and col set to the
1842    same as the last lexed token, so that diagnostics appear in the
1843    right place.  */
1844 cpp_token *
1845 _cpp_temp_token (cpp_reader *pfile)
1846 {
1847   cpp_token *old, *result;
1848   ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
1849   ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
1850
1851   old = pfile->cur_token - 1;
1852   /* Any pre-existing lookaheads must not be clobbered.  */
1853   if (la)
1854     {
1855       if (sz <= la)
1856         {
1857           tokenrun *next = next_tokenrun (pfile->cur_run);
1858
1859           if (sz < la)
1860             memmove (next->base + 1, next->base,
1861                      (la - sz) * sizeof (cpp_token));
1862
1863           next->base[0] = pfile->cur_run->limit[-1];
1864         }
1865
1866       if (sz > 1)
1867         memmove (pfile->cur_token + 1, pfile->cur_token,
1868                  MIN (la, sz - 1) * sizeof (cpp_token));
1869     }
1870
1871   if (!sz && pfile->cur_token == pfile->cur_run->limit)
1872     {
1873       pfile->cur_run = next_tokenrun (pfile->cur_run);
1874       pfile->cur_token = pfile->cur_run->base;
1875     }
1876
1877   result = pfile->cur_token++;
1878   result->src_loc = old->src_loc;
1879   return result;
1880 }
1881
1882 /* Lex a token into RESULT (external interface).  Takes care of issues
1883    like directive handling, token lookahead, multiple include
1884    optimization and skipping.  */
1885 const cpp_token *
1886 _cpp_lex_token (cpp_reader *pfile)
1887 {
1888   cpp_token *result;
1889
1890   for (;;)
1891     {
1892       if (pfile->cur_token == pfile->cur_run->limit)
1893         {
1894           pfile->cur_run = next_tokenrun (pfile->cur_run);
1895           pfile->cur_token = pfile->cur_run->base;
1896         }
1897       /* We assume that the current token is somewhere in the current
1898          run.  */
1899       if (pfile->cur_token < pfile->cur_run->base
1900           || pfile->cur_token >= pfile->cur_run->limit)
1901         abort ();
1902
1903       if (pfile->lookaheads)
1904         {
1905           pfile->lookaheads--;
1906           result = pfile->cur_token++;
1907         }
1908       else
1909         result = _cpp_lex_direct (pfile);
1910
1911       if (result->flags & BOL)
1912         {
1913           /* Is this a directive.  If _cpp_handle_directive returns
1914              false, it is an assembler #.  */
1915           if (result->type == CPP_HASH
1916               /* 6.10.3 p 11: Directives in a list of macro arguments
1917                  gives undefined behavior.  This implementation
1918                  handles the directive as normal.  */
1919               && pfile->state.parsing_args != 1)
1920             {
1921               if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
1922                 {
1923                   if (pfile->directive_result.type == CPP_PADDING)
1924                     continue;
1925                   result = &pfile->directive_result;
1926                 }
1927             }
1928           else if (pfile->state.in_deferred_pragma)
1929             result = &pfile->directive_result;
1930
1931           if (pfile->cb.line_change && !pfile->state.skipping)
1932             pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
1933         }
1934
1935       /* We don't skip tokens in directives.  */
1936       if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
1937         break;
1938
1939       /* Outside a directive, invalidate controlling macros.  At file
1940          EOF, _cpp_lex_direct takes care of popping the buffer, so we never
1941          get here and MI optimization works.  */
1942       pfile->mi_valid = false;
1943
1944       if (!pfile->state.skipping || result->type == CPP_EOF)
1945         break;
1946     }
1947
1948   return result;
1949 }
1950
1951 /* Returns true if a fresh line has been loaded.  */
1952 bool
1953 _cpp_get_fresh_line (cpp_reader *pfile)
1954 {
1955   int return_at_eof;
1956
1957   /* We can't get a new line until we leave the current directive.  */
1958   if (pfile->state.in_directive)
1959     return false;
1960
1961   for (;;)
1962     {
1963       cpp_buffer *buffer = pfile->buffer;
1964
1965       if (!buffer->need_line)
1966         return true;
1967
1968       if (buffer->next_line < buffer->rlimit)
1969         {
1970           _cpp_clean_line (pfile);
1971           return true;
1972         }
1973
1974       /* First, get out of parsing arguments state.  */
1975       if (pfile->state.parsing_args)
1976         return false;
1977
1978       /* End of buffer.  Non-empty files should end in a newline.  */
1979       if (buffer->buf != buffer->rlimit
1980           && buffer->next_line > buffer->rlimit
1981           && !buffer->from_stage3)
1982         {
1983           /* Clip to buffer size.  */
1984           buffer->next_line = buffer->rlimit;
1985         }
1986
1987       return_at_eof = buffer->return_at_eof;
1988       _cpp_pop_buffer (pfile);
1989       if (pfile->buffer == NULL || return_at_eof)
1990         return false;
1991     }
1992 }
1993
1994 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)          \
1995   do                                                    \
1996     {                                                   \
1997       result->type = ELSE_TYPE;                         \
1998       if (*buffer->cur == CHAR)                         \
1999         buffer->cur++, result->type = THEN_TYPE;        \
2000     }                                                   \
2001   while (0)
2002
2003 /* Lex a token into pfile->cur_token, which is also incremented, to
2004    get diagnostics pointing to the correct location.
2005
2006    Does not handle issues such as token lookahead, multiple-include
2007    optimization, directives, skipping etc.  This function is only
2008    suitable for use by _cpp_lex_token, and in special cases like
2009    lex_expansion_token which doesn't care for any of these issues.
2010
2011    When meeting a newline, returns CPP_EOF if parsing a directive,
2012    otherwise returns to the start of the token buffer if permissible.
2013    Returns the location of the lexed token.  */
2014 cpp_token *
2015 _cpp_lex_direct (cpp_reader *pfile)
2016 {
2017   cppchar_t c;
2018   cpp_buffer *buffer;
2019   const unsigned char *comment_start;
2020   cpp_token *result = pfile->cur_token++;
2021
2022  fresh_line:
2023   result->flags = 0;
2024   buffer = pfile->buffer;
2025   if (buffer->need_line)
2026     {
2027       if (pfile->state.in_deferred_pragma)
2028         {
2029           result->type = CPP_PRAGMA_EOL;
2030           pfile->state.in_deferred_pragma = false;
2031           if (!pfile->state.pragma_allow_expansion)
2032             pfile->state.prevent_expansion--;
2033           return result;
2034         }
2035       if (!_cpp_get_fresh_line (pfile))
2036         {
2037           result->type = CPP_EOF;
2038           if (!pfile->state.in_directive)
2039             {
2040               /* Tell the compiler the line number of the EOF token.  */
2041               result->src_loc = pfile->line_table->highest_line;
2042               result->flags = BOL;
2043             }
2044           return result;
2045         }
2046       if (!pfile->keep_tokens)
2047         {
2048           pfile->cur_run = &pfile->base_run;
2049           result = pfile->base_run.base;
2050           pfile->cur_token = result + 1;
2051         }
2052       result->flags = BOL;
2053       if (pfile->state.parsing_args == 2)
2054         result->flags |= PREV_WHITE;
2055     }
2056   buffer = pfile->buffer;
2057  update_tokens_line:
2058   result->src_loc = pfile->line_table->highest_line;
2059
2060  skipped_white:
2061   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
2062       && !pfile->overlaid_buffer)
2063     {
2064       _cpp_process_line_notes (pfile, false);
2065       result->src_loc = pfile->line_table->highest_line;
2066     }
2067   c = *buffer->cur++;
2068
2069   if (pfile->forced_token_location_p)
2070     result->src_loc = *pfile->forced_token_location_p;
2071   else
2072     result->src_loc = linemap_position_for_column (pfile->line_table,
2073                                           CPP_BUF_COLUMN (buffer, buffer->cur));
2074
2075   switch (c)
2076     {
2077     case ' ': case '\t': case '\f': case '\v': case '\0':
2078       result->flags |= PREV_WHITE;
2079       skip_whitespace (pfile, c);
2080       goto skipped_white;
2081
2082     case '\n':
2083       if (buffer->cur < buffer->rlimit)
2084         CPP_INCREMENT_LINE (pfile, 0);
2085       buffer->need_line = true;
2086       goto fresh_line;
2087
2088     case '0': case '1': case '2': case '3': case '4':
2089     case '5': case '6': case '7': case '8': case '9':
2090       {
2091         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2092         result->type = CPP_NUMBER;
2093         lex_number (pfile, &result->val.str, &nst);
2094         warn_about_normalization (pfile, result, &nst);
2095         break;
2096       }
2097
2098     case 'L':
2099     case 'u':
2100     case 'U':
2101     case 'R':
2102       /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
2103          wide strings or raw strings.  */
2104       if (c == 'L' || CPP_OPTION (pfile, rliterals)
2105           || (c != 'R' && CPP_OPTION (pfile, uliterals)))
2106         {
2107           if ((*buffer->cur == '\'' && c != 'R')
2108               || *buffer->cur == '"'
2109               || (*buffer->cur == 'R'
2110                   && c != 'R'
2111                   && buffer->cur[1] == '"'
2112                   && CPP_OPTION (pfile, rliterals))
2113               || (*buffer->cur == '8'
2114                   && c == 'u'
2115                   && (buffer->cur[1] == '"'
2116                       || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
2117                           && CPP_OPTION (pfile, rliterals)))))
2118             {
2119               lex_string (pfile, result, buffer->cur - 1);
2120               break;
2121             }
2122         }
2123       /* Fall through.  */
2124
2125     case '_':
2126     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
2127     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
2128     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
2129     case 's': case 't':           case 'v': case 'w': case 'x':
2130     case 'y': case 'z':
2131     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
2132     case 'G': case 'H': case 'I': case 'J': case 'K':
2133     case 'M': case 'N': case 'O': case 'P': case 'Q':
2134     case 'S': case 'T':           case 'V': case 'W': case 'X':
2135     case 'Y': case 'Z':
2136       result->type = CPP_NAME;
2137       {
2138         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2139         result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
2140                                                 &nst);
2141         warn_about_normalization (pfile, result, &nst);
2142       }
2143
2144       /* Convert named operators to their proper types.  */
2145       if (result->val.node.node->flags & NODE_OPERATOR)
2146         {
2147           result->flags |= NAMED_OP;
2148           result->type = (enum cpp_ttype) result->val.node.node->directive_index;
2149         }
2150       break;
2151
2152     case '\'':
2153     case '"':
2154       lex_string (pfile, result, buffer->cur - 1);
2155       break;
2156
2157     case '/':
2158       /* A potential block or line comment.  */
2159       comment_start = buffer->cur;
2160       c = *buffer->cur;
2161
2162       if (c == '*')
2163         {
2164           if (_cpp_skip_block_comment (pfile))
2165             cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
2166         }
2167       else if (c == '/' && (CPP_OPTION (pfile, cplusplus_comments)
2168                             || cpp_in_system_header (pfile)))
2169         {
2170           /* Warn about comments only if pedantically GNUC89, and not
2171              in system headers.  */
2172           if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile)
2173               && ! buffer->warned_cplusplus_comments)
2174             {
2175               cpp_error (pfile, CPP_DL_PEDWARN,
2176                          "C++ style comments are not allowed in ISO C90");
2177               cpp_error (pfile, CPP_DL_PEDWARN,
2178                          "(this will be reported only once per input file)");
2179               buffer->warned_cplusplus_comments = 1;
2180             }
2181
2182           if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
2183             cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
2184         }
2185       else if (c == '=')
2186         {
2187           buffer->cur++;
2188           result->type = CPP_DIV_EQ;
2189           break;
2190         }
2191       else
2192         {
2193           result->type = CPP_DIV;
2194           break;
2195         }
2196
2197       if (!pfile->state.save_comments)
2198         {
2199           result->flags |= PREV_WHITE;
2200           goto update_tokens_line;
2201         }
2202
2203       /* Save the comment as a token in its own right.  */
2204       save_comment (pfile, result, comment_start, c);
2205       break;
2206
2207     case '<':
2208       if (pfile->state.angled_headers)
2209         {
2210           lex_string (pfile, result, buffer->cur - 1);
2211           if (result->type != CPP_LESS)
2212             break;
2213         }
2214
2215       result->type = CPP_LESS;
2216       if (*buffer->cur == '=')
2217         buffer->cur++, result->type = CPP_LESS_EQ;
2218       else if (*buffer->cur == '<')
2219         {
2220           buffer->cur++;
2221           IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
2222         }
2223       else if (CPP_OPTION (pfile, digraphs))
2224         {
2225           if (*buffer->cur == ':')
2226             {
2227               /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
2228                  three characters are <:: and the subsequent character
2229                  is neither : nor >, the < is treated as a preprocessor
2230                  token by itself".  */
2231               if (CPP_OPTION (pfile, cplusplus)
2232                   && (CPP_OPTION (pfile, lang) == CLK_CXX11
2233                       || CPP_OPTION (pfile, lang) == CLK_GNUCXX11)
2234                   && buffer->cur[1] == ':'
2235                   && buffer->cur[2] != ':' && buffer->cur[2] != '>')
2236                 break;
2237
2238               buffer->cur++;
2239               result->flags |= DIGRAPH;
2240               result->type = CPP_OPEN_SQUARE;
2241             }
2242           else if (*buffer->cur == '%')
2243             {
2244               buffer->cur++;
2245               result->flags |= DIGRAPH;
2246               result->type = CPP_OPEN_BRACE;
2247             }
2248         }
2249       break;
2250
2251     case '>':
2252       result->type = CPP_GREATER;
2253       if (*buffer->cur == '=')
2254         buffer->cur++, result->type = CPP_GREATER_EQ;
2255       else if (*buffer->cur == '>')
2256         {
2257           buffer->cur++;
2258           IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
2259         }
2260       break;
2261
2262     case '%':
2263       result->type = CPP_MOD;
2264       if (*buffer->cur == '=')
2265         buffer->cur++, result->type = CPP_MOD_EQ;
2266       else if (CPP_OPTION (pfile, digraphs))
2267         {
2268           if (*buffer->cur == ':')
2269             {
2270               buffer->cur++;
2271               result->flags |= DIGRAPH;
2272               result->type = CPP_HASH;
2273               if (*buffer->cur == '%' && buffer->cur[1] == ':')
2274                 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
2275             }
2276           else if (*buffer->cur == '>')
2277             {
2278               buffer->cur++;
2279               result->flags |= DIGRAPH;
2280               result->type = CPP_CLOSE_BRACE;
2281             }
2282         }
2283       break;
2284
2285     case '.':
2286       result->type = CPP_DOT;
2287       if (ISDIGIT (*buffer->cur))
2288         {
2289           struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2290           result->type = CPP_NUMBER;
2291           lex_number (pfile, &result->val.str, &nst);
2292           warn_about_normalization (pfile, result, &nst);
2293         }
2294       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
2295         buffer->cur += 2, result->type = CPP_ELLIPSIS;
2296       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
2297         buffer->cur++, result->type = CPP_DOT_STAR;
2298       break;
2299
2300     case '+':
2301       result->type = CPP_PLUS;
2302       if (*buffer->cur == '+')
2303         buffer->cur++, result->type = CPP_PLUS_PLUS;
2304       else if (*buffer->cur == '=')
2305         buffer->cur++, result->type = CPP_PLUS_EQ;
2306       break;
2307
2308     case '-':
2309       result->type = CPP_MINUS;
2310       if (*buffer->cur == '>')
2311         {
2312           buffer->cur++;
2313           result->type = CPP_DEREF;
2314           if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
2315             buffer->cur++, result->type = CPP_DEREF_STAR;
2316         }
2317       else if (*buffer->cur == '-')
2318         buffer->cur++, result->type = CPP_MINUS_MINUS;
2319       else if (*buffer->cur == '=')
2320         buffer->cur++, result->type = CPP_MINUS_EQ;
2321       break;
2322
2323     case '&':
2324       result->type = CPP_AND;
2325       if (*buffer->cur == '&')
2326         buffer->cur++, result->type = CPP_AND_AND;
2327       else if (*buffer->cur == '=')
2328         buffer->cur++, result->type = CPP_AND_EQ;
2329       break;
2330
2331     case '|':
2332       result->type = CPP_OR;
2333       if (*buffer->cur == '|')
2334         buffer->cur++, result->type = CPP_OR_OR;
2335       else if (*buffer->cur == '=')
2336         buffer->cur++, result->type = CPP_OR_EQ;
2337       break;
2338
2339     case ':':
2340       result->type = CPP_COLON;
2341       if (*buffer->cur == ':' && CPP_OPTION (pfile, cplusplus))
2342         buffer->cur++, result->type = CPP_SCOPE;
2343       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
2344         {
2345           buffer->cur++;
2346           result->flags |= DIGRAPH;
2347           result->type = CPP_CLOSE_SQUARE;
2348         }
2349       break;
2350
2351     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
2352     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
2353     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
2354     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
2355     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
2356
2357     case '?': result->type = CPP_QUERY; break;
2358     case '~': result->type = CPP_COMPL; break;
2359     case ',': result->type = CPP_COMMA; break;
2360     case '(': result->type = CPP_OPEN_PAREN; break;
2361     case ')': result->type = CPP_CLOSE_PAREN; break;
2362     case '[': result->type = CPP_OPEN_SQUARE; break;
2363     case ']': result->type = CPP_CLOSE_SQUARE; break;
2364     case '{': result->type = CPP_OPEN_BRACE; break;
2365     case '}': result->type = CPP_CLOSE_BRACE; break;
2366     case ';': result->type = CPP_SEMICOLON; break;
2367
2368       /* @ is a punctuator in Objective-C.  */
2369     case '@': result->type = CPP_ATSIGN; break;
2370
2371     case '$':
2372     case '\\':
2373       {
2374         const uchar *base = --buffer->cur;
2375         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2376
2377         if (forms_identifier_p (pfile, true, &nst))
2378           {
2379             result->type = CPP_NAME;
2380             result->val.node.node = lex_identifier (pfile, base, true, &nst);
2381             warn_about_normalization (pfile, result, &nst);
2382             break;
2383           }
2384         buffer->cur++;
2385       }
2386
2387     default:
2388       create_literal (pfile, result, buffer->cur - 1, 1, CPP_OTHER);
2389       break;
2390     }
2391
2392   return result;
2393 }
2394
2395 /* An upper bound on the number of bytes needed to spell TOKEN.
2396    Does not include preceding whitespace.  */
2397 unsigned int
2398 cpp_token_len (const cpp_token *token)
2399 {
2400   unsigned int len;
2401
2402   switch (TOKEN_SPELL (token))
2403     {
2404     default:            len = 6;                                break;
2405     case SPELL_LITERAL: len = token->val.str.len;               break;
2406     case SPELL_IDENT:   len = NODE_LEN (token->val.node.node) * 10;     break;
2407     }
2408
2409   return len;
2410 }
2411
2412 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
2413    Return the number of bytes read out of NAME.  (There are always
2414    10 bytes written to BUFFER.)  */
2415
2416 static size_t
2417 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
2418 {
2419   int j;
2420   int ucn_len = 0;
2421   int ucn_len_c;
2422   unsigned t;
2423   unsigned long utf32;
2424
2425   /* Compute the length of the UTF-8 sequence.  */
2426   for (t = *name; t & 0x80; t <<= 1)
2427     ucn_len++;
2428
2429   utf32 = *name & (0x7F >> ucn_len);
2430   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
2431     {
2432       utf32 = (utf32 << 6) | (*++name & 0x3F);
2433
2434       /* Ill-formed UTF-8.  */
2435       if ((*name & ~0x3F) != 0x80)
2436         abort ();
2437     }
2438
2439   *buffer++ = '\\';
2440   *buffer++ = 'U';
2441   for (j = 7; j >= 0; j--)
2442     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
2443   return ucn_len;
2444 }
2445
2446 /* Given a token TYPE corresponding to a digraph, return a pointer to
2447    the spelling of the digraph.  */
2448 static const unsigned char *
2449 cpp_digraph2name (enum cpp_ttype type)
2450 {
2451   return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
2452 }
2453
2454 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
2455    already contain the enough space to hold the token's spelling.
2456    Returns a pointer to the character after the last character written.
2457    FORSTRING is true if this is to be the spelling after translation
2458    phase 1 (this is different for UCNs).
2459    FIXME: Would be nice if we didn't need the PFILE argument.  */
2460 unsigned char *
2461 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
2462                  unsigned char *buffer, bool forstring)
2463 {
2464   switch (TOKEN_SPELL (token))
2465     {
2466     case SPELL_OPERATOR:
2467       {
2468         const unsigned char *spelling;
2469         unsigned char c;
2470
2471         if (token->flags & DIGRAPH)
2472           spelling = cpp_digraph2name (token->type);
2473         else if (token->flags & NAMED_OP)
2474           goto spell_ident;
2475         else
2476           spelling = TOKEN_NAME (token);
2477
2478         while ((c = *spelling++) != '\0')
2479           *buffer++ = c;
2480       }
2481       break;
2482
2483     spell_ident:
2484     case SPELL_IDENT:
2485       if (forstring)
2486         {
2487           memcpy (buffer, NODE_NAME (token->val.node.node),
2488                   NODE_LEN (token->val.node.node));
2489           buffer += NODE_LEN (token->val.node.node);
2490         }
2491       else
2492         {
2493           size_t i;
2494           const unsigned char * name = NODE_NAME (token->val.node.node);
2495
2496           for (i = 0; i < NODE_LEN (token->val.node.node); i++)
2497             if (name[i] & ~0x7F)
2498               {
2499                 i += utf8_to_ucn (buffer, name + i) - 1;
2500                 buffer += 10;
2501               }
2502             else
2503               *buffer++ = NODE_NAME (token->val.node.node)[i];
2504         }
2505       break;
2506
2507     case SPELL_LITERAL:
2508       memcpy (buffer, token->val.str.text, token->val.str.len);
2509       buffer += token->val.str.len;
2510       break;
2511
2512     case SPELL_NONE:
2513       cpp_error (pfile, CPP_DL_ICE,
2514                  "unspellable token %s", TOKEN_NAME (token));
2515       break;
2516     }
2517
2518   return buffer;
2519 }
2520
2521 /* Returns TOKEN spelt as a null-terminated string.  The string is
2522    freed when the reader is destroyed.  Useful for diagnostics.  */
2523 unsigned char *
2524 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
2525 {
2526   unsigned int len = cpp_token_len (token) + 1;
2527   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
2528
2529   end = cpp_spell_token (pfile, token, start, false);
2530   end[0] = '\0';
2531
2532   return start;
2533 }
2534
2535 /* Returns a pointer to a string which spells the token defined by
2536    TYPE and FLAGS.  Used by C front ends, which really should move to
2537    using cpp_token_as_text.  */
2538 const char *
2539 cpp_type2name (enum cpp_ttype type, unsigned char flags)
2540 {
2541   if (flags & DIGRAPH)
2542     return (const char *) cpp_digraph2name (type);
2543   else if (flags & NAMED_OP)
2544     return cpp_named_operator2name (type);
2545
2546   return (const char *) token_spellings[type].name;
2547 }
2548
2549 /* Writes the spelling of token to FP, without any preceding space.
2550    Separated from cpp_spell_token for efficiency - to avoid stdio
2551    double-buffering.  */
2552 void
2553 cpp_output_token (const cpp_token *token, FILE *fp)
2554 {
2555   switch (TOKEN_SPELL (token))
2556     {
2557     case SPELL_OPERATOR:
2558       {
2559         const unsigned char *spelling;
2560         int c;
2561
2562         if (token->flags & DIGRAPH)
2563           spelling = cpp_digraph2name (token->type);
2564         else if (token->flags & NAMED_OP)
2565           goto spell_ident;
2566         else
2567           spelling = TOKEN_NAME (token);
2568
2569         c = *spelling;
2570         do
2571           putc (c, fp);
2572         while ((c = *++spelling) != '\0');
2573       }
2574       break;
2575
2576     spell_ident:
2577     case SPELL_IDENT:
2578       {
2579         size_t i;
2580         const unsigned char * name = NODE_NAME (token->val.node.node);
2581
2582         for (i = 0; i < NODE_LEN (token->val.node.node); i++)
2583           if (name[i] & ~0x7F)
2584             {
2585               unsigned char buffer[10];
2586               i += utf8_to_ucn (buffer, name + i) - 1;
2587               fwrite (buffer, 1, 10, fp);
2588             }
2589           else
2590             fputc (NODE_NAME (token->val.node.node)[i], fp);
2591       }
2592       break;
2593
2594     case SPELL_LITERAL:
2595       fwrite (token->val.str.text, 1, token->val.str.len, fp);
2596       break;
2597
2598     case SPELL_NONE:
2599       /* An error, most probably.  */
2600       break;
2601     }
2602 }
2603
2604 /* Compare two tokens.  */
2605 int
2606 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
2607 {
2608   if (a->type == b->type && a->flags == b->flags)
2609     switch (TOKEN_SPELL (a))
2610       {
2611       default:                  /* Keep compiler happy.  */
2612       case SPELL_OPERATOR:
2613         /* token_no is used to track where multiple consecutive ##
2614            tokens were originally located.  */
2615         return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
2616       case SPELL_NONE:
2617         return (a->type != CPP_MACRO_ARG
2618                 || a->val.macro_arg.arg_no == b->val.macro_arg.arg_no);
2619       case SPELL_IDENT:
2620         return a->val.node.node == b->val.node.node;
2621       case SPELL_LITERAL:
2622         return (a->val.str.len == b->val.str.len
2623                 && !memcmp (a->val.str.text, b->val.str.text,
2624                             a->val.str.len));
2625       }
2626
2627   return 0;
2628 }
2629
2630 /* Returns nonzero if a space should be inserted to avoid an
2631    accidental token paste for output.  For simplicity, it is
2632    conservative, and occasionally advises a space where one is not
2633    needed, e.g. "." and ".2".  */
2634 int
2635 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
2636                  const cpp_token *token2)
2637 {
2638   enum cpp_ttype a = token1->type, b = token2->type;
2639   cppchar_t c;
2640
2641   if (token1->flags & NAMED_OP)
2642     a = CPP_NAME;
2643   if (token2->flags & NAMED_OP)
2644     b = CPP_NAME;
2645
2646   c = EOF;
2647   if (token2->flags & DIGRAPH)
2648     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
2649   else if (token_spellings[b].category == SPELL_OPERATOR)
2650     c = token_spellings[b].name[0];
2651
2652   /* Quickly get everything that can paste with an '='.  */
2653   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
2654     return 1;
2655
2656   switch (a)
2657     {
2658     case CPP_GREATER:   return c == '>';
2659     case CPP_LESS:      return c == '<' || c == '%' || c == ':';
2660     case CPP_PLUS:      return c == '+';
2661     case CPP_MINUS:     return c == '-' || c == '>';
2662     case CPP_DIV:       return c == '/' || c == '*'; /* Comments.  */
2663     case CPP_MOD:       return c == ':' || c == '>';
2664     case CPP_AND:       return c == '&';
2665     case CPP_OR:        return c == '|';
2666     case CPP_COLON:     return c == ':' || c == '>';
2667     case CPP_DEREF:     return c == '*';
2668     case CPP_DOT:       return c == '.' || c == '%' || b == CPP_NUMBER;
2669     case CPP_HASH:      return c == '#' || c == '%'; /* Digraph form.  */
2670     case CPP_NAME:      return ((b == CPP_NUMBER
2671                                  && name_p (pfile, &token2->val.str))
2672                                 || b == CPP_NAME
2673                                 || b == CPP_CHAR || b == CPP_STRING); /* L */
2674     case CPP_NUMBER:    return (b == CPP_NUMBER || b == CPP_NAME
2675                                 || c == '.' || c == '+' || c == '-');
2676                                       /* UCNs */
2677     case CPP_OTHER:     return ((token1->val.str.text[0] == '\\'
2678                                  && b == CPP_NAME)
2679                                 || (CPP_OPTION (pfile, objc)
2680                                     && token1->val.str.text[0] == '@'
2681                                     && (b == CPP_NAME || b == CPP_STRING)));
2682     default:            break;
2683     }
2684
2685   return 0;
2686 }
2687
2688 /* Output all the remaining tokens on the current line, and a newline
2689    character, to FP.  Leading whitespace is removed.  If there are
2690    macros, special token padding is not performed.  */
2691 void
2692 cpp_output_line (cpp_reader *pfile, FILE *fp)
2693 {
2694   const cpp_token *token;
2695
2696   token = cpp_get_token (pfile);
2697   while (token->type != CPP_EOF)
2698     {
2699       cpp_output_token (token, fp);
2700       token = cpp_get_token (pfile);
2701       if (token->flags & PREV_WHITE)
2702         putc (' ', fp);
2703     }
2704
2705   putc ('\n', fp);
2706 }
2707
2708 /* Return a string representation of all the remaining tokens on the
2709    current line.  The result is allocated using xmalloc and must be
2710    freed by the caller.  */
2711 unsigned char *
2712 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
2713 {
2714   const cpp_token *token;
2715   unsigned int out = dir_name ? ustrlen (dir_name) : 0;
2716   unsigned int alloced = 120 + out;
2717   unsigned char *result = (unsigned char *) xmalloc (alloced);
2718
2719   /* If DIR_NAME is empty, there are no initial contents.  */
2720   if (dir_name)
2721     {
2722       sprintf ((char *) result, "#%s ", dir_name);
2723       out += 2;
2724     }
2725
2726   token = cpp_get_token (pfile);
2727   while (token->type != CPP_EOF)
2728     {
2729       unsigned char *last;
2730       /* Include room for a possible space and the terminating nul.  */
2731       unsigned int len = cpp_token_len (token) + 2;
2732
2733       if (out + len > alloced)
2734         {
2735           alloced *= 2;
2736           if (out + len > alloced)
2737             alloced = out + len;
2738           result = (unsigned char *) xrealloc (result, alloced);
2739         }
2740
2741       last = cpp_spell_token (pfile, token, &result[out], 0);
2742       out = last - result;
2743
2744       token = cpp_get_token (pfile);
2745       if (token->flags & PREV_WHITE)
2746         result[out++] = ' ';
2747     }
2748
2749   result[out] = '\0';
2750   return result;
2751 }
2752
2753 /* Memory buffers.  Changing these three constants can have a dramatic
2754    effect on performance.  The values here are reasonable defaults,
2755    but might be tuned.  If you adjust them, be sure to test across a
2756    range of uses of cpplib, including heavy nested function-like macro
2757    expansion.  Also check the change in peak memory usage (NJAMD is a
2758    good tool for this).  */
2759 #define MIN_BUFF_SIZE 8000
2760 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
2761 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
2762         (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
2763
2764 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
2765   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
2766 #endif
2767
2768 /* Create a new allocation buffer.  Place the control block at the end
2769    of the buffer, so that buffer overflows will cause immediate chaos.  */
2770 static _cpp_buff *
2771 new_buff (size_t len)
2772 {
2773   _cpp_buff *result;
2774   unsigned char *base;
2775
2776   if (len < MIN_BUFF_SIZE)
2777     len = MIN_BUFF_SIZE;
2778   len = CPP_ALIGN (len);
2779
2780   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
2781   result = (_cpp_buff *) (base + len);
2782   result->base = base;
2783   result->cur = base;
2784   result->limit = base + len;
2785   result->next = NULL;
2786   return result;
2787 }
2788
2789 /* Place a chain of unwanted allocation buffers on the free list.  */
2790 void
2791 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
2792 {
2793   _cpp_buff *end = buff;
2794
2795   while (end->next)
2796     end = end->next;
2797   end->next = pfile->free_buffs;
2798   pfile->free_buffs = buff;
2799 }
2800
2801 /* Return a free buffer of size at least MIN_SIZE.  */
2802 _cpp_buff *
2803 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
2804 {
2805   _cpp_buff *result, **p;
2806
2807   for (p = &pfile->free_buffs;; p = &(*p)->next)
2808     {
2809       size_t size;
2810
2811       if (*p == NULL)
2812         return new_buff (min_size);
2813       result = *p;
2814       size = result->limit - result->base;
2815       /* Return a buffer that's big enough, but don't waste one that's
2816          way too big.  */
2817       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
2818         break;
2819     }
2820
2821   *p = result->next;
2822   result->next = NULL;
2823   result->cur = result->base;
2824   return result;
2825 }
2826
2827 /* Creates a new buffer with enough space to hold the uncommitted
2828    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
2829    the excess bytes to the new buffer.  Chains the new buffer after
2830    BUFF, and returns the new buffer.  */
2831 _cpp_buff *
2832 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
2833 {
2834   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
2835   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
2836
2837   buff->next = new_buff;
2838   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
2839   return new_buff;
2840 }
2841
2842 /* Creates a new buffer with enough space to hold the uncommitted
2843    remaining bytes of the buffer pointed to by BUFF, and at least
2844    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
2845    Chains the new buffer before the buffer pointed to by BUFF, and
2846    updates the pointer to point to the new buffer.  */
2847 void
2848 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
2849 {
2850   _cpp_buff *new_buff, *old_buff = *pbuff;
2851   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
2852
2853   new_buff = _cpp_get_buff (pfile, size);
2854   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
2855   new_buff->next = old_buff;
2856   *pbuff = new_buff;
2857 }
2858
2859 /* Free a chain of buffers starting at BUFF.  */
2860 void
2861 _cpp_free_buff (_cpp_buff *buff)
2862 {
2863   _cpp_buff *next;
2864
2865   for (; buff; buff = next)
2866     {
2867       next = buff->next;
2868       free (buff->base);
2869     }
2870 }
2871
2872 /* Allocate permanent, unaligned storage of length LEN.  */
2873 unsigned char *
2874 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
2875 {
2876   _cpp_buff *buff = pfile->u_buff;
2877   unsigned char *result = buff->cur;
2878
2879   if (len > (size_t) (buff->limit - result))
2880     {
2881       buff = _cpp_get_buff (pfile, len);
2882       buff->next = pfile->u_buff;
2883       pfile->u_buff = buff;
2884       result = buff->cur;
2885     }
2886
2887   buff->cur = result + len;
2888   return result;
2889 }
2890
2891 /* Allocate permanent, unaligned storage of length LEN from a_buff.
2892    That buffer is used for growing allocations when saving macro
2893    replacement lists in a #define, and when parsing an answer to an
2894    assertion in #assert, #unassert or #if (and therefore possibly
2895    whilst expanding macros).  It therefore must not be used by any
2896    code that they might call: specifically the lexer and the guts of
2897    the macro expander.
2898
2899    All existing other uses clearly fit this restriction: storing
2900    registered pragmas during initialization.  */
2901 unsigned char *
2902 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
2903 {
2904   _cpp_buff *buff = pfile->a_buff;
2905   unsigned char *result = buff->cur;
2906
2907   if (len > (size_t) (buff->limit - result))
2908     {
2909       buff = _cpp_get_buff (pfile, len);
2910       buff->next = pfile->a_buff;
2911       pfile->a_buff = buff;
2912       result = buff->cur;
2913     }
2914
2915   buff->cur = result + len;
2916   return result;
2917 }
2918
2919 /* Say which field of TOK is in use.  */
2920
2921 enum cpp_token_fld_kind
2922 cpp_token_val_index (cpp_token *tok)
2923 {
2924   switch (TOKEN_SPELL (tok))
2925     {
2926     case SPELL_IDENT:
2927       return CPP_TOKEN_FLD_NODE;
2928     case SPELL_LITERAL:
2929       return CPP_TOKEN_FLD_STR;
2930     case SPELL_OPERATOR:
2931       if (tok->type == CPP_PASTE)
2932         return CPP_TOKEN_FLD_TOKEN_NO;
2933       else
2934         return CPP_TOKEN_FLD_NONE;
2935     case SPELL_NONE:
2936       if (tok->type == CPP_MACRO_ARG)
2937         return CPP_TOKEN_FLD_ARG_NO;
2938       else if (tok->type == CPP_PADDING)
2939         return CPP_TOKEN_FLD_SOURCE;
2940       else if (tok->type == CPP_PRAGMA)
2941         return CPP_TOKEN_FLD_PRAGMA;
2942       /* else fall through */
2943     default:
2944       return CPP_TOKEN_FLD_NONE;
2945     }
2946 }
2947
2948 /* All tokens lexed in R after calling this function will be forced to have
2949    their source_location the same as the location referenced by P, until
2950    cpp_stop_forcing_token_locations is called for R.  */
2951
2952 void
2953 cpp_force_token_locations (cpp_reader *r, source_location *p)
2954 {
2955   r->forced_token_location_p = p;
2956 }
2957
2958 /* Go back to assigning locations naturally for lexed tokens.  */
2959
2960 void
2961 cpp_stop_forcing_token_locations (cpp_reader *r)
2962 {
2963   r->forced_token_location_p = NULL;
2964 }