libcpp/lex.c

   1 /* CPP Library - lexical analysis.
   2    Copyright (C) 2000-2014 Free Software Foundation, Inc.
   3    Contributed by Per Bothner, 1994-95.
   4    Based on CCCP program by Paul Rubin, June 1986
   5    Adapted to ANSI C, Richard Stallman, Jan 1987
   6    Broken out to separate file, Zack Weinberg, Mar 2000
   7
   8 This program is free software; you can redistribute it and/or modify it
   9 under the terms of the GNU General Public License as published by the
  10 Free Software Foundation; either version 3, or (at your option) any
  11 later version.
  12
  13 This program is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with this program; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "cpplib.h"
  25 #include "internal.h"
  26
  27 enum spell_type
  28 {
  29   SPELL_OPERATOR = 0,
  30   SPELL_IDENT,
  31   SPELL_LITERAL,
  32   SPELL_NONE
  33 };
  34
  35 struct token_spelling
  36 {
  37   enum spell_type category;
  38   const unsigned char *name;
  39 };
  40
  41 static const unsigned char *const digraph_spellings[] =
  42 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
  43
  44 #define OP(e, s) { SPELL_OPERATOR, UC s  },
  45 #define TK(e, s) { SPELL_ ## s,    UC #e },
  46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
  47 #undef OP
  48 #undef TK
  49
  50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
  51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
  52
  53 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
  54 static int skip_line_comment (cpp_reader *);
  55 static void skip_whitespace (cpp_reader *, cppchar_t);
  56 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
  57 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
  58 static void store_comment (cpp_reader *, cpp_token *);
  59 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
  60                             unsigned int, enum cpp_ttype);
  61 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
  62 static int name_p (cpp_reader *, const cpp_string *);
  63 static tokenrun *next_tokenrun (tokenrun *);
  64
  65 static _cpp_buff *new_buff (size_t);
  66
  67
  68 /* Utility routine:
  69
  70    Compares, the token TOKEN to the NUL-terminated string STRING.
  71    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
  72 int
  73 cpp_ideq (const cpp_token *token, const char *string)
  74 {
  75   if (token->type != CPP_NAME)
  76     return 0;
  77
  78   return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
  79 }
  80
  81 /* Record a note TYPE at byte POS into the current cleaned logical
  82    line.  */
  83 static void
  84 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
  85 {
  86   if (buffer->notes_used == buffer->notes_cap)
  87     {
  88       buffer->notes_cap = buffer->notes_cap * 2 + 200;
  89       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
  90                                   buffer->notes_cap);
  91     }
  92
  93   buffer->notes[buffer->notes_used].pos = pos;
  94   buffer->notes[buffer->notes_used].type = type;
  95   buffer->notes_used++;
  96 }
  97
  98 \f
  99 /* Fast path to find line special characters using optimized character
 100    scanning algorithms.  Anything complicated falls back to the slow
 101    path below.  Since this loop is very hot it's worth doing these kinds
 102    of optimizations.
 103
 104    One of the paths through the ifdefs should provide
 105
 106      const uchar *search_line_fast (const uchar *s, const uchar *end);
 107
 108    Between S and END, search for \n, \r, \\, ?.  Return a pointer to
 109    the found character.
 110
 111    Note that the last character of the buffer is *always* a newline,
 112    as forced by _cpp_convert_input.  This fact can be used to avoid
 113    explicitly looking for the end of the buffer.  */
 114
 115 /* Configure gives us an ifdef test.  */
 116 #ifndef WORDS_BIGENDIAN
 117 #define WORDS_BIGENDIAN 0
 118 #endif
 119
 120 /* We'd like the largest integer that fits into a register.  There's nothing
 121    in <stdint.h> that gives us that.  For most hosts this is unsigned long,
 122    but MS decided on an LLP64 model.  Thankfully when building with GCC we
 123    can get the "real" word size.  */
 124 #ifdef __GNUC__
 125 typedef unsigned int word_type __attribute__((__mode__(__word__)));
 126 #else
 127 typedef unsigned long word_type;
 128 #endif
 129
 130 /* The code below is only expecting sizes 4 or 8.
 131    Die at compile-time if this expectation is violated.  */
 132 typedef char check_word_type_size
 133   [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
 134
 135 /* Return X with the first N bytes forced to values that won't match one
 136    of the interesting characters.  Note that NUL is not interesting.  */
 137
 138 static inline word_type
 139 acc_char_mask_misalign (word_type val, unsigned int n)
 140 {
 141   word_type mask = -1;
 142   if (WORDS_BIGENDIAN)
 143     mask >>= n * 8;
 144   else
 145     mask <<= n * 8;
 146   return val & mask;
 147 }
 148
 149 /* Return X replicated to all byte positions within WORD_TYPE.  */
 150
 151 static inline word_type
 152 acc_char_replicate (uchar x)
 153 {
 154   word_type ret;
 155
 156   ret = (x << 24) | (x << 16) | (x << 8) | x;
 157   if (sizeof(word_type) == 8)
 158     ret = (ret << 16 << 16) | ret;
 159   return ret;
 160 }
 161
 162 /* Return non-zero if some byte of VAL is (probably) C.  */
 163
 164 static inline word_type
 165 acc_char_cmp (word_type val, word_type c)
 166 {
 167 #if defined(__GNUC__) && defined(__alpha__)
 168   /* We can get exact results using a compare-bytes instruction.
 169      Get (val == c) via (0 >= (val ^ c)).  */
 170   return __builtin_alpha_cmpbge (0, val ^ c);
 171 #else
 172   word_type magic = 0x7efefefeU;
 173   if (sizeof(word_type) == 8)
 174     magic = (magic << 16 << 16) | 0xfefefefeU;
 175   magic |= 1;
 176
 177   val ^= c;
 178   return ((val + magic) ^ ~val) & ~magic;
 179 #endif
 180 }
 181
 182 /* Given the result of acc_char_cmp is non-zero, return the index of
 183    the found character.  If this was a false positive, return -1.  */
 184
 185 static inline int
 186 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
 187                 word_type val ATTRIBUTE_UNUSED)
 188 {
 189 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
 190   /* The cmpbge instruction sets *bits* of the result corresponding to
 191      matches in the bytes with no false positives.  */
 192   return __builtin_ctzl (cmp);
 193 #else
 194   unsigned int i;
 195
 196   /* ??? It would be nice to force unrolling here,
 197      and have all of these constants folded.  */
 198   for (i = 0; i < sizeof(word_type); ++i)
 199     {
 200       uchar c;
 201       if (WORDS_BIGENDIAN)
 202         c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
 203       else
 204         c = (val >> i * 8) & 0xff;
 205
 206       if (c == '\n' || c == '\r' || c == '\\' || c == '?')
 207         return i;
 208     }
 209
 210   return -1;
 211 #endif
 212 }
 213
 214 /* A version of the fast scanner using bit fiddling techniques.
 215
 216    For 32-bit words, one would normally perform 16 comparisons and
 217    16 branches.  With this algorithm one performs 24 arithmetic
 218    operations and one branch.  Whether this is faster with a 32-bit
 219    word size is going to be somewhat system dependent.
 220
 221    For 64-bit words, we eliminate twice the number of comparisons
 222    and branches without increasing the number of arithmetic operations.
 223    It's almost certainly going to be a win with 64-bit word size.  */
 224
 225 static const uchar * search_line_acc_char (const uchar *, const uchar *)
 226   ATTRIBUTE_UNUSED;
 227
 228 static const uchar *
 229 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 230 {
 231   const word_type repl_nl = acc_char_replicate ('\n');
 232   const word_type repl_cr = acc_char_replicate ('\r');
 233   const word_type repl_bs = acc_char_replicate ('\\');
 234   const word_type repl_qm = acc_char_replicate ('?');
 235
 236   unsigned int misalign;
 237   const word_type *p;
 238   word_type val, t;
 239
 240   /* Align the buffer.  Mask out any bytes from before the beginning.  */
 241   p = (word_type *)((uintptr_t)s & -sizeof(word_type));
 242   val = *p;
 243   misalign = (uintptr_t)s & (sizeof(word_type) - 1);
 244   if (misalign)
 245     val = acc_char_mask_misalign (val, misalign);
 246
 247   /* Main loop.  */
 248   while (1)
 249     {
 250       t  = acc_char_cmp (val, repl_nl);
 251       t |= acc_char_cmp (val, repl_cr);
 252       t |= acc_char_cmp (val, repl_bs);
 253       t |= acc_char_cmp (val, repl_qm);
 254
 255       if (__builtin_expect (t != 0, 0))
 256         {
 257           int i = acc_char_index (t, val);
 258           if (i >= 0)
 259             return (const uchar *)p + i;
 260         }
 261
 262       val = *++p;
 263     }
 264 }
 265
 266 /* Disable on Solaris 2/x86 until the following problem can be properly
 267    autoconfed:
 268
 269    The Solaris 10+ assembler tags objects with the instruction set
 270    extensions used, so SSE4.2 executables cannot run on machines that
 271    don't support that extension.  */
 272
 273 #if (GCC_VERSION >= 4005) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
 274
 275 /* Replicated character data to be shared between implementations.
 276    Recall that outside of a context with vector support we can't
 277    define compatible vector types, therefore these are all defined
 278    in terms of raw characters.  */
 279 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
 280   { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 281     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
 282   { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 283     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
 284   { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 285     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
 286   { '?', '?', '?', '?', '?', '?', '?', '?',
 287     '?', '?', '?', '?', '?', '?', '?', '?' },
 288 };
 289
 290 /* A version of the fast scanner using MMX vectorized byte compare insns.
 291
 292    This uses the PMOVMSKB instruction which was introduced with "MMX2",
 293    which was packaged into SSE1; it is also present in the AMD MMX
 294    extension.  Mark the function as using "sse" so that we emit a real
 295    "emms" instruction, rather than the 3dNOW "femms" instruction.  */
 296
 297 static const uchar *
 298 #ifndef __SSE__
 299 __attribute__((__target__("sse")))
 300 #endif
 301 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 302 {
 303   typedef char v8qi __attribute__ ((__vector_size__ (8)));
 304   typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
 305
 306   const v8qi repl_nl = *(const v8qi *)repl_chars[0];
 307   const v8qi repl_cr = *(const v8qi *)repl_chars[1];
 308   const v8qi repl_bs = *(const v8qi *)repl_chars[2];
 309   const v8qi repl_qm = *(const v8qi *)repl_chars[3];
 310
 311   unsigned int misalign, found, mask;
 312   const v8qi *p;
 313   v8qi data, t, c;
 314
 315   /* Align the source pointer.  While MMX doesn't generate unaligned data
 316      faults, this allows us to safely scan to the end of the buffer without
 317      reading beyond the end of the last page.  */
 318   misalign = (uintptr_t)s & 7;
 319   p = (const v8qi *)((uintptr_t)s & -8);
 320   data = *p;
 321
 322   /* Create a mask for the bytes that are valid within the first
 323      16-byte block.  The Idea here is that the AND with the mask
 324      within the loop is "free", since we need some AND or TEST
 325      insn in order to set the flags for the branch anyway.  */
 326   mask = -1u << misalign;
 327
 328   /* Main loop processing 8 bytes at a time.  */
 329   goto start;
 330   do
 331     {
 332       data = *++p;
 333       mask = -1;
 334
 335     start:
 336       t = __builtin_ia32_pcmpeqb(data, repl_nl);
 337       c = __builtin_ia32_pcmpeqb(data, repl_cr);
 338       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 339       c = __builtin_ia32_pcmpeqb(data, repl_bs);
 340       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 341       c = __builtin_ia32_pcmpeqb(data, repl_qm);
 342       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 343       found = __builtin_ia32_pmovmskb (t);
 344       found &= mask;
 345     }
 346   while (!found);
 347
 348   __builtin_ia32_emms ();
 349
 350   /* FOUND contains 1 in bits for which we matched a relevant
 351      character.  Conversion to the byte index is trivial.  */
 352   found = __builtin_ctz(found);
 353   return (const uchar *)p + found;
 354 }
 355
 356 /* A version of the fast scanner using SSE2 vectorized byte compare insns.  */
 357
 358 static const uchar *
 359 #ifndef __SSE2__
 360 __attribute__((__target__("sse2")))
 361 #endif
 362 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 363 {
 364   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 365
 366   const v16qi repl_nl = *(const v16qi *)repl_chars[0];
 367   const v16qi repl_cr = *(const v16qi *)repl_chars[1];
 368   const v16qi repl_bs = *(const v16qi *)repl_chars[2];
 369   const v16qi repl_qm = *(const v16qi *)repl_chars[3];
 370
 371   unsigned int misalign, found, mask;
 372   const v16qi *p;
 373   v16qi data, t;
 374
 375   /* Align the source pointer.  */
 376   misalign = (uintptr_t)s & 15;
 377   p = (const v16qi *)((uintptr_t)s & -16);
 378   data = *p;
 379
 380   /* Create a mask for the bytes that are valid within the first
 381      16-byte block.  The Idea here is that the AND with the mask
 382      within the loop is "free", since we need some AND or TEST
 383      insn in order to set the flags for the branch anyway.  */
 384   mask = -1u << misalign;
 385
 386   /* Main loop processing 16 bytes at a time.  */
 387   goto start;
 388   do
 389     {
 390       data = *++p;
 391       mask = -1;
 392
 393     start:
 394       t  = __builtin_ia32_pcmpeqb128(data, repl_nl);
 395       t |= __builtin_ia32_pcmpeqb128(data, repl_cr);
 396       t |= __builtin_ia32_pcmpeqb128(data, repl_bs);
 397       t |= __builtin_ia32_pcmpeqb128(data, repl_qm);
 398       found = __builtin_ia32_pmovmskb128 (t);
 399       found &= mask;
 400     }
 401   while (!found);
 402
 403   /* FOUND contains 1 in bits for which we matched a relevant
 404      character.  Conversion to the byte index is trivial.  */
 405   found = __builtin_ctz(found);
 406   return (const uchar *)p + found;
 407 }
 408
 409 #ifdef HAVE_SSE4
 410 /* A version of the fast scanner using SSE 4.2 vectorized string insns.  */
 411
 412 static const uchar *
 413 #ifndef __SSE4_2__
 414 __attribute__((__target__("sse4.2")))
 415 #endif
 416 search_line_sse42 (const uchar *s, const uchar *end)
 417 {
 418   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 419   static const v16qi search = { '\n', '\r', '?', '\\' };
 420
 421   uintptr_t si = (uintptr_t)s;
 422   uintptr_t index;
 423
 424   /* Check for unaligned input.  */
 425   if (si & 15)
 426     {
 427       v16qi sv;
 428
 429       if (__builtin_expect (end - s < 16, 0)
 430           && __builtin_expect ((si & 0xfff) > 0xff0, 0))
 431         {
 432           /* There are less than 16 bytes left in the buffer, and less
 433              than 16 bytes left on the page.  Reading 16 bytes at this
 434              point might generate a spurious page fault.  Defer to the
 435              SSE2 implementation, which already handles alignment.  */
 436           return search_line_sse2 (s, end);
 437         }
 438
 439       /* ??? The builtin doesn't understand that the PCMPESTRI read from
 440          memory need not be aligned.  */
 441       sv = __builtin_ia32_loaddqu ((const char *) s);
 442       index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
 443
 444       if (__builtin_expect (index < 16, 0))
 445         goto found;
 446
 447       /* Advance the pointer to an aligned address.  We will re-scan a
 448          few bytes, but we no longer need care for reading past the
 449          end of a page, since we're guaranteed a match.  */
 450       s = (const uchar *)((si + 16) & -16);
 451     }
 452
 453   /* Main loop, processing 16 bytes at a time.  By doing the whole loop
 454      in inline assembly, we can make proper use of the flags set.  */
 455   __asm (      "sub $16, %1\n"
 456         "       .balign 16\n"
 457         "0:     add $16, %1\n"
 458         "       %vpcmpestri $0, (%1), %2\n"
 459         "       jnc 0b"
 460         : "=&c"(index), "+r"(s)
 461         : "x"(search), "a"(4), "d"(16));
 462
 463  found:
 464   return s + index;
 465 }
 466
 467 #else
 468 /* Work around out-dated assemblers without sse4 support.  */
 469 #define search_line_sse42 search_line_sse2
 470 #endif
 471
 472 /* Check the CPU capabilities.  */
 473
 474 #include "../gcc/config/i386/cpuid.h"
 475
 476 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
 477 static search_line_fast_type search_line_fast;
 478
 479 #define HAVE_init_vectorized_lexer 1
 480 static inline void
 481 init_vectorized_lexer (void)
 482 {
 483   unsigned dummy, ecx = 0, edx = 0;
 484   search_line_fast_type impl = search_line_acc_char;
 485   int minimum = 0;
 486
 487 #if defined(__SSE4_2__)
 488   minimum = 3;
 489 #elif defined(__SSE2__)
 490   minimum = 2;
 491 #elif defined(__SSE__)
 492   minimum = 1;
 493 #endif
 494
 495   if (minimum == 3)
 496     impl = search_line_sse42;
 497   else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
 498     {
 499       if (minimum == 3 || (ecx & bit_SSE4_2))
 500         impl = search_line_sse42;
 501       else if (minimum == 2 || (edx & bit_SSE2))
 502         impl = search_line_sse2;
 503       else if (minimum == 1 || (edx & bit_SSE))
 504         impl = search_line_mmx;
 505     }
 506   else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
 507     {
 508       if (minimum == 1
 509           || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
 510         impl = search_line_mmx;
 511     }
 512
 513   search_line_fast = impl;
 514 }
 515
 516 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__)
 517
 518 /* A vection of the fast scanner using AltiVec vectorized byte compares.  */
 519 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
 520    so we can't compile this function without -maltivec on the command line
 521    (or implied by some other switch).  */
 522
 523 static const uchar *
 524 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 525 {
 526   typedef __attribute__((altivec(vector))) unsigned char vc;
 527
 528   const vc repl_nl = {
 529     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 530     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 531   };
 532   const vc repl_cr = {
 533     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 534     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 535   };
 536   const vc repl_bs = {
 537     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 538     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 539   };
 540   const vc repl_qm = {
 541     '?', '?', '?', '?', '?', '?', '?', '?',
 542     '?', '?', '?', '?', '?', '?', '?', '?',
 543   };
 544   const vc ones = {
 545     -1, -1, -1, -1, -1, -1, -1, -1,
 546     -1, -1, -1, -1, -1, -1, -1, -1,
 547   };
 548   const vc zero = { 0 };
 549
 550   vc data, mask, t;
 551
 552   /* Altivec loads automatically mask addresses with -16.  This lets us
 553      issue the first load as early as possible.  */
 554   data = __builtin_vec_ld(0, (const vc *)s);
 555
 556   /* Discard bytes before the beginning of the buffer.  Do this by
 557      beginning with all ones and shifting in zeros according to the
 558      mis-alignment.  The LVSR instruction pulls the exact shift we
 559      want from the address.  */
 560 #ifdef __BIG_ENDIAN__
 561   mask = __builtin_vec_lvsr(0, s);
 562   mask = __builtin_vec_perm(zero, ones, mask);
 563 #else
 564   mask = __builtin_vec_lvsl(0, s);
 565   mask = __builtin_vec_perm(ones, zero, mask);
 566 #endif
 567   data &= mask;
 568
 569   /* While altivec loads mask addresses, we still need to align S so
 570      that the offset we compute at the end is correct.  */
 571   s = (const uchar *)((uintptr_t)s & -16);
 572
 573   /* Main loop processing 16 bytes at a time.  */
 574   goto start;
 575   do
 576     {
 577       vc m_nl, m_cr, m_bs, m_qm;
 578
 579       s += 16;
 580       data = __builtin_vec_ld(0, (const vc *)s);
 581
 582     start:
 583       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 584       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 585       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 586       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 587       t = (m_nl | m_cr) | (m_bs | m_qm);
 588
 589       /* T now contains 0xff in bytes for which we matched one of the relevant
 590          characters.  We want to exit the loop if any byte in T is non-zero.
 591          Below is the expansion of vec_any_ne(t, zero).  */
 592     }
 593   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 594
 595   {
 596 #define N  (sizeof(vc) / sizeof(long))
 597
 598     union {
 599       vc v;
 600       /* Statically assert that N is 2 or 4.  */
 601       unsigned long l[(N == 2 || N == 4) ? N : -1];
 602     } u;
 603     unsigned long l, i = 0;
 604
 605     u.v = t;
 606
 607     /* Find the first word of T that is non-zero.  */
 608     switch (N)
 609       {
 610       case 4:
 611         l = u.l[i++];
 612         if (l != 0)
 613           break;
 614         s += sizeof(unsigned long);
 615         l = u.l[i++];
 616         if (l != 0)
 617           break;
 618         s += sizeof(unsigned long);
 619       case 2:
 620         l = u.l[i++];
 621         if (l != 0)
 622           break;
 623         s += sizeof(unsigned long);
 624         l = u.l[i];
 625       }
 626
 627     /* L now contains 0xff in bytes for which we matched one of the
 628        relevant characters.  We can find the byte index by finding
 629        its bit index and dividing by 8.  */
 630 #ifdef __BIG_ENDIAN__
 631     l = __builtin_clzl(l) >> 3;
 632 #else
 633     l = __builtin_ctzl(l) >> 3;
 634 #endif
 635     return s + l;
 636
 637 #undef N
 638   }
 639 }
 640
 641 #elif defined (__ARM_NEON__)
 642 #include "arm_neon.h"
 643
 644 static const uchar *
 645 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 646 {
 647   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
 648   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
 649   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
 650   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
 651   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
 652
 653   unsigned int misalign, found, mask;
 654   const uint8_t *p;
 655   uint8x16_t data;
 656
 657   /* Align the source pointer.  */
 658   misalign = (uintptr_t)s & 15;
 659   p = (const uint8_t *)((uintptr_t)s & -16);
 660   data = vld1q_u8 (p);
 661
 662   /* Create a mask for the bytes that are valid within the first
 663      16-byte block.  The Idea here is that the AND with the mask
 664      within the loop is "free", since we need some AND or TEST
 665      insn in order to set the flags for the branch anyway.  */
 666   mask = (-1u << misalign) & 0xffff;
 667
 668   /* Main loop, processing 16 bytes at a time.  */
 669   goto start;
 670
 671   do
 672     {
 673       uint8x8_t l;
 674       uint16x4_t m;
 675       uint32x2_t n;
 676       uint8x16_t t, u, v, w;
 677
 678       p += 16;
 679       data = vld1q_u8 (p);
 680       mask = 0xffff;
 681
 682     start:
 683       t = vceqq_u8 (data, repl_nl);
 684       u = vceqq_u8 (data, repl_cr);
 685       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 686       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 687       t = vandq_u8 (vorrq_u8 (v, w), xmask);
 688       l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
 689       m = vpaddl_u8 (l);
 690       n = vpaddl_u16 (m);
 691
 692       found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
 693               vshr_n_u64 ((uint64x1_t) n, 24)), 0);
 694       found &= mask;
 695     }
 696   while (!found);
 697
 698   /* FOUND contains 1 in bits for which we matched a relevant
 699      character.  Conversion to the byte index is trivial.  */
 700   found = __builtin_ctz (found);
 701   return (const uchar *)p + found;
 702 }
 703
 704 #else
 705
 706 /* We only have one accellerated alternative.  Use a direct call so that
 707    we encourage inlining.  */
 708
 709 #define search_line_fast  search_line_acc_char
 710
 711 #endif
 712
 713 /* Initialize the lexer if needed.  */
 714
 715 void
 716 _cpp_init_lexer (void)
 717 {
 718 #ifdef HAVE_init_vectorized_lexer
 719   init_vectorized_lexer ();
 720 #endif
 721 }
 722
 723 /* Returns with a logical line that contains no escaped newlines or
 724    trigraphs.  This is a time-critical inner loop.  */
 725 void
 726 _cpp_clean_line (cpp_reader *pfile)
 727 {
 728   cpp_buffer *buffer;
 729   const uchar *s;
 730   uchar c, *d, *p;
 731
 732   buffer = pfile->buffer;
 733   buffer->cur_note = buffer->notes_used = 0;
 734   buffer->cur = buffer->line_base = buffer->next_line;
 735   buffer->need_line = false;
 736   s = buffer->next_line;
 737
 738   if (!buffer->from_stage3)
 739     {
 740       const uchar *pbackslash = NULL;
 741
 742       /* Fast path.  This is the common case of an un-escaped line with
 743          no trigraphs.  The primary win here is by not writing any
 744          data back to memory until we have to.  */
 745       while (1)
 746         {
 747           /* Perform an optimized search for \n, \r, \\, ?.  */
 748           s = search_line_fast (s, buffer->rlimit);
 749
 750           c = *s;
 751           if (c == '\\')
 752             {
 753               /* Record the location of the backslash and continue.  */
 754               pbackslash = s++;
 755             }
 756           else if (__builtin_expect (c == '?', 0))
 757             {
 758               if (__builtin_expect (s[1] == '?', false)
 759                    && _cpp_trigraph_map[s[2]])
 760                 {
 761                   /* Have a trigraph.  We may or may not have to convert
 762                      it.  Add a line note regardless, for -Wtrigraphs.  */
 763                   add_line_note (buffer, s, s[2]);
 764                   if (CPP_OPTION (pfile, trigraphs))
 765                     {
 766                       /* We do, and that means we have to switch to the
 767                          slow path.  */
 768                       d = (uchar *) s;
 769                       *d = _cpp_trigraph_map[s[2]];
 770                       s += 2;
 771                       goto slow_path;
 772                     }
 773                 }
 774               /* Not a trigraph.  Continue on fast-path.  */
 775               s++;
 776             }
 777           else
 778             break;
 779         }
 780
 781       /* This must be \r or \n.  We're either done, or we'll be forced
 782          to write back to the buffer and continue on the slow path.  */
 783       d = (uchar *) s;
 784
 785       if (__builtin_expect (s == buffer->rlimit, false))
 786         goto done;
 787
 788       /* DOS line ending? */
 789       if (__builtin_expect (c == '\r', false) && s[1] == '\n')
 790         {
 791           s++;
 792           if (s == buffer->rlimit)
 793             goto done;
 794         }
 795
 796       if (__builtin_expect (pbackslash == NULL, true))
 797         goto done;
 798
 799       /* Check for escaped newline.  */
 800       p = d;
 801       while (is_nvspace (p[-1]))
 802         p--;
 803       if (p - 1 != pbackslash)
 804         goto done;
 805
 806       /* Have an escaped newline; process it and proceed to
 807          the slow path.  */
 808       add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
 809       d = p - 2;
 810       buffer->next_line = p - 1;
 811
 812     slow_path:
 813       while (1)
 814         {
 815           c = *++s;
 816           *++d = c;
 817
 818           if (c == '\n' || c == '\r')
 819             {
 820               /* Handle DOS line endings.  */
 821               if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
 822                 s++;
 823               if (s == buffer->rlimit)
 824                 break;
 825
 826               /* Escaped?  */
 827               p = d;
 828               while (p != buffer->next_line && is_nvspace (p[-1]))
 829                 p--;
 830               if (p == buffer->next_line || p[-1] != '\\')
 831                 break;
 832
 833               add_line_note (buffer, p - 1, p != d ? ' ': '\\');
 834               d = p - 2;
 835               buffer->next_line = p - 1;
 836             }
 837           else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
 838             {
 839               /* Add a note regardless, for the benefit of -Wtrigraphs.  */
 840               add_line_note (buffer, d, s[2]);
 841               if (CPP_OPTION (pfile, trigraphs))
 842                 {
 843                   *d = _cpp_trigraph_map[s[2]];
 844                   s += 2;
 845                 }
 846             }
 847         }
 848     }
 849   else
 850     {
 851       while (*s != '\n' && *s != '\r')
 852         s++;
 853       d = (uchar *) s;
 854
 855       /* Handle DOS line endings.  */
 856       if (*s == '\r' && s != buffer->rlimit && s[1] == '\n')
 857         s++;
 858     }
 859
 860  done:
 861   *d = '\n';
 862   /* A sentinel note that should never be processed.  */
 863   add_line_note (buffer, d + 1, '\n');
 864   buffer->next_line = s + 1;
 865 }
 866
 867 /* Return true if the trigraph indicated by NOTE should be warned
 868    about in a comment.  */
 869 static bool
 870 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
 871 {
 872   const uchar *p;
 873
 874   /* Within comments we don't warn about trigraphs, unless the
 875      trigraph forms an escaped newline, as that may change
 876      behavior.  */
 877   if (note->type != '/')
 878     return false;
 879
 880   /* If -trigraphs, then this was an escaped newline iff the next note
 881      is coincident.  */
 882   if (CPP_OPTION (pfile, trigraphs))
 883     return note[1].pos == note->pos;
 884
 885   /* Otherwise, see if this forms an escaped newline.  */
 886   p = note->pos + 3;
 887   while (is_nvspace (*p))
 888     p++;
 889
 890   /* There might have been escaped newlines between the trigraph and the
 891      newline we found.  Hence the position test.  */
 892   return (*p == '\n' && p < note[1].pos);
 893 }
 894
 895 /* Process the notes created by add_line_note as far as the current
 896    location.  */
 897 void
 898 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
 899 {
 900   cpp_buffer *buffer = pfile->buffer;
 901
 902   for (;;)
 903     {
 904       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
 905       unsigned int col;
 906
 907       if (note->pos > buffer->cur)
 908         break;
 909
 910       buffer->cur_note++;
 911       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
 912
 913       if (note->type == '\\' || note->type == ' ')
 914         {
 915           if (note->type == ' ' && !in_comment)
 916             cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
 917                                  "backslash and newline separated by space");
 918
 919           if (buffer->next_line > buffer->rlimit)
 920             {
 921               cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
 922                                    "backslash-newline at end of file");
 923               /* Prevent "no newline at end of file" warning.  */
 924               buffer->next_line = buffer->rlimit;
 925             }
 926
 927           buffer->line_base = note->pos;
 928           CPP_INCREMENT_LINE (pfile, 0);
 929         }
 930       else if (_cpp_trigraph_map[note->type])
 931         {
 932           if (CPP_OPTION (pfile, warn_trigraphs)
 933               && (!in_comment || warn_in_comment (pfile, note)))
 934             {
 935               if (CPP_OPTION (pfile, trigraphs))
 936                 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
 937                                        pfile->line_table->highest_line, col,
 938                                        "trigraph ??%c converted to %c",
 939                                        note->type,
 940                                        (int) _cpp_trigraph_map[note->type]);
 941               else
 942                 {
 943                   cpp_warning_with_line
 944                     (pfile, CPP_W_TRIGRAPHS,
 945                      pfile->line_table->highest_line, col,
 946                      "trigraph ??%c ignored, use -trigraphs to enable",
 947                      note->type);
 948                 }
 949             }
 950         }
 951       else if (note->type == 0)
 952         /* Already processed in lex_raw_string.  */;
 953       else
 954         abort ();
 955     }
 956 }
 957
 958 /* Skip a C-style block comment.  We find the end of the comment by
 959    seeing if an asterisk is before every '/' we encounter.  Returns
 960    nonzero if comment terminated by EOF, zero otherwise.
 961
 962    Buffer->cur points to the initial asterisk of the comment.  */
 963 bool
 964 _cpp_skip_block_comment (cpp_reader *pfile)
 965 {
 966   cpp_buffer *buffer = pfile->buffer;
 967   const uchar *cur = buffer->cur;
 968   uchar c;
 969
 970   cur++;
 971   if (*cur == '/')
 972     cur++;
 973
 974   for (;;)
 975     {
 976       /* People like decorating comments with '*', so check for '/'
 977          instead for efficiency.  */
 978       c = *cur++;
 979
 980       if (c == '/')
 981         {
 982           if (cur[-2] == '*')
 983             break;
 984
 985           /* Warn about potential nested comments, but not if the '/'
 986              comes immediately before the true comment delimiter.
 987              Don't bother to get it right across escaped newlines.  */
 988           if (CPP_OPTION (pfile, warn_comments)
 989               && cur[0] == '*' && cur[1] != '/')
 990             {
 991               buffer->cur = cur;
 992               cpp_warning_with_line (pfile, CPP_W_COMMENTS,
 993                                      pfile->line_table->highest_line,
 994                                      CPP_BUF_COL (buffer),
 995                                      "\"/*\" within comment");
 996             }
 997         }
 998       else if (c == '\n')
 999         {
1000           unsigned int cols;
1001           buffer->cur = cur - 1;
1002           _cpp_process_line_notes (pfile, true);
1003           if (buffer->next_line >= buffer->rlimit)
1004             return true;
1005           _cpp_clean_line (pfile);
1006
1007           cols = buffer->next_line - buffer->line_base;
1008           CPP_INCREMENT_LINE (pfile, cols);
1009
1010           cur = buffer->cur;
1011         }
1012     }
1013
1014   buffer->cur = cur;
1015   _cpp_process_line_notes (pfile, true);
1016   return false;
1017 }
1018
1019 /* Skip a C++ line comment, leaving buffer->cur pointing to the
1020    terminating newline.  Handles escaped newlines.  Returns nonzero
1021    if a multiline comment.  */
1022 static int
1023 skip_line_comment (cpp_reader *pfile)
1024 {
1025   cpp_buffer *buffer = pfile->buffer;
1026   source_location orig_line = pfile->line_table->highest_line;
1027
1028   while (*buffer->cur != '\n')
1029     buffer->cur++;
1030
1031   _cpp_process_line_notes (pfile, true);
1032   return orig_line != pfile->line_table->highest_line;
1033 }
1034
1035 /* Skips whitespace, saving the next non-whitespace character.  */
1036 static void
1037 skip_whitespace (cpp_reader *pfile, cppchar_t c)
1038 {
1039   cpp_buffer *buffer = pfile->buffer;
1040   bool saw_NUL = false;
1041
1042   do
1043     {
1044       /* Horizontal space always OK.  */
1045       if (c == ' ' || c == '\t')
1046         ;
1047       /* Just \f \v or \0 left.  */
1048       else if (c == '\0')
1049         saw_NUL = true;
1050       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
1051         cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
1052                              CPP_BUF_COL (buffer),
1053                              "%s in preprocessing directive",
1054                              c == '\f' ? "form feed" : "vertical tab");
1055
1056       c = *buffer->cur++;
1057     }
1058   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
1059   while (is_nvspace (c));
1060
1061   if (saw_NUL)
1062     cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored");
1063
1064   buffer->cur--;
1065 }
1066
1067 /* See if the characters of a number token are valid in a name (no
1068    '.', '+' or '-').  */
1069 static int
1070 name_p (cpp_reader *pfile, const cpp_string *string)
1071 {
1072   unsigned int i;
1073
1074   for (i = 0; i < string->len; i++)
1075     if (!is_idchar (string->text[i]))
1076       return 0;
1077
1078   return 1;
1079 }
1080
1081 /* After parsing an identifier or other sequence, produce a warning about
1082    sequences not in NFC/NFKC.  */
1083 static void
1084 warn_about_normalization (cpp_reader *pfile,
1085                           const cpp_token *token,
1086                           const struct normalize_state *s)
1087 {
1088   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
1089       && !pfile->state.skipping)
1090     {
1091       /* Make sure that the token is printed using UCNs, even
1092          if we'd otherwise happily print UTF-8.  */
1093       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
1094       size_t sz;
1095
1096       sz = cpp_spell_token (pfile, token, buf, false) - buf;
1097       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
1098         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1099                                "`%.*s' is not in NFKC", (int) sz, buf);
1100       else
1101         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1102                                "`%.*s' is not in NFC", (int) sz, buf);
1103       free (buf);
1104     }
1105 }
1106
1107 /* Returns TRUE if the sequence starting at buffer->cur is invalid in
1108    an identifier.  FIRST is TRUE if this starts an identifier.  */
1109 static bool
1110 forms_identifier_p (cpp_reader *pfile, int first,
1111                     struct normalize_state *state)
1112 {
1113   cpp_buffer *buffer = pfile->buffer;
1114
1115   if (*buffer->cur == '$')
1116     {
1117       if (!CPP_OPTION (pfile, dollars_in_ident))
1118         return false;
1119
1120       buffer->cur++;
1121       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
1122         {
1123           CPP_OPTION (pfile, warn_dollars) = 0;
1124           cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
1125         }
1126
1127       return true;
1128     }
1129
1130   /* Is this a syntactically valid UCN?  */
1131   if (CPP_OPTION (pfile, extended_identifiers)
1132       && *buffer->cur == '\\'
1133       && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
1134     {
1135       buffer->cur += 2;
1136       if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1137                           state))
1138         return true;
1139       buffer->cur -= 2;
1140     }
1141
1142   return false;
1143 }
1144
1145 /* Helper function to get the cpp_hashnode of the identifier BASE.  */
1146 static cpp_hashnode *
1147 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
1148 {
1149   cpp_hashnode *result;
1150   const uchar *cur;
1151   unsigned int len;
1152   unsigned int hash = HT_HASHSTEP (0, *base);
1153
1154   cur = base + 1;
1155   while (ISIDNUM (*cur))
1156     {
1157       hash = HT_HASHSTEP (hash, *cur);
1158       cur++;
1159     }
1160   len = cur - base;
1161   hash = HT_HASHFINISH (hash, len);
1162   result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1163                                               base, len, hash, HT_ALLOC));
1164
1165   /* Rarely, identifiers require diagnostics when lexed.  */
1166   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1167                         && !pfile->state.skipping, 0))
1168     {
1169       /* It is allowed to poison the same identifier twice.  */
1170       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1171         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1172                    NODE_NAME (result));
1173
1174       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1175          replacement list of a variadic macro.  */
1176       if (result == pfile->spec_nodes.n__VA_ARGS__
1177           && !pfile->state.va_args_ok)
1178         {
1179           if (CPP_OPTION (pfile, cplusplus))
1180             cpp_error (pfile, CPP_DL_PEDWARN,
1181                        "__VA_ARGS__ can only appear in the expansion"
1182                        " of a C++11 variadic macro");
1183           else
1184             cpp_error (pfile, CPP_DL_PEDWARN,
1185                        "__VA_ARGS__ can only appear in the expansion"
1186                        " of a C99 variadic macro");
1187         }
1188
1189       /* For -Wc++-compat, warn about use of C++ named operators.  */
1190       if (result->flags & NODE_WARN_OPERATOR)
1191         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1192                      "identifier \"%s\" is a special operator name in C++",
1193                      NODE_NAME (result));
1194     }
1195
1196   return result;
1197 }
1198
1199 /* Get the cpp_hashnode of an identifier specified by NAME in
1200    the current cpp_reader object.  If none is found, NULL is returned.  */
1201 cpp_hashnode *
1202 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
1203 {
1204   cpp_hashnode *result;
1205   result = lex_identifier_intern (pfile, (uchar *) name);
1206   return result;
1207 }
1208
1209 /* Lex an identifier starting at BUFFER->CUR - 1.  */
1210 static cpp_hashnode *
1211 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
1212                 struct normalize_state *nst)
1213 {
1214   cpp_hashnode *result;
1215   const uchar *cur;
1216   unsigned int len;
1217   unsigned int hash = HT_HASHSTEP (0, *base);
1218
1219   cur = pfile->buffer->cur;
1220   if (! starts_ucn)
1221     {
1222       while (ISIDNUM (*cur))
1223         {
1224           hash = HT_HASHSTEP (hash, *cur);
1225           cur++;
1226         }
1227       NORMALIZE_STATE_UPDATE_IDNUM (nst, *(cur - 1));
1228     }
1229   pfile->buffer->cur = cur;
1230   if (starts_ucn || forms_identifier_p (pfile, false, nst))
1231     {
1232       /* Slower version for identifiers containing UCNs (or $).  */
1233       do {
1234         while (ISIDNUM (*pfile->buffer->cur))
1235           {
1236             NORMALIZE_STATE_UPDATE_IDNUM (nst, *pfile->buffer->cur);
1237             pfile->buffer->cur++;
1238           }
1239       } while (forms_identifier_p (pfile, false, nst));
1240       result = _cpp_interpret_identifier (pfile, base,
1241                                           pfile->buffer->cur - base);
1242     }
1243   else
1244     {
1245       len = cur - base;
1246       hash = HT_HASHFINISH (hash, len);
1247
1248       result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1249                                                   base, len, hash, HT_ALLOC));
1250     }
1251
1252   /* Rarely, identifiers require diagnostics when lexed.  */
1253   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1254                         && !pfile->state.skipping, 0))
1255     {
1256       /* It is allowed to poison the same identifier twice.  */
1257       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1258         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1259                    NODE_NAME (result));
1260
1261       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1262          replacement list of a variadic macro.  */
1263       if (result == pfile->spec_nodes.n__VA_ARGS__
1264           && !pfile->state.va_args_ok)
1265         {
1266           if (CPP_OPTION (pfile, cplusplus))
1267             cpp_error (pfile, CPP_DL_PEDWARN,
1268                        "__VA_ARGS__ can only appear in the expansion"
1269                        " of a C++11 variadic macro");
1270           else
1271             cpp_error (pfile, CPP_DL_PEDWARN,
1272                        "__VA_ARGS__ can only appear in the expansion"
1273                        " of a C99 variadic macro");
1274         }
1275
1276       /* For -Wc++-compat, warn about use of C++ named operators.  */
1277       if (result->flags & NODE_WARN_OPERATOR)
1278         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1279                      "identifier \"%s\" is a special operator name in C++",
1280                      NODE_NAME (result));
1281     }
1282
1283   return result;
1284 }
1285
1286 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
1287 static void
1288 lex_number (cpp_reader *pfile, cpp_string *number,
1289             struct normalize_state *nst)
1290 {
1291   const uchar *cur;
1292   const uchar *base;
1293   uchar *dest;
1294
1295   base = pfile->buffer->cur - 1;
1296   do
1297     {
1298       cur = pfile->buffer->cur;
1299
1300       /* N.B. ISIDNUM does not include $.  */
1301       while (ISIDNUM (*cur) || *cur == '.' || DIGIT_SEP (*cur)
1302              || VALID_SIGN (*cur, cur[-1]))
1303         {
1304           NORMALIZE_STATE_UPDATE_IDNUM (nst, *cur);
1305           cur++;
1306         }
1307
1308       pfile->buffer->cur = cur;
1309     }
1310   while (forms_identifier_p (pfile, false, nst));
1311
1312   number->len = cur - base;
1313   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
1314   memcpy (dest, base, number->len);
1315   dest[number->len] = '\0';
1316   number->text = dest;
1317 }
1318
1319 /* Create a token of type TYPE with a literal spelling.  */
1320 static void
1321 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
1322                 unsigned int len, enum cpp_ttype type)
1323 {
1324   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
1325
1326   memcpy (dest, base, len);
1327   dest[len] = '\0';
1328   token->type = type;
1329   token->val.str.len = len;
1330   token->val.str.text = dest;
1331 }
1332
1333 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
1334    sequence from *FIRST_BUFF_P to LAST_BUFF_P.  */
1335
1336 static void
1337 bufring_append (cpp_reader *pfile, const uchar *base, size_t len,
1338                 _cpp_buff **first_buff_p, _cpp_buff **last_buff_p)
1339 {
1340   _cpp_buff *first_buff = *first_buff_p;
1341   _cpp_buff *last_buff = *last_buff_p;
1342
1343   if (first_buff == NULL)
1344     first_buff = last_buff = _cpp_get_buff (pfile, len);
1345   else if (len > BUFF_ROOM (last_buff))
1346     {
1347       size_t room = BUFF_ROOM (last_buff);
1348       memcpy (BUFF_FRONT (last_buff), base, room);
1349       BUFF_FRONT (last_buff) += room;
1350       base += room;
1351       len -= room;
1352       last_buff = _cpp_append_extend_buff (pfile, last_buff, len);
1353     }
1354
1355   memcpy (BUFF_FRONT (last_buff), base, len);
1356   BUFF_FRONT (last_buff) += len;
1357
1358   *first_buff_p = first_buff;
1359   *last_buff_p = last_buff;
1360 }
1361
1362
1363 /* Returns true if a macro has been defined.
1364    This might not work if compile with -save-temps,
1365    or preprocess separately from compilation.  */
1366
1367 static bool
1368 is_macro(cpp_reader *pfile, const uchar *base)
1369 {
1370   const uchar *cur = base;
1371   if (! ISIDST (*cur))
1372     return false;
1373   unsigned int hash = HT_HASHSTEP (0, *cur);
1374   ++cur;
1375   while (ISIDNUM (*cur))
1376     {
1377       hash = HT_HASHSTEP (hash, *cur);
1378       ++cur;
1379     }
1380   hash = HT_HASHFINISH (hash, cur - base);
1381
1382   cpp_hashnode *result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1383                                         base, cur - base, hash, HT_NO_INSERT));
1384
1385   return !result ? false : (result->type == NT_MACRO);
1386 }
1387
1388
1389 /* Lexes a raw string.  The stored string contains the spelling, including
1390    double quotes, delimiter string, '(' and ')', any leading
1391    'L', 'u', 'U' or 'u8' and 'R' modifier.  It returns the type of the
1392    literal, or CPP_OTHER if it was not properly terminated.
1393
1394    The spelling is NUL-terminated, but it is not guaranteed that this
1395    is the first NUL since embedded NULs are preserved.  */
1396
1397 static void
1398 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base,
1399                 const uchar *cur)
1400 {
1401   uchar raw_prefix[17];
1402   uchar temp_buffer[18];
1403   const uchar *orig_base;
1404   unsigned int raw_prefix_len = 0, raw_suffix_len = 0;
1405   enum raw_str_phase { RAW_STR_PREFIX, RAW_STR, RAW_STR_SUFFIX };
1406   raw_str_phase phase = RAW_STR_PREFIX;
1407   enum cpp_ttype type;
1408   size_t total_len = 0;
1409   /* Index into temp_buffer during phases other than RAW_STR,
1410      during RAW_STR phase 17 to tell BUF_APPEND that nothing should
1411      be appended to temp_buffer.  */
1412   size_t temp_buffer_len = 0;
1413   _cpp_buff *first_buff = NULL, *last_buff = NULL;
1414   size_t raw_prefix_start;
1415   _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
1416
1417   type = (*base == 'L' ? CPP_WSTRING :
1418           *base == 'U' ? CPP_STRING32 :
1419           *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1420           : CPP_STRING);
1421
1422 #define BUF_APPEND(STR,LEN)                                     \
1423       do {                                                      \
1424         bufring_append (pfile, (const uchar *)(STR), (LEN),     \
1425                         &first_buff, &last_buff);               \
1426         total_len += (LEN);                                     \
1427         if (__builtin_expect (temp_buffer_len < 17, 0)          \
1428             && (const uchar *)(STR) != base                     \
1429             && (LEN) <= 2)                                      \
1430           {                                                     \
1431             memcpy (temp_buffer + temp_buffer_len,              \
1432                     (const uchar *)(STR), (LEN));               \
1433             temp_buffer_len += (LEN);                           \
1434           }                                                     \
1435       } while (0);
1436
1437   orig_base = base;
1438   ++cur;
1439   raw_prefix_start = cur - base;
1440   for (;;)
1441     {
1442       cppchar_t c;
1443
1444       /* If we previously performed any trigraph or line splicing
1445          transformations, undo them in between the opening and closing
1446          double quote.  */
1447       while (note->pos < cur)
1448         ++note;
1449       for (; note->pos == cur; ++note)
1450         {
1451           switch (note->type)
1452             {
1453             case '\\':
1454             case ' ':
1455               /* Restore backslash followed by newline.  */
1456               BUF_APPEND (base, cur - base);
1457               base = cur;
1458               BUF_APPEND ("\\", 1);
1459             after_backslash:
1460               if (note->type == ' ')
1461                 {
1462                   /* GNU backslash whitespace newline extension.  FIXME
1463                      could be any sequence of non-vertical space.  When we
1464                      can properly restore any such sequence, we should mark
1465                      this note as handled so _cpp_process_line_notes
1466                      doesn't warn.  */
1467                   BUF_APPEND (" ", 1);
1468                 }
1469
1470               BUF_APPEND ("\n", 1);
1471               break;
1472
1473             case 0:
1474               /* Already handled.  */
1475               break;
1476
1477             default:
1478               if (_cpp_trigraph_map[note->type])
1479                 {
1480                   /* Don't warn about this trigraph in
1481                      _cpp_process_line_notes, since trigraphs show up as
1482                      trigraphs in raw strings.  */
1483                   uchar type = note->type;
1484                   note->type = 0;
1485
1486                   if (!CPP_OPTION (pfile, trigraphs))
1487                     /* If we didn't convert the trigraph in the first
1488                        place, don't do anything now either.  */
1489                     break;
1490
1491                   BUF_APPEND (base, cur - base);
1492                   base = cur;
1493                   BUF_APPEND ("??", 2);
1494
1495                   /* ??/ followed by newline gets two line notes, one for
1496                      the trigraph and one for the backslash/newline.  */
1497                   if (type == '/' && note[1].pos == cur)
1498                     {
1499                       if (note[1].type != '\\'
1500                           && note[1].type != ' ')
1501                         abort ();
1502                       BUF_APPEND ("/", 1);
1503                       ++note;
1504                       goto after_backslash;
1505                     }
1506                   else
1507                     {
1508                       /* Skip the replacement character.  */
1509                       base = ++cur;
1510                       BUF_APPEND (&type, 1);
1511                       c = type;
1512                       goto check_c;
1513                     }
1514                 }
1515               else
1516                 abort ();
1517               break;
1518             }
1519         }
1520       c = *cur++;
1521       if (__builtin_expect (temp_buffer_len < 17, 0))
1522         temp_buffer[temp_buffer_len++] = c;
1523
1524      check_c:
1525       if (phase == RAW_STR_PREFIX)
1526         {
1527           while (raw_prefix_len < temp_buffer_len)
1528             {
1529               raw_prefix[raw_prefix_len] = temp_buffer[raw_prefix_len];
1530               switch (raw_prefix[raw_prefix_len])
1531                 {
1532                 case ' ': case '(': case ')': case '\\': case '\t':
1533                 case '\v': case '\f': case '\n': default:
1534                   break;
1535                 /* Basic source charset except the above chars.  */
1536                 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1537                 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1538                 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1539                 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1540                 case 'y': case 'z':
1541                 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1542                 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1543                 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1544                 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1545                 case 'Y': case 'Z':
1546                 case '0': case '1': case '2': case '3': case '4': case '5':
1547                 case '6': case '7': case '8': case '9':
1548                 case '_': case '{': case '}': case '#': case '[': case ']':
1549                 case '<': case '>': case '%': case ':': case ';': case '.':
1550                 case '?': case '*': case '+': case '-': case '/': case '^':
1551                 case '&': case '|': case '~': case '!': case '=': case ',':
1552                 case '"': case '\'':
1553                   if (raw_prefix_len < 16)
1554                     {
1555                       raw_prefix_len++;
1556                       continue;
1557                     }
1558                   break;
1559                 }
1560
1561               if (raw_prefix[raw_prefix_len] != '(')
1562                 {
1563                   int col = CPP_BUF_COLUMN (pfile->buffer, cur) + 1;
1564                   if (raw_prefix_len == 16)
1565                     cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1566                                          col, "raw string delimiter longer "
1567                                               "than 16 characters");
1568                   else if (raw_prefix[raw_prefix_len] == '\n')
1569                     cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1570                                          col, "invalid new-line in raw "
1571                                               "string delimiter");
1572                   else
1573                     cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1574                                          col, "invalid character '%c' in "
1575                                               "raw string delimiter",
1576                                          (int) raw_prefix[raw_prefix_len]);
1577                   pfile->buffer->cur = orig_base + raw_prefix_start - 1;
1578                   create_literal (pfile, token, orig_base,
1579                                   raw_prefix_start - 1, CPP_OTHER);
1580                   if (first_buff)
1581                     _cpp_release_buff (pfile, first_buff);
1582                   return;
1583                 }
1584               raw_prefix[raw_prefix_len] = '"';
1585               phase = RAW_STR;
1586               /* Nothing should be appended to temp_buffer during
1587                  RAW_STR phase.  */
1588               temp_buffer_len = 17;
1589               break;
1590             }
1591           continue;
1592         }
1593       else if (phase == RAW_STR_SUFFIX)
1594         {
1595           while (raw_suffix_len <= raw_prefix_len
1596                  && raw_suffix_len < temp_buffer_len
1597                  && temp_buffer[raw_suffix_len] == raw_prefix[raw_suffix_len])
1598             raw_suffix_len++;
1599           if (raw_suffix_len > raw_prefix_len)
1600             break;
1601           if (raw_suffix_len == temp_buffer_len)
1602             continue;
1603           phase = RAW_STR;
1604           /* Nothing should be appended to temp_buffer during
1605              RAW_STR phase.  */
1606           temp_buffer_len = 17;
1607         }
1608       if (c == ')')
1609         {
1610           phase = RAW_STR_SUFFIX;
1611           raw_suffix_len = 0;
1612           temp_buffer_len = 0;
1613         }
1614       else if (c == '\n')
1615         {
1616           if (pfile->state.in_directive
1617               || (pfile->state.parsing_args
1618                   && pfile->buffer->next_line >= pfile->buffer->rlimit))
1619             {
1620               cur--;
1621               type = CPP_OTHER;
1622               cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
1623                                    "unterminated raw string");
1624               break;
1625             }
1626
1627           BUF_APPEND (base, cur - base);
1628
1629           if (pfile->buffer->cur < pfile->buffer->rlimit)
1630             CPP_INCREMENT_LINE (pfile, 0);
1631           pfile->buffer->need_line = true;
1632
1633           pfile->buffer->cur = cur-1;
1634           _cpp_process_line_notes (pfile, false);
1635           if (!_cpp_get_fresh_line (pfile))
1636             {
1637               source_location src_loc = token->src_loc;
1638               token->type = CPP_EOF;
1639               /* Tell the compiler the line number of the EOF token.  */
1640               token->src_loc = pfile->line_table->highest_line;
1641               token->flags = BOL;
1642               if (first_buff != NULL)
1643                 _cpp_release_buff (pfile, first_buff);
1644               cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
1645                                    "unterminated raw string");
1646               return;
1647             }
1648
1649           cur = base = pfile->buffer->cur;
1650           note = &pfile->buffer->notes[pfile->buffer->cur_note];
1651         }
1652     }
1653
1654   if (CPP_OPTION (pfile, user_literals))
1655     {
1656       /* If a string format macro, say from inttypes.h, is placed touching
1657          a string literal it could be parsed as a C++11 user-defined string
1658          literal thus breaking the program.
1659          Try to identify macros with is_macro. A warning is issued. */
1660       if (is_macro (pfile, cur))
1661         {
1662           /* Raise a warning, but do not consume subsequent tokens.  */
1663           if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
1664             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
1665                                    token->src_loc, 0,
1666                                    "invalid suffix on literal; C++11 requires "
1667                                    "a space between literal and string macro");
1668         }
1669       /* Grab user defined literal suffix.  */
1670       else if (ISIDST (*cur))
1671         {
1672           type = cpp_userdef_string_add_type (type);
1673           ++cur;
1674
1675           while (ISIDNUM (*cur))
1676             ++cur;
1677         }
1678     }
1679
1680   pfile->buffer->cur = cur;
1681   if (first_buff == NULL)
1682     create_literal (pfile, token, base, cur - base, type);
1683   else
1684     {
1685       uchar *dest = _cpp_unaligned_alloc (pfile, total_len + (cur - base) + 1);
1686
1687       token->type = type;
1688       token->val.str.len = total_len + (cur - base);
1689       token->val.str.text = dest;
1690       last_buff = first_buff;
1691       while (last_buff != NULL)
1692         {
1693           memcpy (dest, last_buff->base,
1694                   BUFF_FRONT (last_buff) - last_buff->base);
1695           dest += BUFF_FRONT (last_buff) - last_buff->base;
1696           last_buff = last_buff->next;
1697         }
1698       _cpp_release_buff (pfile, first_buff);
1699       memcpy (dest, base, cur - base);
1700       dest[cur - base] = '\0';
1701     }
1702 }
1703
1704 /* Lexes a string, character constant, or angle-bracketed header file
1705    name.  The stored string contains the spelling, including opening
1706    quote and any leading 'L', 'u', 'U' or 'u8' and optional
1707    'R' modifier.  It returns the type of the literal, or CPP_OTHER
1708    if it was not properly terminated, or CPP_LESS for an unterminated
1709    header name which must be relexed as normal tokens.
1710
1711    The spelling is NUL-terminated, but it is not guaranteed that this
1712    is the first NUL since embedded NULs are preserved.  */
1713 static void
1714 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
1715 {
1716   bool saw_NUL = false;
1717   const uchar *cur;
1718   cppchar_t terminator;
1719   enum cpp_ttype type;
1720
1721   cur = base;
1722   terminator = *cur++;
1723   if (terminator == 'L' || terminator == 'U')
1724     terminator = *cur++;
1725   else if (terminator == 'u')
1726     {
1727       terminator = *cur++;
1728       if (terminator == '8')
1729         terminator = *cur++;
1730     }
1731   if (terminator == 'R')
1732     {
1733       lex_raw_string (pfile, token, base, cur);
1734       return;
1735     }
1736   if (terminator == '"')
1737     type = (*base == 'L' ? CPP_WSTRING :
1738             *base == 'U' ? CPP_STRING32 :
1739             *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1740                          : CPP_STRING);
1741   else if (terminator == '\'')
1742     type = (*base == 'L' ? CPP_WCHAR :
1743             *base == 'U' ? CPP_CHAR32 :
1744             *base == 'u' ? CPP_CHAR16 : CPP_CHAR);
1745   else
1746     terminator = '>', type = CPP_HEADER_NAME;
1747
1748   for (;;)
1749     {
1750       cppchar_t c = *cur++;
1751
1752       /* In #include-style directives, terminators are not escapable.  */
1753       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
1754         cur++;
1755       else if (c == terminator)
1756         break;
1757       else if (c == '\n')
1758         {
1759           cur--;
1760           /* Unmatched quotes always yield undefined behavior, but
1761              greedy lexing means that what appears to be an unterminated
1762              header name may actually be a legitimate sequence of tokens.  */
1763           if (terminator == '>')
1764             {
1765               token->type = CPP_LESS;
1766               return;
1767             }
1768           type = CPP_OTHER;
1769           break;
1770         }
1771       else if (c == '\0')
1772         saw_NUL = true;
1773     }
1774
1775   if (saw_NUL && !pfile->state.skipping)
1776     cpp_error (pfile, CPP_DL_WARNING,
1777                "null character(s) preserved in literal");
1778
1779   if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
1780     cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
1781                (int) terminator);
1782
1783   if (CPP_OPTION (pfile, user_literals))
1784     {
1785       /* If a string format macro, say from inttypes.h, is placed touching
1786          a string literal it could be parsed as a C++11 user-defined string
1787          literal thus breaking the program.
1788          Try to identify macros with is_macro. A warning is issued. */
1789       if (is_macro (pfile, cur))
1790         {
1791           /* Raise a warning, but do not consume subsequent tokens.  */
1792           if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
1793             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
1794                                    token->src_loc, 0,
1795                                    "invalid suffix on literal; C++11 requires "
1796                                    "a space between literal and string macro");
1797         }
1798       /* Grab user defined literal suffix.  */
1799       else if (ISIDST (*cur))
1800         {
1801           type = cpp_userdef_char_add_type (type);
1802           type = cpp_userdef_string_add_type (type);
1803           ++cur;
1804
1805           while (ISIDNUM (*cur))
1806             ++cur;
1807         }
1808     }
1809
1810   pfile->buffer->cur = cur;
1811   create_literal (pfile, token, base, cur - base, type);
1812 }
1813
1814 /* Return the comment table. The client may not make any assumption
1815    about the ordering of the table.  */
1816 cpp_comment_table *
1817 cpp_get_comments (cpp_reader *pfile)
1818 {
1819   return &pfile->comments;
1820 }
1821
1822 /* Append a comment to the end of the comment table. */
1823 static void
1824 store_comment (cpp_reader *pfile, cpp_token *token)
1825 {
1826   int len;
1827
1828   if (pfile->comments.allocated == 0)
1829     {
1830       pfile->comments.allocated = 256;
1831       pfile->comments.entries = (cpp_comment *) xmalloc
1832         (pfile->comments.allocated * sizeof (cpp_comment));
1833     }
1834
1835   if (pfile->comments.count == pfile->comments.allocated)
1836     {
1837       pfile->comments.allocated *= 2;
1838       pfile->comments.entries = (cpp_comment *) xrealloc
1839         (pfile->comments.entries,
1840          pfile->comments.allocated * sizeof (cpp_comment));
1841     }
1842
1843   len = token->val.str.len;
1844
1845   /* Copy comment. Note, token may not be NULL terminated. */
1846   pfile->comments.entries[pfile->comments.count].comment =
1847     (char *) xmalloc (sizeof (char) * (len + 1));
1848   memcpy (pfile->comments.entries[pfile->comments.count].comment,
1849           token->val.str.text, len);
1850   pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
1851
1852   /* Set source location. */
1853   pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
1854
1855   /* Increment the count of entries in the comment table. */
1856   pfile->comments.count++;
1857 }
1858
1859 /* The stored comment includes the comment start and any terminator.  */
1860 static void
1861 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
1862               cppchar_t type)
1863 {
1864   unsigned char *buffer;
1865   unsigned int len, clen, i;
1866
1867   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
1868
1869   /* C++ comments probably (not definitely) have moved past a new
1870      line, which we don't want to save in the comment.  */
1871   if (is_vspace (pfile->buffer->cur[-1]))
1872     len--;
1873
1874   /* If we are currently in a directive or in argument parsing, then
1875      we need to store all C++ comments as C comments internally, and
1876      so we need to allocate a little extra space in that case.
1877
1878      Note that the only time we encounter a directive here is
1879      when we are saving comments in a "#define".  */
1880   clen = ((pfile->state.in_directive || pfile->state.parsing_args)
1881           && type == '/') ? len + 2 : len;
1882
1883   buffer = _cpp_unaligned_alloc (pfile, clen);
1884
1885   token->type = CPP_COMMENT;
1886   token->val.str.len = clen;
1887   token->val.str.text = buffer;
1888
1889   buffer[0] = '/';
1890   memcpy (buffer + 1, from, len - 1);
1891
1892   /* Finish conversion to a C comment, if necessary.  */
1893   if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
1894     {
1895       buffer[1] = '*';
1896       buffer[clen - 2] = '*';
1897       buffer[clen - 1] = '/';
1898       /* As there can be in a C++ comments illegal sequences for C comments
1899          we need to filter them out.  */
1900       for (i = 2; i < (clen - 2); i++)
1901         if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
1902           buffer[i] = '|';
1903     }
1904
1905   /* Finally store this comment for use by clients of libcpp. */
1906   store_comment (pfile, token);
1907 }
1908
1909 /* Allocate COUNT tokens for RUN.  */
1910 void
1911 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
1912 {
1913   run->base = XNEWVEC (cpp_token, count);
1914   run->limit = run->base + count;
1915   run->next = NULL;
1916 }
1917
1918 /* Returns the next tokenrun, or creates one if there is none.  */
1919 static tokenrun *
1920 next_tokenrun (tokenrun *run)
1921 {
1922   if (run->next == NULL)
1923     {
1924       run->next = XNEW (tokenrun);
1925       run->next->prev = run;
1926       _cpp_init_tokenrun (run->next, 250);
1927     }
1928
1929   return run->next;
1930 }
1931
1932 /* Return the number of not yet processed token in a given
1933    context.  */
1934 int
1935 _cpp_remaining_tokens_num_in_context (cpp_context *context)
1936 {
1937   if (context->tokens_kind == TOKENS_KIND_DIRECT)
1938     return (LAST (context).token - FIRST (context).token);
1939   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
1940            || context->tokens_kind == TOKENS_KIND_EXTENDED)
1941     return (LAST (context).ptoken - FIRST (context).ptoken);
1942   else
1943       abort ();
1944 }
1945
1946 /* Returns the token present at index INDEX in a given context.  If
1947    INDEX is zero, the next token to be processed is returned.  */
1948 static const cpp_token*
1949 _cpp_token_from_context_at (cpp_context *context, int index)
1950 {
1951   if (context->tokens_kind == TOKENS_KIND_DIRECT)
1952     return &(FIRST (context).token[index]);
1953   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
1954            || context->tokens_kind == TOKENS_KIND_EXTENDED)
1955     return FIRST (context).ptoken[index];
1956  else
1957    abort ();
1958 }
1959
1960 /* Look ahead in the input stream.  */
1961 const cpp_token *
1962 cpp_peek_token (cpp_reader *pfile, int index)
1963 {
1964   cpp_context *context = pfile->context;
1965   const cpp_token *peektok;
1966   int count;
1967
1968   /* First, scan through any pending cpp_context objects.  */
1969   while (context->prev)
1970     {
1971       ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
1972
1973       if (index < (int) sz)
1974         return _cpp_token_from_context_at (context, index);
1975       index -= (int) sz;
1976       context = context->prev;
1977     }
1978
1979   /* We will have to read some new tokens after all (and do so
1980      without invalidating preceding tokens).  */
1981   count = index;
1982   pfile->keep_tokens++;
1983
1984   do
1985     {
1986       peektok = _cpp_lex_token (pfile);
1987       if (peektok->type == CPP_EOF)
1988         return peektok;
1989     }
1990   while (index--);
1991
1992   _cpp_backup_tokens_direct (pfile, count + 1);
1993   pfile->keep_tokens--;
1994
1995   return peektok;
1996 }
1997
1998 /* Allocate a single token that is invalidated at the same time as the
1999    rest of the tokens on the line.  Has its line and col set to the
2000    same as the last lexed token, so that diagnostics appear in the
2001    right place.  */
2002 cpp_token *
2003 _cpp_temp_token (cpp_reader *pfile)
2004 {
2005   cpp_token *old, *result;
2006   ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
2007   ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
2008
2009   old = pfile->cur_token - 1;
2010   /* Any pre-existing lookaheads must not be clobbered.  */
2011   if (la)
2012     {
2013       if (sz <= la)
2014         {
2015           tokenrun *next = next_tokenrun (pfile->cur_run);
2016
2017           if (sz < la)
2018             memmove (next->base + 1, next->base,
2019                      (la - sz) * sizeof (cpp_token));
2020
2021           next->base[0] = pfile->cur_run->limit[-1];
2022         }
2023
2024       if (sz > 1)
2025         memmove (pfile->cur_token + 1, pfile->cur_token,
2026                  MIN (la, sz - 1) * sizeof (cpp_token));
2027     }
2028
2029   if (!sz && pfile->cur_token == pfile->cur_run->limit)
2030     {
2031       pfile->cur_run = next_tokenrun (pfile->cur_run);
2032       pfile->cur_token = pfile->cur_run->base;
2033     }
2034
2035   result = pfile->cur_token++;
2036   result->src_loc = old->src_loc;
2037   return result;
2038 }
2039
2040 /* Lex a token into RESULT (external interface).  Takes care of issues
2041    like directive handling, token lookahead, multiple include
2042    optimization and skipping.  */
2043 const cpp_token *
2044 _cpp_lex_token (cpp_reader *pfile)
2045 {
2046   cpp_token *result;
2047
2048   for (;;)
2049     {
2050       if (pfile->cur_token == pfile->cur_run->limit)
2051         {
2052           pfile->cur_run = next_tokenrun (pfile->cur_run);
2053           pfile->cur_token = pfile->cur_run->base;
2054         }
2055       /* We assume that the current token is somewhere in the current
2056          run.  */
2057       if (pfile->cur_token < pfile->cur_run->base
2058           || pfile->cur_token >= pfile->cur_run->limit)
2059         abort ();
2060
2061       if (pfile->lookaheads)
2062         {
2063           pfile->lookaheads--;
2064           result = pfile->cur_token++;
2065         }
2066       else
2067         result = _cpp_lex_direct (pfile);
2068
2069       if (result->flags & BOL)
2070         {
2071           /* Is this a directive.  If _cpp_handle_directive returns
2072              false, it is an assembler #.  */
2073           if (result->type == CPP_HASH
2074               /* 6.10.3 p 11: Directives in a list of macro arguments
2075                  gives undefined behavior.  This implementation
2076                  handles the directive as normal.  */
2077               && pfile->state.parsing_args != 1)
2078             {
2079               if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
2080                 {
2081                   if (pfile->directive_result.type == CPP_PADDING)
2082                     continue;
2083                   result = &pfile->directive_result;
2084                 }
2085             }
2086           else if (pfile->state.in_deferred_pragma)
2087             result = &pfile->directive_result;
2088
2089           if (pfile->cb.line_change && !pfile->state.skipping)
2090             pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
2091         }
2092
2093       /* We don't skip tokens in directives.  */
2094       if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
2095         break;
2096
2097       /* Outside a directive, invalidate controlling macros.  At file
2098          EOF, _cpp_lex_direct takes care of popping the buffer, so we never
2099          get here and MI optimization works.  */
2100       pfile->mi_valid = false;
2101
2102       if (!pfile->state.skipping || result->type == CPP_EOF)
2103         break;
2104     }
2105
2106   return result;
2107 }
2108
2109 /* Returns true if a fresh line has been loaded.  */
2110 bool
2111 _cpp_get_fresh_line (cpp_reader *pfile)
2112 {
2113   int return_at_eof;
2114
2115   /* We can't get a new line until we leave the current directive.  */
2116   if (pfile->state.in_directive)
2117     return false;
2118
2119   for (;;)
2120     {
2121       cpp_buffer *buffer = pfile->buffer;
2122
2123       if (!buffer->need_line)
2124         return true;
2125
2126       if (buffer->next_line < buffer->rlimit)
2127         {
2128           _cpp_clean_line (pfile);
2129           return true;
2130         }
2131
2132       /* First, get out of parsing arguments state.  */
2133       if (pfile->state.parsing_args)
2134         return false;
2135
2136       /* End of buffer.  Non-empty files should end in a newline.  */
2137       if (buffer->buf != buffer->rlimit
2138           && buffer->next_line > buffer->rlimit
2139           && !buffer->from_stage3)
2140         {
2141           /* Clip to buffer size.  */
2142           buffer->next_line = buffer->rlimit;
2143         }
2144
2145       return_at_eof = buffer->return_at_eof;
2146       _cpp_pop_buffer (pfile);
2147       if (pfile->buffer == NULL || return_at_eof)
2148         return false;
2149     }
2150 }
2151
2152 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)          \
2153   do                                                    \
2154     {                                                   \
2155       result->type = ELSE_TYPE;                         \
2156       if (*buffer->cur == CHAR)                         \
2157         buffer->cur++, result->type = THEN_TYPE;        \
2158     }                                                   \
2159   while (0)
2160
2161 /* Lex a token into pfile->cur_token, which is also incremented, to
2162    get diagnostics pointing to the correct location.
2163
2164    Does not handle issues such as token lookahead, multiple-include
2165    optimization, directives, skipping etc.  This function is only
2166    suitable for use by _cpp_lex_token, and in special cases like
2167    lex_expansion_token which doesn't care for any of these issues.
2168
2169    When meeting a newline, returns CPP_EOF if parsing a directive,
2170    otherwise returns to the start of the token buffer if permissible.
2171    Returns the location of the lexed token.  */
2172 cpp_token *
2173 _cpp_lex_direct (cpp_reader *pfile)
2174 {
2175   cppchar_t c;
2176   cpp_buffer *buffer;
2177   const unsigned char *comment_start;
2178   cpp_token *result = pfile->cur_token++;
2179
2180  fresh_line:
2181   result->flags = 0;
2182   buffer = pfile->buffer;
2183   if (buffer->need_line)
2184     {
2185       if (pfile->state.in_deferred_pragma)
2186         {
2187           result->type = CPP_PRAGMA_EOL;
2188           pfile->state.in_deferred_pragma = false;
2189           if (!pfile->state.pragma_allow_expansion)
2190             pfile->state.prevent_expansion--;
2191           return result;
2192         }
2193       if (!_cpp_get_fresh_line (pfile))
2194         {
2195           result->type = CPP_EOF;
2196           if (!pfile->state.in_directive)
2197             {
2198               /* Tell the compiler the line number of the EOF token.  */
2199               result->src_loc = pfile->line_table->highest_line;
2200               result->flags = BOL;
2201             }
2202           return result;
2203         }
2204       if (!pfile->keep_tokens)
2205         {
2206           pfile->cur_run = &pfile->base_run;
2207           result = pfile->base_run.base;
2208           pfile->cur_token = result + 1;
2209         }
2210       result->flags = BOL;
2211       if (pfile->state.parsing_args == 2)
2212         result->flags |= PREV_WHITE;
2213     }
2214   buffer = pfile->buffer;
2215  update_tokens_line:
2216   result->src_loc = pfile->line_table->highest_line;
2217
2218  skipped_white:
2219   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
2220       && !pfile->overlaid_buffer)
2221     {
2222       _cpp_process_line_notes (pfile, false);
2223       result->src_loc = pfile->line_table->highest_line;
2224     }
2225   c = *buffer->cur++;
2226
2227   if (pfile->forced_token_location_p)
2228     result->src_loc = *pfile->forced_token_location_p;
2229   else
2230     result->src_loc = linemap_position_for_column (pfile->line_table,
2231                                           CPP_BUF_COLUMN (buffer, buffer->cur));
2232
2233   switch (c)
2234     {
2235     case ' ': case '\t': case '\f': case '\v': case '\0':
2236       result->flags |= PREV_WHITE;
2237       skip_whitespace (pfile, c);
2238       goto skipped_white;
2239
2240     case '\n':
2241       if (buffer->cur < buffer->rlimit)
2242         CPP_INCREMENT_LINE (pfile, 0);
2243       buffer->need_line = true;
2244       goto fresh_line;
2245
2246     case '0': case '1': case '2': case '3': case '4':
2247     case '5': case '6': case '7': case '8': case '9':
2248       {
2249         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2250         result->type = CPP_NUMBER;
2251         lex_number (pfile, &result->val.str, &nst);
2252         warn_about_normalization (pfile, result, &nst);
2253         break;
2254       }
2255
2256     case 'L':
2257     case 'u':
2258     case 'U':
2259     case 'R':
2260       /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
2261          wide strings or raw strings.  */
2262       if (c == 'L' || CPP_OPTION (pfile, rliterals)
2263           || (c != 'R' && CPP_OPTION (pfile, uliterals)))
2264         {
2265           if ((*buffer->cur == '\'' && c != 'R')
2266               || *buffer->cur == '"'
2267               || (*buffer->cur == 'R'
2268                   && c != 'R'
2269                   && buffer->cur[1] == '"'
2270                   && CPP_OPTION (pfile, rliterals))
2271               || (*buffer->cur == '8'
2272                   && c == 'u'
2273                   && (buffer->cur[1] == '"'
2274                       || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
2275                           && CPP_OPTION (pfile, rliterals)))))
2276             {
2277               lex_string (pfile, result, buffer->cur - 1);
2278               break;
2279             }
2280         }
2281       /* Fall through.  */
2282
2283     case '_':
2284     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
2285     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
2286     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
2287     case 's': case 't':           case 'v': case 'w': case 'x':
2288     case 'y': case 'z':
2289     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
2290     case 'G': case 'H': case 'I': case 'J': case 'K':
2291     case 'M': case 'N': case 'O': case 'P': case 'Q':
2292     case 'S': case 'T':           case 'V': case 'W': case 'X':
2293     case 'Y': case 'Z':
2294       result->type = CPP_NAME;
2295       {
2296         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2297         result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
2298                                                 &nst);
2299         warn_about_normalization (pfile, result, &nst);
2300       }
2301
2302       /* Convert named operators to their proper types.  */
2303       if (result->val.node.node->flags & NODE_OPERATOR)
2304         {
2305           result->flags |= NAMED_OP;
2306           result->type = (enum cpp_ttype) result->val.node.node->directive_index;
2307         }
2308       break;
2309
2310     case '\'':
2311     case '"':
2312       lex_string (pfile, result, buffer->cur - 1);
2313       break;
2314
2315     case '/':
2316       /* A potential block or line comment.  */
2317       comment_start = buffer->cur;
2318       c = *buffer->cur;
2319
2320       if (c == '*')
2321         {
2322           if (_cpp_skip_block_comment (pfile))
2323             cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
2324         }
2325       else if (c == '/' && (CPP_OPTION (pfile, cplusplus_comments)
2326                             || cpp_in_system_header (pfile)))
2327         {
2328           /* Warn about comments only if pedantically GNUC89, and not
2329              in system headers.  */
2330           if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile)
2331               && ! buffer->warned_cplusplus_comments)
2332             {
2333               cpp_error (pfile, CPP_DL_PEDWARN,
2334                          "C++ style comments are not allowed in ISO C90");
2335               cpp_error (pfile, CPP_DL_PEDWARN,
2336                          "(this will be reported only once per input file)");
2337               buffer->warned_cplusplus_comments = 1;
2338             }
2339
2340           if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
2341             cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
2342         }
2343       else if (c == '=')
2344         {
2345           buffer->cur++;
2346           result->type = CPP_DIV_EQ;
2347           break;
2348         }
2349       else
2350         {
2351           result->type = CPP_DIV;
2352           break;
2353         }
2354
2355       if (!pfile->state.save_comments)
2356         {
2357           result->flags |= PREV_WHITE;
2358           goto update_tokens_line;
2359         }
2360
2361       /* Save the comment as a token in its own right.  */
2362       save_comment (pfile, result, comment_start, c);
2363       break;
2364
2365     case '<':
2366       if (pfile->state.angled_headers)
2367         {
2368           lex_string (pfile, result, buffer->cur - 1);
2369           if (result->type != CPP_LESS)
2370             break;
2371         }
2372
2373       result->type = CPP_LESS;
2374       if (*buffer->cur == '=')
2375         buffer->cur++, result->type = CPP_LESS_EQ;
2376       else if (*buffer->cur == '<')
2377         {
2378           buffer->cur++;
2379           IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
2380         }
2381       else if (CPP_OPTION (pfile, digraphs))
2382         {
2383           if (*buffer->cur == ':')
2384             {
2385               /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
2386                  three characters are <:: and the subsequent character
2387                  is neither : nor >, the < is treated as a preprocessor
2388                  token by itself".  */
2389               if (CPP_OPTION (pfile, cplusplus)
2390                   && CPP_OPTION (pfile, lang) != CLK_CXX98
2391                   && CPP_OPTION (pfile, lang) != CLK_GNUCXX
2392                   && buffer->cur[1] == ':'
2393                   && buffer->cur[2] != ':' && buffer->cur[2] != '>')
2394                 break;
2395
2396               buffer->cur++;
2397               result->flags |= DIGRAPH;
2398               result->type = CPP_OPEN_SQUARE;
2399             }
2400           else if (*buffer->cur == '%')
2401             {
2402               buffer->cur++;
2403               result->flags |= DIGRAPH;
2404               result->type = CPP_OPEN_BRACE;
2405             }
2406         }
2407       break;
2408
2409     case '>':
2410       result->type = CPP_GREATER;
2411       if (*buffer->cur == '=')
2412         buffer->cur++, result->type = CPP_GREATER_EQ;
2413       else if (*buffer->cur == '>')
2414         {
2415           buffer->cur++;
2416           IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
2417         }
2418       break;
2419
2420     case '%':
2421       result->type = CPP_MOD;
2422       if (*buffer->cur == '=')
2423         buffer->cur++, result->type = CPP_MOD_EQ;
2424       else if (CPP_OPTION (pfile, digraphs))
2425         {
2426           if (*buffer->cur == ':')
2427             {
2428               buffer->cur++;
2429               result->flags |= DIGRAPH;
2430               result->type = CPP_HASH;
2431               if (*buffer->cur == '%' && buffer->cur[1] == ':')
2432                 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
2433             }
2434           else if (*buffer->cur == '>')
2435             {
2436               buffer->cur++;
2437               result->flags |= DIGRAPH;
2438               result->type = CPP_CLOSE_BRACE;
2439             }
2440         }
2441       break;
2442
2443     case '.':
2444       result->type = CPP_DOT;
2445       if (ISDIGIT (*buffer->cur))
2446         {
2447           struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2448           result->type = CPP_NUMBER;
2449           lex_number (pfile, &result->val.str, &nst);
2450           warn_about_normalization (pfile, result, &nst);
2451         }
2452       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
2453         buffer->cur += 2, result->type = CPP_ELLIPSIS;
2454       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
2455         buffer->cur++, result->type = CPP_DOT_STAR;
2456       break;
2457
2458     case '+':
2459       result->type = CPP_PLUS;
2460       if (*buffer->cur == '+')
2461         buffer->cur++, result->type = CPP_PLUS_PLUS;
2462       else if (*buffer->cur == '=')
2463         buffer->cur++, result->type = CPP_PLUS_EQ;
2464       break;
2465
2466     case '-':
2467       result->type = CPP_MINUS;
2468       if (*buffer->cur == '>')
2469         {
2470           buffer->cur++;
2471           result->type = CPP_DEREF;
2472           if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
2473             buffer->cur++, result->type = CPP_DEREF_STAR;
2474         }
2475       else if (*buffer->cur == '-')
2476         buffer->cur++, result->type = CPP_MINUS_MINUS;
2477       else if (*buffer->cur == '=')
2478         buffer->cur++, result->type = CPP_MINUS_EQ;
2479       break;
2480
2481     case '&':
2482       result->type = CPP_AND;
2483       if (*buffer->cur == '&')
2484         buffer->cur++, result->type = CPP_AND_AND;
2485       else if (*buffer->cur == '=')
2486         buffer->cur++, result->type = CPP_AND_EQ;
2487       break;
2488
2489     case '|':
2490       result->type = CPP_OR;
2491       if (*buffer->cur == '|')
2492         buffer->cur++, result->type = CPP_OR_OR;
2493       else if (*buffer->cur == '=')
2494         buffer->cur++, result->type = CPP_OR_EQ;
2495       break;
2496
2497     case ':':
2498       result->type = CPP_COLON;
2499       if (*buffer->cur == ':' && CPP_OPTION (pfile, cplusplus))
2500         buffer->cur++, result->type = CPP_SCOPE;
2501       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
2502         {
2503           buffer->cur++;
2504           result->flags |= DIGRAPH;
2505           result->type = CPP_CLOSE_SQUARE;
2506         }
2507       break;
2508
2509     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
2510     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
2511     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
2512     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
2513     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
2514
2515     case '?': result->type = CPP_QUERY; break;
2516     case '~': result->type = CPP_COMPL; break;
2517     case ',': result->type = CPP_COMMA; break;
2518     case '(': result->type = CPP_OPEN_PAREN; break;
2519     case ')': result->type = CPP_CLOSE_PAREN; break;
2520     case '[': result->type = CPP_OPEN_SQUARE; break;
2521     case ']': result->type = CPP_CLOSE_SQUARE; break;
2522     case '{': result->type = CPP_OPEN_BRACE; break;
2523     case '}': result->type = CPP_CLOSE_BRACE; break;
2524     case ';': result->type = CPP_SEMICOLON; break;
2525
2526       /* @ is a punctuator in Objective-C.  */
2527     case '@': result->type = CPP_ATSIGN; break;
2528
2529     case '$':
2530     case '\\':
2531       {
2532         const uchar *base = --buffer->cur;
2533         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2534
2535         if (forms_identifier_p (pfile, true, &nst))
2536           {
2537             result->type = CPP_NAME;
2538             result->val.node.node = lex_identifier (pfile, base, true, &nst);
2539             warn_about_normalization (pfile, result, &nst);
2540             break;
2541           }
2542         buffer->cur++;
2543       }
2544
2545     default:
2546       create_literal (pfile, result, buffer->cur - 1, 1, CPP_OTHER);
2547       break;
2548     }
2549
2550   return result;
2551 }
2552
2553 /* An upper bound on the number of bytes needed to spell TOKEN.
2554    Does not include preceding whitespace.  */
2555 unsigned int
2556 cpp_token_len (const cpp_token *token)
2557 {
2558   unsigned int len;
2559
2560   switch (TOKEN_SPELL (token))
2561     {
2562     default:            len = 6;                                break;
2563     case SPELL_LITERAL: len = token->val.str.len;               break;
2564     case SPELL_IDENT:   len = NODE_LEN (token->val.node.node) * 10;     break;
2565     }
2566
2567   return len;
2568 }
2569
2570 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
2571    Return the number of bytes read out of NAME.  (There are always
2572    10 bytes written to BUFFER.)  */
2573
2574 static size_t
2575 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
2576 {
2577   int j;
2578   int ucn_len = 0;
2579   int ucn_len_c;
2580   unsigned t;
2581   unsigned long utf32;
2582
2583   /* Compute the length of the UTF-8 sequence.  */
2584   for (t = *name; t & 0x80; t <<= 1)
2585     ucn_len++;
2586
2587   utf32 = *name & (0x7F >> ucn_len);
2588   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
2589     {
2590       utf32 = (utf32 << 6) | (*++name & 0x3F);
2591
2592       /* Ill-formed UTF-8.  */
2593       if ((*name & ~0x3F) != 0x80)
2594         abort ();
2595     }
2596
2597   *buffer++ = '\\';
2598   *buffer++ = 'U';
2599   for (j = 7; j >= 0; j--)
2600     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
2601   return ucn_len;
2602 }
2603
2604 /* Given a token TYPE corresponding to a digraph, return a pointer to
2605    the spelling of the digraph.  */
2606 static const unsigned char *
2607 cpp_digraph2name (enum cpp_ttype type)
2608 {
2609   return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
2610 }
2611
2612 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
2613    already contain the enough space to hold the token's spelling.
2614    Returns a pointer to the character after the last character written.
2615    FORSTRING is true if this is to be the spelling after translation
2616    phase 1 (this is different for UCNs).
2617    FIXME: Would be nice if we didn't need the PFILE argument.  */
2618 unsigned char *
2619 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
2620                  unsigned char *buffer, bool forstring)
2621 {
2622   switch (TOKEN_SPELL (token))
2623     {
2624     case SPELL_OPERATOR:
2625       {
2626         const unsigned char *spelling;
2627         unsigned char c;
2628
2629         if (token->flags & DIGRAPH)
2630           spelling = cpp_digraph2name (token->type);
2631         else if (token->flags & NAMED_OP)
2632           goto spell_ident;
2633         else
2634           spelling = TOKEN_NAME (token);
2635
2636         while ((c = *spelling++) != '\0')
2637           *buffer++ = c;
2638       }
2639       break;
2640
2641     spell_ident:
2642     case SPELL_IDENT:
2643       if (forstring)
2644         {
2645           memcpy (buffer, NODE_NAME (token->val.node.node),
2646                   NODE_LEN (token->val.node.node));
2647           buffer += NODE_LEN (token->val.node.node);
2648         }
2649       else
2650         {
2651           size_t i;
2652           const unsigned char * name = NODE_NAME (token->val.node.node);
2653
2654           for (i = 0; i < NODE_LEN (token->val.node.node); i++)
2655             if (name[i] & ~0x7F)
2656               {
2657                 i += utf8_to_ucn (buffer, name + i) - 1;
2658                 buffer += 10;
2659               }
2660             else
2661               *buffer++ = NODE_NAME (token->val.node.node)[i];
2662         }
2663       break;
2664
2665     case SPELL_LITERAL:
2666       memcpy (buffer, token->val.str.text, token->val.str.len);
2667       buffer += token->val.str.len;
2668       break;
2669
2670     case SPELL_NONE:
2671       cpp_error (pfile, CPP_DL_ICE,
2672                  "unspellable token %s", TOKEN_NAME (token));
2673       break;
2674     }
2675
2676   return buffer;
2677 }
2678
2679 /* Returns TOKEN spelt as a null-terminated string.  The string is
2680    freed when the reader is destroyed.  Useful for diagnostics.  */
2681 unsigned char *
2682 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
2683 {
2684   unsigned int len = cpp_token_len (token) + 1;
2685   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
2686
2687   end = cpp_spell_token (pfile, token, start, false);
2688   end[0] = '\0';
2689
2690   return start;
2691 }
2692
2693 /* Returns a pointer to a string which spells the token defined by
2694    TYPE and FLAGS.  Used by C front ends, which really should move to
2695    using cpp_token_as_text.  */
2696 const char *
2697 cpp_type2name (enum cpp_ttype type, unsigned char flags)
2698 {
2699   if (flags & DIGRAPH)
2700     return (const char *) cpp_digraph2name (type);
2701   else if (flags & NAMED_OP)
2702     return cpp_named_operator2name (type);
2703
2704   return (const char *) token_spellings[type].name;
2705 }
2706
2707 /* Writes the spelling of token to FP, without any preceding space.
2708    Separated from cpp_spell_token for efficiency - to avoid stdio
2709    double-buffering.  */
2710 void
2711 cpp_output_token (const cpp_token *token, FILE *fp)
2712 {
2713   switch (TOKEN_SPELL (token))
2714     {
2715     case SPELL_OPERATOR:
2716       {
2717         const unsigned char *spelling;
2718         int c;
2719
2720         if (token->flags & DIGRAPH)
2721           spelling = cpp_digraph2name (token->type);
2722         else if (token->flags & NAMED_OP)
2723           goto spell_ident;
2724         else
2725           spelling = TOKEN_NAME (token);
2726
2727         c = *spelling;
2728         do
2729           putc (c, fp);
2730         while ((c = *++spelling) != '\0');
2731       }
2732       break;
2733
2734     spell_ident:
2735     case SPELL_IDENT:
2736       {
2737         size_t i;
2738         const unsigned char * name = NODE_NAME (token->val.node.node);
2739
2740         for (i = 0; i < NODE_LEN (token->val.node.node); i++)
2741           if (name[i] & ~0x7F)
2742             {
2743               unsigned char buffer[10];
2744               i += utf8_to_ucn (buffer, name + i) - 1;
2745               fwrite (buffer, 1, 10, fp);
2746             }
2747           else
2748             fputc (NODE_NAME (token->val.node.node)[i], fp);
2749       }
2750       break;
2751
2752     case SPELL_LITERAL:
2753       fwrite (token->val.str.text, 1, token->val.str.len, fp);
2754       break;
2755
2756     case SPELL_NONE:
2757       /* An error, most probably.  */
2758       break;
2759     }
2760 }
2761
2762 /* Compare two tokens.  */
2763 int
2764 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
2765 {
2766   if (a->type == b->type && a->flags == b->flags)
2767     switch (TOKEN_SPELL (a))
2768       {
2769       default:                  /* Keep compiler happy.  */
2770       case SPELL_OPERATOR:
2771         /* token_no is used to track where multiple consecutive ##
2772            tokens were originally located.  */
2773         return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
2774       case SPELL_NONE:
2775         return (a->type != CPP_MACRO_ARG
2776                 || a->val.macro_arg.arg_no == b->val.macro_arg.arg_no);
2777       case SPELL_IDENT:
2778         return a->val.node.node == b->val.node.node;
2779       case SPELL_LITERAL:
2780         return (a->val.str.len == b->val.str.len
2781                 && !memcmp (a->val.str.text, b->val.str.text,
2782                             a->val.str.len));
2783       }
2784
2785   return 0;
2786 }
2787
2788 /* Returns nonzero if a space should be inserted to avoid an
2789    accidental token paste for output.  For simplicity, it is
2790    conservative, and occasionally advises a space where one is not
2791    needed, e.g. "." and ".2".  */
2792 int
2793 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
2794                  const cpp_token *token2)
2795 {
2796   enum cpp_ttype a = token1->type, b = token2->type;
2797   cppchar_t c;
2798
2799   if (token1->flags & NAMED_OP)
2800     a = CPP_NAME;
2801   if (token2->flags & NAMED_OP)
2802     b = CPP_NAME;
2803
2804   c = EOF;
2805   if (token2->flags & DIGRAPH)
2806     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
2807   else if (token_spellings[b].category == SPELL_OPERATOR)
2808     c = token_spellings[b].name[0];
2809
2810   /* Quickly get everything that can paste with an '='.  */
2811   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
2812     return 1;
2813
2814   switch (a)
2815     {
2816     case CPP_GREATER:   return c == '>';
2817     case CPP_LESS:      return c == '<' || c == '%' || c == ':';
2818     case CPP_PLUS:      return c == '+';
2819     case CPP_MINUS:     return c == '-' || c == '>';
2820     case CPP_DIV:       return c == '/' || c == '*'; /* Comments.  */
2821     case CPP_MOD:       return c == ':' || c == '>';
2822     case CPP_AND:       return c == '&';
2823     case CPP_OR:        return c == '|';
2824     case CPP_COLON:     return c == ':' || c == '>';
2825     case CPP_DEREF:     return c == '*';
2826     case CPP_DOT:       return c == '.' || c == '%' || b == CPP_NUMBER;
2827     case CPP_HASH:      return c == '#' || c == '%'; /* Digraph form.  */
2828     case CPP_NAME:      return ((b == CPP_NUMBER
2829                                  && name_p (pfile, &token2->val.str))
2830                                 || b == CPP_NAME
2831                                 || b == CPP_CHAR || b == CPP_STRING); /* L */
2832     case CPP_NUMBER:    return (b == CPP_NUMBER || b == CPP_NAME
2833                                 || c == '.' || c == '+' || c == '-');
2834                                       /* UCNs */
2835     case CPP_OTHER:     return ((token1->val.str.text[0] == '\\'
2836                                  && b == CPP_NAME)
2837                                 || (CPP_OPTION (pfile, objc)
2838                                     && token1->val.str.text[0] == '@'
2839                                     && (b == CPP_NAME || b == CPP_STRING)));
2840     case CPP_STRING:
2841     case CPP_WSTRING:
2842     case CPP_UTF8STRING:
2843     case CPP_STRING16:
2844     case CPP_STRING32:  return (CPP_OPTION (pfile, user_literals)
2845                                 && (b == CPP_NAME
2846                                     || (TOKEN_SPELL (token2) == SPELL_LITERAL
2847                                         && ISIDST (token2->val.str.text[0]))));
2848
2849     default:            break;
2850     }
2851
2852   return 0;
2853 }
2854
2855 /* Output all the remaining tokens on the current line, and a newline
2856    character, to FP.  Leading whitespace is removed.  If there are
2857    macros, special token padding is not performed.  */
2858 void
2859 cpp_output_line (cpp_reader *pfile, FILE *fp)
2860 {
2861   const cpp_token *token;
2862
2863   token = cpp_get_token (pfile);
2864   while (token->type != CPP_EOF)
2865     {
2866       cpp_output_token (token, fp);
2867       token = cpp_get_token (pfile);
2868       if (token->flags & PREV_WHITE)
2869         putc (' ', fp);
2870     }
2871
2872   putc ('\n', fp);
2873 }
2874
2875 /* Return a string representation of all the remaining tokens on the
2876    current line.  The result is allocated using xmalloc and must be
2877    freed by the caller.  */
2878 unsigned char *
2879 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
2880 {
2881   const cpp_token *token;
2882   unsigned int out = dir_name ? ustrlen (dir_name) : 0;
2883   unsigned int alloced = 120 + out;
2884   unsigned char *result = (unsigned char *) xmalloc (alloced);
2885
2886   /* If DIR_NAME is empty, there are no initial contents.  */
2887   if (dir_name)
2888     {
2889       sprintf ((char *) result, "#%s ", dir_name);
2890       out += 2;
2891     }
2892
2893   token = cpp_get_token (pfile);
2894   while (token->type != CPP_EOF)
2895     {
2896       unsigned char *last;
2897       /* Include room for a possible space and the terminating nul.  */
2898       unsigned int len = cpp_token_len (token) + 2;
2899
2900       if (out + len > alloced)
2901         {
2902           alloced *= 2;
2903           if (out + len > alloced)
2904             alloced = out + len;
2905           result = (unsigned char *) xrealloc (result, alloced);
2906         }
2907
2908       last = cpp_spell_token (pfile, token, &result[out], 0);
2909       out = last - result;
2910
2911       token = cpp_get_token (pfile);
2912       if (token->flags & PREV_WHITE)
2913         result[out++] = ' ';
2914     }
2915
2916   result[out] = '\0';
2917   return result;
2918 }
2919
2920 /* Memory buffers.  Changing these three constants can have a dramatic
2921    effect on performance.  The values here are reasonable defaults,
2922    but might be tuned.  If you adjust them, be sure to test across a
2923    range of uses of cpplib, including heavy nested function-like macro
2924    expansion.  Also check the change in peak memory usage (NJAMD is a
2925    good tool for this).  */
2926 #define MIN_BUFF_SIZE 8000
2927 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
2928 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
2929         (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
2930
2931 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
2932   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
2933 #endif
2934
2935 /* Create a new allocation buffer.  Place the control block at the end
2936    of the buffer, so that buffer overflows will cause immediate chaos.  */
2937 static _cpp_buff *
2938 new_buff (size_t len)
2939 {
2940   _cpp_buff *result;
2941   unsigned char *base;
2942
2943   if (len < MIN_BUFF_SIZE)
2944     len = MIN_BUFF_SIZE;
2945   len = CPP_ALIGN (len);
2946
2947 #ifdef ENABLE_VALGRIND_CHECKING
2948   /* Valgrind warns about uses of interior pointers, so put _cpp_buff
2949      struct first.  */
2950   size_t slen = CPP_ALIGN2 (sizeof (_cpp_buff), 2 * DEFAULT_ALIGNMENT);
2951   base = XNEWVEC (unsigned char, len + slen);
2952   result = (_cpp_buff *) base;
2953   base += slen;
2954 #else
2955   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
2956   result = (_cpp_buff *) (base + len);
2957 #endif
2958   result->base = base;
2959   result->cur = base;
2960   result->limit = base + len;
2961   result->next = NULL;
2962   return result;
2963 }
2964
2965 /* Place a chain of unwanted allocation buffers on the free list.  */
2966 void
2967 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
2968 {
2969   _cpp_buff *end = buff;
2970
2971   while (end->next)
2972     end = end->next;
2973   end->next = pfile->free_buffs;
2974   pfile->free_buffs = buff;
2975 }
2976
2977 /* Return a free buffer of size at least MIN_SIZE.  */
2978 _cpp_buff *
2979 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
2980 {
2981   _cpp_buff *result, **p;
2982
2983   for (p = &pfile->free_buffs;; p = &(*p)->next)
2984     {
2985       size_t size;
2986
2987       if (*p == NULL)
2988         return new_buff (min_size);
2989       result = *p;
2990       size = result->limit - result->base;
2991       /* Return a buffer that's big enough, but don't waste one that's
2992          way too big.  */
2993       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
2994         break;
2995     }
2996
2997   *p = result->next;
2998   result->next = NULL;
2999   result->cur = result->base;
3000   return result;
3001 }
3002
3003 /* Creates a new buffer with enough space to hold the uncommitted
3004    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
3005    the excess bytes to the new buffer.  Chains the new buffer after
3006    BUFF, and returns the new buffer.  */
3007 _cpp_buff *
3008 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
3009 {
3010   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
3011   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
3012
3013   buff->next = new_buff;
3014   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
3015   return new_buff;
3016 }
3017
3018 /* Creates a new buffer with enough space to hold the uncommitted
3019    remaining bytes of the buffer pointed to by BUFF, and at least
3020    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
3021    Chains the new buffer before the buffer pointed to by BUFF, and
3022    updates the pointer to point to the new buffer.  */
3023 void
3024 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
3025 {
3026   _cpp_buff *new_buff, *old_buff = *pbuff;
3027   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
3028
3029   new_buff = _cpp_get_buff (pfile, size);
3030   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
3031   new_buff->next = old_buff;
3032   *pbuff = new_buff;
3033 }
3034
3035 /* Free a chain of buffers starting at BUFF.  */
3036 void
3037 _cpp_free_buff (_cpp_buff *buff)
3038 {
3039   _cpp_buff *next;
3040
3041   for (; buff; buff = next)
3042     {
3043       next = buff->next;
3044 #ifdef ENABLE_VALGRIND_CHECKING
3045       free (buff);
3046 #else
3047       free (buff->base);
3048 #endif
3049     }
3050 }
3051
3052 /* Allocate permanent, unaligned storage of length LEN.  */
3053 unsigned char *
3054 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
3055 {
3056   _cpp_buff *buff = pfile->u_buff;
3057   unsigned char *result = buff->cur;
3058
3059   if (len > (size_t) (buff->limit - result))
3060     {
3061       buff = _cpp_get_buff (pfile, len);
3062       buff->next = pfile->u_buff;
3063       pfile->u_buff = buff;
3064       result = buff->cur;
3065     }
3066
3067   buff->cur = result + len;
3068   return result;
3069 }
3070
3071 /* Allocate permanent, unaligned storage of length LEN from a_buff.
3072    That buffer is used for growing allocations when saving macro
3073    replacement lists in a #define, and when parsing an answer to an
3074    assertion in #assert, #unassert or #if (and therefore possibly
3075    whilst expanding macros).  It therefore must not be used by any
3076    code that they might call: specifically the lexer and the guts of
3077    the macro expander.
3078
3079    All existing other uses clearly fit this restriction: storing
3080    registered pragmas during initialization.  */
3081 unsigned char *
3082 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
3083 {
3084   _cpp_buff *buff = pfile->a_buff;
3085   unsigned char *result = buff->cur;
3086
3087   if (len > (size_t) (buff->limit - result))
3088     {
3089       buff = _cpp_get_buff (pfile, len);
3090       buff->next = pfile->a_buff;
3091       pfile->a_buff = buff;
3092       result = buff->cur;
3093     }
3094
3095   buff->cur = result + len;
3096   return result;
3097 }
3098
3099 /* Say which field of TOK is in use.  */
3100
3101 enum cpp_token_fld_kind
3102 cpp_token_val_index (const cpp_token *tok)
3103 {
3104   switch (TOKEN_SPELL (tok))
3105     {
3106     case SPELL_IDENT:
3107       return CPP_TOKEN_FLD_NODE;
3108     case SPELL_LITERAL:
3109       return CPP_TOKEN_FLD_STR;
3110     case SPELL_OPERATOR:
3111       if (tok->type == CPP_PASTE)
3112         return CPP_TOKEN_FLD_TOKEN_NO;
3113       else
3114         return CPP_TOKEN_FLD_NONE;
3115     case SPELL_NONE:
3116       if (tok->type == CPP_MACRO_ARG)
3117         return CPP_TOKEN_FLD_ARG_NO;
3118       else if (tok->type == CPP_PADDING)
3119         return CPP_TOKEN_FLD_SOURCE;
3120       else if (tok->type == CPP_PRAGMA)
3121         return CPP_TOKEN_FLD_PRAGMA;
3122       /* else fall through */
3123     default:
3124       return CPP_TOKEN_FLD_NONE;
3125     }
3126 }
3127
3128 /* All tokens lexed in R after calling this function will be forced to have
3129    their source_location the same as the location referenced by P, until
3130    cpp_stop_forcing_token_locations is called for R.  */
3131
3132 void
3133 cpp_force_token_locations (cpp_reader *r, source_location *p)
3134 {
3135   r->forced_token_location_p = p;
3136 }
3137
3138 /* Go back to assigning locations naturally for lexed tokens.  */
3139
3140 void
3141 cpp_stop_forcing_token_locations (cpp_reader *r)
3142 {
3143   r->forced_token_location_p = NULL;
3144 }