libcpp/lex.c

   1 /* CPP Library - lexical analysis.
   2    Copyright (C) 2000-2014 Free Software Foundation, Inc.
   3    Contributed by Per Bothner, 1994-95.
   4    Based on CCCP program by Paul Rubin, June 1986
   5    Adapted to ANSI C, Richard Stallman, Jan 1987
   6    Broken out to separate file, Zack Weinberg, Mar 2000
   7
   8 This program is free software; you can redistribute it and/or modify it
   9 under the terms of the GNU General Public License as published by the
  10 Free Software Foundation; either version 3, or (at your option) any
  11 later version.
  12
  13 This program is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with this program; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "cpplib.h"
  25 #include "internal.h"
  26
  27 enum spell_type
  28 {
  29   SPELL_OPERATOR = 0,
  30   SPELL_IDENT,
  31   SPELL_LITERAL,
  32   SPELL_NONE
  33 };
  34
  35 struct token_spelling
  36 {
  37   enum spell_type category;
  38   const unsigned char *name;
  39 };
  40
  41 static const unsigned char *const digraph_spellings[] =
  42 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
  43
  44 #define OP(e, s) { SPELL_OPERATOR, UC s  },
  45 #define TK(e, s) { SPELL_ ## s,    UC #e },
  46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
  47 #undef OP
  48 #undef TK
  49
  50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
  51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
  52
  53 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
  54 static int skip_line_comment (cpp_reader *);
  55 static void skip_whitespace (cpp_reader *, cppchar_t);
  56 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
  57 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
  58 static void store_comment (cpp_reader *, cpp_token *);
  59 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
  60                             unsigned int, enum cpp_ttype);
  61 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
  62 static int name_p (cpp_reader *, const cpp_string *);
  63 static tokenrun *next_tokenrun (tokenrun *);
  64
  65 static _cpp_buff *new_buff (size_t);
  66
  67
  68 /* Utility routine:
  69
  70    Compares, the token TOKEN to the NUL-terminated string STRING.
  71    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
  72 int
  73 cpp_ideq (const cpp_token *token, const char *string)
  74 {
  75   if (token->type != CPP_NAME)
  76     return 0;
  77
  78   return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
  79 }
  80
  81 /* Record a note TYPE at byte POS into the current cleaned logical
  82    line.  */
  83 static void
  84 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
  85 {
  86   if (buffer->notes_used == buffer->notes_cap)
  87     {
  88       buffer->notes_cap = buffer->notes_cap * 2 + 200;
  89       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
  90                                   buffer->notes_cap);
  91     }
  92
  93   buffer->notes[buffer->notes_used].pos = pos;
  94   buffer->notes[buffer->notes_used].type = type;
  95   buffer->notes_used++;
  96 }
  97
  98 \f
  99 /* Fast path to find line special characters using optimized character
 100    scanning algorithms.  Anything complicated falls back to the slow
 101    path below.  Since this loop is very hot it's worth doing these kinds
 102    of optimizations.
 103
 104    One of the paths through the ifdefs should provide
 105
 106      const uchar *search_line_fast (const uchar *s, const uchar *end);
 107
 108    Between S and END, search for \n, \r, \\, ?.  Return a pointer to
 109    the found character.
 110
 111    Note that the last character of the buffer is *always* a newline,
 112    as forced by _cpp_convert_input.  This fact can be used to avoid
 113    explicitly looking for the end of the buffer.  */
 114
 115 /* Configure gives us an ifdef test.  */
 116 #ifndef WORDS_BIGENDIAN
 117 #define WORDS_BIGENDIAN 0
 118 #endif
 119
 120 /* We'd like the largest integer that fits into a register.  There's nothing
 121    in <stdint.h> that gives us that.  For most hosts this is unsigned long,
 122    but MS decided on an LLP64 model.  Thankfully when building with GCC we
 123    can get the "real" word size.  */
 124 #ifdef __GNUC__
 125 typedef unsigned int word_type __attribute__((__mode__(__word__)));
 126 #else
 127 typedef unsigned long word_type;
 128 #endif
 129
 130 /* The code below is only expecting sizes 4 or 8.
 131    Die at compile-time if this expectation is violated.  */
 132 typedef char check_word_type_size
 133   [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
 134
 135 /* Return X with the first N bytes forced to values that won't match one
 136    of the interesting characters.  Note that NUL is not interesting.  */
 137
 138 static inline word_type
 139 acc_char_mask_misalign (word_type val, unsigned int n)
 140 {
 141   word_type mask = -1;
 142   if (WORDS_BIGENDIAN)
 143     mask >>= n * 8;
 144   else
 145     mask <<= n * 8;
 146   return val & mask;
 147 }
 148
 149 /* Return X replicated to all byte positions within WORD_TYPE.  */
 150
 151 static inline word_type
 152 acc_char_replicate (uchar x)
 153 {
 154   word_type ret;
 155
 156   ret = (x << 24) | (x << 16) | (x << 8) | x;
 157   if (sizeof(word_type) == 8)
 158     ret = (ret << 16 << 16) | ret;
 159   return ret;
 160 }
 161
 162 /* Return non-zero if some byte of VAL is (probably) C.  */
 163
 164 static inline word_type
 165 acc_char_cmp (word_type val, word_type c)
 166 {
 167 #if defined(__GNUC__) && defined(__alpha__)
 168   /* We can get exact results using a compare-bytes instruction.
 169      Get (val == c) via (0 >= (val ^ c)).  */
 170   return __builtin_alpha_cmpbge (0, val ^ c);
 171 #else
 172   word_type magic = 0x7efefefeU;
 173   if (sizeof(word_type) == 8)
 174     magic = (magic << 16 << 16) | 0xfefefefeU;
 175   magic |= 1;
 176
 177   val ^= c;
 178   return ((val + magic) ^ ~val) & ~magic;
 179 #endif
 180 }
 181
 182 /* Given the result of acc_char_cmp is non-zero, return the index of
 183    the found character.  If this was a false positive, return -1.  */
 184
 185 static inline int
 186 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
 187                 word_type val ATTRIBUTE_UNUSED)
 188 {
 189 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
 190   /* The cmpbge instruction sets *bits* of the result corresponding to
 191      matches in the bytes with no false positives.  */
 192   return __builtin_ctzl (cmp);
 193 #else
 194   unsigned int i;
 195
 196   /* ??? It would be nice to force unrolling here,
 197      and have all of these constants folded.  */
 198   for (i = 0; i < sizeof(word_type); ++i)
 199     {
 200       uchar c;
 201       if (WORDS_BIGENDIAN)
 202         c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
 203       else
 204         c = (val >> i * 8) & 0xff;
 205
 206       if (c == '\n' || c == '\r' || c == '\\' || c == '?')
 207         return i;
 208     }
 209
 210   return -1;
 211 #endif
 212 }
 213
 214 /* A version of the fast scanner using bit fiddling techniques.
 215
 216    For 32-bit words, one would normally perform 16 comparisons and
 217    16 branches.  With this algorithm one performs 24 arithmetic
 218    operations and one branch.  Whether this is faster with a 32-bit
 219    word size is going to be somewhat system dependent.
 220
 221    For 64-bit words, we eliminate twice the number of comparisons
 222    and branches without increasing the number of arithmetic operations.
 223    It's almost certainly going to be a win with 64-bit word size.  */
 224
 225 static const uchar * search_line_acc_char (const uchar *, const uchar *)
 226   ATTRIBUTE_UNUSED;
 227
 228 static const uchar *
 229 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 230 {
 231   const word_type repl_nl = acc_char_replicate ('\n');
 232   const word_type repl_cr = acc_char_replicate ('\r');
 233   const word_type repl_bs = acc_char_replicate ('\\');
 234   const word_type repl_qm = acc_char_replicate ('?');
 235
 236   unsigned int misalign;
 237   const word_type *p;
 238   word_type val, t;
 239
 240   /* Align the buffer.  Mask out any bytes from before the beginning.  */
 241   p = (word_type *)((uintptr_t)s & -sizeof(word_type));
 242   val = *p;
 243   misalign = (uintptr_t)s & (sizeof(word_type) - 1);
 244   if (misalign)
 245     val = acc_char_mask_misalign (val, misalign);
 246
 247   /* Main loop.  */
 248   while (1)
 249     {
 250       t  = acc_char_cmp (val, repl_nl);
 251       t |= acc_char_cmp (val, repl_cr);
 252       t |= acc_char_cmp (val, repl_bs);
 253       t |= acc_char_cmp (val, repl_qm);
 254
 255       if (__builtin_expect (t != 0, 0))
 256         {
 257           int i = acc_char_index (t, val);
 258           if (i >= 0)
 259             return (const uchar *)p + i;
 260         }
 261
 262       val = *++p;
 263     }
 264 }
 265
 266 /* Disable on Solaris 2/x86 until the following problems can be properly
 267    autoconfed:
 268
 269    The Solaris 9 assembler cannot assemble SSE4.2 insns.
 270    Before Solaris 9 Update 6, SSE insns cannot be executed.
 271    The Solaris 10+ assembler tags objects with the instruction set
 272    extensions used, so SSE4.2 executables cannot run on machines that
 273    don't support that extension.  */
 274
 275 #if (GCC_VERSION >= 4005) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
 276
 277 /* Replicated character data to be shared between implementations.
 278    Recall that outside of a context with vector support we can't
 279    define compatible vector types, therefore these are all defined
 280    in terms of raw characters.  */
 281 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
 282   { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 283     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
 284   { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 285     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
 286   { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 287     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
 288   { '?', '?', '?', '?', '?', '?', '?', '?',
 289     '?', '?', '?', '?', '?', '?', '?', '?' },
 290 };
 291
 292 /* A version of the fast scanner using MMX vectorized byte compare insns.
 293
 294    This uses the PMOVMSKB instruction which was introduced with "MMX2",
 295    which was packaged into SSE1; it is also present in the AMD MMX
 296    extension.  Mark the function as using "sse" so that we emit a real
 297    "emms" instruction, rather than the 3dNOW "femms" instruction.  */
 298
 299 static const uchar *
 300 #ifndef __SSE__
 301 __attribute__((__target__("sse")))
 302 #endif
 303 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 304 {
 305   typedef char v8qi __attribute__ ((__vector_size__ (8)));
 306   typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
 307
 308   const v8qi repl_nl = *(const v8qi *)repl_chars[0];
 309   const v8qi repl_cr = *(const v8qi *)repl_chars[1];
 310   const v8qi repl_bs = *(const v8qi *)repl_chars[2];
 311   const v8qi repl_qm = *(const v8qi *)repl_chars[3];
 312
 313   unsigned int misalign, found, mask;
 314   const v8qi *p;
 315   v8qi data, t, c;
 316
 317   /* Align the source pointer.  While MMX doesn't generate unaligned data
 318      faults, this allows us to safely scan to the end of the buffer without
 319      reading beyond the end of the last page.  */
 320   misalign = (uintptr_t)s & 7;
 321   p = (const v8qi *)((uintptr_t)s & -8);
 322   data = *p;
 323
 324   /* Create a mask for the bytes that are valid within the first
 325      16-byte block.  The Idea here is that the AND with the mask
 326      within the loop is "free", since we need some AND or TEST
 327      insn in order to set the flags for the branch anyway.  */
 328   mask = -1u << misalign;
 329
 330   /* Main loop processing 8 bytes at a time.  */
 331   goto start;
 332   do
 333     {
 334       data = *++p;
 335       mask = -1;
 336
 337     start:
 338       t = __builtin_ia32_pcmpeqb(data, repl_nl);
 339       c = __builtin_ia32_pcmpeqb(data, repl_cr);
 340       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 341       c = __builtin_ia32_pcmpeqb(data, repl_bs);
 342       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 343       c = __builtin_ia32_pcmpeqb(data, repl_qm);
 344       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 345       found = __builtin_ia32_pmovmskb (t);
 346       found &= mask;
 347     }
 348   while (!found);
 349
 350   __builtin_ia32_emms ();
 351
 352   /* FOUND contains 1 in bits for which we matched a relevant
 353      character.  Conversion to the byte index is trivial.  */
 354   found = __builtin_ctz(found);
 355   return (const uchar *)p + found;
 356 }
 357
 358 /* A version of the fast scanner using SSE2 vectorized byte compare insns.  */
 359
 360 static const uchar *
 361 #ifndef __SSE2__
 362 __attribute__((__target__("sse2")))
 363 #endif
 364 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 365 {
 366   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 367
 368   const v16qi repl_nl = *(const v16qi *)repl_chars[0];
 369   const v16qi repl_cr = *(const v16qi *)repl_chars[1];
 370   const v16qi repl_bs = *(const v16qi *)repl_chars[2];
 371   const v16qi repl_qm = *(const v16qi *)repl_chars[3];
 372
 373   unsigned int misalign, found, mask;
 374   const v16qi *p;
 375   v16qi data, t;
 376
 377   /* Align the source pointer.  */
 378   misalign = (uintptr_t)s & 15;
 379   p = (const v16qi *)((uintptr_t)s & -16);
 380   data = *p;
 381
 382   /* Create a mask for the bytes that are valid within the first
 383      16-byte block.  The Idea here is that the AND with the mask
 384      within the loop is "free", since we need some AND or TEST
 385      insn in order to set the flags for the branch anyway.  */
 386   mask = -1u << misalign;
 387
 388   /* Main loop processing 16 bytes at a time.  */
 389   goto start;
 390   do
 391     {
 392       data = *++p;
 393       mask = -1;
 394
 395     start:
 396       t  = __builtin_ia32_pcmpeqb128(data, repl_nl);
 397       t |= __builtin_ia32_pcmpeqb128(data, repl_cr);
 398       t |= __builtin_ia32_pcmpeqb128(data, repl_bs);
 399       t |= __builtin_ia32_pcmpeqb128(data, repl_qm);
 400       found = __builtin_ia32_pmovmskb128 (t);
 401       found &= mask;
 402     }
 403   while (!found);
 404
 405   /* FOUND contains 1 in bits for which we matched a relevant
 406      character.  Conversion to the byte index is trivial.  */
 407   found = __builtin_ctz(found);
 408   return (const uchar *)p + found;
 409 }
 410
 411 #ifdef HAVE_SSE4
 412 /* A version of the fast scanner using SSE 4.2 vectorized string insns.  */
 413
 414 static const uchar *
 415 #ifndef __SSE4_2__
 416 __attribute__((__target__("sse4.2")))
 417 #endif
 418 search_line_sse42 (const uchar *s, const uchar *end)
 419 {
 420   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 421   static const v16qi search = { '\n', '\r', '?', '\\' };
 422
 423   uintptr_t si = (uintptr_t)s;
 424   uintptr_t index;
 425
 426   /* Check for unaligned input.  */
 427   if (si & 15)
 428     {
 429       v16qi sv;
 430
 431       if (__builtin_expect (end - s < 16, 0)
 432           && __builtin_expect ((si & 0xfff) > 0xff0, 0))
 433         {
 434           /* There are less than 16 bytes left in the buffer, and less
 435              than 16 bytes left on the page.  Reading 16 bytes at this
 436              point might generate a spurious page fault.  Defer to the
 437              SSE2 implementation, which already handles alignment.  */
 438           return search_line_sse2 (s, end);
 439         }
 440
 441       /* ??? The builtin doesn't understand that the PCMPESTRI read from
 442          memory need not be aligned.  */
 443       sv = __builtin_ia32_loaddqu ((const char *) s);
 444       index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
 445
 446       if (__builtin_expect (index < 16, 0))
 447         goto found;
 448
 449       /* Advance the pointer to an aligned address.  We will re-scan a
 450          few bytes, but we no longer need care for reading past the
 451          end of a page, since we're guaranteed a match.  */
 452       s = (const uchar *)((si + 16) & -16);
 453     }
 454
 455   /* Main loop, processing 16 bytes at a time.  By doing the whole loop
 456      in inline assembly, we can make proper use of the flags set.  */
 457   __asm (      "sub $16, %1\n"
 458         "       .balign 16\n"
 459         "0:     add $16, %1\n"
 460         "       %vpcmpestri $0, (%1), %2\n"
 461         "       jnc 0b"
 462         : "=&c"(index), "+r"(s)
 463         : "x"(search), "a"(4), "d"(16));
 464
 465  found:
 466   return s + index;
 467 }
 468
 469 #else
 470 /* Work around out-dated assemblers without sse4 support.  */
 471 #define search_line_sse42 search_line_sse2
 472 #endif
 473
 474 /* Check the CPU capabilities.  */
 475
 476 #include "../gcc/config/i386/cpuid.h"
 477
 478 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
 479 static search_line_fast_type search_line_fast;
 480
 481 #define HAVE_init_vectorized_lexer 1
 482 static inline void
 483 init_vectorized_lexer (void)
 484 {
 485   unsigned dummy, ecx = 0, edx = 0;
 486   search_line_fast_type impl = search_line_acc_char;
 487   int minimum = 0;
 488
 489 #if defined(__SSE4_2__)
 490   minimum = 3;
 491 #elif defined(__SSE2__)
 492   minimum = 2;
 493 #elif defined(__SSE__)
 494   minimum = 1;
 495 #endif
 496
 497   if (minimum == 3)
 498     impl = search_line_sse42;
 499   else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
 500     {
 501       if (minimum == 3 || (ecx & bit_SSE4_2))
 502         impl = search_line_sse42;
 503       else if (minimum == 2 || (edx & bit_SSE2))
 504         impl = search_line_sse2;
 505       else if (minimum == 1 || (edx & bit_SSE))
 506         impl = search_line_mmx;
 507     }
 508   else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
 509     {
 510       if (minimum == 1
 511           || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
 512         impl = search_line_mmx;
 513     }
 514
 515   search_line_fast = impl;
 516 }
 517
 518 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__)
 519
 520 /* A vection of the fast scanner using AltiVec vectorized byte compares.  */
 521 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
 522    so we can't compile this function without -maltivec on the command line
 523    (or implied by some other switch).  */
 524
 525 static const uchar *
 526 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 527 {
 528   typedef __attribute__((altivec(vector))) unsigned char vc;
 529
 530   const vc repl_nl = {
 531     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 532     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 533   };
 534   const vc repl_cr = {
 535     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 536     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 537   };
 538   const vc repl_bs = {
 539     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 540     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 541   };
 542   const vc repl_qm = {
 543     '?', '?', '?', '?', '?', '?', '?', '?',
 544     '?', '?', '?', '?', '?', '?', '?', '?',
 545   };
 546   const vc ones = {
 547     -1, -1, -1, -1, -1, -1, -1, -1,
 548     -1, -1, -1, -1, -1, -1, -1, -1,
 549   };
 550   const vc zero = { 0 };
 551
 552   vc data, mask, t;
 553
 554   /* Altivec loads automatically mask addresses with -16.  This lets us
 555      issue the first load as early as possible.  */
 556   data = __builtin_vec_ld(0, (const vc *)s);
 557
 558   /* Discard bytes before the beginning of the buffer.  Do this by
 559      beginning with all ones and shifting in zeros according to the
 560      mis-alignment.  The LVSR instruction pulls the exact shift we
 561      want from the address.  */
 562 #ifdef __BIG_ENDIAN__
 563   mask = __builtin_vec_lvsr(0, s);
 564   mask = __builtin_vec_perm(zero, ones, mask);
 565 #else
 566   mask = __builtin_vec_lvsl(0, s);
 567   mask = __builtin_vec_perm(ones, zero, mask);
 568 #endif
 569   data &= mask;
 570
 571   /* While altivec loads mask addresses, we still need to align S so
 572      that the offset we compute at the end is correct.  */
 573   s = (const uchar *)((uintptr_t)s & -16);
 574
 575   /* Main loop processing 16 bytes at a time.  */
 576   goto start;
 577   do
 578     {
 579       vc m_nl, m_cr, m_bs, m_qm;
 580
 581       s += 16;
 582       data = __builtin_vec_ld(0, (const vc *)s);
 583
 584     start:
 585       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 586       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 587       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 588       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 589       t = (m_nl | m_cr) | (m_bs | m_qm);
 590
 591       /* T now contains 0xff in bytes for which we matched one of the relevant
 592          characters.  We want to exit the loop if any byte in T is non-zero.
 593          Below is the expansion of vec_any_ne(t, zero).  */
 594     }
 595   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 596
 597   {
 598 #define N  (sizeof(vc) / sizeof(long))
 599
 600     union {
 601       vc v;
 602       /* Statically assert that N is 2 or 4.  */
 603       unsigned long l[(N == 2 || N == 4) ? N : -1];
 604     } u;
 605     unsigned long l, i = 0;
 606
 607     u.v = t;
 608
 609     /* Find the first word of T that is non-zero.  */
 610     switch (N)
 611       {
 612       case 4:
 613         l = u.l[i++];
 614         if (l != 0)
 615           break;
 616         s += sizeof(unsigned long);
 617         l = u.l[i++];
 618         if (l != 0)
 619           break;
 620         s += sizeof(unsigned long);
 621       case 2:
 622         l = u.l[i++];
 623         if (l != 0)
 624           break;
 625         s += sizeof(unsigned long);
 626         l = u.l[i];
 627       }
 628
 629     /* L now contains 0xff in bytes for which we matched one of the
 630        relevant characters.  We can find the byte index by finding
 631        its bit index and dividing by 8.  */
 632 #ifdef __BIG_ENDIAN__
 633     l = __builtin_clzl(l) >> 3;
 634 #else
 635     l = __builtin_ctzl(l) >> 3;
 636 #endif
 637     return s + l;
 638
 639 #undef N
 640   }
 641 }
 642
 643 #elif defined (__ARM_NEON__)
 644 #include "arm_neon.h"
 645
 646 static const uchar *
 647 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 648 {
 649   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
 650   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
 651   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
 652   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
 653   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
 654
 655   unsigned int misalign, found, mask;
 656   const uint8_t *p;
 657   uint8x16_t data;
 658
 659   /* Align the source pointer.  */
 660   misalign = (uintptr_t)s & 15;
 661   p = (const uint8_t *)((uintptr_t)s & -16);
 662   data = vld1q_u8 (p);
 663
 664   /* Create a mask for the bytes that are valid within the first
 665      16-byte block.  The Idea here is that the AND with the mask
 666      within the loop is "free", since we need some AND or TEST
 667      insn in order to set the flags for the branch anyway.  */
 668   mask = (-1u << misalign) & 0xffff;
 669
 670   /* Main loop, processing 16 bytes at a time.  */
 671   goto start;
 672
 673   do
 674     {
 675       uint8x8_t l;
 676       uint16x4_t m;
 677       uint32x2_t n;
 678       uint8x16_t t, u, v, w;
 679
 680       p += 16;
 681       data = vld1q_u8 (p);
 682       mask = 0xffff;
 683
 684     start:
 685       t = vceqq_u8 (data, repl_nl);
 686       u = vceqq_u8 (data, repl_cr);
 687       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 688       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 689       t = vandq_u8 (vorrq_u8 (v, w), xmask);
 690       l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
 691       m = vpaddl_u8 (l);
 692       n = vpaddl_u16 (m);
 693
 694       found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
 695               vshr_n_u64 ((uint64x1_t) n, 24)), 0);
 696       found &= mask;
 697     }
 698   while (!found);
 699
 700   /* FOUND contains 1 in bits for which we matched a relevant
 701      character.  Conversion to the byte index is trivial.  */
 702   found = __builtin_ctz (found);
 703   return (const uchar *)p + found;
 704 }
 705
 706 #else
 707
 708 /* We only have one accellerated alternative.  Use a direct call so that
 709    we encourage inlining.  */
 710
 711 #define search_line_fast  search_line_acc_char
 712
 713 #endif
 714
 715 /* Initialize the lexer if needed.  */
 716
 717 void
 718 _cpp_init_lexer (void)
 719 {
 720 #ifdef HAVE_init_vectorized_lexer
 721   init_vectorized_lexer ();
 722 #endif
 723 }
 724
 725 /* Returns with a logical line that contains no escaped newlines or
 726    trigraphs.  This is a time-critical inner loop.  */
 727 void
 728 _cpp_clean_line (cpp_reader *pfile)
 729 {
 730   cpp_buffer *buffer;
 731   const uchar *s;
 732   uchar c, *d, *p;
 733
 734   buffer = pfile->buffer;
 735   buffer->cur_note = buffer->notes_used = 0;
 736   buffer->cur = buffer->line_base = buffer->next_line;
 737   buffer->need_line = false;
 738   s = buffer->next_line;
 739
 740   if (!buffer->from_stage3)
 741     {
 742       const uchar *pbackslash = NULL;
 743
 744       /* Fast path.  This is the common case of an un-escaped line with
 745          no trigraphs.  The primary win here is by not writing any
 746          data back to memory until we have to.  */
 747       while (1)
 748         {
 749           /* Perform an optimized search for \n, \r, \\, ?.  */
 750           s = search_line_fast (s, buffer->rlimit);
 751
 752           c = *s;
 753           if (c == '\\')
 754             {
 755               /* Record the location of the backslash and continue.  */
 756               pbackslash = s++;
 757             }
 758           else if (__builtin_expect (c == '?', 0))
 759             {
 760               if (__builtin_expect (s[1] == '?', false)
 761                    && _cpp_trigraph_map[s[2]])
 762                 {
 763                   /* Have a trigraph.  We may or may not have to convert
 764                      it.  Add a line note regardless, for -Wtrigraphs.  */
 765                   add_line_note (buffer, s, s[2]);
 766                   if (CPP_OPTION (pfile, trigraphs))
 767                     {
 768                       /* We do, and that means we have to switch to the
 769                          slow path.  */
 770                       d = (uchar *) s;
 771                       *d = _cpp_trigraph_map[s[2]];
 772                       s += 2;
 773                       goto slow_path;
 774                     }
 775                 }
 776               /* Not a trigraph.  Continue on fast-path.  */
 777               s++;
 778             }
 779           else
 780             break;
 781         }
 782
 783       /* This must be \r or \n.  We're either done, or we'll be forced
 784          to write back to the buffer and continue on the slow path.  */
 785       d = (uchar *) s;
 786
 787       if (__builtin_expect (s == buffer->rlimit, false))
 788         goto done;
 789
 790       /* DOS line ending? */
 791       if (__builtin_expect (c == '\r', false) && s[1] == '\n')
 792         {
 793           s++;
 794           if (s == buffer->rlimit)
 795             goto done;
 796         }
 797
 798       if (__builtin_expect (pbackslash == NULL, true))
 799         goto done;
 800
 801       /* Check for escaped newline.  */
 802       p = d;
 803       while (is_nvspace (p[-1]))
 804         p--;
 805       if (p - 1 != pbackslash)
 806         goto done;
 807
 808       /* Have an escaped newline; process it and proceed to
 809          the slow path.  */
 810       add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
 811       d = p - 2;
 812       buffer->next_line = p - 1;
 813
 814     slow_path:
 815       while (1)
 816         {
 817           c = *++s;
 818           *++d = c;
 819
 820           if (c == '\n' || c == '\r')
 821             {
 822               /* Handle DOS line endings.  */
 823               if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
 824                 s++;
 825               if (s == buffer->rlimit)
 826                 break;
 827
 828               /* Escaped?  */
 829               p = d;
 830               while (p != buffer->next_line && is_nvspace (p[-1]))
 831                 p--;
 832               if (p == buffer->next_line || p[-1] != '\\')
 833                 break;
 834
 835               add_line_note (buffer, p - 1, p != d ? ' ': '\\');
 836               d = p - 2;
 837               buffer->next_line = p - 1;
 838             }
 839           else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
 840             {
 841               /* Add a note regardless, for the benefit of -Wtrigraphs.  */
 842               add_line_note (buffer, d, s[2]);
 843               if (CPP_OPTION (pfile, trigraphs))
 844                 {
 845                   *d = _cpp_trigraph_map[s[2]];
 846                   s += 2;
 847                 }
 848             }
 849         }
 850     }
 851   else
 852     {
 853       while (*s != '\n' && *s != '\r')
 854         s++;
 855       d = (uchar *) s;
 856
 857       /* Handle DOS line endings.  */
 858       if (*s == '\r' && s != buffer->rlimit && s[1] == '\n')
 859         s++;
 860     }
 861
 862  done:
 863   *d = '\n';
 864   /* A sentinel note that should never be processed.  */
 865   add_line_note (buffer, d + 1, '\n');
 866   buffer->next_line = s + 1;
 867 }
 868
 869 /* Return true if the trigraph indicated by NOTE should be warned
 870    about in a comment.  */
 871 static bool
 872 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
 873 {
 874   const uchar *p;
 875
 876   /* Within comments we don't warn about trigraphs, unless the
 877      trigraph forms an escaped newline, as that may change
 878      behavior.  */
 879   if (note->type != '/')
 880     return false;
 881
 882   /* If -trigraphs, then this was an escaped newline iff the next note
 883      is coincident.  */
 884   if (CPP_OPTION (pfile, trigraphs))
 885     return note[1].pos == note->pos;
 886
 887   /* Otherwise, see if this forms an escaped newline.  */
 888   p = note->pos + 3;
 889   while (is_nvspace (*p))
 890     p++;
 891
 892   /* There might have been escaped newlines between the trigraph and the
 893      newline we found.  Hence the position test.  */
 894   return (*p == '\n' && p < note[1].pos);
 895 }
 896
 897 /* Process the notes created by add_line_note as far as the current
 898    location.  */
 899 void
 900 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
 901 {
 902   cpp_buffer *buffer = pfile->buffer;
 903
 904   for (;;)
 905     {
 906       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
 907       unsigned int col;
 908
 909       if (note->pos > buffer->cur)
 910         break;
 911
 912       buffer->cur_note++;
 913       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
 914
 915       if (note->type == '\\' || note->type == ' ')
 916         {
 917           if (note->type == ' ' && !in_comment)
 918             cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
 919                                  "backslash and newline separated by space");
 920
 921           if (buffer->next_line > buffer->rlimit)
 922             {
 923               cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
 924                                    "backslash-newline at end of file");
 925               /* Prevent "no newline at end of file" warning.  */
 926               buffer->next_line = buffer->rlimit;
 927             }
 928
 929           buffer->line_base = note->pos;
 930           CPP_INCREMENT_LINE (pfile, 0);
 931         }
 932       else if (_cpp_trigraph_map[note->type])
 933         {
 934           if (CPP_OPTION (pfile, warn_trigraphs)
 935               && (!in_comment || warn_in_comment (pfile, note)))
 936             {
 937               if (CPP_OPTION (pfile, trigraphs))
 938                 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
 939                                        pfile->line_table->highest_line, col,
 940                                        "trigraph ??%c converted to %c",
 941                                        note->type,
 942                                        (int) _cpp_trigraph_map[note->type]);
 943               else
 944                 {
 945                   cpp_warning_with_line
 946                     (pfile, CPP_W_TRIGRAPHS,
 947                      pfile->line_table->highest_line, col,
 948                      "trigraph ??%c ignored, use -trigraphs to enable",
 949                      note->type);
 950                 }
 951             }
 952         }
 953       else if (note->type == 0)
 954         /* Already processed in lex_raw_string.  */;
 955       else
 956         abort ();
 957     }
 958 }
 959
 960 /* Skip a C-style block comment.  We find the end of the comment by
 961    seeing if an asterisk is before every '/' we encounter.  Returns
 962    nonzero if comment terminated by EOF, zero otherwise.
 963
 964    Buffer->cur points to the initial asterisk of the comment.  */
 965 bool
 966 _cpp_skip_block_comment (cpp_reader *pfile)
 967 {
 968   cpp_buffer *buffer = pfile->buffer;
 969   const uchar *cur = buffer->cur;
 970   uchar c;
 971
 972   cur++;
 973   if (*cur == '/')
 974     cur++;
 975
 976   for (;;)
 977     {
 978       /* People like decorating comments with '*', so check for '/'
 979          instead for efficiency.  */
 980       c = *cur++;
 981
 982       if (c == '/')
 983         {
 984           if (cur[-2] == '*')
 985             break;
 986
 987           /* Warn about potential nested comments, but not if the '/'
 988              comes immediately before the true comment delimiter.
 989              Don't bother to get it right across escaped newlines.  */
 990           if (CPP_OPTION (pfile, warn_comments)
 991               && cur[0] == '*' && cur[1] != '/')
 992             {
 993               buffer->cur = cur;
 994               cpp_warning_with_line (pfile, CPP_W_COMMENTS,
 995                                      pfile->line_table->highest_line,
 996                                      CPP_BUF_COL (buffer),
 997                                      "\"/*\" within comment");
 998             }
 999         }
1000       else if (c == '\n')
1001         {
1002           unsigned int cols;
1003           buffer->cur = cur - 1;
1004           _cpp_process_line_notes (pfile, true);
1005           if (buffer->next_line >= buffer->rlimit)
1006             return true;
1007           _cpp_clean_line (pfile);
1008
1009           cols = buffer->next_line - buffer->line_base;
1010           CPP_INCREMENT_LINE (pfile, cols);
1011
1012           cur = buffer->cur;
1013         }
1014     }
1015
1016   buffer->cur = cur;
1017   _cpp_process_line_notes (pfile, true);
1018   return false;
1019 }
1020
1021 /* Skip a C++ line comment, leaving buffer->cur pointing to the
1022    terminating newline.  Handles escaped newlines.  Returns nonzero
1023    if a multiline comment.  */
1024 static int
1025 skip_line_comment (cpp_reader *pfile)
1026 {
1027   cpp_buffer *buffer = pfile->buffer;
1028   source_location orig_line = pfile->line_table->highest_line;
1029
1030   while (*buffer->cur != '\n')
1031     buffer->cur++;
1032
1033   _cpp_process_line_notes (pfile, true);
1034   return orig_line != pfile->line_table->highest_line;
1035 }
1036
1037 /* Skips whitespace, saving the next non-whitespace character.  */
1038 static void
1039 skip_whitespace (cpp_reader *pfile, cppchar_t c)
1040 {
1041   cpp_buffer *buffer = pfile->buffer;
1042   bool saw_NUL = false;
1043
1044   do
1045     {
1046       /* Horizontal space always OK.  */
1047       if (c == ' ' || c == '\t')
1048         ;
1049       /* Just \f \v or \0 left.  */
1050       else if (c == '\0')
1051         saw_NUL = true;
1052       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
1053         cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
1054                              CPP_BUF_COL (buffer),
1055                              "%s in preprocessing directive",
1056                              c == '\f' ? "form feed" : "vertical tab");
1057
1058       c = *buffer->cur++;
1059     }
1060   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
1061   while (is_nvspace (c));
1062
1063   if (saw_NUL)
1064     cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored");
1065
1066   buffer->cur--;
1067 }
1068
1069 /* See if the characters of a number token are valid in a name (no
1070    '.', '+' or '-').  */
1071 static int
1072 name_p (cpp_reader *pfile, const cpp_string *string)
1073 {
1074   unsigned int i;
1075
1076   for (i = 0; i < string->len; i++)
1077     if (!is_idchar (string->text[i]))
1078       return 0;
1079
1080   return 1;
1081 }
1082
1083 /* After parsing an identifier or other sequence, produce a warning about
1084    sequences not in NFC/NFKC.  */
1085 static void
1086 warn_about_normalization (cpp_reader *pfile,
1087                           const cpp_token *token,
1088                           const struct normalize_state *s)
1089 {
1090   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
1091       && !pfile->state.skipping)
1092     {
1093       /* Make sure that the token is printed using UCNs, even
1094          if we'd otherwise happily print UTF-8.  */
1095       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
1096       size_t sz;
1097
1098       sz = cpp_spell_token (pfile, token, buf, false) - buf;
1099       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
1100         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1101                                "`%.*s' is not in NFKC", (int) sz, buf);
1102       else
1103         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1104                                "`%.*s' is not in NFC", (int) sz, buf);
1105       free (buf);
1106     }
1107 }
1108
1109 /* Returns TRUE if the sequence starting at buffer->cur is invalid in
1110    an identifier.  FIRST is TRUE if this starts an identifier.  */
1111 static bool
1112 forms_identifier_p (cpp_reader *pfile, int first,
1113                     struct normalize_state *state)
1114 {
1115   cpp_buffer *buffer = pfile->buffer;
1116
1117   if (*buffer->cur == '$')
1118     {
1119       if (!CPP_OPTION (pfile, dollars_in_ident))
1120         return false;
1121
1122       buffer->cur++;
1123       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
1124         {
1125           CPP_OPTION (pfile, warn_dollars) = 0;
1126           cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
1127         }
1128
1129       return true;
1130     }
1131
1132   /* Is this a syntactically valid UCN?  */
1133   if (CPP_OPTION (pfile, extended_identifiers)
1134       && *buffer->cur == '\\'
1135       && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
1136     {
1137       buffer->cur += 2;
1138       if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1139                           state))
1140         return true;
1141       buffer->cur -= 2;
1142     }
1143
1144   return false;
1145 }
1146
1147 /* Helper function to get the cpp_hashnode of the identifier BASE.  */
1148 static cpp_hashnode *
1149 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
1150 {
1151   cpp_hashnode *result;
1152   const uchar *cur;
1153   unsigned int len;
1154   unsigned int hash = HT_HASHSTEP (0, *base);
1155
1156   cur = base + 1;
1157   while (ISIDNUM (*cur))
1158     {
1159       hash = HT_HASHSTEP (hash, *cur);
1160       cur++;
1161     }
1162   len = cur - base;
1163   hash = HT_HASHFINISH (hash, len);
1164   result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1165                                               base, len, hash, HT_ALLOC));
1166
1167   /* Rarely, identifiers require diagnostics when lexed.  */
1168   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1169                         && !pfile->state.skipping, 0))
1170     {
1171       /* It is allowed to poison the same identifier twice.  */
1172       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1173         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1174                    NODE_NAME (result));
1175
1176       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1177          replacement list of a variadic macro.  */
1178       if (result == pfile->spec_nodes.n__VA_ARGS__
1179           && !pfile->state.va_args_ok)
1180         cpp_error (pfile, CPP_DL_PEDWARN,
1181                    "__VA_ARGS__ can only appear in the expansion"
1182                    " of a C99 variadic macro");
1183
1184       /* For -Wc++-compat, warn about use of C++ named operators.  */
1185       if (result->flags & NODE_WARN_OPERATOR)
1186         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1187                      "identifier \"%s\" is a special operator name in C++",
1188                      NODE_NAME (result));
1189     }
1190
1191   return result;
1192 }
1193
1194 /* Get the cpp_hashnode of an identifier specified by NAME in
1195    the current cpp_reader object.  If none is found, NULL is returned.  */
1196 cpp_hashnode *
1197 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
1198 {
1199   cpp_hashnode *result;
1200   result = lex_identifier_intern (pfile, (uchar *) name);
1201   return result;
1202 }
1203
1204 /* Lex an identifier starting at BUFFER->CUR - 1.  */
1205 static cpp_hashnode *
1206 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
1207                 struct normalize_state *nst)
1208 {
1209   cpp_hashnode *result;
1210   const uchar *cur;
1211   unsigned int len;
1212   unsigned int hash = HT_HASHSTEP (0, *base);
1213
1214   cur = pfile->buffer->cur;
1215   if (! starts_ucn)
1216     {
1217       while (ISIDNUM (*cur))
1218         {
1219           hash = HT_HASHSTEP (hash, *cur);
1220           cur++;
1221         }
1222       NORMALIZE_STATE_UPDATE_IDNUM (nst, *(cur - 1));
1223     }
1224   pfile->buffer->cur = cur;
1225   if (starts_ucn || forms_identifier_p (pfile, false, nst))
1226     {
1227       /* Slower version for identifiers containing UCNs (or $).  */
1228       do {
1229         while (ISIDNUM (*pfile->buffer->cur))
1230           {
1231             NORMALIZE_STATE_UPDATE_IDNUM (nst, *pfile->buffer->cur);
1232             pfile->buffer->cur++;
1233           }
1234       } while (forms_identifier_p (pfile, false, nst));
1235       result = _cpp_interpret_identifier (pfile, base,
1236                                           pfile->buffer->cur - base);
1237     }
1238   else
1239     {
1240       len = cur - base;
1241       hash = HT_HASHFINISH (hash, len);
1242
1243       result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1244                                                   base, len, hash, HT_ALLOC));
1245     }
1246
1247   /* Rarely, identifiers require diagnostics when lexed.  */
1248   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1249                         && !pfile->state.skipping, 0))
1250     {
1251       /* It is allowed to poison the same identifier twice.  */
1252       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1253         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1254                    NODE_NAME (result));
1255
1256       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1257          replacement list of a variadic macro.  */
1258       if (result == pfile->spec_nodes.n__VA_ARGS__
1259           && !pfile->state.va_args_ok)
1260         cpp_error (pfile, CPP_DL_PEDWARN,
1261                    "__VA_ARGS__ can only appear in the expansion"
1262                    " of a C99 variadic macro");
1263
1264       /* For -Wc++-compat, warn about use of C++ named operators.  */
1265       if (result->flags & NODE_WARN_OPERATOR)
1266         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1267                      "identifier \"%s\" is a special operator name in C++",
1268                      NODE_NAME (result));
1269     }
1270
1271   return result;
1272 }
1273
1274 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
1275 static void
1276 lex_number (cpp_reader *pfile, cpp_string *number,
1277             struct normalize_state *nst)
1278 {
1279   const uchar *cur;
1280   const uchar *base;
1281   uchar *dest;
1282
1283   base = pfile->buffer->cur - 1;
1284   do
1285     {
1286       cur = pfile->buffer->cur;
1287
1288       /* N.B. ISIDNUM does not include $.  */
1289       while (ISIDNUM (*cur) || *cur == '.' || DIGIT_SEP (*cur)
1290              || VALID_SIGN (*cur, cur[-1]))
1291         {
1292           NORMALIZE_STATE_UPDATE_IDNUM (nst, *cur);
1293           cur++;
1294         }
1295
1296       pfile->buffer->cur = cur;
1297     }
1298   while (forms_identifier_p (pfile, false, nst));
1299
1300   number->len = cur - base;
1301   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
1302   memcpy (dest, base, number->len);
1303   dest[number->len] = '\0';
1304   number->text = dest;
1305 }
1306
1307 /* Create a token of type TYPE with a literal spelling.  */
1308 static void
1309 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
1310                 unsigned int len, enum cpp_ttype type)
1311 {
1312   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
1313
1314   memcpy (dest, base, len);
1315   dest[len] = '\0';
1316   token->type = type;
1317   token->val.str.len = len;
1318   token->val.str.text = dest;
1319 }
1320
1321 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
1322    sequence from *FIRST_BUFF_P to LAST_BUFF_P.  */
1323
1324 static void
1325 bufring_append (cpp_reader *pfile, const uchar *base, size_t len,
1326                 _cpp_buff **first_buff_p, _cpp_buff **last_buff_p)
1327 {
1328   _cpp_buff *first_buff = *first_buff_p;
1329   _cpp_buff *last_buff = *last_buff_p;
1330
1331   if (first_buff == NULL)
1332     first_buff = last_buff = _cpp_get_buff (pfile, len);
1333   else if (len > BUFF_ROOM (last_buff))
1334     {
1335       size_t room = BUFF_ROOM (last_buff);
1336       memcpy (BUFF_FRONT (last_buff), base, room);
1337       BUFF_FRONT (last_buff) += room;
1338       base += room;
1339       len -= room;
1340       last_buff = _cpp_append_extend_buff (pfile, last_buff, len);
1341     }
1342
1343   memcpy (BUFF_FRONT (last_buff), base, len);
1344   BUFF_FRONT (last_buff) += len;
1345
1346   *first_buff_p = first_buff;
1347   *last_buff_p = last_buff;
1348 }
1349
1350
1351 /* Returns true if a macro has been defined.
1352    This might not work if compile with -save-temps,
1353    or preprocess separately from compilation.  */
1354
1355 static bool
1356 is_macro(cpp_reader *pfile, const uchar *base)
1357 {
1358   const uchar *cur = base;
1359   if (! ISIDST (*cur))
1360     return false;
1361   unsigned int hash = HT_HASHSTEP (0, *cur);
1362   ++cur;
1363   while (ISIDNUM (*cur))
1364     {
1365       hash = HT_HASHSTEP (hash, *cur);
1366       ++cur;
1367     }
1368   hash = HT_HASHFINISH (hash, cur - base);
1369
1370   cpp_hashnode *result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1371                                         base, cur - base, hash, HT_NO_INSERT));
1372
1373   return !result ? false : (result->type == NT_MACRO);
1374 }
1375
1376
1377 /* Lexes a raw string.  The stored string contains the spelling, including
1378    double quotes, delimiter string, '(' and ')', any leading
1379    'L', 'u', 'U' or 'u8' and 'R' modifier.  It returns the type of the
1380    literal, or CPP_OTHER if it was not properly terminated.
1381
1382    The spelling is NUL-terminated, but it is not guaranteed that this
1383    is the first NUL since embedded NULs are preserved.  */
1384
1385 static void
1386 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base,
1387                 const uchar *cur)
1388 {
1389   uchar raw_prefix[17];
1390   uchar temp_buffer[18];
1391   const uchar *orig_base;
1392   unsigned int raw_prefix_len = 0, raw_suffix_len = 0;
1393   enum raw_str_phase { RAW_STR_PREFIX, RAW_STR, RAW_STR_SUFFIX };
1394   raw_str_phase phase = RAW_STR_PREFIX;
1395   enum cpp_ttype type;
1396   size_t total_len = 0;
1397   /* Index into temp_buffer during phases other than RAW_STR,
1398      during RAW_STR phase 17 to tell BUF_APPEND that nothing should
1399      be appended to temp_buffer.  */
1400   size_t temp_buffer_len = 0;
1401   _cpp_buff *first_buff = NULL, *last_buff = NULL;
1402   size_t raw_prefix_start;
1403   _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
1404
1405   type = (*base == 'L' ? CPP_WSTRING :
1406           *base == 'U' ? CPP_STRING32 :
1407           *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1408           : CPP_STRING);
1409
1410 #define BUF_APPEND(STR,LEN)                                     \
1411       do {                                                      \
1412         bufring_append (pfile, (const uchar *)(STR), (LEN),     \
1413                         &first_buff, &last_buff);               \
1414         total_len += (LEN);                                     \
1415         if (__builtin_expect (temp_buffer_len < 17, 0)          \
1416             && (const uchar *)(STR) != base                     \
1417             && (LEN) <= 2)                                      \
1418           {                                                     \
1419             memcpy (temp_buffer + temp_buffer_len,              \
1420                     (const uchar *)(STR), (LEN));               \
1421             temp_buffer_len += (LEN);                           \
1422           }                                                     \
1423       } while (0);
1424
1425   orig_base = base;
1426   ++cur;
1427   raw_prefix_start = cur - base;
1428   for (;;)
1429     {
1430       cppchar_t c;
1431
1432       /* If we previously performed any trigraph or line splicing
1433          transformations, undo them in between the opening and closing
1434          double quote.  */
1435       while (note->pos < cur)
1436         ++note;
1437       for (; note->pos == cur; ++note)
1438         {
1439           switch (note->type)
1440             {
1441             case '\\':
1442             case ' ':
1443               /* Restore backslash followed by newline.  */
1444               BUF_APPEND (base, cur - base);
1445               base = cur;
1446               BUF_APPEND ("\\", 1);
1447             after_backslash:
1448               if (note->type == ' ')
1449                 {
1450                   /* GNU backslash whitespace newline extension.  FIXME
1451                      could be any sequence of non-vertical space.  When we
1452                      can properly restore any such sequence, we should mark
1453                      this note as handled so _cpp_process_line_notes
1454                      doesn't warn.  */
1455                   BUF_APPEND (" ", 1);
1456                 }
1457
1458               BUF_APPEND ("\n", 1);
1459               break;
1460
1461             case 0:
1462               /* Already handled.  */
1463               break;
1464
1465             default:
1466               if (_cpp_trigraph_map[note->type])
1467                 {
1468                   /* Don't warn about this trigraph in
1469                      _cpp_process_line_notes, since trigraphs show up as
1470                      trigraphs in raw strings.  */
1471                   uchar type = note->type;
1472                   note->type = 0;
1473
1474                   if (!CPP_OPTION (pfile, trigraphs))
1475                     /* If we didn't convert the trigraph in the first
1476                        place, don't do anything now either.  */
1477                     break;
1478
1479                   BUF_APPEND (base, cur - base);
1480                   base = cur;
1481                   BUF_APPEND ("??", 2);
1482
1483                   /* ??/ followed by newline gets two line notes, one for
1484                      the trigraph and one for the backslash/newline.  */
1485                   if (type == '/' && note[1].pos == cur)
1486                     {
1487                       if (note[1].type != '\\'
1488                           && note[1].type != ' ')
1489                         abort ();
1490                       BUF_APPEND ("/", 1);
1491                       ++note;
1492                       goto after_backslash;
1493                     }
1494                   else
1495                     {
1496                       /* Skip the replacement character.  */
1497                       base = ++cur;
1498                       BUF_APPEND (&type, 1);
1499                       c = type;
1500                       goto check_c;
1501                     }
1502                 }
1503               else
1504                 abort ();
1505               break;
1506             }
1507         }
1508       c = *cur++;
1509       if (__builtin_expect (temp_buffer_len < 17, 0))
1510         temp_buffer[temp_buffer_len++] = c;
1511
1512      check_c:
1513       if (phase == RAW_STR_PREFIX)
1514         {
1515           while (raw_prefix_len < temp_buffer_len)
1516             {
1517               raw_prefix[raw_prefix_len] = temp_buffer[raw_prefix_len];
1518               switch (raw_prefix[raw_prefix_len])
1519                 {
1520                 case ' ': case '(': case ')': case '\\': case '\t':
1521                 case '\v': case '\f': case '\n': default:
1522                   break;
1523                 /* Basic source charset except the above chars.  */
1524                 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1525                 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1526                 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1527                 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1528                 case 'y': case 'z':
1529                 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1530                 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1531                 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1532                 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1533                 case 'Y': case 'Z':
1534                 case '0': case '1': case '2': case '3': case '4': case '5':
1535                 case '6': case '7': case '8': case '9':
1536                 case '_': case '{': case '}': case '#': case '[': case ']':
1537                 case '<': case '>': case '%': case ':': case ';': case '.':
1538                 case '?': case '*': case '+': case '-': case '/': case '^':
1539                 case '&': case '|': case '~': case '!': case '=': case ',':
1540                 case '"': case '\'':
1541                   if (raw_prefix_len < 16)
1542                     {
1543                       raw_prefix_len++;
1544                       continue;
1545                     }
1546                   break;
1547                 }
1548
1549               if (raw_prefix[raw_prefix_len] != '(')
1550                 {
1551                   int col = CPP_BUF_COLUMN (pfile->buffer, cur) + 1;
1552                   if (raw_prefix_len == 16)
1553                     cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1554                                          col, "raw string delimiter longer "
1555                                               "than 16 characters");
1556                   else if (raw_prefix[raw_prefix_len] == '\n')
1557                     cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1558                                          col, "invalid new-line in raw "
1559                                               "string delimiter");
1560                   else
1561                     cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1562                                          col, "invalid character '%c' in "
1563                                               "raw string delimiter",
1564                                          (int) raw_prefix[raw_prefix_len]);
1565                   pfile->buffer->cur = orig_base + raw_prefix_start - 1;
1566                   create_literal (pfile, token, orig_base,
1567                                   raw_prefix_start - 1, CPP_OTHER);
1568                   if (first_buff)
1569                     _cpp_release_buff (pfile, first_buff);
1570                   return;
1571                 }
1572               raw_prefix[raw_prefix_len] = '"';
1573               phase = RAW_STR;
1574               /* Nothing should be appended to temp_buffer during
1575                  RAW_STR phase.  */
1576               temp_buffer_len = 17;
1577               break;
1578             }
1579           continue;
1580         }
1581       else if (phase == RAW_STR_SUFFIX)
1582         {
1583           while (raw_suffix_len <= raw_prefix_len
1584                  && raw_suffix_len < temp_buffer_len
1585                  && temp_buffer[raw_suffix_len] == raw_prefix[raw_suffix_len])
1586             raw_suffix_len++;
1587           if (raw_suffix_len > raw_prefix_len)
1588             break;
1589           if (raw_suffix_len == temp_buffer_len)
1590             continue;
1591           phase = RAW_STR;
1592           /* Nothing should be appended to temp_buffer during
1593              RAW_STR phase.  */
1594           temp_buffer_len = 17;
1595         }
1596       if (c == ')')
1597         {
1598           phase = RAW_STR_SUFFIX;
1599           raw_suffix_len = 0;
1600           temp_buffer_len = 0;
1601         }
1602       else if (c == '\n')
1603         {
1604           if (pfile->state.in_directive
1605               || (pfile->state.parsing_args
1606                   && pfile->buffer->next_line >= pfile->buffer->rlimit))
1607             {
1608               cur--;
1609               type = CPP_OTHER;
1610               cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
1611                                    "unterminated raw string");
1612               break;
1613             }
1614
1615           BUF_APPEND (base, cur - base);
1616
1617           if (pfile->buffer->cur < pfile->buffer->rlimit)
1618             CPP_INCREMENT_LINE (pfile, 0);
1619           pfile->buffer->need_line = true;
1620
1621           pfile->buffer->cur = cur-1;
1622           _cpp_process_line_notes (pfile, false);
1623           if (!_cpp_get_fresh_line (pfile))
1624             {
1625               source_location src_loc = token->src_loc;
1626               token->type = CPP_EOF;
1627               /* Tell the compiler the line number of the EOF token.  */
1628               token->src_loc = pfile->line_table->highest_line;
1629               token->flags = BOL;
1630               if (first_buff != NULL)
1631                 _cpp_release_buff (pfile, first_buff);
1632               cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
1633                                    "unterminated raw string");
1634               return;
1635             }
1636
1637           cur = base = pfile->buffer->cur;
1638           note = &pfile->buffer->notes[pfile->buffer->cur_note];
1639         }
1640     }
1641
1642   if (CPP_OPTION (pfile, user_literals))
1643     {
1644       /* If a string format macro, say from inttypes.h, is placed touching
1645          a string literal it could be parsed as a C++11 user-defined string
1646          literal thus breaking the program.
1647          Try to identify macros with is_macro. A warning is issued. */
1648       if (is_macro (pfile, cur))
1649         {
1650           /* Raise a warning, but do not consume subsequent tokens.  */
1651           if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
1652             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
1653                                    token->src_loc, 0,
1654                                    "invalid suffix on literal; C++11 requires "
1655                                    "a space between literal and string macro");
1656         }
1657       /* Grab user defined literal suffix.  */
1658       else if (ISIDST (*cur))
1659         {
1660           type = cpp_userdef_string_add_type (type);
1661           ++cur;
1662
1663           while (ISIDNUM (*cur))
1664             ++cur;
1665         }
1666     }
1667
1668   pfile->buffer->cur = cur;
1669   if (first_buff == NULL)
1670     create_literal (pfile, token, base, cur - base, type);
1671   else
1672     {
1673       uchar *dest = _cpp_unaligned_alloc (pfile, total_len + (cur - base) + 1);
1674
1675       token->type = type;
1676       token->val.str.len = total_len + (cur - base);
1677       token->val.str.text = dest;
1678       last_buff = first_buff;
1679       while (last_buff != NULL)
1680         {
1681           memcpy (dest, last_buff->base,
1682                   BUFF_FRONT (last_buff) - last_buff->base);
1683           dest += BUFF_FRONT (last_buff) - last_buff->base;
1684           last_buff = last_buff->next;
1685         }
1686       _cpp_release_buff (pfile, first_buff);
1687       memcpy (dest, base, cur - base);
1688       dest[cur - base] = '\0';
1689     }
1690 }
1691
1692 /* Lexes a string, character constant, or angle-bracketed header file
1693    name.  The stored string contains the spelling, including opening
1694    quote and any leading 'L', 'u', 'U' or 'u8' and optional
1695    'R' modifier.  It returns the type of the literal, or CPP_OTHER
1696    if it was not properly terminated, or CPP_LESS for an unterminated
1697    header name which must be relexed as normal tokens.
1698
1699    The spelling is NUL-terminated, but it is not guaranteed that this
1700    is the first NUL since embedded NULs are preserved.  */
1701 static void
1702 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
1703 {
1704   bool saw_NUL = false;
1705   const uchar *cur;
1706   cppchar_t terminator;
1707   enum cpp_ttype type;
1708
1709   cur = base;
1710   terminator = *cur++;
1711   if (terminator == 'L' || terminator == 'U')
1712     terminator = *cur++;
1713   else if (terminator == 'u')
1714     {
1715       terminator = *cur++;
1716       if (terminator == '8')
1717         terminator = *cur++;
1718     }
1719   if (terminator == 'R')
1720     {
1721       lex_raw_string (pfile, token, base, cur);
1722       return;
1723     }
1724   if (terminator == '"')
1725     type = (*base == 'L' ? CPP_WSTRING :
1726             *base == 'U' ? CPP_STRING32 :
1727             *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1728                          : CPP_STRING);
1729   else if (terminator == '\'')
1730     type = (*base == 'L' ? CPP_WCHAR :
1731             *base == 'U' ? CPP_CHAR32 :
1732             *base == 'u' ? CPP_CHAR16 : CPP_CHAR);
1733   else
1734     terminator = '>', type = CPP_HEADER_NAME;
1735
1736   for (;;)
1737     {
1738       cppchar_t c = *cur++;
1739
1740       /* In #include-style directives, terminators are not escapable.  */
1741       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
1742         cur++;
1743       else if (c == terminator)
1744         break;
1745       else if (c == '\n')
1746         {
1747           cur--;
1748           /* Unmatched quotes always yield undefined behavior, but
1749              greedy lexing means that what appears to be an unterminated
1750              header name may actually be a legitimate sequence of tokens.  */
1751           if (terminator == '>')
1752             {
1753               token->type = CPP_LESS;
1754               return;
1755             }
1756           type = CPP_OTHER;
1757           break;
1758         }
1759       else if (c == '\0')
1760         saw_NUL = true;
1761     }
1762
1763   if (saw_NUL && !pfile->state.skipping)
1764     cpp_error (pfile, CPP_DL_WARNING,
1765                "null character(s) preserved in literal");
1766
1767   if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
1768     cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
1769                (int) terminator);
1770
1771   if (CPP_OPTION (pfile, user_literals))
1772     {
1773       /* If a string format macro, say from inttypes.h, is placed touching
1774          a string literal it could be parsed as a C++11 user-defined string
1775          literal thus breaking the program.
1776          Try to identify macros with is_macro. A warning is issued. */
1777       if (is_macro (pfile, cur))
1778         {
1779           /* Raise a warning, but do not consume subsequent tokens.  */
1780           if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
1781             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
1782                                    token->src_loc, 0,
1783                                    "invalid suffix on literal; C++11 requires "
1784                                    "a space between literal and string macro");
1785         }
1786       /* Grab user defined literal suffix.  */
1787       else if (ISIDST (*cur))
1788         {
1789           type = cpp_userdef_char_add_type (type);
1790           type = cpp_userdef_string_add_type (type);
1791           ++cur;
1792
1793           while (ISIDNUM (*cur))
1794             ++cur;
1795         }
1796     }
1797
1798   pfile->buffer->cur = cur;
1799   create_literal (pfile, token, base, cur - base, type);
1800 }
1801
1802 /* Return the comment table. The client may not make any assumption
1803    about the ordering of the table.  */
1804 cpp_comment_table *
1805 cpp_get_comments (cpp_reader *pfile)
1806 {
1807   return &pfile->comments;
1808 }
1809
1810 /* Append a comment to the end of the comment table. */
1811 static void
1812 store_comment (cpp_reader *pfile, cpp_token *token)
1813 {
1814   int len;
1815
1816   if (pfile->comments.allocated == 0)
1817     {
1818       pfile->comments.allocated = 256;
1819       pfile->comments.entries = (cpp_comment *) xmalloc
1820         (pfile->comments.allocated * sizeof (cpp_comment));
1821     }
1822
1823   if (pfile->comments.count == pfile->comments.allocated)
1824     {
1825       pfile->comments.allocated *= 2;
1826       pfile->comments.entries = (cpp_comment *) xrealloc
1827         (pfile->comments.entries,
1828          pfile->comments.allocated * sizeof (cpp_comment));
1829     }
1830
1831   len = token->val.str.len;
1832
1833   /* Copy comment. Note, token may not be NULL terminated. */
1834   pfile->comments.entries[pfile->comments.count].comment =
1835     (char *) xmalloc (sizeof (char) * (len + 1));
1836   memcpy (pfile->comments.entries[pfile->comments.count].comment,
1837           token->val.str.text, len);
1838   pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
1839
1840   /* Set source location. */
1841   pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
1842
1843   /* Increment the count of entries in the comment table. */
1844   pfile->comments.count++;
1845 }
1846
1847 /* The stored comment includes the comment start and any terminator.  */
1848 static void
1849 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
1850               cppchar_t type)
1851 {
1852   unsigned char *buffer;
1853   unsigned int len, clen, i;
1854
1855   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
1856
1857   /* C++ comments probably (not definitely) have moved past a new
1858      line, which we don't want to save in the comment.  */
1859   if (is_vspace (pfile->buffer->cur[-1]))
1860     len--;
1861
1862   /* If we are currently in a directive or in argument parsing, then
1863      we need to store all C++ comments as C comments internally, and
1864      so we need to allocate a little extra space in that case.
1865
1866      Note that the only time we encounter a directive here is
1867      when we are saving comments in a "#define".  */
1868   clen = ((pfile->state.in_directive || pfile->state.parsing_args)
1869           && type == '/') ? len + 2 : len;
1870
1871   buffer = _cpp_unaligned_alloc (pfile, clen);
1872
1873   token->type = CPP_COMMENT;
1874   token->val.str.len = clen;
1875   token->val.str.text = buffer;
1876
1877   buffer[0] = '/';
1878   memcpy (buffer + 1, from, len - 1);
1879
1880   /* Finish conversion to a C comment, if necessary.  */
1881   if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
1882     {
1883       buffer[1] = '*';
1884       buffer[clen - 2] = '*';
1885       buffer[clen - 1] = '/';
1886       /* As there can be in a C++ comments illegal sequences for C comments
1887          we need to filter them out.  */
1888       for (i = 2; i < (clen - 2); i++)
1889         if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
1890           buffer[i] = '|';
1891     }
1892
1893   /* Finally store this comment for use by clients of libcpp. */
1894   store_comment (pfile, token);
1895 }
1896
1897 /* Allocate COUNT tokens for RUN.  */
1898 void
1899 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
1900 {
1901   run->base = XNEWVEC (cpp_token, count);
1902   run->limit = run->base + count;
1903   run->next = NULL;
1904 }
1905
1906 /* Returns the next tokenrun, or creates one if there is none.  */
1907 static tokenrun *
1908 next_tokenrun (tokenrun *run)
1909 {
1910   if (run->next == NULL)
1911     {
1912       run->next = XNEW (tokenrun);
1913       run->next->prev = run;
1914       _cpp_init_tokenrun (run->next, 250);
1915     }
1916
1917   return run->next;
1918 }
1919
1920 /* Return the number of not yet processed token in a given
1921    context.  */
1922 int
1923 _cpp_remaining_tokens_num_in_context (cpp_context *context)
1924 {
1925   if (context->tokens_kind == TOKENS_KIND_DIRECT)
1926     return (LAST (context).token - FIRST (context).token);
1927   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
1928            || context->tokens_kind == TOKENS_KIND_EXTENDED)
1929     return (LAST (context).ptoken - FIRST (context).ptoken);
1930   else
1931       abort ();
1932 }
1933
1934 /* Returns the token present at index INDEX in a given context.  If
1935    INDEX is zero, the next token to be processed is returned.  */
1936 static const cpp_token*
1937 _cpp_token_from_context_at (cpp_context *context, int index)
1938 {
1939   if (context->tokens_kind == TOKENS_KIND_DIRECT)
1940     return &(FIRST (context).token[index]);
1941   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
1942            || context->tokens_kind == TOKENS_KIND_EXTENDED)
1943     return FIRST (context).ptoken[index];
1944  else
1945    abort ();
1946 }
1947
1948 /* Look ahead in the input stream.  */
1949 const cpp_token *
1950 cpp_peek_token (cpp_reader *pfile, int index)
1951 {
1952   cpp_context *context = pfile->context;
1953   const cpp_token *peektok;
1954   int count;
1955
1956   /* First, scan through any pending cpp_context objects.  */
1957   while (context->prev)
1958     {
1959       ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
1960
1961       if (index < (int) sz)
1962         return _cpp_token_from_context_at (context, index);
1963       index -= (int) sz;
1964       context = context->prev;
1965     }
1966
1967   /* We will have to read some new tokens after all (and do so
1968      without invalidating preceding tokens).  */
1969   count = index;
1970   pfile->keep_tokens++;
1971
1972   do
1973     {
1974       peektok = _cpp_lex_token (pfile);
1975       if (peektok->type == CPP_EOF)
1976         return peektok;
1977     }
1978   while (index--);
1979
1980   _cpp_backup_tokens_direct (pfile, count + 1);
1981   pfile->keep_tokens--;
1982
1983   return peektok;
1984 }
1985
1986 /* Allocate a single token that is invalidated at the same time as the
1987    rest of the tokens on the line.  Has its line and col set to the
1988    same as the last lexed token, so that diagnostics appear in the
1989    right place.  */
1990 cpp_token *
1991 _cpp_temp_token (cpp_reader *pfile)
1992 {
1993   cpp_token *old, *result;
1994   ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
1995   ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
1996
1997   old = pfile->cur_token - 1;
1998   /* Any pre-existing lookaheads must not be clobbered.  */
1999   if (la)
2000     {
2001       if (sz <= la)
2002         {
2003           tokenrun *next = next_tokenrun (pfile->cur_run);
2004
2005           if (sz < la)
2006             memmove (next->base + 1, next->base,
2007                      (la - sz) * sizeof (cpp_token));
2008
2009           next->base[0] = pfile->cur_run->limit[-1];
2010         }
2011
2012       if (sz > 1)
2013         memmove (pfile->cur_token + 1, pfile->cur_token,
2014                  MIN (la, sz - 1) * sizeof (cpp_token));
2015     }
2016
2017   if (!sz && pfile->cur_token == pfile->cur_run->limit)
2018     {
2019       pfile->cur_run = next_tokenrun (pfile->cur_run);
2020       pfile->cur_token = pfile->cur_run->base;
2021     }
2022
2023   result = pfile->cur_token++;
2024   result->src_loc = old->src_loc;
2025   return result;
2026 }
2027
2028 /* Lex a token into RESULT (external interface).  Takes care of issues
2029    like directive handling, token lookahead, multiple include
2030    optimization and skipping.  */
2031 const cpp_token *
2032 _cpp_lex_token (cpp_reader *pfile)
2033 {
2034   cpp_token *result;
2035
2036   for (;;)
2037     {
2038       if (pfile->cur_token == pfile->cur_run->limit)
2039         {
2040           pfile->cur_run = next_tokenrun (pfile->cur_run);
2041           pfile->cur_token = pfile->cur_run->base;
2042         }
2043       /* We assume that the current token is somewhere in the current
2044          run.  */
2045       if (pfile->cur_token < pfile->cur_run->base
2046           || pfile->cur_token >= pfile->cur_run->limit)
2047         abort ();
2048
2049       if (pfile->lookaheads)
2050         {
2051           pfile->lookaheads--;
2052           result = pfile->cur_token++;
2053         }
2054       else
2055         result = _cpp_lex_direct (pfile);
2056
2057       if (result->flags & BOL)
2058         {
2059           /* Is this a directive.  If _cpp_handle_directive returns
2060              false, it is an assembler #.  */
2061           if (result->type == CPP_HASH
2062               /* 6.10.3 p 11: Directives in a list of macro arguments
2063                  gives undefined behavior.  This implementation
2064                  handles the directive as normal.  */
2065               && pfile->state.parsing_args != 1)
2066             {
2067               if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
2068                 {
2069                   if (pfile->directive_result.type == CPP_PADDING)
2070                     continue;
2071                   result = &pfile->directive_result;
2072                 }
2073             }
2074           else if (pfile->state.in_deferred_pragma)
2075             result = &pfile->directive_result;
2076
2077           if (pfile->cb.line_change && !pfile->state.skipping)
2078             pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
2079         }
2080
2081       /* We don't skip tokens in directives.  */
2082       if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
2083         break;
2084
2085       /* Outside a directive, invalidate controlling macros.  At file
2086          EOF, _cpp_lex_direct takes care of popping the buffer, so we never
2087          get here and MI optimization works.  */
2088       pfile->mi_valid = false;
2089
2090       if (!pfile->state.skipping || result->type == CPP_EOF)
2091         break;
2092     }
2093
2094   return result;
2095 }
2096
2097 /* Returns true if a fresh line has been loaded.  */
2098 bool
2099 _cpp_get_fresh_line (cpp_reader *pfile)
2100 {
2101   int return_at_eof;
2102
2103   /* We can't get a new line until we leave the current directive.  */
2104   if (pfile->state.in_directive)
2105     return false;
2106
2107   for (;;)
2108     {
2109       cpp_buffer *buffer = pfile->buffer;
2110
2111       if (!buffer->need_line)
2112         return true;
2113
2114       if (buffer->next_line < buffer->rlimit)
2115         {
2116           _cpp_clean_line (pfile);
2117           return true;
2118         }
2119
2120       /* First, get out of parsing arguments state.  */
2121       if (pfile->state.parsing_args)
2122         return false;
2123
2124       /* End of buffer.  Non-empty files should end in a newline.  */
2125       if (buffer->buf != buffer->rlimit
2126           && buffer->next_line > buffer->rlimit
2127           && !buffer->from_stage3)
2128         {
2129           /* Clip to buffer size.  */
2130           buffer->next_line = buffer->rlimit;
2131         }
2132
2133       return_at_eof = buffer->return_at_eof;
2134       _cpp_pop_buffer (pfile);
2135       if (pfile->buffer == NULL || return_at_eof)
2136         return false;
2137     }
2138 }
2139
2140 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)          \
2141   do                                                    \
2142     {                                                   \
2143       result->type = ELSE_TYPE;                         \
2144       if (*buffer->cur == CHAR)                         \
2145         buffer->cur++, result->type = THEN_TYPE;        \
2146     }                                                   \
2147   while (0)
2148
2149 /* Lex a token into pfile->cur_token, which is also incremented, to
2150    get diagnostics pointing to the correct location.
2151
2152    Does not handle issues such as token lookahead, multiple-include
2153    optimization, directives, skipping etc.  This function is only
2154    suitable for use by _cpp_lex_token, and in special cases like
2155    lex_expansion_token which doesn't care for any of these issues.
2156
2157    When meeting a newline, returns CPP_EOF if parsing a directive,
2158    otherwise returns to the start of the token buffer if permissible.
2159    Returns the location of the lexed token.  */
2160 cpp_token *
2161 _cpp_lex_direct (cpp_reader *pfile)
2162 {
2163   cppchar_t c;
2164   cpp_buffer *buffer;
2165   const unsigned char *comment_start;
2166   cpp_token *result = pfile->cur_token++;
2167
2168  fresh_line:
2169   result->flags = 0;
2170   buffer = pfile->buffer;
2171   if (buffer->need_line)
2172     {
2173       if (pfile->state.in_deferred_pragma)
2174         {
2175           result->type = CPP_PRAGMA_EOL;
2176           pfile->state.in_deferred_pragma = false;
2177           if (!pfile->state.pragma_allow_expansion)
2178             pfile->state.prevent_expansion--;
2179           return result;
2180         }
2181       if (!_cpp_get_fresh_line (pfile))
2182         {
2183           result->type = CPP_EOF;
2184           if (!pfile->state.in_directive)
2185             {
2186               /* Tell the compiler the line number of the EOF token.  */
2187               result->src_loc = pfile->line_table->highest_line;
2188               result->flags = BOL;
2189             }
2190           return result;
2191         }
2192       if (!pfile->keep_tokens)
2193         {
2194           pfile->cur_run = &pfile->base_run;
2195           result = pfile->base_run.base;
2196           pfile->cur_token = result + 1;
2197         }
2198       result->flags = BOL;
2199       if (pfile->state.parsing_args == 2)
2200         result->flags |= PREV_WHITE;
2201     }
2202   buffer = pfile->buffer;
2203  update_tokens_line:
2204   result->src_loc = pfile->line_table->highest_line;
2205
2206  skipped_white:
2207   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
2208       && !pfile->overlaid_buffer)
2209     {
2210       _cpp_process_line_notes (pfile, false);
2211       result->src_loc = pfile->line_table->highest_line;
2212     }
2213   c = *buffer->cur++;
2214
2215   if (pfile->forced_token_location_p)
2216     result->src_loc = *pfile->forced_token_location_p;
2217   else
2218     result->src_loc = linemap_position_for_column (pfile->line_table,
2219                                           CPP_BUF_COLUMN (buffer, buffer->cur));
2220
2221   switch (c)
2222     {
2223     case ' ': case '\t': case '\f': case '\v': case '\0':
2224       result->flags |= PREV_WHITE;
2225       skip_whitespace (pfile, c);
2226       goto skipped_white;
2227
2228     case '\n':
2229       if (buffer->cur < buffer->rlimit)
2230         CPP_INCREMENT_LINE (pfile, 0);
2231       buffer->need_line = true;
2232       goto fresh_line;
2233
2234     case '0': case '1': case '2': case '3': case '4':
2235     case '5': case '6': case '7': case '8': case '9':
2236       {
2237         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2238         result->type = CPP_NUMBER;
2239         lex_number (pfile, &result->val.str, &nst);
2240         warn_about_normalization (pfile, result, &nst);
2241         break;
2242       }
2243
2244     case 'L':
2245     case 'u':
2246     case 'U':
2247     case 'R':
2248       /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
2249          wide strings or raw strings.  */
2250       if (c == 'L' || CPP_OPTION (pfile, rliterals)
2251           || (c != 'R' && CPP_OPTION (pfile, uliterals)))
2252         {
2253           if ((*buffer->cur == '\'' && c != 'R')
2254               || *buffer->cur == '"'
2255               || (*buffer->cur == 'R'
2256                   && c != 'R'
2257                   && buffer->cur[1] == '"'
2258                   && CPP_OPTION (pfile, rliterals))
2259               || (*buffer->cur == '8'
2260                   && c == 'u'
2261                   && (buffer->cur[1] == '"'
2262                       || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
2263                           && CPP_OPTION (pfile, rliterals)))))
2264             {
2265               lex_string (pfile, result, buffer->cur - 1);
2266               break;
2267             }
2268         }
2269       /* Fall through.  */
2270
2271     case '_':
2272     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
2273     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
2274     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
2275     case 's': case 't':           case 'v': case 'w': case 'x':
2276     case 'y': case 'z':
2277     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
2278     case 'G': case 'H': case 'I': case 'J': case 'K':
2279     case 'M': case 'N': case 'O': case 'P': case 'Q':
2280     case 'S': case 'T':           case 'V': case 'W': case 'X':
2281     case 'Y': case 'Z':
2282       result->type = CPP_NAME;
2283       {
2284         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2285         result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
2286                                                 &nst);
2287         warn_about_normalization (pfile, result, &nst);
2288       }
2289
2290       /* Convert named operators to their proper types.  */
2291       if (result->val.node.node->flags & NODE_OPERATOR)
2292         {
2293           result->flags |= NAMED_OP;
2294           result->type = (enum cpp_ttype) result->val.node.node->directive_index;
2295         }
2296       break;
2297
2298     case '\'':
2299     case '"':
2300       lex_string (pfile, result, buffer->cur - 1);
2301       break;
2302
2303     case '/':
2304       /* A potential block or line comment.  */
2305       comment_start = buffer->cur;
2306       c = *buffer->cur;
2307
2308       if (c == '*')
2309         {
2310           if (_cpp_skip_block_comment (pfile))
2311             cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
2312         }
2313       else if (c == '/' && (CPP_OPTION (pfile, cplusplus_comments)
2314                             || cpp_in_system_header (pfile)))
2315         {
2316           /* Warn about comments only if pedantically GNUC89, and not
2317              in system headers.  */
2318           if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile)
2319               && ! buffer->warned_cplusplus_comments)
2320             {
2321               cpp_error (pfile, CPP_DL_PEDWARN,
2322                          "C++ style comments are not allowed in ISO C90");
2323               cpp_error (pfile, CPP_DL_PEDWARN,
2324                          "(this will be reported only once per input file)");
2325               buffer->warned_cplusplus_comments = 1;
2326             }
2327
2328           if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
2329             cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
2330         }
2331       else if (c == '=')
2332         {
2333           buffer->cur++;
2334           result->type = CPP_DIV_EQ;
2335           break;
2336         }
2337       else
2338         {
2339           result->type = CPP_DIV;
2340           break;
2341         }
2342
2343       if (!pfile->state.save_comments)
2344         {
2345           result->flags |= PREV_WHITE;
2346           goto update_tokens_line;
2347         }
2348
2349       /* Save the comment as a token in its own right.  */
2350       save_comment (pfile, result, comment_start, c);
2351       break;
2352
2353     case '<':
2354       if (pfile->state.angled_headers)
2355         {
2356           lex_string (pfile, result, buffer->cur - 1);
2357           if (result->type != CPP_LESS)
2358             break;
2359         }
2360
2361       result->type = CPP_LESS;
2362       if (*buffer->cur == '=')
2363         buffer->cur++, result->type = CPP_LESS_EQ;
2364       else if (*buffer->cur == '<')
2365         {
2366           buffer->cur++;
2367           IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
2368         }
2369       else if (CPP_OPTION (pfile, digraphs))
2370         {
2371           if (*buffer->cur == ':')
2372             {
2373               /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
2374                  three characters are <:: and the subsequent character
2375                  is neither : nor >, the < is treated as a preprocessor
2376                  token by itself".  */
2377               if (CPP_OPTION (pfile, cplusplus)
2378                   && CPP_OPTION (pfile, lang) != CLK_CXX98
2379                   && CPP_OPTION (pfile, lang) != CLK_GNUCXX
2380                   && buffer->cur[1] == ':'
2381                   && buffer->cur[2] != ':' && buffer->cur[2] != '>')
2382                 break;
2383
2384               buffer->cur++;
2385               result->flags |= DIGRAPH;
2386               result->type = CPP_OPEN_SQUARE;
2387             }
2388           else if (*buffer->cur == '%')
2389             {
2390               buffer->cur++;
2391               result->flags |= DIGRAPH;
2392               result->type = CPP_OPEN_BRACE;
2393             }
2394         }
2395       break;
2396
2397     case '>':
2398       result->type = CPP_GREATER;
2399       if (*buffer->cur == '=')
2400         buffer->cur++, result->type = CPP_GREATER_EQ;
2401       else if (*buffer->cur == '>')
2402         {
2403           buffer->cur++;
2404           IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
2405         }
2406       break;
2407
2408     case '%':
2409       result->type = CPP_MOD;
2410       if (*buffer->cur == '=')
2411         buffer->cur++, result->type = CPP_MOD_EQ;
2412       else if (CPP_OPTION (pfile, digraphs))
2413         {
2414           if (*buffer->cur == ':')
2415             {
2416               buffer->cur++;
2417               result->flags |= DIGRAPH;
2418               result->type = CPP_HASH;
2419               if (*buffer->cur == '%' && buffer->cur[1] == ':')
2420                 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
2421             }
2422           else if (*buffer->cur == '>')
2423             {
2424               buffer->cur++;
2425               result->flags |= DIGRAPH;
2426               result->type = CPP_CLOSE_BRACE;
2427             }
2428         }
2429       break;
2430
2431     case '.':
2432       result->type = CPP_DOT;
2433       if (ISDIGIT (*buffer->cur))
2434         {
2435           struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2436           result->type = CPP_NUMBER;
2437           lex_number (pfile, &result->val.str, &nst);
2438           warn_about_normalization (pfile, result, &nst);
2439         }
2440       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
2441         buffer->cur += 2, result->type = CPP_ELLIPSIS;
2442       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
2443         buffer->cur++, result->type = CPP_DOT_STAR;
2444       break;
2445
2446     case '+':
2447       result->type = CPP_PLUS;
2448       if (*buffer->cur == '+')
2449         buffer->cur++, result->type = CPP_PLUS_PLUS;
2450       else if (*buffer->cur == '=')
2451         buffer->cur++, result->type = CPP_PLUS_EQ;
2452       break;
2453
2454     case '-':
2455       result->type = CPP_MINUS;
2456       if (*buffer->cur == '>')
2457         {
2458           buffer->cur++;
2459           result->type = CPP_DEREF;
2460           if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
2461             buffer->cur++, result->type = CPP_DEREF_STAR;
2462         }
2463       else if (*buffer->cur == '-')
2464         buffer->cur++, result->type = CPP_MINUS_MINUS;
2465       else if (*buffer->cur == '=')
2466         buffer->cur++, result->type = CPP_MINUS_EQ;
2467       break;
2468
2469     case '&':
2470       result->type = CPP_AND;
2471       if (*buffer->cur == '&')
2472         buffer->cur++, result->type = CPP_AND_AND;
2473       else if (*buffer->cur == '=')
2474         buffer->cur++, result->type = CPP_AND_EQ;
2475       break;
2476
2477     case '|':
2478       result->type = CPP_OR;
2479       if (*buffer->cur == '|')
2480         buffer->cur++, result->type = CPP_OR_OR;
2481       else if (*buffer->cur == '=')
2482         buffer->cur++, result->type = CPP_OR_EQ;
2483       break;
2484
2485     case ':':
2486       result->type = CPP_COLON;
2487       if (*buffer->cur == ':' && CPP_OPTION (pfile, cplusplus))
2488         buffer->cur++, result->type = CPP_SCOPE;
2489       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
2490         {
2491           buffer->cur++;
2492           result->flags |= DIGRAPH;
2493           result->type = CPP_CLOSE_SQUARE;
2494         }
2495       break;
2496
2497     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
2498     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
2499     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
2500     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
2501     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
2502
2503     case '?': result->type = CPP_QUERY; break;
2504     case '~': result->type = CPP_COMPL; break;
2505     case ',': result->type = CPP_COMMA; break;
2506     case '(': result->type = CPP_OPEN_PAREN; break;
2507     case ')': result->type = CPP_CLOSE_PAREN; break;
2508     case '[': result->type = CPP_OPEN_SQUARE; break;
2509     case ']': result->type = CPP_CLOSE_SQUARE; break;
2510     case '{': result->type = CPP_OPEN_BRACE; break;
2511     case '}': result->type = CPP_CLOSE_BRACE; break;
2512     case ';': result->type = CPP_SEMICOLON; break;
2513
2514       /* @ is a punctuator in Objective-C.  */
2515     case '@': result->type = CPP_ATSIGN; break;
2516
2517     case '$':
2518     case '\\':
2519       {
2520         const uchar *base = --buffer->cur;
2521         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2522
2523         if (forms_identifier_p (pfile, true, &nst))
2524           {
2525             result->type = CPP_NAME;
2526             result->val.node.node = lex_identifier (pfile, base, true, &nst);
2527             warn_about_normalization (pfile, result, &nst);
2528             break;
2529           }
2530         buffer->cur++;
2531       }
2532
2533     default:
2534       create_literal (pfile, result, buffer->cur - 1, 1, CPP_OTHER);
2535       break;
2536     }
2537
2538   return result;
2539 }
2540
2541 /* An upper bound on the number of bytes needed to spell TOKEN.
2542    Does not include preceding whitespace.  */
2543 unsigned int
2544 cpp_token_len (const cpp_token *token)
2545 {
2546   unsigned int len;
2547
2548   switch (TOKEN_SPELL (token))
2549     {
2550     default:            len = 6;                                break;
2551     case SPELL_LITERAL: len = token->val.str.len;               break;
2552     case SPELL_IDENT:   len = NODE_LEN (token->val.node.node) * 10;     break;
2553     }
2554
2555   return len;
2556 }
2557
2558 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
2559    Return the number of bytes read out of NAME.  (There are always
2560    10 bytes written to BUFFER.)  */
2561
2562 static size_t
2563 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
2564 {
2565   int j;
2566   int ucn_len = 0;
2567   int ucn_len_c;
2568   unsigned t;
2569   unsigned long utf32;
2570
2571   /* Compute the length of the UTF-8 sequence.  */
2572   for (t = *name; t & 0x80; t <<= 1)
2573     ucn_len++;
2574
2575   utf32 = *name & (0x7F >> ucn_len);
2576   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
2577     {
2578       utf32 = (utf32 << 6) | (*++name & 0x3F);
2579
2580       /* Ill-formed UTF-8.  */
2581       if ((*name & ~0x3F) != 0x80)
2582         abort ();
2583     }
2584
2585   *buffer++ = '\\';
2586   *buffer++ = 'U';
2587   for (j = 7; j >= 0; j--)
2588     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
2589   return ucn_len;
2590 }
2591
2592 /* Given a token TYPE corresponding to a digraph, return a pointer to
2593    the spelling of the digraph.  */
2594 static const unsigned char *
2595 cpp_digraph2name (enum cpp_ttype type)
2596 {
2597   return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
2598 }
2599
2600 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
2601    already contain the enough space to hold the token's spelling.
2602    Returns a pointer to the character after the last character written.
2603    FORSTRING is true if this is to be the spelling after translation
2604    phase 1 (this is different for UCNs).
2605    FIXME: Would be nice if we didn't need the PFILE argument.  */
2606 unsigned char *
2607 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
2608                  unsigned char *buffer, bool forstring)
2609 {
2610   switch (TOKEN_SPELL (token))
2611     {
2612     case SPELL_OPERATOR:
2613       {
2614         const unsigned char *spelling;
2615         unsigned char c;
2616
2617         if (token->flags & DIGRAPH)
2618           spelling = cpp_digraph2name (token->type);
2619         else if (token->flags & NAMED_OP)
2620           goto spell_ident;
2621         else
2622           spelling = TOKEN_NAME (token);
2623
2624         while ((c = *spelling++) != '\0')
2625           *buffer++ = c;
2626       }
2627       break;
2628
2629     spell_ident:
2630     case SPELL_IDENT:
2631       if (forstring)
2632         {
2633           memcpy (buffer, NODE_NAME (token->val.node.node),
2634                   NODE_LEN (token->val.node.node));
2635           buffer += NODE_LEN (token->val.node.node);
2636         }
2637       else
2638         {
2639           size_t i;
2640           const unsigned char * name = NODE_NAME (token->val.node.node);
2641
2642           for (i = 0; i < NODE_LEN (token->val.node.node); i++)
2643             if (name[i] & ~0x7F)
2644               {
2645                 i += utf8_to_ucn (buffer, name + i) - 1;
2646                 buffer += 10;
2647               }
2648             else
2649               *buffer++ = NODE_NAME (token->val.node.node)[i];
2650         }
2651       break;
2652
2653     case SPELL_LITERAL:
2654       memcpy (buffer, token->val.str.text, token->val.str.len);
2655       buffer += token->val.str.len;
2656       break;
2657
2658     case SPELL_NONE:
2659       cpp_error (pfile, CPP_DL_ICE,
2660                  "unspellable token %s", TOKEN_NAME (token));
2661       break;
2662     }
2663
2664   return buffer;
2665 }
2666
2667 /* Returns TOKEN spelt as a null-terminated string.  The string is
2668    freed when the reader is destroyed.  Useful for diagnostics.  */
2669 unsigned char *
2670 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
2671 {
2672   unsigned int len = cpp_token_len (token) + 1;
2673   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
2674
2675   end = cpp_spell_token (pfile, token, start, false);
2676   end[0] = '\0';
2677
2678   return start;
2679 }
2680
2681 /* Returns a pointer to a string which spells the token defined by
2682    TYPE and FLAGS.  Used by C front ends, which really should move to
2683    using cpp_token_as_text.  */
2684 const char *
2685 cpp_type2name (enum cpp_ttype type, unsigned char flags)
2686 {
2687   if (flags & DIGRAPH)
2688     return (const char *) cpp_digraph2name (type);
2689   else if (flags & NAMED_OP)
2690     return cpp_named_operator2name (type);
2691
2692   return (const char *) token_spellings[type].name;
2693 }
2694
2695 /* Writes the spelling of token to FP, without any preceding space.
2696    Separated from cpp_spell_token for efficiency - to avoid stdio
2697    double-buffering.  */
2698 void
2699 cpp_output_token (const cpp_token *token, FILE *fp)
2700 {
2701   switch (TOKEN_SPELL (token))
2702     {
2703     case SPELL_OPERATOR:
2704       {
2705         const unsigned char *spelling;
2706         int c;
2707
2708         if (token->flags & DIGRAPH)
2709           spelling = cpp_digraph2name (token->type);
2710         else if (token->flags & NAMED_OP)
2711           goto spell_ident;
2712         else
2713           spelling = TOKEN_NAME (token);
2714
2715         c = *spelling;
2716         do
2717           putc (c, fp);
2718         while ((c = *++spelling) != '\0');
2719       }
2720       break;
2721
2722     spell_ident:
2723     case SPELL_IDENT:
2724       {
2725         size_t i;
2726         const unsigned char * name = NODE_NAME (token->val.node.node);
2727
2728         for (i = 0; i < NODE_LEN (token->val.node.node); i++)
2729           if (name[i] & ~0x7F)
2730             {
2731               unsigned char buffer[10];
2732               i += utf8_to_ucn (buffer, name + i) - 1;
2733               fwrite (buffer, 1, 10, fp);
2734             }
2735           else
2736             fputc (NODE_NAME (token->val.node.node)[i], fp);
2737       }
2738       break;
2739
2740     case SPELL_LITERAL:
2741       fwrite (token->val.str.text, 1, token->val.str.len, fp);
2742       break;
2743
2744     case SPELL_NONE:
2745       /* An error, most probably.  */
2746       break;
2747     }
2748 }
2749
2750 /* Compare two tokens.  */
2751 int
2752 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
2753 {
2754   if (a->type == b->type && a->flags == b->flags)
2755     switch (TOKEN_SPELL (a))
2756       {
2757       default:                  /* Keep compiler happy.  */
2758       case SPELL_OPERATOR:
2759         /* token_no is used to track where multiple consecutive ##
2760            tokens were originally located.  */
2761         return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
2762       case SPELL_NONE:
2763         return (a->type != CPP_MACRO_ARG
2764                 || a->val.macro_arg.arg_no == b->val.macro_arg.arg_no);
2765       case SPELL_IDENT:
2766         return a->val.node.node == b->val.node.node;
2767       case SPELL_LITERAL:
2768         return (a->val.str.len == b->val.str.len
2769                 && !memcmp (a->val.str.text, b->val.str.text,
2770                             a->val.str.len));
2771       }
2772
2773   return 0;
2774 }
2775
2776 /* Returns nonzero if a space should be inserted to avoid an
2777    accidental token paste for output.  For simplicity, it is
2778    conservative, and occasionally advises a space where one is not
2779    needed, e.g. "." and ".2".  */
2780 int
2781 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
2782                  const cpp_token *token2)
2783 {
2784   enum cpp_ttype a = token1->type, b = token2->type;
2785   cppchar_t c;
2786
2787   if (token1->flags & NAMED_OP)
2788     a = CPP_NAME;
2789   if (token2->flags & NAMED_OP)
2790     b = CPP_NAME;
2791
2792   c = EOF;
2793   if (token2->flags & DIGRAPH)
2794     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
2795   else if (token_spellings[b].category == SPELL_OPERATOR)
2796     c = token_spellings[b].name[0];
2797
2798   /* Quickly get everything that can paste with an '='.  */
2799   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
2800     return 1;
2801
2802   switch (a)
2803     {
2804     case CPP_GREATER:   return c == '>';
2805     case CPP_LESS:      return c == '<' || c == '%' || c == ':';
2806     case CPP_PLUS:      return c == '+';
2807     case CPP_MINUS:     return c == '-' || c == '>';
2808     case CPP_DIV:       return c == '/' || c == '*'; /* Comments.  */
2809     case CPP_MOD:       return c == ':' || c == '>';
2810     case CPP_AND:       return c == '&';
2811     case CPP_OR:        return c == '|';
2812     case CPP_COLON:     return c == ':' || c == '>';
2813     case CPP_DEREF:     return c == '*';
2814     case CPP_DOT:       return c == '.' || c == '%' || b == CPP_NUMBER;
2815     case CPP_HASH:      return c == '#' || c == '%'; /* Digraph form.  */
2816     case CPP_NAME:      return ((b == CPP_NUMBER
2817                                  && name_p (pfile, &token2->val.str))
2818                                 || b == CPP_NAME
2819                                 || b == CPP_CHAR || b == CPP_STRING); /* L */
2820     case CPP_NUMBER:    return (b == CPP_NUMBER || b == CPP_NAME
2821                                 || c == '.' || c == '+' || c == '-');
2822                                       /* UCNs */
2823     case CPP_OTHER:     return ((token1->val.str.text[0] == '\\'
2824                                  && b == CPP_NAME)
2825                                 || (CPP_OPTION (pfile, objc)
2826                                     && token1->val.str.text[0] == '@'
2827                                     && (b == CPP_NAME || b == CPP_STRING)));
2828     case CPP_STRING:
2829     case CPP_WSTRING:
2830     case CPP_UTF8STRING:
2831     case CPP_STRING16:
2832     case CPP_STRING32:  return (CPP_OPTION (pfile, user_literals)
2833                                 && (b == CPP_NAME
2834                                     || (TOKEN_SPELL (token2) == SPELL_LITERAL
2835                                         && ISIDST (token2->val.str.text[0]))));
2836
2837     default:            break;
2838     }
2839
2840   return 0;
2841 }
2842
2843 /* Output all the remaining tokens on the current line, and a newline
2844    character, to FP.  Leading whitespace is removed.  If there are
2845    macros, special token padding is not performed.  */
2846 void
2847 cpp_output_line (cpp_reader *pfile, FILE *fp)
2848 {
2849   const cpp_token *token;
2850
2851   token = cpp_get_token (pfile);
2852   while (token->type != CPP_EOF)
2853     {
2854       cpp_output_token (token, fp);
2855       token = cpp_get_token (pfile);
2856       if (token->flags & PREV_WHITE)
2857         putc (' ', fp);
2858     }
2859
2860   putc ('\n', fp);
2861 }
2862
2863 /* Return a string representation of all the remaining tokens on the
2864    current line.  The result is allocated using xmalloc and must be
2865    freed by the caller.  */
2866 unsigned char *
2867 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
2868 {
2869   const cpp_token *token;
2870   unsigned int out = dir_name ? ustrlen (dir_name) : 0;
2871   unsigned int alloced = 120 + out;
2872   unsigned char *result = (unsigned char *) xmalloc (alloced);
2873
2874   /* If DIR_NAME is empty, there are no initial contents.  */
2875   if (dir_name)
2876     {
2877       sprintf ((char *) result, "#%s ", dir_name);
2878       out += 2;
2879     }
2880
2881   token = cpp_get_token (pfile);
2882   while (token->type != CPP_EOF)
2883     {
2884       unsigned char *last;
2885       /* Include room for a possible space and the terminating nul.  */
2886       unsigned int len = cpp_token_len (token) + 2;
2887
2888       if (out + len > alloced)
2889         {
2890           alloced *= 2;
2891           if (out + len > alloced)
2892             alloced = out + len;
2893           result = (unsigned char *) xrealloc (result, alloced);
2894         }
2895
2896       last = cpp_spell_token (pfile, token, &result[out], 0);
2897       out = last - result;
2898
2899       token = cpp_get_token (pfile);
2900       if (token->flags & PREV_WHITE)
2901         result[out++] = ' ';
2902     }
2903
2904   result[out] = '\0';
2905   return result;
2906 }
2907
2908 /* Memory buffers.  Changing these three constants can have a dramatic
2909    effect on performance.  The values here are reasonable defaults,
2910    but might be tuned.  If you adjust them, be sure to test across a
2911    range of uses of cpplib, including heavy nested function-like macro
2912    expansion.  Also check the change in peak memory usage (NJAMD is a
2913    good tool for this).  */
2914 #define MIN_BUFF_SIZE 8000
2915 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
2916 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
2917         (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
2918
2919 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
2920   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
2921 #endif
2922
2923 /* Create a new allocation buffer.  Place the control block at the end
2924    of the buffer, so that buffer overflows will cause immediate chaos.  */
2925 static _cpp_buff *
2926 new_buff (size_t len)
2927 {
2928   _cpp_buff *result;
2929   unsigned char *base;
2930
2931   if (len < MIN_BUFF_SIZE)
2932     len = MIN_BUFF_SIZE;
2933   len = CPP_ALIGN (len);
2934
2935 #ifdef ENABLE_VALGRIND_CHECKING
2936   /* Valgrind warns about uses of interior pointers, so put _cpp_buff
2937      struct first.  */
2938   size_t slen = CPP_ALIGN2 (sizeof (_cpp_buff), 2 * DEFAULT_ALIGNMENT);
2939   base = XNEWVEC (unsigned char, len + slen);
2940   result = (_cpp_buff *) base;
2941   base += slen;
2942 #else
2943   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
2944   result = (_cpp_buff *) (base + len);
2945 #endif
2946   result->base = base;
2947   result->cur = base;
2948   result->limit = base + len;
2949   result->next = NULL;
2950   return result;
2951 }
2952
2953 /* Place a chain of unwanted allocation buffers on the free list.  */
2954 void
2955 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
2956 {
2957   _cpp_buff *end = buff;
2958
2959   while (end->next)
2960     end = end->next;
2961   end->next = pfile->free_buffs;
2962   pfile->free_buffs = buff;
2963 }
2964
2965 /* Return a free buffer of size at least MIN_SIZE.  */
2966 _cpp_buff *
2967 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
2968 {
2969   _cpp_buff *result, **p;
2970
2971   for (p = &pfile->free_buffs;; p = &(*p)->next)
2972     {
2973       size_t size;
2974
2975       if (*p == NULL)
2976         return new_buff (min_size);
2977       result = *p;
2978       size = result->limit - result->base;
2979       /* Return a buffer that's big enough, but don't waste one that's
2980          way too big.  */
2981       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
2982         break;
2983     }
2984
2985   *p = result->next;
2986   result->next = NULL;
2987   result->cur = result->base;
2988   return result;
2989 }
2990
2991 /* Creates a new buffer with enough space to hold the uncommitted
2992    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
2993    the excess bytes to the new buffer.  Chains the new buffer after
2994    BUFF, and returns the new buffer.  */
2995 _cpp_buff *
2996 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
2997 {
2998   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
2999   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
3000
3001   buff->next = new_buff;
3002   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
3003   return new_buff;
3004 }
3005
3006 /* Creates a new buffer with enough space to hold the uncommitted
3007    remaining bytes of the buffer pointed to by BUFF, and at least
3008    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
3009    Chains the new buffer before the buffer pointed to by BUFF, and
3010    updates the pointer to point to the new buffer.  */
3011 void
3012 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
3013 {
3014   _cpp_buff *new_buff, *old_buff = *pbuff;
3015   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
3016
3017   new_buff = _cpp_get_buff (pfile, size);
3018   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
3019   new_buff->next = old_buff;
3020   *pbuff = new_buff;
3021 }
3022
3023 /* Free a chain of buffers starting at BUFF.  */
3024 void
3025 _cpp_free_buff (_cpp_buff *buff)
3026 {
3027   _cpp_buff *next;
3028
3029   for (; buff; buff = next)
3030     {
3031       next = buff->next;
3032 #ifdef ENABLE_VALGRIND_CHECKING
3033       free (buff);
3034 #else
3035       free (buff->base);
3036 #endif
3037     }
3038 }
3039
3040 /* Allocate permanent, unaligned storage of length LEN.  */
3041 unsigned char *
3042 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
3043 {
3044   _cpp_buff *buff = pfile->u_buff;
3045   unsigned char *result = buff->cur;
3046
3047   if (len > (size_t) (buff->limit - result))
3048     {
3049       buff = _cpp_get_buff (pfile, len);
3050       buff->next = pfile->u_buff;
3051       pfile->u_buff = buff;
3052       result = buff->cur;
3053     }
3054
3055   buff->cur = result + len;
3056   return result;
3057 }
3058
3059 /* Allocate permanent, unaligned storage of length LEN from a_buff.
3060    That buffer is used for growing allocations when saving macro
3061    replacement lists in a #define, and when parsing an answer to an
3062    assertion in #assert, #unassert or #if (and therefore possibly
3063    whilst expanding macros).  It therefore must not be used by any
3064    code that they might call: specifically the lexer and the guts of
3065    the macro expander.
3066
3067    All existing other uses clearly fit this restriction: storing
3068    registered pragmas during initialization.  */
3069 unsigned char *
3070 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
3071 {
3072   _cpp_buff *buff = pfile->a_buff;
3073   unsigned char *result = buff->cur;
3074
3075   if (len > (size_t) (buff->limit - result))
3076     {
3077       buff = _cpp_get_buff (pfile, len);
3078       buff->next = pfile->a_buff;
3079       pfile->a_buff = buff;
3080       result = buff->cur;
3081     }
3082
3083   buff->cur = result + len;
3084   return result;
3085 }
3086
3087 /* Say which field of TOK is in use.  */
3088
3089 enum cpp_token_fld_kind
3090 cpp_token_val_index (const cpp_token *tok)
3091 {
3092   switch (TOKEN_SPELL (tok))
3093     {
3094     case SPELL_IDENT:
3095       return CPP_TOKEN_FLD_NODE;
3096     case SPELL_LITERAL:
3097       return CPP_TOKEN_FLD_STR;
3098     case SPELL_OPERATOR:
3099       if (tok->type == CPP_PASTE)
3100         return CPP_TOKEN_FLD_TOKEN_NO;
3101       else
3102         return CPP_TOKEN_FLD_NONE;
3103     case SPELL_NONE:
3104       if (tok->type == CPP_MACRO_ARG)
3105         return CPP_TOKEN_FLD_ARG_NO;
3106       else if (tok->type == CPP_PADDING)
3107         return CPP_TOKEN_FLD_SOURCE;
3108       else if (tok->type == CPP_PRAGMA)
3109         return CPP_TOKEN_FLD_PRAGMA;
3110       /* else fall through */
3111     default:
3112       return CPP_TOKEN_FLD_NONE;
3113     }
3114 }
3115
3116 /* All tokens lexed in R after calling this function will be forced to have
3117    their source_location the same as the location referenced by P, until
3118    cpp_stop_forcing_token_locations is called for R.  */
3119
3120 void
3121 cpp_force_token_locations (cpp_reader *r, source_location *p)
3122 {
3123   r->forced_token_location_p = p;
3124 }
3125
3126 /* Go back to assigning locations naturally for lexed tokens.  */
3127
3128 void
3129 cpp_stop_forcing_token_locations (cpp_reader *r)
3130 {
3131   r->forced_token_location_p = NULL;
3132 }