libcpp/lex.c

   1 /* CPP Library - lexical analysis.
   2    Copyright (C) 2000-2017 Free Software Foundation, Inc.
   3    Contributed by Per Bothner, 1994-95.
   4    Based on CCCP program by Paul Rubin, June 1986
   5    Adapted to ANSI C, Richard Stallman, Jan 1987
   6    Broken out to separate file, Zack Weinberg, Mar 2000
   7
   8 This program is free software; you can redistribute it and/or modify it
   9 under the terms of the GNU General Public License as published by the
  10 Free Software Foundation; either version 3, or (at your option) any
  11 later version.
  12
  13 This program is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with this program; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "cpplib.h"
  25 #include "internal.h"
  26
  27 enum spell_type
  28 {
  29   SPELL_OPERATOR = 0,
  30   SPELL_IDENT,
  31   SPELL_LITERAL,
  32   SPELL_NONE
  33 };
  34
  35 struct token_spelling
  36 {
  37   enum spell_type category;
  38   const unsigned char *name;
  39 };
  40
  41 static const unsigned char *const digraph_spellings[] =
  42 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
  43
  44 #define OP(e, s) { SPELL_OPERATOR, UC s  },
  45 #define TK(e, s) { SPELL_ ## s,    UC #e },
  46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
  47 #undef OP
  48 #undef TK
  49
  50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
  51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
  52
  53 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
  54 static int skip_line_comment (cpp_reader *);
  55 static void skip_whitespace (cpp_reader *, cppchar_t);
  56 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
  57 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
  58 static void store_comment (cpp_reader *, cpp_token *);
  59 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
  60                             unsigned int, enum cpp_ttype);
  61 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
  62 static int name_p (cpp_reader *, const cpp_string *);
  63 static tokenrun *next_tokenrun (tokenrun *);
  64
  65 static _cpp_buff *new_buff (size_t);
  66
  67
  68 /* Utility routine:
  69
  70    Compares, the token TOKEN to the NUL-terminated string STRING.
  71    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
  72 int
  73 cpp_ideq (const cpp_token *token, const char *string)
  74 {
  75   if (token->type != CPP_NAME)
  76     return 0;
  77
  78   return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
  79 }
  80
  81 /* Record a note TYPE at byte POS into the current cleaned logical
  82    line.  */
  83 static void
  84 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
  85 {
  86   if (buffer->notes_used == buffer->notes_cap)
  87     {
  88       buffer->notes_cap = buffer->notes_cap * 2 + 200;
  89       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
  90                                   buffer->notes_cap);
  91     }
  92
  93   buffer->notes[buffer->notes_used].pos = pos;
  94   buffer->notes[buffer->notes_used].type = type;
  95   buffer->notes_used++;
  96 }
  97
  98 \f
  99 /* Fast path to find line special characters using optimized character
 100    scanning algorithms.  Anything complicated falls back to the slow
 101    path below.  Since this loop is very hot it's worth doing these kinds
 102    of optimizations.
 103
 104    One of the paths through the ifdefs should provide
 105
 106      const uchar *search_line_fast (const uchar *s, const uchar *end);
 107
 108    Between S and END, search for \n, \r, \\, ?.  Return a pointer to
 109    the found character.
 110
 111    Note that the last character of the buffer is *always* a newline,
 112    as forced by _cpp_convert_input.  This fact can be used to avoid
 113    explicitly looking for the end of the buffer.  */
 114
 115 /* Configure gives us an ifdef test.  */
 116 #ifndef WORDS_BIGENDIAN
 117 #define WORDS_BIGENDIAN 0
 118 #endif
 119
 120 /* We'd like the largest integer that fits into a register.  There's nothing
 121    in <stdint.h> that gives us that.  For most hosts this is unsigned long,
 122    but MS decided on an LLP64 model.  Thankfully when building with GCC we
 123    can get the "real" word size.  */
 124 #ifdef __GNUC__
 125 typedef unsigned int word_type __attribute__((__mode__(__word__)));
 126 #else
 127 typedef unsigned long word_type;
 128 #endif
 129
 130 /* The code below is only expecting sizes 4 or 8.
 131    Die at compile-time if this expectation is violated.  */
 132 typedef char check_word_type_size
 133   [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
 134
 135 /* Return X with the first N bytes forced to values that won't match one
 136    of the interesting characters.  Note that NUL is not interesting.  */
 137
 138 static inline word_type
 139 acc_char_mask_misalign (word_type val, unsigned int n)
 140 {
 141   word_type mask = -1;
 142   if (WORDS_BIGENDIAN)
 143     mask >>= n * 8;
 144   else
 145     mask <<= n * 8;
 146   return val & mask;
 147 }
 148
 149 /* Return X replicated to all byte positions within WORD_TYPE.  */
 150
 151 static inline word_type
 152 acc_char_replicate (uchar x)
 153 {
 154   word_type ret;
 155
 156   ret = (x << 24) | (x << 16) | (x << 8) | x;
 157   if (sizeof(word_type) == 8)
 158     ret = (ret << 16 << 16) | ret;
 159   return ret;
 160 }
 161
 162 /* Return non-zero if some byte of VAL is (probably) C.  */
 163
 164 static inline word_type
 165 acc_char_cmp (word_type val, word_type c)
 166 {
 167 #if defined(__GNUC__) && defined(__alpha__)
 168   /* We can get exact results using a compare-bytes instruction.
 169      Get (val == c) via (0 >= (val ^ c)).  */
 170   return __builtin_alpha_cmpbge (0, val ^ c);
 171 #else
 172   word_type magic = 0x7efefefeU;
 173   if (sizeof(word_type) == 8)
 174     magic = (magic << 16 << 16) | 0xfefefefeU;
 175   magic |= 1;
 176
 177   val ^= c;
 178   return ((val + magic) ^ ~val) & ~magic;
 179 #endif
 180 }
 181
 182 /* Given the result of acc_char_cmp is non-zero, return the index of
 183    the found character.  If this was a false positive, return -1.  */
 184
 185 static inline int
 186 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
 187                 word_type val ATTRIBUTE_UNUSED)
 188 {
 189 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
 190   /* The cmpbge instruction sets *bits* of the result corresponding to
 191      matches in the bytes with no false positives.  */
 192   return __builtin_ctzl (cmp);
 193 #else
 194   unsigned int i;
 195
 196   /* ??? It would be nice to force unrolling here,
 197      and have all of these constants folded.  */
 198   for (i = 0; i < sizeof(word_type); ++i)
 199     {
 200       uchar c;
 201       if (WORDS_BIGENDIAN)
 202         c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
 203       else
 204         c = (val >> i * 8) & 0xff;
 205
 206       if (c == '\n' || c == '\r' || c == '\\' || c == '?')
 207         return i;
 208     }
 209
 210   return -1;
 211 #endif
 212 }
 213
 214 /* A version of the fast scanner using bit fiddling techniques.
 215
 216    For 32-bit words, one would normally perform 16 comparisons and
 217    16 branches.  With this algorithm one performs 24 arithmetic
 218    operations and one branch.  Whether this is faster with a 32-bit
 219    word size is going to be somewhat system dependent.
 220
 221    For 64-bit words, we eliminate twice the number of comparisons
 222    and branches without increasing the number of arithmetic operations.
 223    It's almost certainly going to be a win with 64-bit word size.  */
 224
 225 static const uchar * search_line_acc_char (const uchar *, const uchar *)
 226   ATTRIBUTE_UNUSED;
 227
 228 static const uchar *
 229 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 230 {
 231   const word_type repl_nl = acc_char_replicate ('\n');
 232   const word_type repl_cr = acc_char_replicate ('\r');
 233   const word_type repl_bs = acc_char_replicate ('\\');
 234   const word_type repl_qm = acc_char_replicate ('?');
 235
 236   unsigned int misalign;
 237   const word_type *p;
 238   word_type val, t;
 239
 240   /* Align the buffer.  Mask out any bytes from before the beginning.  */
 241   p = (word_type *)((uintptr_t)s & -sizeof(word_type));
 242   val = *p;
 243   misalign = (uintptr_t)s & (sizeof(word_type) - 1);
 244   if (misalign)
 245     val = acc_char_mask_misalign (val, misalign);
 246
 247   /* Main loop.  */
 248   while (1)
 249     {
 250       t  = acc_char_cmp (val, repl_nl);
 251       t |= acc_char_cmp (val, repl_cr);
 252       t |= acc_char_cmp (val, repl_bs);
 253       t |= acc_char_cmp (val, repl_qm);
 254
 255       if (__builtin_expect (t != 0, 0))
 256         {
 257           int i = acc_char_index (t, val);
 258           if (i >= 0)
 259             return (const uchar *)p + i;
 260         }
 261
 262       val = *++p;
 263     }
 264 }
 265
 266 /* Disable on Solaris 2/x86 until the following problem can be properly
 267    autoconfed:
 268
 269    The Solaris 10+ assembler tags objects with the instruction set
 270    extensions used, so SSE4.2 executables cannot run on machines that
 271    don't support that extension.  */
 272
 273 #if (GCC_VERSION >= 4005) && (__GNUC__ >= 5 || !defined(__PIC__)) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
 274
 275 /* Replicated character data to be shared between implementations.
 276    Recall that outside of a context with vector support we can't
 277    define compatible vector types, therefore these are all defined
 278    in terms of raw characters.  */
 279 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
 280   { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 281     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
 282   { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 283     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
 284   { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 285     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
 286   { '?', '?', '?', '?', '?', '?', '?', '?',
 287     '?', '?', '?', '?', '?', '?', '?', '?' },
 288 };
 289
 290 /* A version of the fast scanner using MMX vectorized byte compare insns.
 291
 292    This uses the PMOVMSKB instruction which was introduced with "MMX2",
 293    which was packaged into SSE1; it is also present in the AMD MMX
 294    extension.  Mark the function as using "sse" so that we emit a real
 295    "emms" instruction, rather than the 3dNOW "femms" instruction.  */
 296
 297 static const uchar *
 298 #ifndef __SSE__
 299 __attribute__((__target__("sse")))
 300 #endif
 301 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 302 {
 303   typedef char v8qi __attribute__ ((__vector_size__ (8)));
 304   typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
 305
 306   const v8qi repl_nl = *(const v8qi *)repl_chars[0];
 307   const v8qi repl_cr = *(const v8qi *)repl_chars[1];
 308   const v8qi repl_bs = *(const v8qi *)repl_chars[2];
 309   const v8qi repl_qm = *(const v8qi *)repl_chars[3];
 310
 311   unsigned int misalign, found, mask;
 312   const v8qi *p;
 313   v8qi data, t, c;
 314
 315   /* Align the source pointer.  While MMX doesn't generate unaligned data
 316      faults, this allows us to safely scan to the end of the buffer without
 317      reading beyond the end of the last page.  */
 318   misalign = (uintptr_t)s & 7;
 319   p = (const v8qi *)((uintptr_t)s & -8);
 320   data = *p;
 321
 322   /* Create a mask for the bytes that are valid within the first
 323      16-byte block.  The Idea here is that the AND with the mask
 324      within the loop is "free", since we need some AND or TEST
 325      insn in order to set the flags for the branch anyway.  */
 326   mask = -1u << misalign;
 327
 328   /* Main loop processing 8 bytes at a time.  */
 329   goto start;
 330   do
 331     {
 332       data = *++p;
 333       mask = -1;
 334
 335     start:
 336       t = __builtin_ia32_pcmpeqb(data, repl_nl);
 337       c = __builtin_ia32_pcmpeqb(data, repl_cr);
 338       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 339       c = __builtin_ia32_pcmpeqb(data, repl_bs);
 340       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 341       c = __builtin_ia32_pcmpeqb(data, repl_qm);
 342       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 343       found = __builtin_ia32_pmovmskb (t);
 344       found &= mask;
 345     }
 346   while (!found);
 347
 348   __builtin_ia32_emms ();
 349
 350   /* FOUND contains 1 in bits for which we matched a relevant
 351      character.  Conversion to the byte index is trivial.  */
 352   found = __builtin_ctz(found);
 353   return (const uchar *)p + found;
 354 }
 355
 356 /* A version of the fast scanner using SSE2 vectorized byte compare insns.  */
 357
 358 static const uchar *
 359 #ifndef __SSE2__
 360 __attribute__((__target__("sse2")))
 361 #endif
 362 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 363 {
 364   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 365
 366   const v16qi repl_nl = *(const v16qi *)repl_chars[0];
 367   const v16qi repl_cr = *(const v16qi *)repl_chars[1];
 368   const v16qi repl_bs = *(const v16qi *)repl_chars[2];
 369   const v16qi repl_qm = *(const v16qi *)repl_chars[3];
 370
 371   unsigned int misalign, found, mask;
 372   const v16qi *p;
 373   v16qi data, t;
 374
 375   /* Align the source pointer.  */
 376   misalign = (uintptr_t)s & 15;
 377   p = (const v16qi *)((uintptr_t)s & -16);
 378   data = *p;
 379
 380   /* Create a mask for the bytes that are valid within the first
 381      16-byte block.  The Idea here is that the AND with the mask
 382      within the loop is "free", since we need some AND or TEST
 383      insn in order to set the flags for the branch anyway.  */
 384   mask = -1u << misalign;
 385
 386   /* Main loop processing 16 bytes at a time.  */
 387   goto start;
 388   do
 389     {
 390       data = *++p;
 391       mask = -1;
 392
 393     start:
 394       t  = __builtin_ia32_pcmpeqb128(data, repl_nl);
 395       t |= __builtin_ia32_pcmpeqb128(data, repl_cr);
 396       t |= __builtin_ia32_pcmpeqb128(data, repl_bs);
 397       t |= __builtin_ia32_pcmpeqb128(data, repl_qm);
 398       found = __builtin_ia32_pmovmskb128 (t);
 399       found &= mask;
 400     }
 401   while (!found);
 402
 403   /* FOUND contains 1 in bits for which we matched a relevant
 404      character.  Conversion to the byte index is trivial.  */
 405   found = __builtin_ctz(found);
 406   return (const uchar *)p + found;
 407 }
 408
 409 #ifdef HAVE_SSE4
 410 /* A version of the fast scanner using SSE 4.2 vectorized string insns.  */
 411
 412 static const uchar *
 413 #ifndef __SSE4_2__
 414 __attribute__((__target__("sse4.2")))
 415 #endif
 416 search_line_sse42 (const uchar *s, const uchar *end)
 417 {
 418   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 419   static const v16qi search = { '\n', '\r', '?', '\\' };
 420
 421   uintptr_t si = (uintptr_t)s;
 422   uintptr_t index;
 423
 424   /* Check for unaligned input.  */
 425   if (si & 15)
 426     {
 427       v16qi sv;
 428
 429       if (__builtin_expect (end - s < 16, 0)
 430           && __builtin_expect ((si & 0xfff) > 0xff0, 0))
 431         {
 432           /* There are less than 16 bytes left in the buffer, and less
 433              than 16 bytes left on the page.  Reading 16 bytes at this
 434              point might generate a spurious page fault.  Defer to the
 435              SSE2 implementation, which already handles alignment.  */
 436           return search_line_sse2 (s, end);
 437         }
 438
 439       /* ??? The builtin doesn't understand that the PCMPESTRI read from
 440          memory need not be aligned.  */
 441       sv = __builtin_ia32_loaddqu ((const char *) s);
 442       index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
 443
 444       if (__builtin_expect (index < 16, 0))
 445         goto found;
 446
 447       /* Advance the pointer to an aligned address.  We will re-scan a
 448          few bytes, but we no longer need care for reading past the
 449          end of a page, since we're guaranteed a match.  */
 450       s = (const uchar *)((si + 15) & -16);
 451     }
 452
 453   /* Main loop, processing 16 bytes at a time.  */
 454 #ifdef __GCC_ASM_FLAG_OUTPUTS__
 455   while (1)
 456     {
 457       char f;
 458
 459       /* By using inline assembly instead of the builtin,
 460          we can use the result, as well as the flags set.  */
 461       __asm ("%vpcmpestri\t$0, %2, %3"
 462              : "=c"(index), "=@ccc"(f)
 463              : "m"(*s), "x"(search), "a"(4), "d"(16));
 464       if (f)
 465         break;
 466
 467       s += 16;
 468     }
 469 #else
 470   s -= 16;
 471   /* By doing the whole loop in inline assembly,
 472      we can make proper use of the flags set.  */
 473   __asm (      ".balign 16\n"
 474         "0:     add $16, %1\n"
 475         "       %vpcmpestri\t$0, (%1), %2\n"
 476         "       jnc 0b"
 477         : "=&c"(index), "+r"(s)
 478         : "x"(search), "a"(4), "d"(16));
 479 #endif
 480
 481  found:
 482   return s + index;
 483 }
 484
 485 #else
 486 /* Work around out-dated assemblers without sse4 support.  */
 487 #define search_line_sse42 search_line_sse2
 488 #endif
 489
 490 /* Check the CPU capabilities.  */
 491
 492 #include "../gcc/config/i386/cpuid.h"
 493
 494 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
 495 static search_line_fast_type search_line_fast;
 496
 497 #define HAVE_init_vectorized_lexer 1
 498 static inline void
 499 init_vectorized_lexer (void)
 500 {
 501   unsigned dummy, ecx = 0, edx = 0;
 502   search_line_fast_type impl = search_line_acc_char;
 503   int minimum = 0;
 504
 505 #if defined(__SSE4_2__)
 506   minimum = 3;
 507 #elif defined(__SSE2__)
 508   minimum = 2;
 509 #elif defined(__SSE__)
 510   minimum = 1;
 511 #endif
 512
 513   if (minimum == 3)
 514     impl = search_line_sse42;
 515   else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
 516     {
 517       if (minimum == 3 || (ecx & bit_SSE4_2))
 518         impl = search_line_sse42;
 519       else if (minimum == 2 || (edx & bit_SSE2))
 520         impl = search_line_sse2;
 521       else if (minimum == 1 || (edx & bit_SSE))
 522         impl = search_line_mmx;
 523     }
 524   else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
 525     {
 526       if (minimum == 1
 527           || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
 528         impl = search_line_mmx;
 529     }
 530
 531   search_line_fast = impl;
 532 }
 533
 534 #elif defined(_ARCH_PWR8) && defined(__ALTIVEC__)
 535
 536 /* A vection of the fast scanner using AltiVec vectorized byte compares
 537    and VSX unaligned loads (when VSX is available).  This is otherwise
 538    the same as the pre-GCC 5 version.  */
 539
 540 ATTRIBUTE_NO_SANITIZE_UNDEFINED
 541 static const uchar *
 542 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 543 {
 544   typedef __attribute__((altivec(vector))) unsigned char vc;
 545
 546   const vc repl_nl = {
 547     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 548     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 549   };
 550   const vc repl_cr = {
 551     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 552     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 553   };
 554   const vc repl_bs = {
 555     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 556     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 557   };
 558   const vc repl_qm = {
 559     '?', '?', '?', '?', '?', '?', '?', '?',
 560     '?', '?', '?', '?', '?', '?', '?', '?',
 561   };
 562   const vc zero = { 0 };
 563
 564   vc data, t;
 565
 566   /* Main loop processing 16 bytes at a time.  */
 567   do
 568     {
 569       vc m_nl, m_cr, m_bs, m_qm;
 570
 571       data = *((const vc *)s);
 572       s += 16;
 573
 574       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 575       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 576       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 577       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 578       t = (m_nl | m_cr) | (m_bs | m_qm);
 579
 580       /* T now contains 0xff in bytes for which we matched one of the relevant
 581          characters.  We want to exit the loop if any byte in T is non-zero.
 582          Below is the expansion of vec_any_ne(t, zero).  */
 583     }
 584   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 585
 586   /* Restore s to to point to the 16 bytes we just processed.  */
 587   s -= 16;
 588
 589   {
 590 #define N  (sizeof(vc) / sizeof(long))
 591
 592     union {
 593       vc v;
 594       /* Statically assert that N is 2 or 4.  */
 595       unsigned long l[(N == 2 || N == 4) ? N : -1];
 596     } u;
 597     unsigned long l, i = 0;
 598
 599     u.v = t;
 600
 601     /* Find the first word of T that is non-zero.  */
 602     switch (N)
 603       {
 604       case 4:
 605         l = u.l[i++];
 606         if (l != 0)
 607           break;
 608         s += sizeof(unsigned long);
 609         l = u.l[i++];
 610         if (l != 0)
 611           break;
 612         s += sizeof(unsigned long);
 613         /* FALLTHRU */
 614       case 2:
 615         l = u.l[i++];
 616         if (l != 0)
 617           break;
 618         s += sizeof(unsigned long);
 619         l = u.l[i];
 620       }
 621
 622     /* L now contains 0xff in bytes for which we matched one of the
 623        relevant characters.  We can find the byte index by finding
 624        its bit index and dividing by 8.  */
 625 #ifdef __BIG_ENDIAN__
 626     l = __builtin_clzl(l) >> 3;
 627 #else
 628     l = __builtin_ctzl(l) >> 3;
 629 #endif
 630     return s + l;
 631
 632 #undef N
 633   }
 634 }
 635
 636 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__) && defined (__BIG_ENDIAN__)
 637
 638 /* A vection of the fast scanner using AltiVec vectorized byte compares.
 639    This cannot be used for little endian because vec_lvsl/lvsr are
 640    deprecated for little endian and the code won't work properly.  */
 641 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
 642    so we can't compile this function without -maltivec on the command line
 643    (or implied by some other switch).  */
 644
 645 static const uchar *
 646 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 647 {
 648   typedef __attribute__((altivec(vector))) unsigned char vc;
 649
 650   const vc repl_nl = {
 651     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 652     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 653   };
 654   const vc repl_cr = {
 655     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 656     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 657   };
 658   const vc repl_bs = {
 659     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 660     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 661   };
 662   const vc repl_qm = {
 663     '?', '?', '?', '?', '?', '?', '?', '?',
 664     '?', '?', '?', '?', '?', '?', '?', '?',
 665   };
 666   const vc ones = {
 667     -1, -1, -1, -1, -1, -1, -1, -1,
 668     -1, -1, -1, -1, -1, -1, -1, -1,
 669   };
 670   const vc zero = { 0 };
 671
 672   vc data, mask, t;
 673
 674   /* Altivec loads automatically mask addresses with -16.  This lets us
 675      issue the first load as early as possible.  */
 676   data = __builtin_vec_ld(0, (const vc *)s);
 677
 678   /* Discard bytes before the beginning of the buffer.  Do this by
 679      beginning with all ones and shifting in zeros according to the
 680      mis-alignment.  The LVSR instruction pulls the exact shift we
 681      want from the address.  */
 682   mask = __builtin_vec_lvsr(0, s);
 683   mask = __builtin_vec_perm(zero, ones, mask);
 684   data &= mask;
 685
 686   /* While altivec loads mask addresses, we still need to align S so
 687      that the offset we compute at the end is correct.  */
 688   s = (const uchar *)((uintptr_t)s & -16);
 689
 690   /* Main loop processing 16 bytes at a time.  */
 691   goto start;
 692   do
 693     {
 694       vc m_nl, m_cr, m_bs, m_qm;
 695
 696       s += 16;
 697       data = __builtin_vec_ld(0, (const vc *)s);
 698
 699     start:
 700       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 701       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 702       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 703       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 704       t = (m_nl | m_cr) | (m_bs | m_qm);
 705
 706       /* T now contains 0xff in bytes for which we matched one of the relevant
 707          characters.  We want to exit the loop if any byte in T is non-zero.
 708          Below is the expansion of vec_any_ne(t, zero).  */
 709     }
 710   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 711
 712   {
 713 #define N  (sizeof(vc) / sizeof(long))
 714
 715     union {
 716       vc v;
 717       /* Statically assert that N is 2 or 4.  */
 718       unsigned long l[(N == 2 || N == 4) ? N : -1];
 719     } u;
 720     unsigned long l, i = 0;
 721
 722     u.v = t;
 723
 724     /* Find the first word of T that is non-zero.  */
 725     switch (N)
 726       {
 727       case 4:
 728         l = u.l[i++];
 729         if (l != 0)
 730           break;
 731         s += sizeof(unsigned long);
 732         l = u.l[i++];
 733         if (l != 0)
 734           break;
 735         s += sizeof(unsigned long);
 736         /* FALLTHROUGH */
 737       case 2:
 738         l = u.l[i++];
 739         if (l != 0)
 740           break;
 741         s += sizeof(unsigned long);
 742         l = u.l[i];
 743       }
 744
 745     /* L now contains 0xff in bytes for which we matched one of the
 746        relevant characters.  We can find the byte index by finding
 747        its bit index and dividing by 8.  */
 748     l = __builtin_clzl(l) >> 3;
 749     return s + l;
 750
 751 #undef N
 752   }
 753 }
 754
 755 #elif defined (__ARM_NEON) && defined (__ARM_64BIT_STATE)
 756 #include "arm_neon.h"
 757
 758 /* This doesn't have to be the exact page size, but no system may use
 759    a size smaller than this.  ARMv8 requires a minimum page size of
 760    4k.  The impact of being conservative here is a small number of
 761    cases will take the slightly slower entry path into the main
 762    loop.  */
 763
 764 #define AARCH64_MIN_PAGE_SIZE 4096
 765
 766 static const uchar *
 767 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 768 {
 769   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
 770   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
 771   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
 772   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
 773   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
 774
 775 #ifdef __AARCH64EB
 776   const int16x8_t shift = {8, 8, 8, 8, 0, 0, 0, 0};
 777 #else
 778   const int16x8_t shift = {0, 0, 0, 0, 8, 8, 8, 8};
 779 #endif
 780
 781   unsigned int found;
 782   const uint8_t *p;
 783   uint8x16_t data;
 784   uint8x16_t t;
 785   uint16x8_t m;
 786   uint8x16_t u, v, w;
 787
 788   /* Align the source pointer.  */
 789   p = (const uint8_t *)((uintptr_t)s & -16);
 790
 791   /* Assuming random string start positions, with a 4k page size we'll take
 792      the slow path about 0.37% of the time.  */
 793   if (__builtin_expect ((AARCH64_MIN_PAGE_SIZE
 794                          - (((uintptr_t) s) & (AARCH64_MIN_PAGE_SIZE - 1)))
 795                         < 16, 0))
 796     {
 797       /* Slow path: the string starts near a possible page boundary.  */
 798       uint32_t misalign, mask;
 799
 800       misalign = (uintptr_t)s & 15;
 801       mask = (-1u << misalign) & 0xffff;
 802       data = vld1q_u8 (p);
 803       t = vceqq_u8 (data, repl_nl);
 804       u = vceqq_u8 (data, repl_cr);
 805       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 806       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 807       t = vorrq_u8 (v, w);
 808       t = vandq_u8 (t, xmask);
 809       m = vpaddlq_u8 (t);
 810       m = vshlq_u16 (m, shift);
 811       found = vaddvq_u16 (m);
 812       found &= mask;
 813       if (found)
 814         return (const uchar*)p + __builtin_ctz (found);
 815     }
 816   else
 817     {
 818       data = vld1q_u8 ((const uint8_t *) s);
 819       t = vceqq_u8 (data, repl_nl);
 820       u = vceqq_u8 (data, repl_cr);
 821       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 822       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 823       t = vorrq_u8 (v, w);
 824       if (__builtin_expect (vpaddd_u64 ((uint64x2_t)t) != 0, 0))
 825         goto done;
 826     }
 827
 828   do
 829     {
 830       p += 16;
 831       data = vld1q_u8 (p);
 832       t = vceqq_u8 (data, repl_nl);
 833       u = vceqq_u8 (data, repl_cr);
 834       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 835       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 836       t = vorrq_u8 (v, w);
 837     } while (!vpaddd_u64 ((uint64x2_t)t));
 838
 839 done:
 840   /* Now that we've found the terminating substring, work out precisely where
 841      we need to stop.  */
 842   t = vandq_u8 (t, xmask);
 843   m = vpaddlq_u8 (t);
 844   m = vshlq_u16 (m, shift);
 845   found = vaddvq_u16 (m);
 846   return (((((uintptr_t) p) < (uintptr_t) s) ? s : (const uchar *)p)
 847           + __builtin_ctz (found));
 848 }
 849
 850 #elif defined (__ARM_NEON)
 851 #include "arm_neon.h"
 852
 853 static const uchar *
 854 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 855 {
 856   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
 857   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
 858   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
 859   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
 860   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
 861
 862   unsigned int misalign, found, mask;
 863   const uint8_t *p;
 864   uint8x16_t data;
 865
 866   /* Align the source pointer.  */
 867   misalign = (uintptr_t)s & 15;
 868   p = (const uint8_t *)((uintptr_t)s & -16);
 869   data = vld1q_u8 (p);
 870
 871   /* Create a mask for the bytes that are valid within the first
 872      16-byte block.  The Idea here is that the AND with the mask
 873      within the loop is "free", since we need some AND or TEST
 874      insn in order to set the flags for the branch anyway.  */
 875   mask = (-1u << misalign) & 0xffff;
 876
 877   /* Main loop, processing 16 bytes at a time.  */
 878   goto start;
 879
 880   do
 881     {
 882       uint8x8_t l;
 883       uint16x4_t m;
 884       uint32x2_t n;
 885       uint8x16_t t, u, v, w;
 886
 887       p += 16;
 888       data = vld1q_u8 (p);
 889       mask = 0xffff;
 890
 891     start:
 892       t = vceqq_u8 (data, repl_nl);
 893       u = vceqq_u8 (data, repl_cr);
 894       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 895       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 896       t = vandq_u8 (vorrq_u8 (v, w), xmask);
 897       l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
 898       m = vpaddl_u8 (l);
 899       n = vpaddl_u16 (m);
 900
 901       found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
 902               vshr_n_u64 ((uint64x1_t) n, 24)), 0);
 903       found &= mask;
 904     }
 905   while (!found);
 906
 907   /* FOUND contains 1 in bits for which we matched a relevant
 908      character.  Conversion to the byte index is trivial.  */
 909   found = __builtin_ctz (found);
 910   return (const uchar *)p + found;
 911 }
 912
 913 #else
 914
 915 /* We only have one accelerated alternative.  Use a direct call so that
 916    we encourage inlining.  */
 917
 918 #define search_line_fast  search_line_acc_char
 919
 920 #endif
 921
 922 /* Initialize the lexer if needed.  */
 923
 924 void
 925 _cpp_init_lexer (void)
 926 {
 927 #ifdef HAVE_init_vectorized_lexer
 928   init_vectorized_lexer ();
 929 #endif
 930 }
 931
 932 /* Returns with a logical line that contains no escaped newlines or
 933    trigraphs.  This is a time-critical inner loop.  */
 934 void
 935 _cpp_clean_line (cpp_reader *pfile)
 936 {
 937   cpp_buffer *buffer;
 938   const uchar *s;
 939   uchar c, *d, *p;
 940
 941   buffer = pfile->buffer;
 942   buffer->cur_note = buffer->notes_used = 0;
 943   buffer->cur = buffer->line_base = buffer->next_line;
 944   buffer->need_line = false;
 945   s = buffer->next_line;
 946
 947   if (!buffer->from_stage3)
 948     {
 949       const uchar *pbackslash = NULL;
 950
 951       /* Fast path.  This is the common case of an un-escaped line with
 952          no trigraphs.  The primary win here is by not writing any
 953          data back to memory until we have to.  */
 954       while (1)
 955         {
 956           /* Perform an optimized search for \n, \r, \\, ?.  */
 957           s = search_line_fast (s, buffer->rlimit);
 958
 959           c = *s;
 960           if (c == '\\')
 961             {
 962               /* Record the location of the backslash and continue.  */
 963               pbackslash = s++;
 964             }
 965           else if (__builtin_expect (c == '?', 0))
 966             {
 967               if (__builtin_expect (s[1] == '?', false)
 968                    && _cpp_trigraph_map[s[2]])
 969                 {
 970                   /* Have a trigraph.  We may or may not have to convert
 971                      it.  Add a line note regardless, for -Wtrigraphs.  */
 972                   add_line_note (buffer, s, s[2]);
 973                   if (CPP_OPTION (pfile, trigraphs))
 974                     {
 975                       /* We do, and that means we have to switch to the
 976                          slow path.  */
 977                       d = (uchar *) s;
 978                       *d = _cpp_trigraph_map[s[2]];
 979                       s += 2;
 980                       goto slow_path;
 981                     }
 982                 }
 983               /* Not a trigraph.  Continue on fast-path.  */
 984               s++;
 985             }
 986           else
 987             break;
 988         }
 989
 990       /* This must be \r or \n.  We're either done, or we'll be forced
 991          to write back to the buffer and continue on the slow path.  */
 992       d = (uchar *) s;
 993
 994       if (__builtin_expect (s == buffer->rlimit, false))
 995         goto done;
 996
 997       /* DOS line ending? */
 998       if (__builtin_expect (c == '\r', false) && s[1] == '\n')
 999         {
1000           s++;
1001           if (s == buffer->rlimit)
1002             goto done;
1003         }
1004
1005       if (__builtin_expect (pbackslash == NULL, true))
1006         goto done;
1007
1008       /* Check for escaped newline.  */
1009       p = d;
1010       while (is_nvspace (p[-1]))
1011         p--;
1012       if (p - 1 != pbackslash)
1013         goto done;
1014
1015       /* Have an escaped newline; process it and proceed to
1016          the slow path.  */
1017       add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
1018       d = p - 2;
1019       buffer->next_line = p - 1;
1020
1021     slow_path:
1022       while (1)
1023         {
1024           c = *++s;
1025           *++d = c;
1026
1027           if (c == '\n' || c == '\r')
1028             {
1029               /* Handle DOS line endings.  */
1030               if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
1031                 s++;
1032               if (s == buffer->rlimit)
1033                 break;
1034
1035               /* Escaped?  */
1036               p = d;
1037               while (p != buffer->next_line && is_nvspace (p[-1]))
1038                 p--;
1039               if (p == buffer->next_line || p[-1] != '\\')
1040                 break;
1041
1042               add_line_note (buffer, p - 1, p != d ? ' ': '\\');
1043               d = p - 2;
1044               buffer->next_line = p - 1;
1045             }
1046           else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
1047             {
1048               /* Add a note regardless, for the benefit of -Wtrigraphs.  */
1049               add_line_note (buffer, d, s[2]);
1050               if (CPP_OPTION (pfile, trigraphs))
1051                 {
1052                   *d = _cpp_trigraph_map[s[2]];
1053                   s += 2;
1054                 }
1055             }
1056         }
1057     }
1058   else
1059     {
1060       while (*s != '\n' && *s != '\r')
1061         s++;
1062       d = (uchar *) s;
1063
1064       /* Handle DOS line endings.  */
1065       if (*s == '\r' && s != buffer->rlimit && s[1] == '\n')
1066         s++;
1067     }
1068
1069  done:
1070   *d = '\n';
1071   /* A sentinel note that should never be processed.  */
1072   add_line_note (buffer, d + 1, '\n');
1073   buffer->next_line = s + 1;
1074 }
1075
1076 /* Return true if the trigraph indicated by NOTE should be warned
1077    about in a comment.  */
1078 static bool
1079 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
1080 {
1081   const uchar *p;
1082
1083   /* Within comments we don't warn about trigraphs, unless the
1084      trigraph forms an escaped newline, as that may change
1085      behavior.  */
1086   if (note->type != '/')
1087     return false;
1088
1089   /* If -trigraphs, then this was an escaped newline iff the next note
1090      is coincident.  */
1091   if (CPP_OPTION (pfile, trigraphs))
1092     return note[1].pos == note->pos;
1093
1094   /* Otherwise, see if this forms an escaped newline.  */
1095   p = note->pos + 3;
1096   while (is_nvspace (*p))
1097     p++;
1098
1099   /* There might have been escaped newlines between the trigraph and the
1100      newline we found.  Hence the position test.  */
1101   return (*p == '\n' && p < note[1].pos);
1102 }
1103
1104 /* Process the notes created by add_line_note as far as the current
1105    location.  */
1106 void
1107 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
1108 {
1109   cpp_buffer *buffer = pfile->buffer;
1110
1111   for (;;)
1112     {
1113       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
1114       unsigned int col;
1115
1116       if (note->pos > buffer->cur)
1117         break;
1118
1119       buffer->cur_note++;
1120       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
1121
1122       if (note->type == '\\' || note->type == ' ')
1123         {
1124           if (note->type == ' ' && !in_comment)
1125             cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
1126                                  "backslash and newline separated by space");
1127
1128           if (buffer->next_line > buffer->rlimit)
1129             {
1130               cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
1131                                    "backslash-newline at end of file");
1132               /* Prevent "no newline at end of file" warning.  */
1133               buffer->next_line = buffer->rlimit;
1134             }
1135
1136           buffer->line_base = note->pos;
1137           CPP_INCREMENT_LINE (pfile, 0);
1138         }
1139       else if (_cpp_trigraph_map[note->type])
1140         {
1141           if (CPP_OPTION (pfile, warn_trigraphs)
1142               && (!in_comment || warn_in_comment (pfile, note)))
1143             {
1144               if (CPP_OPTION (pfile, trigraphs))
1145                 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
1146                                        pfile->line_table->highest_line, col,
1147                                        "trigraph ??%c converted to %c",
1148                                        note->type,
1149                                        (int) _cpp_trigraph_map[note->type]);
1150               else
1151                 {
1152                   cpp_warning_with_line
1153                     (pfile, CPP_W_TRIGRAPHS,
1154                      pfile->line_table->highest_line, col,
1155                      "trigraph ??%c ignored, use -trigraphs to enable",
1156                      note->type);
1157                 }
1158             }
1159         }
1160       else if (note->type == 0)
1161         /* Already processed in lex_raw_string.  */;
1162       else
1163         abort ();
1164     }
1165 }
1166
1167 /* Skip a C-style block comment.  We find the end of the comment by
1168    seeing if an asterisk is before every '/' we encounter.  Returns
1169    nonzero if comment terminated by EOF, zero otherwise.
1170
1171    Buffer->cur points to the initial asterisk of the comment.  */
1172 bool
1173 _cpp_skip_block_comment (cpp_reader *pfile)
1174 {
1175   cpp_buffer *buffer = pfile->buffer;
1176   const uchar *cur = buffer->cur;
1177   uchar c;
1178
1179   cur++;
1180   if (*cur == '/')
1181     cur++;
1182
1183   for (;;)
1184     {
1185       /* People like decorating comments with '*', so check for '/'
1186          instead for efficiency.  */
1187       c = *cur++;
1188
1189       if (c == '/')
1190         {
1191           if (cur[-2] == '*')
1192             break;
1193
1194           /* Warn about potential nested comments, but not if the '/'
1195              comes immediately before the true comment delimiter.
1196              Don't bother to get it right across escaped newlines.  */
1197           if (CPP_OPTION (pfile, warn_comments)
1198               && cur[0] == '*' && cur[1] != '/')
1199             {
1200               buffer->cur = cur;
1201               cpp_warning_with_line (pfile, CPP_W_COMMENTS,
1202                                      pfile->line_table->highest_line,
1203                                      CPP_BUF_COL (buffer),
1204                                      "\"/*\" within comment");
1205             }
1206         }
1207       else if (c == '\n')
1208         {
1209           unsigned int cols;
1210           buffer->cur = cur - 1;
1211           _cpp_process_line_notes (pfile, true);
1212           if (buffer->next_line >= buffer->rlimit)
1213             return true;
1214           _cpp_clean_line (pfile);
1215
1216           cols = buffer->next_line - buffer->line_base;
1217           CPP_INCREMENT_LINE (pfile, cols);
1218
1219           cur = buffer->cur;
1220         }
1221     }
1222
1223   buffer->cur = cur;
1224   _cpp_process_line_notes (pfile, true);
1225   return false;
1226 }
1227
1228 /* Skip a C++ line comment, leaving buffer->cur pointing to the
1229    terminating newline.  Handles escaped newlines.  Returns nonzero
1230    if a multiline comment.  */
1231 static int
1232 skip_line_comment (cpp_reader *pfile)
1233 {
1234   cpp_buffer *buffer = pfile->buffer;
1235   source_location orig_line = pfile->line_table->highest_line;
1236
1237   while (*buffer->cur != '\n')
1238     buffer->cur++;
1239
1240   _cpp_process_line_notes (pfile, true);
1241   return orig_line != pfile->line_table->highest_line;
1242 }
1243
1244 /* Skips whitespace, saving the next non-whitespace character.  */
1245 static void
1246 skip_whitespace (cpp_reader *pfile, cppchar_t c)
1247 {
1248   cpp_buffer *buffer = pfile->buffer;
1249   bool saw_NUL = false;
1250
1251   do
1252     {
1253       /* Horizontal space always OK.  */
1254       if (c == ' ' || c == '\t')
1255         ;
1256       /* Just \f \v or \0 left.  */
1257       else if (c == '\0')
1258         saw_NUL = true;
1259       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
1260         cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
1261                              CPP_BUF_COL (buffer),
1262                              "%s in preprocessing directive",
1263                              c == '\f' ? "form feed" : "vertical tab");
1264
1265       c = *buffer->cur++;
1266     }
1267   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
1268   while (is_nvspace (c));
1269
1270   if (saw_NUL)
1271     cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored");
1272
1273   buffer->cur--;
1274 }
1275
1276 /* See if the characters of a number token are valid in a name (no
1277    '.', '+' or '-').  */
1278 static int
1279 name_p (cpp_reader *pfile, const cpp_string *string)
1280 {
1281   unsigned int i;
1282
1283   for (i = 0; i < string->len; i++)
1284     if (!is_idchar (string->text[i]))
1285       return 0;
1286
1287   return 1;
1288 }
1289
1290 /* After parsing an identifier or other sequence, produce a warning about
1291    sequences not in NFC/NFKC.  */
1292 static void
1293 warn_about_normalization (cpp_reader *pfile,
1294                           const cpp_token *token,
1295                           const struct normalize_state *s)
1296 {
1297   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
1298       && !pfile->state.skipping)
1299     {
1300       /* Make sure that the token is printed using UCNs, even
1301          if we'd otherwise happily print UTF-8.  */
1302       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
1303       size_t sz;
1304
1305       sz = cpp_spell_token (pfile, token, buf, false) - buf;
1306       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
1307         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1308                                "`%.*s' is not in NFKC", (int) sz, buf);
1309       else
1310         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1311                                "`%.*s' is not in NFC", (int) sz, buf);
1312       free (buf);
1313     }
1314 }
1315
1316 /* Returns TRUE if the sequence starting at buffer->cur is invalid in
1317    an identifier.  FIRST is TRUE if this starts an identifier.  */
1318 static bool
1319 forms_identifier_p (cpp_reader *pfile, int first,
1320                     struct normalize_state *state)
1321 {
1322   cpp_buffer *buffer = pfile->buffer;
1323
1324   if (*buffer->cur == '$')
1325     {
1326       if (!CPP_OPTION (pfile, dollars_in_ident))
1327         return false;
1328
1329       buffer->cur++;
1330       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
1331         {
1332           CPP_OPTION (pfile, warn_dollars) = 0;
1333           cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
1334         }
1335
1336       return true;
1337     }
1338
1339   /* Is this a syntactically valid UCN?  */
1340   if (CPP_OPTION (pfile, extended_identifiers)
1341       && *buffer->cur == '\\'
1342       && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
1343     {
1344       cppchar_t s;
1345       buffer->cur += 2;
1346       if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1347                           state, &s, NULL, NULL))
1348         return true;
1349       buffer->cur -= 2;
1350     }
1351
1352   return false;
1353 }
1354
1355 /* Helper function to get the cpp_hashnode of the identifier BASE.  */
1356 static cpp_hashnode *
1357 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
1358 {
1359   cpp_hashnode *result;
1360   const uchar *cur;
1361   unsigned int len;
1362   unsigned int hash = HT_HASHSTEP (0, *base);
1363
1364   cur = base + 1;
1365   while (ISIDNUM (*cur))
1366     {
1367       hash = HT_HASHSTEP (hash, *cur);
1368       cur++;
1369     }
1370   len = cur - base;
1371   hash = HT_HASHFINISH (hash, len);
1372   result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1373                                               base, len, hash, HT_ALLOC));
1374
1375   /* Rarely, identifiers require diagnostics when lexed.  */
1376   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1377                         && !pfile->state.skipping, 0))
1378     {
1379       /* It is allowed to poison the same identifier twice.  */
1380       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1381         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1382                    NODE_NAME (result));
1383
1384       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1385          replacement list of a variadic macro.  */
1386       if (result == pfile->spec_nodes.n__VA_ARGS__
1387           && !pfile->state.va_args_ok)
1388         {
1389           if (CPP_OPTION (pfile, cplusplus))
1390             cpp_error (pfile, CPP_DL_PEDWARN,
1391                        "__VA_ARGS__ can only appear in the expansion"
1392                        " of a C++11 variadic macro");
1393           else
1394             cpp_error (pfile, CPP_DL_PEDWARN,
1395                        "__VA_ARGS__ can only appear in the expansion"
1396                        " of a C99 variadic macro");
1397         }
1398
1399       /* For -Wc++-compat, warn about use of C++ named operators.  */
1400       if (result->flags & NODE_WARN_OPERATOR)
1401         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1402                      "identifier \"%s\" is a special operator name in C++",
1403                      NODE_NAME (result));
1404     }
1405
1406   return result;
1407 }
1408
1409 /* Get the cpp_hashnode of an identifier specified by NAME in
1410    the current cpp_reader object.  If none is found, NULL is returned.  */
1411 cpp_hashnode *
1412 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
1413 {
1414   cpp_hashnode *result;
1415   result = lex_identifier_intern (pfile, (uchar *) name);
1416   return result;
1417 }
1418
1419 /* Lex an identifier starting at BUFFER->CUR - 1.  */
1420 static cpp_hashnode *
1421 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
1422                 struct normalize_state *nst, cpp_hashnode **spelling)
1423 {
1424   cpp_hashnode *result;
1425   const uchar *cur;
1426   unsigned int len;
1427   unsigned int hash = HT_HASHSTEP (0, *base);
1428
1429   cur = pfile->buffer->cur;
1430   if (! starts_ucn)
1431     {
1432       while (ISIDNUM (*cur))
1433         {
1434           hash = HT_HASHSTEP (hash, *cur);
1435           cur++;
1436         }
1437       NORMALIZE_STATE_UPDATE_IDNUM (nst, *(cur - 1));
1438     }
1439   pfile->buffer->cur = cur;
1440   if (starts_ucn || forms_identifier_p (pfile, false, nst))
1441     {
1442       /* Slower version for identifiers containing UCNs (or $).  */
1443       do {
1444         while (ISIDNUM (*pfile->buffer->cur))
1445           {
1446             NORMALIZE_STATE_UPDATE_IDNUM (nst, *pfile->buffer->cur);
1447             pfile->buffer->cur++;
1448           }
1449       } while (forms_identifier_p (pfile, false, nst));
1450       result = _cpp_interpret_identifier (pfile, base,
1451                                           pfile->buffer->cur - base);
1452       *spelling = cpp_lookup (pfile, base, pfile->buffer->cur - base);
1453     }
1454   else
1455     {
1456       len = cur - base;
1457       hash = HT_HASHFINISH (hash, len);
1458
1459       result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1460                                                   base, len, hash, HT_ALLOC));
1461       *spelling = result;
1462     }
1463
1464   /* Rarely, identifiers require diagnostics when lexed.  */
1465   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1466                         && !pfile->state.skipping, 0))
1467     {
1468       /* It is allowed to poison the same identifier twice.  */
1469       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1470         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1471                    NODE_NAME (result));
1472
1473       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1474          replacement list of a variadic macro.  */
1475       if (result == pfile->spec_nodes.n__VA_ARGS__
1476           && !pfile->state.va_args_ok)
1477         {
1478           if (CPP_OPTION (pfile, cplusplus))
1479             cpp_error (pfile, CPP_DL_PEDWARN,
1480                        "__VA_ARGS__ can only appear in the expansion"
1481                        " of a C++11 variadic macro");
1482           else
1483             cpp_error (pfile, CPP_DL_PEDWARN,
1484                        "__VA_ARGS__ can only appear in the expansion"
1485                        " of a C99 variadic macro");
1486         }
1487
1488       /* For -Wc++-compat, warn about use of C++ named operators.  */
1489       if (result->flags & NODE_WARN_OPERATOR)
1490         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1491                      "identifier \"%s\" is a special operator name in C++",
1492                      NODE_NAME (result));
1493     }
1494
1495   return result;
1496 }
1497
1498 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
1499 static void
1500 lex_number (cpp_reader *pfile, cpp_string *number,
1501             struct normalize_state *nst)
1502 {
1503   const uchar *cur;
1504   const uchar *base;
1505   uchar *dest;
1506
1507   base = pfile->buffer->cur - 1;
1508   do
1509     {
1510       cur = pfile->buffer->cur;
1511
1512       /* N.B. ISIDNUM does not include $.  */
1513       while (ISIDNUM (*cur) || *cur == '.' || DIGIT_SEP (*cur)
1514              || VALID_SIGN (*cur, cur[-1]))
1515         {
1516           NORMALIZE_STATE_UPDATE_IDNUM (nst, *cur);
1517           cur++;
1518         }
1519       /* A number can't end with a digit separator.  */
1520       while (cur > pfile->buffer->cur && DIGIT_SEP (cur[-1]))
1521         --cur;
1522
1523       pfile->buffer->cur = cur;
1524     }
1525   while (forms_identifier_p (pfile, false, nst));
1526
1527   number->len = cur - base;
1528   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
1529   memcpy (dest, base, number->len);
1530   dest[number->len] = '\0';
1531   number->text = dest;
1532 }
1533
1534 /* Create a token of type TYPE with a literal spelling.  */
1535 static void
1536 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
1537                 unsigned int len, enum cpp_ttype type)
1538 {
1539   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
1540
1541   memcpy (dest, base, len);
1542   dest[len] = '\0';
1543   token->type = type;
1544   token->val.str.len = len;
1545   token->val.str.text = dest;
1546 }
1547
1548 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
1549    sequence from *FIRST_BUFF_P to LAST_BUFF_P.  */
1550
1551 static void
1552 bufring_append (cpp_reader *pfile, const uchar *base, size_t len,
1553                 _cpp_buff **first_buff_p, _cpp_buff **last_buff_p)
1554 {
1555   _cpp_buff *first_buff = *first_buff_p;
1556   _cpp_buff *last_buff = *last_buff_p;
1557
1558   if (first_buff == NULL)
1559     first_buff = last_buff = _cpp_get_buff (pfile, len);
1560   else if (len > BUFF_ROOM (last_buff))
1561     {
1562       size_t room = BUFF_ROOM (last_buff);
1563       memcpy (BUFF_FRONT (last_buff), base, room);
1564       BUFF_FRONT (last_buff) += room;
1565       base += room;
1566       len -= room;
1567       last_buff = _cpp_append_extend_buff (pfile, last_buff, len);
1568     }
1569
1570   memcpy (BUFF_FRONT (last_buff), base, len);
1571   BUFF_FRONT (last_buff) += len;
1572
1573   *first_buff_p = first_buff;
1574   *last_buff_p = last_buff;
1575 }
1576
1577
1578 /* Returns true if a macro has been defined.
1579    This might not work if compile with -save-temps,
1580    or preprocess separately from compilation.  */
1581
1582 static bool
1583 is_macro(cpp_reader *pfile, const uchar *base)
1584 {
1585   const uchar *cur = base;
1586   if (! ISIDST (*cur))
1587     return false;
1588   unsigned int hash = HT_HASHSTEP (0, *cur);
1589   ++cur;
1590   while (ISIDNUM (*cur))
1591     {
1592       hash = HT_HASHSTEP (hash, *cur);
1593       ++cur;
1594     }
1595   hash = HT_HASHFINISH (hash, cur - base);
1596
1597   cpp_hashnode *result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1598                                         base, cur - base, hash, HT_NO_INSERT));
1599
1600   return !result ? false : (result->type == NT_MACRO);
1601 }
1602
1603
1604 /* Lexes a raw string.  The stored string contains the spelling, including
1605    double quotes, delimiter string, '(' and ')', any leading
1606    'L', 'u', 'U' or 'u8' and 'R' modifier.  It returns the type of the
1607    literal, or CPP_OTHER if it was not properly terminated.
1608
1609    The spelling is NUL-terminated, but it is not guaranteed that this
1610    is the first NUL since embedded NULs are preserved.  */
1611
1612 static void
1613 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base,
1614                 const uchar *cur)
1615 {
1616   uchar raw_prefix[17];
1617   uchar temp_buffer[18];
1618   const uchar *orig_base;
1619   unsigned int raw_prefix_len = 0, raw_suffix_len = 0;
1620   enum raw_str_phase { RAW_STR_PREFIX, RAW_STR, RAW_STR_SUFFIX };
1621   raw_str_phase phase = RAW_STR_PREFIX;
1622   enum cpp_ttype type;
1623   size_t total_len = 0;
1624   /* Index into temp_buffer during phases other than RAW_STR,
1625      during RAW_STR phase 17 to tell BUF_APPEND that nothing should
1626      be appended to temp_buffer.  */
1627   size_t temp_buffer_len = 0;
1628   _cpp_buff *first_buff = NULL, *last_buff = NULL;
1629   size_t raw_prefix_start;
1630   _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
1631
1632   type = (*base == 'L' ? CPP_WSTRING :
1633           *base == 'U' ? CPP_STRING32 :
1634           *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1635           : CPP_STRING);
1636
1637 #define BUF_APPEND(STR,LEN)                                     \
1638       do {                                                      \
1639         bufring_append (pfile, (const uchar *)(STR), (LEN),     \
1640                         &first_buff, &last_buff);               \
1641         total_len += (LEN);                                     \
1642         if (__builtin_expect (temp_buffer_len < 17, 0)          \
1643             && (const uchar *)(STR) != base                     \
1644             && (LEN) <= 2)                                      \
1645           {                                                     \
1646             memcpy (temp_buffer + temp_buffer_len,              \
1647                     (const uchar *)(STR), (LEN));               \
1648             temp_buffer_len += (LEN);                           \
1649           }                                                     \
1650       } while (0);
1651
1652   orig_base = base;
1653   ++cur;
1654   raw_prefix_start = cur - base;
1655   for (;;)
1656     {
1657       cppchar_t c;
1658
1659       /* If we previously performed any trigraph or line splicing
1660          transformations, undo them in between the opening and closing
1661          double quote.  */
1662       while (note->pos < cur)
1663         ++note;
1664       for (; note->pos == cur; ++note)
1665         {
1666           switch (note->type)
1667             {
1668             case '\\':
1669             case ' ':
1670               /* Restore backslash followed by newline.  */
1671               BUF_APPEND (base, cur - base);
1672               base = cur;
1673               BUF_APPEND ("\\", 1);
1674             after_backslash:
1675               if (note->type == ' ')
1676                 {
1677                   /* GNU backslash whitespace newline extension.  FIXME
1678                      could be any sequence of non-vertical space.  When we
1679                      can properly restore any such sequence, we should mark
1680                      this note as handled so _cpp_process_line_notes
1681                      doesn't warn.  */
1682                   BUF_APPEND (" ", 1);
1683                 }
1684
1685               BUF_APPEND ("\n", 1);
1686               break;
1687
1688             case 0:
1689               /* Already handled.  */
1690               break;
1691
1692             default:
1693               if (_cpp_trigraph_map[note->type])
1694                 {
1695                   /* Don't warn about this trigraph in
1696                      _cpp_process_line_notes, since trigraphs show up as
1697                      trigraphs in raw strings.  */
1698                   uchar type = note->type;
1699                   note->type = 0;
1700
1701                   if (!CPP_OPTION (pfile, trigraphs))
1702                     /* If we didn't convert the trigraph in the first
1703                        place, don't do anything now either.  */
1704                     break;
1705
1706                   BUF_APPEND (base, cur - base);
1707                   base = cur;
1708                   BUF_APPEND ("??", 2);
1709
1710                   /* ??/ followed by newline gets two line notes, one for
1711                      the trigraph and one for the backslash/newline.  */
1712                   if (type == '/' && note[1].pos == cur)
1713                     {
1714                       if (note[1].type != '\\'
1715                           && note[1].type != ' ')
1716                         abort ();
1717                       BUF_APPEND ("/", 1);
1718                       ++note;
1719                       goto after_backslash;
1720                     }
1721                   else
1722                     {
1723                       /* Skip the replacement character.  */
1724                       base = ++cur;
1725                       BUF_APPEND (&type, 1);
1726                       c = type;
1727                       goto check_c;
1728                     }
1729                 }
1730               else
1731                 abort ();
1732               break;
1733             }
1734         }
1735       c = *cur++;
1736       if (__builtin_expect (temp_buffer_len < 17, 0))
1737         temp_buffer[temp_buffer_len++] = c;
1738
1739      check_c:
1740       if (phase == RAW_STR_PREFIX)
1741         {
1742           while (raw_prefix_len < temp_buffer_len)
1743             {
1744               raw_prefix[raw_prefix_len] = temp_buffer[raw_prefix_len];
1745               switch (raw_prefix[raw_prefix_len])
1746                 {
1747                 case ' ': case '(': case ')': case '\\': case '\t':
1748                 case '\v': case '\f': case '\n': default:
1749                   break;
1750                 /* Basic source charset except the above chars.  */
1751                 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1752                 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1753                 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1754                 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1755                 case 'y': case 'z':
1756                 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1757                 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1758                 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1759                 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1760                 case 'Y': case 'Z':
1761                 case '0': case '1': case '2': case '3': case '4': case '5':
1762                 case '6': case '7': case '8': case '9':
1763                 case '_': case '{': case '}': case '#': case '[': case ']':
1764                 case '<': case '>': case '%': case ':': case ';': case '.':
1765                 case '?': case '*': case '+': case '-': case '/': case '^':
1766                 case '&': case '|': case '~': case '!': case '=': case ',':
1767                 case '"': case '\'':
1768                   if (raw_prefix_len < 16)
1769                     {
1770                       raw_prefix_len++;
1771                       continue;
1772                     }
1773                   break;
1774                 }
1775
1776               if (raw_prefix[raw_prefix_len] != '(')
1777                 {
1778                   int col = CPP_BUF_COLUMN (pfile->buffer, cur) + 1;
1779                   if (raw_prefix_len == 16)
1780                     cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1781                                          col, "raw string delimiter longer "
1782                                               "than 16 characters");
1783                   else if (raw_prefix[raw_prefix_len] == '\n')
1784                     cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1785                                          col, "invalid new-line in raw "
1786                                               "string delimiter");
1787                   else
1788                     cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1789                                          col, "invalid character '%c' in "
1790                                               "raw string delimiter",
1791                                          (int) raw_prefix[raw_prefix_len]);
1792                   pfile->buffer->cur = orig_base + raw_prefix_start - 1;
1793                   create_literal (pfile, token, orig_base,
1794                                   raw_prefix_start - 1, CPP_OTHER);
1795                   if (first_buff)
1796                     _cpp_release_buff (pfile, first_buff);
1797                   return;
1798                 }
1799               raw_prefix[raw_prefix_len] = '"';
1800               phase = RAW_STR;
1801               /* Nothing should be appended to temp_buffer during
1802                  RAW_STR phase.  */
1803               temp_buffer_len = 17;
1804               break;
1805             }
1806           continue;
1807         }
1808       else if (phase == RAW_STR_SUFFIX)
1809         {
1810           while (raw_suffix_len <= raw_prefix_len
1811                  && raw_suffix_len < temp_buffer_len
1812                  && temp_buffer[raw_suffix_len] == raw_prefix[raw_suffix_len])
1813             raw_suffix_len++;
1814           if (raw_suffix_len > raw_prefix_len)
1815             break;
1816           if (raw_suffix_len == temp_buffer_len)
1817             continue;
1818           phase = RAW_STR;
1819           /* Nothing should be appended to temp_buffer during
1820              RAW_STR phase.  */
1821           temp_buffer_len = 17;
1822         }
1823       if (c == ')')
1824         {
1825           phase = RAW_STR_SUFFIX;
1826           raw_suffix_len = 0;
1827           temp_buffer_len = 0;
1828         }
1829       else if (c == '\n')
1830         {
1831           if (pfile->state.in_directive
1832               || (pfile->state.parsing_args
1833                   && pfile->buffer->next_line >= pfile->buffer->rlimit))
1834             {
1835               cur--;
1836               type = CPP_OTHER;
1837               cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
1838                                    "unterminated raw string");
1839               break;
1840             }
1841
1842           BUF_APPEND (base, cur - base);
1843
1844           if (pfile->buffer->cur < pfile->buffer->rlimit)
1845             CPP_INCREMENT_LINE (pfile, 0);
1846           pfile->buffer->need_line = true;
1847
1848           pfile->buffer->cur = cur-1;
1849           _cpp_process_line_notes (pfile, false);
1850           if (!_cpp_get_fresh_line (pfile))
1851             {
1852               source_location src_loc = token->src_loc;
1853               token->type = CPP_EOF;
1854               /* Tell the compiler the line number of the EOF token.  */
1855               token->src_loc = pfile->line_table->highest_line;
1856               token->flags = BOL;
1857               if (first_buff != NULL)
1858                 _cpp_release_buff (pfile, first_buff);
1859               cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
1860                                    "unterminated raw string");
1861               return;
1862             }
1863
1864           cur = base = pfile->buffer->cur;
1865           note = &pfile->buffer->notes[pfile->buffer->cur_note];
1866         }
1867     }
1868
1869   if (CPP_OPTION (pfile, user_literals))
1870     {
1871       /* If a string format macro, say from inttypes.h, is placed touching
1872          a string literal it could be parsed as a C++11 user-defined string
1873          literal thus breaking the program.
1874          Try to identify macros with is_macro. A warning is issued. */
1875       if (is_macro (pfile, cur))
1876         {
1877           /* Raise a warning, but do not consume subsequent tokens.  */
1878           if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
1879             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
1880                                    token->src_loc, 0,
1881                                    "invalid suffix on literal; C++11 requires "
1882                                    "a space between literal and string macro");
1883         }
1884       /* Grab user defined literal suffix.  */
1885       else if (ISIDST (*cur))
1886         {
1887           type = cpp_userdef_string_add_type (type);
1888           ++cur;
1889
1890           while (ISIDNUM (*cur))
1891             ++cur;
1892         }
1893     }
1894
1895   pfile->buffer->cur = cur;
1896   if (first_buff == NULL)
1897     create_literal (pfile, token, base, cur - base, type);
1898   else
1899     {
1900       uchar *dest = _cpp_unaligned_alloc (pfile, total_len + (cur - base) + 1);
1901
1902       token->type = type;
1903       token->val.str.len = total_len + (cur - base);
1904       token->val.str.text = dest;
1905       last_buff = first_buff;
1906       while (last_buff != NULL)
1907         {
1908           memcpy (dest, last_buff->base,
1909                   BUFF_FRONT (last_buff) - last_buff->base);
1910           dest += BUFF_FRONT (last_buff) - last_buff->base;
1911           last_buff = last_buff->next;
1912         }
1913       _cpp_release_buff (pfile, first_buff);
1914       memcpy (dest, base, cur - base);
1915       dest[cur - base] = '\0';
1916     }
1917 }
1918
1919 /* Lexes a string, character constant, or angle-bracketed header file
1920    name.  The stored string contains the spelling, including opening
1921    quote and any leading 'L', 'u', 'U' or 'u8' and optional
1922    'R' modifier.  It returns the type of the literal, or CPP_OTHER
1923    if it was not properly terminated, or CPP_LESS for an unterminated
1924    header name which must be relexed as normal tokens.
1925
1926    The spelling is NUL-terminated, but it is not guaranteed that this
1927    is the first NUL since embedded NULs are preserved.  */
1928 static void
1929 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
1930 {
1931   bool saw_NUL = false;
1932   const uchar *cur;
1933   cppchar_t terminator;
1934   enum cpp_ttype type;
1935
1936   cur = base;
1937   terminator = *cur++;
1938   if (terminator == 'L' || terminator == 'U')
1939     terminator = *cur++;
1940   else if (terminator == 'u')
1941     {
1942       terminator = *cur++;
1943       if (terminator == '8')
1944         terminator = *cur++;
1945     }
1946   if (terminator == 'R')
1947     {
1948       lex_raw_string (pfile, token, base, cur);
1949       return;
1950     }
1951   if (terminator == '"')
1952     type = (*base == 'L' ? CPP_WSTRING :
1953             *base == 'U' ? CPP_STRING32 :
1954             *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1955                          : CPP_STRING);
1956   else if (terminator == '\'')
1957     type = (*base == 'L' ? CPP_WCHAR :
1958             *base == 'U' ? CPP_CHAR32 :
1959             *base == 'u' ? (base[1] == '8' ? CPP_UTF8CHAR : CPP_CHAR16)
1960                          : CPP_CHAR);
1961   else
1962     terminator = '>', type = CPP_HEADER_NAME;
1963
1964   for (;;)
1965     {
1966       cppchar_t c = *cur++;
1967
1968       /* In #include-style directives, terminators are not escapable.  */
1969       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
1970         cur++;
1971       else if (c == terminator)
1972         break;
1973       else if (c == '\n')
1974         {
1975           cur--;
1976           /* Unmatched quotes always yield undefined behavior, but
1977              greedy lexing means that what appears to be an unterminated
1978              header name may actually be a legitimate sequence of tokens.  */
1979           if (terminator == '>')
1980             {
1981               token->type = CPP_LESS;
1982               return;
1983             }
1984           type = CPP_OTHER;
1985           break;
1986         }
1987       else if (c == '\0')
1988         saw_NUL = true;
1989     }
1990
1991   if (saw_NUL && !pfile->state.skipping)
1992     cpp_error (pfile, CPP_DL_WARNING,
1993                "null character(s) preserved in literal");
1994
1995   if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
1996     cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
1997                (int) terminator);
1998
1999   if (CPP_OPTION (pfile, user_literals))
2000     {
2001       /* If a string format macro, say from inttypes.h, is placed touching
2002          a string literal it could be parsed as a C++11 user-defined string
2003          literal thus breaking the program.
2004          Try to identify macros with is_macro. A warning is issued. */
2005       if (is_macro (pfile, cur))
2006         {
2007           /* Raise a warning, but do not consume subsequent tokens.  */
2008           if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
2009             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
2010                                    token->src_loc, 0,
2011                                    "invalid suffix on literal; C++11 requires "
2012                                    "a space between literal and string macro");
2013         }
2014       /* Grab user defined literal suffix.  */
2015       else if (ISIDST (*cur))
2016         {
2017           type = cpp_userdef_char_add_type (type);
2018           type = cpp_userdef_string_add_type (type);
2019           ++cur;
2020
2021           while (ISIDNUM (*cur))
2022             ++cur;
2023         }
2024     }
2025   else if (CPP_OPTION (pfile, cpp_warn_cxx11_compat)
2026            && is_macro (pfile, cur)
2027            && !pfile->state.skipping)
2028     cpp_warning_with_line (pfile, CPP_W_CXX11_COMPAT,
2029                            token->src_loc, 0, "C++11 requires a space "
2030                            "between string literal and macro");
2031
2032   pfile->buffer->cur = cur;
2033   create_literal (pfile, token, base, cur - base, type);
2034 }
2035
2036 /* Return the comment table. The client may not make any assumption
2037    about the ordering of the table.  */
2038 cpp_comment_table *
2039 cpp_get_comments (cpp_reader *pfile)
2040 {
2041   return &pfile->comments;
2042 }
2043
2044 /* Append a comment to the end of the comment table. */
2045 static void
2046 store_comment (cpp_reader *pfile, cpp_token *token)
2047 {
2048   int len;
2049
2050   if (pfile->comments.allocated == 0)
2051     {
2052       pfile->comments.allocated = 256;
2053       pfile->comments.entries = (cpp_comment *) xmalloc
2054         (pfile->comments.allocated * sizeof (cpp_comment));
2055     }
2056
2057   if (pfile->comments.count == pfile->comments.allocated)
2058     {
2059       pfile->comments.allocated *= 2;
2060       pfile->comments.entries = (cpp_comment *) xrealloc
2061         (pfile->comments.entries,
2062          pfile->comments.allocated * sizeof (cpp_comment));
2063     }
2064
2065   len = token->val.str.len;
2066
2067   /* Copy comment. Note, token may not be NULL terminated. */
2068   pfile->comments.entries[pfile->comments.count].comment =
2069     (char *) xmalloc (sizeof (char) * (len + 1));
2070   memcpy (pfile->comments.entries[pfile->comments.count].comment,
2071           token->val.str.text, len);
2072   pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
2073
2074   /* Set source location. */
2075   pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
2076
2077   /* Increment the count of entries in the comment table. */
2078   pfile->comments.count++;
2079 }
2080
2081 /* The stored comment includes the comment start and any terminator.  */
2082 static void
2083 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
2084               cppchar_t type)
2085 {
2086   unsigned char *buffer;
2087   unsigned int len, clen, i;
2088
2089   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
2090
2091   /* C++ comments probably (not definitely) have moved past a new
2092      line, which we don't want to save in the comment.  */
2093   if (is_vspace (pfile->buffer->cur[-1]))
2094     len--;
2095
2096   /* If we are currently in a directive or in argument parsing, then
2097      we need to store all C++ comments as C comments internally, and
2098      so we need to allocate a little extra space in that case.
2099
2100      Note that the only time we encounter a directive here is
2101      when we are saving comments in a "#define".  */
2102   clen = ((pfile->state.in_directive || pfile->state.parsing_args)
2103           && type == '/') ? len + 2 : len;
2104
2105   buffer = _cpp_unaligned_alloc (pfile, clen);
2106
2107   token->type = CPP_COMMENT;
2108   token->val.str.len = clen;
2109   token->val.str.text = buffer;
2110
2111   buffer[0] = '/';
2112   memcpy (buffer + 1, from, len - 1);
2113
2114   /* Finish conversion to a C comment, if necessary.  */
2115   if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
2116     {
2117       buffer[1] = '*';
2118       buffer[clen - 2] = '*';
2119       buffer[clen - 1] = '/';
2120       /* As there can be in a C++ comments illegal sequences for C comments
2121          we need to filter them out.  */
2122       for (i = 2; i < (clen - 2); i++)
2123         if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
2124           buffer[i] = '|';
2125     }
2126
2127   /* Finally store this comment for use by clients of libcpp. */
2128   store_comment (pfile, token);
2129 }
2130
2131 /* Returns true if comment at COMMENT_START is a recognized FALLTHROUGH
2132    comment.  */
2133
2134 static bool
2135 fallthrough_comment_p (cpp_reader *pfile, const unsigned char *comment_start)
2136 {
2137   const unsigned char *from = comment_start + 1;
2138
2139   switch (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough))
2140     {
2141       /* For both -Wimplicit-fallthrough=0 and -Wimplicit-fallthrough=5 we
2142          don't recognize any comments.  The latter only checks attributes,
2143          the former doesn't warn.  */
2144     case 0:
2145     default:
2146       return false;
2147       /* -Wimplicit-fallthrough=1 considers any comment, no matter what
2148          content it has.  */
2149     case 1:
2150       return true;
2151     case 2:
2152       /* -Wimplicit-fallthrough=2 looks for (case insensitive)
2153          .*falls?[ \t-]*thr(u|ough).* regex.  */
2154       for (; (size_t) (pfile->buffer->cur - from) >= sizeof "fallthru" - 1;
2155            from++)
2156         {
2157           /* Is there anything like strpbrk with upper boundary, or
2158              memchr looking for 2 characters rather than just one?  */
2159           if (from[0] != 'f' && from[0] != 'F')
2160             continue;
2161           if (from[1] != 'a' && from[1] != 'A')
2162             continue;
2163           if (from[2] != 'l' && from[2] != 'L')
2164             continue;
2165           if (from[3] != 'l' && from[3] != 'L')
2166             continue;
2167           from += sizeof "fall" - 1;
2168           if (from[0] == 's' || from[0] == 'S')
2169             from++;
2170           while (*from == ' ' || *from == '\t' || *from == '-')
2171             from++;
2172           if (from[0] != 't' && from[0] != 'T')
2173             continue;
2174           if (from[1] != 'h' && from[1] != 'H')
2175             continue;
2176           if (from[2] != 'r' && from[2] != 'R')
2177             continue;
2178           if (from[3] == 'u' || from[3] == 'U')
2179             return true;
2180           if (from[3] != 'o' && from[3] != 'O')
2181             continue;
2182           if (from[4] != 'u' && from[4] != 'U')
2183             continue;
2184           if (from[5] != 'g' && from[5] != 'G')
2185             continue;
2186           if (from[6] != 'h' && from[6] != 'H')
2187             continue;
2188           return true;
2189         }
2190       return false;
2191     case 3:
2192     case 4:
2193       break;
2194     }
2195
2196   /* Whole comment contents:
2197      -fallthrough
2198      @fallthrough@
2199    */
2200   if (*from == '-' || *from == '@')
2201     {
2202       size_t len = sizeof "fallthrough" - 1;
2203       if ((size_t) (pfile->buffer->cur - from - 1) < len)
2204         return false;
2205       if (memcmp (from + 1, "fallthrough", len))
2206         return false;
2207       if (*from == '@')
2208         {
2209           if (from[len + 1] != '@')
2210             return false;
2211           len++;
2212         }
2213       from += 1 + len;
2214     }
2215   /* Whole comment contents (regex):
2216      lint -fallthrough[ \t]*
2217    */
2218   else if (*from == 'l')
2219     {
2220       size_t len = sizeof "int -fallthrough" - 1;
2221       if ((size_t) (pfile->buffer->cur - from - 1) < len)
2222         return false;
2223       if (memcmp (from + 1, "int -fallthrough", len))
2224         return false;
2225       from += 1 + len;
2226       while (*from == ' ' || *from == '\t')
2227         from++;
2228     }
2229   /* Whole comment contents (regex):
2230      [ \t]*FALLTHR(U|OUGH)[ \t]*
2231    */
2232   else if (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough) == 4)
2233     {
2234       while (*from == ' ' || *from == '\t')
2235         from++;
2236       if ((size_t) (pfile->buffer->cur - from)  < sizeof "FALLTHRU" - 1)
2237         return false;
2238       if (memcmp (from, "FALLTHR", sizeof "FALLTHR" - 1))
2239         return false;
2240       from += sizeof "FALLTHR" - 1;
2241       if (*from == 'U')
2242         from++;
2243       else if ((size_t) (pfile->buffer->cur - from)  < sizeof "OUGH" - 1)
2244         return false;
2245       else if (memcmp (from, "OUGH", sizeof "OUGH" - 1))
2246         return false;
2247       else
2248         from += sizeof "OUGH" - 1;
2249       while (*from == ' ' || *from == '\t')
2250         from++;
2251     }
2252   /* Whole comment contents (regex):
2253      [ \t.!]*(ELSE,? |INTENTIONAL(LY)? )?FALL(S | |-)?THR(OUGH|U)[ \t.!]*(-[^\n\r]*)?
2254      [ \t.!]*(Else,? |Intentional(ly)? )?Fall((s | |-)[Tt]|t)hr(ough|u)[ \t.!]*(-[^\n\r]*)?
2255      [ \t.!]*([Ee]lse,? |[Ii]ntentional(ly)? )?fall(s | |-)?thr(ough|u)[ \t.!]*(-[^\n\r]*)?
2256    */
2257   else
2258     {
2259       while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
2260         from++;
2261       unsigned char f = *from;
2262       bool all_upper = false;
2263       if (f == 'E' || f == 'e')
2264         {
2265           if ((size_t) (pfile->buffer->cur - from)
2266               < sizeof "else fallthru" - 1)
2267             return false;
2268           if (f == 'E' && memcmp (from + 1, "LSE", sizeof "LSE" - 1) == 0)
2269             all_upper = true;
2270           else if (memcmp (from + 1, "lse", sizeof "lse" - 1))
2271             return false;
2272           from += sizeof "else" - 1;
2273           if (*from == ',')
2274             from++;
2275           if (*from != ' ')
2276             return false;
2277           from++;
2278           if (all_upper && *from == 'f')
2279             return false;
2280           if (f == 'e' && *from == 'F')
2281             return false;
2282           f = *from;
2283         }
2284       else if (f == 'I' || f == 'i')
2285         {
2286           if ((size_t) (pfile->buffer->cur - from)
2287               < sizeof "intentional fallthru" - 1)
2288             return false;
2289           if (f == 'I' && memcmp (from + 1, "NTENTIONAL",
2290                                   sizeof "NTENTIONAL" - 1) == 0)
2291             all_upper = true;
2292           else if (memcmp (from + 1, "ntentional",
2293                            sizeof "ntentional" - 1))
2294             return false;
2295           from += sizeof "intentional" - 1;
2296           if (*from == ' ')
2297             {
2298               from++;
2299               if (all_upper && *from == 'f')
2300                 return false;
2301             }
2302           else if (all_upper)
2303             {
2304               if (memcmp (from, "LY F", sizeof "LY F" - 1))
2305                 return false;
2306               from += sizeof "LY " - 1;
2307             }
2308           else
2309             {
2310               if (memcmp (from, "ly ", sizeof "ly " - 1))
2311                 return false;
2312               from += sizeof "ly " - 1;
2313             }
2314           if (f == 'i' && *from == 'F')
2315             return false;
2316           f = *from;
2317         }
2318       if (f != 'F' && f != 'f')
2319         return false;
2320       if ((size_t) (pfile->buffer->cur - from) < sizeof "fallthru" - 1)
2321         return false;
2322       if (f == 'F' && memcmp (from + 1, "ALL", sizeof "ALL" - 1) == 0)
2323         all_upper = true;
2324       else if (all_upper)
2325         return false;
2326       else if (memcmp (from + 1, "all", sizeof "all" - 1))
2327         return false;
2328       from += sizeof "fall" - 1;
2329       if (*from == (all_upper ? 'S' : 's') && from[1] == ' ')
2330         from += 2;
2331       else if (*from == ' ' || *from == '-')
2332         from++;
2333       else if (*from != (all_upper ? 'T' : 't'))
2334         return false;
2335       if ((f == 'f' || *from != 'T') && (all_upper || *from != 't'))
2336         return false;
2337       if ((size_t) (pfile->buffer->cur - from) < sizeof "thru" - 1)
2338         return false;
2339       if (memcmp (from + 1, all_upper ? "HRU" : "hru", sizeof "hru" - 1))
2340         {
2341           if ((size_t) (pfile->buffer->cur - from) < sizeof "through" - 1)
2342             return false;
2343           if (memcmp (from + 1, all_upper ? "HROUGH" : "hrough",
2344                       sizeof "hrough" - 1))
2345             return false;
2346           from += sizeof "through" - 1;
2347         }
2348       else
2349         from += sizeof "thru" - 1;
2350       while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
2351         from++;
2352       if (*from == '-')
2353         {
2354           from++;
2355           if (*comment_start == '*')
2356             {
2357               do
2358                 {
2359                   while (*from && *from != '*'
2360                          && *from != '\n' && *from != '\r')
2361                     from++;
2362                   if (*from != '*' || from[1] == '/')
2363                     break;
2364                   from++;
2365                 }
2366               while (1);
2367             }
2368           else
2369             while (*from && *from != '\n' && *from != '\r')
2370               from++;
2371         }
2372     }
2373   /* C block comment.  */
2374   if (*comment_start == '*')
2375     {
2376       if (*from != '*' || from[1] != '/')
2377         return false;
2378     }
2379   /* C++ line comment.  */
2380   else if (*from != '\n')
2381     return false;
2382
2383   return true;
2384 }
2385
2386 /* Allocate COUNT tokens for RUN.  */
2387 void
2388 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
2389 {
2390   run->base = XNEWVEC (cpp_token, count);
2391   run->limit = run->base + count;
2392   run->next = NULL;
2393 }
2394
2395 /* Returns the next tokenrun, or creates one if there is none.  */
2396 static tokenrun *
2397 next_tokenrun (tokenrun *run)
2398 {
2399   if (run->next == NULL)
2400     {
2401       run->next = XNEW (tokenrun);
2402       run->next->prev = run;
2403       _cpp_init_tokenrun (run->next, 250);
2404     }
2405
2406   return run->next;
2407 }
2408
2409 /* Return the number of not yet processed token in a given
2410    context.  */
2411 int
2412 _cpp_remaining_tokens_num_in_context (cpp_context *context)
2413 {
2414   if (context->tokens_kind == TOKENS_KIND_DIRECT)
2415     return (LAST (context).token - FIRST (context).token);
2416   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
2417            || context->tokens_kind == TOKENS_KIND_EXTENDED)
2418     return (LAST (context).ptoken - FIRST (context).ptoken);
2419   else
2420       abort ();
2421 }
2422
2423 /* Returns the token present at index INDEX in a given context.  If
2424    INDEX is zero, the next token to be processed is returned.  */
2425 static const cpp_token*
2426 _cpp_token_from_context_at (cpp_context *context, int index)
2427 {
2428   if (context->tokens_kind == TOKENS_KIND_DIRECT)
2429     return &(FIRST (context).token[index]);
2430   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
2431            || context->tokens_kind == TOKENS_KIND_EXTENDED)
2432     return FIRST (context).ptoken[index];
2433  else
2434    abort ();
2435 }
2436
2437 /* Look ahead in the input stream.  */
2438 const cpp_token *
2439 cpp_peek_token (cpp_reader *pfile, int index)
2440 {
2441   cpp_context *context = pfile->context;
2442   const cpp_token *peektok;
2443   int count;
2444
2445   /* First, scan through any pending cpp_context objects.  */
2446   while (context->prev)
2447     {
2448       ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
2449
2450       if (index < (int) sz)
2451         return _cpp_token_from_context_at (context, index);
2452       index -= (int) sz;
2453       context = context->prev;
2454     }
2455
2456   /* We will have to read some new tokens after all (and do so
2457      without invalidating preceding tokens).  */
2458   count = index;
2459   pfile->keep_tokens++;
2460
2461   /* For peeked tokens temporarily disable line_change reporting,
2462      until the tokens are parsed for real.  */
2463   void (*line_change) (cpp_reader *, const cpp_token *, int)
2464     = pfile->cb.line_change;
2465   pfile->cb.line_change = NULL;
2466
2467   do
2468     {
2469       peektok = _cpp_lex_token (pfile);
2470       if (peektok->type == CPP_EOF)
2471         {
2472           index--;
2473           break;
2474         }
2475     }
2476   while (index--);
2477
2478   _cpp_backup_tokens_direct (pfile, count - index);
2479   pfile->keep_tokens--;
2480   pfile->cb.line_change = line_change;
2481
2482   return peektok;
2483 }
2484
2485 /* Allocate a single token that is invalidated at the same time as the
2486    rest of the tokens on the line.  Has its line and col set to the
2487    same as the last lexed token, so that diagnostics appear in the
2488    right place.  */
2489 cpp_token *
2490 _cpp_temp_token (cpp_reader *pfile)
2491 {
2492   cpp_token *old, *result;
2493   ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
2494   ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
2495
2496   old = pfile->cur_token - 1;
2497   /* Any pre-existing lookaheads must not be clobbered.  */
2498   if (la)
2499     {
2500       if (sz <= la)
2501         {
2502           tokenrun *next = next_tokenrun (pfile->cur_run);
2503
2504           if (sz < la)
2505             memmove (next->base + 1, next->base,
2506                      (la - sz) * sizeof (cpp_token));
2507
2508           next->base[0] = pfile->cur_run->limit[-1];
2509         }
2510
2511       if (sz > 1)
2512         memmove (pfile->cur_token + 1, pfile->cur_token,
2513                  MIN (la, sz - 1) * sizeof (cpp_token));
2514     }
2515
2516   if (!sz && pfile->cur_token == pfile->cur_run->limit)
2517     {
2518       pfile->cur_run = next_tokenrun (pfile->cur_run);
2519       pfile->cur_token = pfile->cur_run->base;
2520     }
2521
2522   result = pfile->cur_token++;
2523   result->src_loc = old->src_loc;
2524   return result;
2525 }
2526
2527 /* Lex a token into RESULT (external interface).  Takes care of issues
2528    like directive handling, token lookahead, multiple include
2529    optimization and skipping.  */
2530 const cpp_token *
2531 _cpp_lex_token (cpp_reader *pfile)
2532 {
2533   cpp_token *result;
2534
2535   for (;;)
2536     {
2537       if (pfile->cur_token == pfile->cur_run->limit)
2538         {
2539           pfile->cur_run = next_tokenrun (pfile->cur_run);
2540           pfile->cur_token = pfile->cur_run->base;
2541         }
2542       /* We assume that the current token is somewhere in the current
2543          run.  */
2544       if (pfile->cur_token < pfile->cur_run->base
2545           || pfile->cur_token >= pfile->cur_run->limit)
2546         abort ();
2547
2548       if (pfile->lookaheads)
2549         {
2550           pfile->lookaheads--;
2551           result = pfile->cur_token++;
2552         }
2553       else
2554         result = _cpp_lex_direct (pfile);
2555
2556       if (result->flags & BOL)
2557         {
2558           /* Is this a directive.  If _cpp_handle_directive returns
2559              false, it is an assembler #.  */
2560           if (result->type == CPP_HASH
2561               /* 6.10.3 p 11: Directives in a list of macro arguments
2562                  gives undefined behavior.  This implementation
2563                  handles the directive as normal.  */
2564               && pfile->state.parsing_args != 1)
2565             {
2566               if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
2567                 {
2568                   if (pfile->directive_result.type == CPP_PADDING)
2569                     continue;
2570                   result = &pfile->directive_result;
2571                 }
2572             }
2573           else if (pfile->state.in_deferred_pragma)
2574             result = &pfile->directive_result;
2575
2576           if (pfile->cb.line_change && !pfile->state.skipping)
2577             pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
2578         }
2579
2580       /* We don't skip tokens in directives.  */
2581       if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
2582         break;
2583
2584       /* Outside a directive, invalidate controlling macros.  At file
2585          EOF, _cpp_lex_direct takes care of popping the buffer, so we never
2586          get here and MI optimization works.  */
2587       pfile->mi_valid = false;
2588
2589       if (!pfile->state.skipping || result->type == CPP_EOF)
2590         break;
2591     }
2592
2593   return result;
2594 }
2595
2596 /* Returns true if a fresh line has been loaded.  */
2597 bool
2598 _cpp_get_fresh_line (cpp_reader *pfile)
2599 {
2600   int return_at_eof;
2601
2602   /* We can't get a new line until we leave the current directive.  */
2603   if (pfile->state.in_directive)
2604     return false;
2605
2606   for (;;)
2607     {
2608       cpp_buffer *buffer = pfile->buffer;
2609
2610       if (!buffer->need_line)
2611         return true;
2612
2613       if (buffer->next_line < buffer->rlimit)
2614         {
2615           _cpp_clean_line (pfile);
2616           return true;
2617         }
2618
2619       /* First, get out of parsing arguments state.  */
2620       if (pfile->state.parsing_args)
2621         return false;
2622
2623       /* End of buffer.  Non-empty files should end in a newline.  */
2624       if (buffer->buf != buffer->rlimit
2625           && buffer->next_line > buffer->rlimit
2626           && !buffer->from_stage3)
2627         {
2628           /* Clip to buffer size.  */
2629           buffer->next_line = buffer->rlimit;
2630         }
2631
2632       return_at_eof = buffer->return_at_eof;
2633       _cpp_pop_buffer (pfile);
2634       if (pfile->buffer == NULL || return_at_eof)
2635         return false;
2636     }
2637 }
2638
2639 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)          \
2640   do                                                    \
2641     {                                                   \
2642       result->type = ELSE_TYPE;                         \
2643       if (*buffer->cur == CHAR)                         \
2644         buffer->cur++, result->type = THEN_TYPE;        \
2645     }                                                   \
2646   while (0)
2647
2648 /* Lex a token into pfile->cur_token, which is also incremented, to
2649    get diagnostics pointing to the correct location.
2650
2651    Does not handle issues such as token lookahead, multiple-include
2652    optimization, directives, skipping etc.  This function is only
2653    suitable for use by _cpp_lex_token, and in special cases like
2654    lex_expansion_token which doesn't care for any of these issues.
2655
2656    When meeting a newline, returns CPP_EOF if parsing a directive,
2657    otherwise returns to the start of the token buffer if permissible.
2658    Returns the location of the lexed token.  */
2659 cpp_token *
2660 _cpp_lex_direct (cpp_reader *pfile)
2661 {
2662   cppchar_t c;
2663   cpp_buffer *buffer;
2664   const unsigned char *comment_start;
2665   bool fallthrough_comment = false;
2666   cpp_token *result = pfile->cur_token++;
2667
2668  fresh_line:
2669   result->flags = 0;
2670   buffer = pfile->buffer;
2671   if (buffer->need_line)
2672     {
2673       if (pfile->state.in_deferred_pragma)
2674         {
2675           result->type = CPP_PRAGMA_EOL;
2676           pfile->state.in_deferred_pragma = false;
2677           if (!pfile->state.pragma_allow_expansion)
2678             pfile->state.prevent_expansion--;
2679           return result;
2680         }
2681       if (!_cpp_get_fresh_line (pfile))
2682         {
2683           result->type = CPP_EOF;
2684           if (!pfile->state.in_directive)
2685             {
2686               /* Tell the compiler the line number of the EOF token.  */
2687               result->src_loc = pfile->line_table->highest_line;
2688               result->flags = BOL;
2689             }
2690           return result;
2691         }
2692       if (buffer != pfile->buffer)
2693         fallthrough_comment = false;
2694       if (!pfile->keep_tokens)
2695         {
2696           pfile->cur_run = &pfile->base_run;
2697           result = pfile->base_run.base;
2698           pfile->cur_token = result + 1;
2699         }
2700       result->flags = BOL;
2701       if (pfile->state.parsing_args == 2)
2702         result->flags |= PREV_WHITE;
2703     }
2704   buffer = pfile->buffer;
2705  update_tokens_line:
2706   result->src_loc = pfile->line_table->highest_line;
2707
2708  skipped_white:
2709   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
2710       && !pfile->overlaid_buffer)
2711     {
2712       _cpp_process_line_notes (pfile, false);
2713       result->src_loc = pfile->line_table->highest_line;
2714     }
2715   c = *buffer->cur++;
2716
2717   if (pfile->forced_token_location_p)
2718     result->src_loc = *pfile->forced_token_location_p;
2719   else
2720     result->src_loc = linemap_position_for_column (pfile->line_table,
2721                                           CPP_BUF_COLUMN (buffer, buffer->cur));
2722
2723   switch (c)
2724     {
2725     case ' ': case '\t': case '\f': case '\v': case '\0':
2726       result->flags |= PREV_WHITE;
2727       skip_whitespace (pfile, c);
2728       goto skipped_white;
2729
2730     case '\n':
2731       if (buffer->cur < buffer->rlimit)
2732         CPP_INCREMENT_LINE (pfile, 0);
2733       buffer->need_line = true;
2734       goto fresh_line;
2735
2736     case '0': case '1': case '2': case '3': case '4':
2737     case '5': case '6': case '7': case '8': case '9':
2738       {
2739         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2740         result->type = CPP_NUMBER;
2741         lex_number (pfile, &result->val.str, &nst);
2742         warn_about_normalization (pfile, result, &nst);
2743         break;
2744       }
2745
2746     case 'L':
2747     case 'u':
2748     case 'U':
2749     case 'R':
2750       /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
2751          wide strings or raw strings.  */
2752       if (c == 'L' || CPP_OPTION (pfile, rliterals)
2753           || (c != 'R' && CPP_OPTION (pfile, uliterals)))
2754         {
2755           if ((*buffer->cur == '\'' && c != 'R')
2756               || *buffer->cur == '"'
2757               || (*buffer->cur == 'R'
2758                   && c != 'R'
2759                   && buffer->cur[1] == '"'
2760                   && CPP_OPTION (pfile, rliterals))
2761               || (*buffer->cur == '8'
2762                   && c == 'u'
2763                   && ((buffer->cur[1] == '"' || (buffer->cur[1] == '\''
2764                                 && CPP_OPTION (pfile, utf8_char_literals)))
2765                       || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
2766                           && CPP_OPTION (pfile, rliterals)))))
2767             {
2768               lex_string (pfile, result, buffer->cur - 1);
2769               break;
2770             }
2771         }
2772       /* Fall through.  */
2773
2774     case '_':
2775     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
2776     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
2777     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
2778     case 's': case 't':           case 'v': case 'w': case 'x':
2779     case 'y': case 'z':
2780     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
2781     case 'G': case 'H': case 'I': case 'J': case 'K':
2782     case 'M': case 'N': case 'O': case 'P': case 'Q':
2783     case 'S': case 'T':           case 'V': case 'W': case 'X':
2784     case 'Y': case 'Z':
2785       result->type = CPP_NAME;
2786       {
2787         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2788         result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
2789                                                 &nst,
2790                                                 &result->val.node.spelling);
2791         warn_about_normalization (pfile, result, &nst);
2792       }
2793
2794       /* Convert named operators to their proper types.  */
2795       if (result->val.node.node->flags & NODE_OPERATOR)
2796         {
2797           result->flags |= NAMED_OP;
2798           result->type = (enum cpp_ttype) result->val.node.node->directive_index;
2799         }
2800
2801       /* Signal FALLTHROUGH comment followed by another token.  */
2802       if (fallthrough_comment)
2803         result->flags |= PREV_FALLTHROUGH;
2804       break;
2805
2806     case '\'':
2807     case '"':
2808       lex_string (pfile, result, buffer->cur - 1);
2809       break;
2810
2811     case '/':
2812       /* A potential block or line comment.  */
2813       comment_start = buffer->cur;
2814       c = *buffer->cur;
2815
2816       if (c == '*')
2817         {
2818           if (_cpp_skip_block_comment (pfile))
2819             cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
2820         }
2821       else if (c == '/' && ! CPP_OPTION (pfile, traditional))
2822         {
2823           /* Don't warn for system headers.  */
2824           if (cpp_in_system_header (pfile))
2825             ;
2826           /* Warn about comments if pedantically GNUC89, and not
2827              in system headers.  */
2828           else if (CPP_OPTION (pfile, lang) == CLK_GNUC89
2829                    && CPP_PEDANTIC (pfile)
2830                    && ! buffer->warned_cplusplus_comments)
2831             {
2832               cpp_error (pfile, CPP_DL_PEDWARN,
2833                          "C++ style comments are not allowed in ISO C90");
2834               cpp_error (pfile, CPP_DL_PEDWARN,
2835                          "(this will be reported only once per input file)");
2836               buffer->warned_cplusplus_comments = 1;
2837             }
2838           /* Or if specifically desired via -Wc90-c99-compat.  */
2839           else if (CPP_OPTION (pfile, cpp_warn_c90_c99_compat) > 0
2840                    && ! CPP_OPTION (pfile, cplusplus)
2841                    && ! buffer->warned_cplusplus_comments)
2842             {
2843               cpp_error (pfile, CPP_DL_WARNING,
2844                          "C++ style comments are incompatible with C90");
2845               cpp_error (pfile, CPP_DL_WARNING,
2846                          "(this will be reported only once per input file)");
2847               buffer->warned_cplusplus_comments = 1;
2848             }
2849           /* In C89/C94, C++ style comments are forbidden.  */
2850           else if ((CPP_OPTION (pfile, lang) == CLK_STDC89
2851                     || CPP_OPTION (pfile, lang) == CLK_STDC94))
2852             {
2853               /* But don't be confused about valid code such as
2854                  - // immediately followed by *,
2855                  - // in a preprocessing directive,
2856                  - // in an #if 0 block.  */
2857               if (buffer->cur[1] == '*'
2858                   || pfile->state.in_directive
2859                   || pfile->state.skipping)
2860                 {
2861                   result->type = CPP_DIV;
2862                   break;
2863                 }
2864               else if (! buffer->warned_cplusplus_comments)
2865                 {
2866                   cpp_error (pfile, CPP_DL_ERROR,
2867                              "C++ style comments are not allowed in ISO C90");
2868                   cpp_error (pfile, CPP_DL_ERROR,
2869                              "(this will be reported only once per input "
2870                              "file)");
2871                   buffer->warned_cplusplus_comments = 1;
2872                 }
2873             }
2874           if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
2875             cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
2876         }
2877       else if (c == '=')
2878         {
2879           buffer->cur++;
2880           result->type = CPP_DIV_EQ;
2881           break;
2882         }
2883       else
2884         {
2885           result->type = CPP_DIV;
2886           break;
2887         }
2888
2889       if (fallthrough_comment_p (pfile, comment_start))
2890         fallthrough_comment = true;
2891
2892       if (pfile->cb.comment)
2893         {
2894           size_t len = pfile->buffer->cur - comment_start;
2895           pfile->cb.comment (pfile, result->src_loc, comment_start - 1,
2896                              len + 1);
2897         }
2898
2899       if (!pfile->state.save_comments)
2900         {
2901           result->flags |= PREV_WHITE;
2902           goto update_tokens_line;
2903         }
2904
2905       if (fallthrough_comment)
2906         result->flags |= PREV_FALLTHROUGH;
2907
2908       /* Save the comment as a token in its own right.  */
2909       save_comment (pfile, result, comment_start, c);
2910       break;
2911
2912     case '<':
2913       if (pfile->state.angled_headers)
2914         {
2915           lex_string (pfile, result, buffer->cur - 1);
2916           if (result->type != CPP_LESS)
2917             break;
2918         }
2919
2920       result->type = CPP_LESS;
2921       if (*buffer->cur == '=')
2922         buffer->cur++, result->type = CPP_LESS_EQ;
2923       else if (*buffer->cur == '<')
2924         {
2925           buffer->cur++;
2926           IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
2927         }
2928       else if (CPP_OPTION (pfile, digraphs))
2929         {
2930           if (*buffer->cur == ':')
2931             {
2932               /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
2933                  three characters are <:: and the subsequent character
2934                  is neither : nor >, the < is treated as a preprocessor
2935                  token by itself".  */
2936               if (CPP_OPTION (pfile, cplusplus)
2937                   && CPP_OPTION (pfile, lang) != CLK_CXX98
2938                   && CPP_OPTION (pfile, lang) != CLK_GNUCXX
2939                   && buffer->cur[1] == ':'
2940                   && buffer->cur[2] != ':' && buffer->cur[2] != '>')
2941                 break;
2942
2943               buffer->cur++;
2944               result->flags |= DIGRAPH;
2945               result->type = CPP_OPEN_SQUARE;
2946             }
2947           else if (*buffer->cur == '%')
2948             {
2949               buffer->cur++;
2950               result->flags |= DIGRAPH;
2951               result->type = CPP_OPEN_BRACE;
2952             }
2953         }
2954       break;
2955
2956     case '>':
2957       result->type = CPP_GREATER;
2958       if (*buffer->cur == '=')
2959         buffer->cur++, result->type = CPP_GREATER_EQ;
2960       else if (*buffer->cur == '>')
2961         {
2962           buffer->cur++;
2963           IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
2964         }
2965       break;
2966
2967     case '%':
2968       result->type = CPP_MOD;
2969       if (*buffer->cur == '=')
2970         buffer->cur++, result->type = CPP_MOD_EQ;
2971       else if (CPP_OPTION (pfile, digraphs))
2972         {
2973           if (*buffer->cur == ':')
2974             {
2975               buffer->cur++;
2976               result->flags |= DIGRAPH;
2977               result->type = CPP_HASH;
2978               if (*buffer->cur == '%' && buffer->cur[1] == ':')
2979                 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
2980             }
2981           else if (*buffer->cur == '>')
2982             {
2983               buffer->cur++;
2984               result->flags |= DIGRAPH;
2985               result->type = CPP_CLOSE_BRACE;
2986             }
2987         }
2988       break;
2989
2990     case '.':
2991       result->type = CPP_DOT;
2992       if (ISDIGIT (*buffer->cur))
2993         {
2994           struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2995           result->type = CPP_NUMBER;
2996           lex_number (pfile, &result->val.str, &nst);
2997           warn_about_normalization (pfile, result, &nst);
2998         }
2999       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
3000         buffer->cur += 2, result->type = CPP_ELLIPSIS;
3001       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
3002         buffer->cur++, result->type = CPP_DOT_STAR;
3003       break;
3004
3005     case '+':
3006       result->type = CPP_PLUS;
3007       if (*buffer->cur == '+')
3008         buffer->cur++, result->type = CPP_PLUS_PLUS;
3009       else if (*buffer->cur == '=')
3010         buffer->cur++, result->type = CPP_PLUS_EQ;
3011       break;
3012
3013     case '-':
3014       result->type = CPP_MINUS;
3015       if (*buffer->cur == '>')
3016         {
3017           buffer->cur++;
3018           result->type = CPP_DEREF;
3019           if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
3020             buffer->cur++, result->type = CPP_DEREF_STAR;
3021         }
3022       else if (*buffer->cur == '-')
3023         buffer->cur++, result->type = CPP_MINUS_MINUS;
3024       else if (*buffer->cur == '=')
3025         buffer->cur++, result->type = CPP_MINUS_EQ;
3026       break;
3027
3028     case '&':
3029       result->type = CPP_AND;
3030       if (*buffer->cur == '&')
3031         buffer->cur++, result->type = CPP_AND_AND;
3032       else if (*buffer->cur == '=')
3033         buffer->cur++, result->type = CPP_AND_EQ;
3034       break;
3035
3036     case '|':
3037       result->type = CPP_OR;
3038       if (*buffer->cur == '|')
3039         buffer->cur++, result->type = CPP_OR_OR;
3040       else if (*buffer->cur == '=')
3041         buffer->cur++, result->type = CPP_OR_EQ;
3042       break;
3043
3044     case ':':
3045       result->type = CPP_COLON;
3046       if (*buffer->cur == ':' && CPP_OPTION (pfile, cplusplus))
3047         buffer->cur++, result->type = CPP_SCOPE;
3048       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
3049         {
3050           buffer->cur++;
3051           result->flags |= DIGRAPH;
3052           result->type = CPP_CLOSE_SQUARE;
3053         }
3054       break;
3055
3056     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
3057     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
3058     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
3059     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
3060     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
3061
3062     case '?': result->type = CPP_QUERY; break;
3063     case '~': result->type = CPP_COMPL; break;
3064     case ',': result->type = CPP_COMMA; break;
3065     case '(': result->type = CPP_OPEN_PAREN; break;
3066     case ')': result->type = CPP_CLOSE_PAREN; break;
3067     case '[': result->type = CPP_OPEN_SQUARE; break;
3068     case ']': result->type = CPP_CLOSE_SQUARE; break;
3069     case '{': result->type = CPP_OPEN_BRACE; break;
3070     case '}': result->type = CPP_CLOSE_BRACE; break;
3071     case ';': result->type = CPP_SEMICOLON; break;
3072
3073       /* @ is a punctuator in Objective-C.  */
3074     case '@': result->type = CPP_ATSIGN; break;
3075
3076     case '$':
3077     case '\\':
3078       {
3079         const uchar *base = --buffer->cur;
3080         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3081
3082         if (forms_identifier_p (pfile, true, &nst))
3083           {
3084             result->type = CPP_NAME;
3085             result->val.node.node = lex_identifier (pfile, base, true, &nst,
3086                                                     &result->val.node.spelling);
3087             warn_about_normalization (pfile, result, &nst);
3088             break;
3089           }
3090         buffer->cur++;
3091       }
3092       /* FALLTHRU */
3093
3094     default:
3095       create_literal (pfile, result, buffer->cur - 1, 1, CPP_OTHER);
3096       break;
3097     }
3098
3099   /* Potentially convert the location of the token to a range.  */
3100   if (result->src_loc >= RESERVED_LOCATION_COUNT
3101       && result->type != CPP_EOF)
3102     {
3103       /* Ensure that any line notes are processed, so that we have the
3104          correct physical line/column for the end-point of the token even
3105          when a logical line is split via one or more backslashes.  */
3106       if (buffer->cur >= buffer->notes[buffer->cur_note].pos
3107           && !pfile->overlaid_buffer)
3108         _cpp_process_line_notes (pfile, false);
3109
3110       source_range tok_range;
3111       tok_range.m_start = result->src_loc;
3112       tok_range.m_finish
3113         = linemap_position_for_column (pfile->line_table,
3114                                        CPP_BUF_COLUMN (buffer, buffer->cur));
3115
3116       result->src_loc = COMBINE_LOCATION_DATA (pfile->line_table,
3117                                                result->src_loc,
3118                                                tok_range, NULL);
3119     }
3120
3121   return result;
3122 }
3123
3124 /* An upper bound on the number of bytes needed to spell TOKEN.
3125    Does not include preceding whitespace.  */
3126 unsigned int
3127 cpp_token_len (const cpp_token *token)
3128 {
3129   unsigned int len;
3130
3131   switch (TOKEN_SPELL (token))
3132     {
3133     default:            len = 6;                                break;
3134     case SPELL_LITERAL: len = token->val.str.len;               break;
3135     case SPELL_IDENT:   len = NODE_LEN (token->val.node.node) * 10;     break;
3136     }
3137
3138   return len;
3139 }
3140
3141 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
3142    Return the number of bytes read out of NAME.  (There are always
3143    10 bytes written to BUFFER.)  */
3144
3145 static size_t
3146 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
3147 {
3148   int j;
3149   int ucn_len = 0;
3150   int ucn_len_c;
3151   unsigned t;
3152   unsigned long utf32;
3153
3154   /* Compute the length of the UTF-8 sequence.  */
3155   for (t = *name; t & 0x80; t <<= 1)
3156     ucn_len++;
3157
3158   utf32 = *name & (0x7F >> ucn_len);
3159   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
3160     {
3161       utf32 = (utf32 << 6) | (*++name & 0x3F);
3162
3163       /* Ill-formed UTF-8.  */
3164       if ((*name & ~0x3F) != 0x80)
3165         abort ();
3166     }
3167
3168   *buffer++ = '\\';
3169   *buffer++ = 'U';
3170   for (j = 7; j >= 0; j--)
3171     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
3172   return ucn_len;
3173 }
3174
3175 /* Given a token TYPE corresponding to a digraph, return a pointer to
3176    the spelling of the digraph.  */
3177 static const unsigned char *
3178 cpp_digraph2name (enum cpp_ttype type)
3179 {
3180   return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
3181 }
3182
3183 /* Write the spelling of an identifier IDENT, using UCNs, to BUFFER.
3184    The buffer must already contain the enough space to hold the
3185    token's spelling.  Returns a pointer to the character after the
3186    last character written.  */
3187 unsigned char *
3188 _cpp_spell_ident_ucns (unsigned char *buffer, cpp_hashnode *ident)
3189 {
3190   size_t i;
3191   const unsigned char *name = NODE_NAME (ident);
3192
3193   for (i = 0; i < NODE_LEN (ident); i++)
3194     if (name[i] & ~0x7F)
3195       {
3196         i += utf8_to_ucn (buffer, name + i) - 1;
3197         buffer += 10;
3198       }
3199     else
3200       *buffer++ = name[i];
3201
3202   return buffer;
3203 }
3204
3205 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
3206    already contain the enough space to hold the token's spelling.
3207    Returns a pointer to the character after the last character written.
3208    FORSTRING is true if this is to be the spelling after translation
3209    phase 1 (with the original spelling of extended identifiers), false
3210    if extended identifiers should always be written using UCNs (there is
3211    no option for always writing them in the internal UTF-8 form).
3212    FIXME: Would be nice if we didn't need the PFILE argument.  */
3213 unsigned char *
3214 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
3215                  unsigned char *buffer, bool forstring)
3216 {
3217   switch (TOKEN_SPELL (token))
3218     {
3219     case SPELL_OPERATOR:
3220       {
3221         const unsigned char *spelling;
3222         unsigned char c;
3223
3224         if (token->flags & DIGRAPH)
3225           spelling = cpp_digraph2name (token->type);
3226         else if (token->flags & NAMED_OP)
3227           goto spell_ident;
3228         else
3229           spelling = TOKEN_NAME (token);
3230
3231         while ((c = *spelling++) != '\0')
3232           *buffer++ = c;
3233       }
3234       break;
3235
3236     spell_ident:
3237     case SPELL_IDENT:
3238       if (forstring)
3239         {
3240           memcpy (buffer, NODE_NAME (token->val.node.spelling),
3241                   NODE_LEN (token->val.node.spelling));
3242           buffer += NODE_LEN (token->val.node.spelling);
3243         }
3244       else
3245         buffer = _cpp_spell_ident_ucns (buffer, token->val.node.node);
3246       break;
3247
3248     case SPELL_LITERAL:
3249       memcpy (buffer, token->val.str.text, token->val.str.len);
3250       buffer += token->val.str.len;
3251       break;
3252
3253     case SPELL_NONE:
3254       cpp_error (pfile, CPP_DL_ICE,
3255                  "unspellable token %s", TOKEN_NAME (token));
3256       break;
3257     }
3258
3259   return buffer;
3260 }
3261
3262 /* Returns TOKEN spelt as a null-terminated string.  The string is
3263    freed when the reader is destroyed.  Useful for diagnostics.  */
3264 unsigned char *
3265 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
3266 {
3267   unsigned int len = cpp_token_len (token) + 1;
3268   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
3269
3270   end = cpp_spell_token (pfile, token, start, false);
3271   end[0] = '\0';
3272
3273   return start;
3274 }
3275
3276 /* Returns a pointer to a string which spells the token defined by
3277    TYPE and FLAGS.  Used by C front ends, which really should move to
3278    using cpp_token_as_text.  */
3279 const char *
3280 cpp_type2name (enum cpp_ttype type, unsigned char flags)
3281 {
3282   if (flags & DIGRAPH)
3283     return (const char *) cpp_digraph2name (type);
3284   else if (flags & NAMED_OP)
3285     return cpp_named_operator2name (type);
3286
3287   return (const char *) token_spellings[type].name;
3288 }
3289
3290 /* Writes the spelling of token to FP, without any preceding space.
3291    Separated from cpp_spell_token for efficiency - to avoid stdio
3292    double-buffering.  */
3293 void
3294 cpp_output_token (const cpp_token *token, FILE *fp)
3295 {
3296   switch (TOKEN_SPELL (token))
3297     {
3298     case SPELL_OPERATOR:
3299       {
3300         const unsigned char *spelling;
3301         int c;
3302
3303         if (token->flags & DIGRAPH)
3304           spelling = cpp_digraph2name (token->type);
3305         else if (token->flags & NAMED_OP)
3306           goto spell_ident;
3307         else
3308           spelling = TOKEN_NAME (token);
3309
3310         c = *spelling;
3311         do
3312           putc (c, fp);
3313         while ((c = *++spelling) != '\0');
3314       }
3315       break;
3316
3317     spell_ident:
3318     case SPELL_IDENT:
3319       {
3320         size_t i;
3321         const unsigned char * name = NODE_NAME (token->val.node.node);
3322
3323         for (i = 0; i < NODE_LEN (token->val.node.node); i++)
3324           if (name[i] & ~0x7F)
3325             {
3326               unsigned char buffer[10];
3327               i += utf8_to_ucn (buffer, name + i) - 1;
3328               fwrite (buffer, 1, 10, fp);
3329             }
3330           else
3331             fputc (NODE_NAME (token->val.node.node)[i], fp);
3332       }
3333       break;
3334
3335     case SPELL_LITERAL:
3336       fwrite (token->val.str.text, 1, token->val.str.len, fp);
3337       break;
3338
3339     case SPELL_NONE:
3340       /* An error, most probably.  */
3341       break;
3342     }
3343 }
3344
3345 /* Compare two tokens.  */
3346 int
3347 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
3348 {
3349   if (a->type == b->type && a->flags == b->flags)
3350     switch (TOKEN_SPELL (a))
3351       {
3352       default:                  /* Keep compiler happy.  */
3353       case SPELL_OPERATOR:
3354         /* token_no is used to track where multiple consecutive ##
3355            tokens were originally located.  */
3356         return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
3357       case SPELL_NONE:
3358         return (a->type != CPP_MACRO_ARG
3359                 || (a->val.macro_arg.arg_no == b->val.macro_arg.arg_no
3360                     && a->val.macro_arg.spelling == b->val.macro_arg.spelling));
3361       case SPELL_IDENT:
3362         return (a->val.node.node == b->val.node.node
3363                 && a->val.node.spelling == b->val.node.spelling);
3364       case SPELL_LITERAL:
3365         return (a->val.str.len == b->val.str.len
3366                 && !memcmp (a->val.str.text, b->val.str.text,
3367                             a->val.str.len));
3368       }
3369
3370   return 0;
3371 }
3372
3373 /* Returns nonzero if a space should be inserted to avoid an
3374    accidental token paste for output.  For simplicity, it is
3375    conservative, and occasionally advises a space where one is not
3376    needed, e.g. "." and ".2".  */
3377 int
3378 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
3379                  const cpp_token *token2)
3380 {
3381   enum cpp_ttype a = token1->type, b = token2->type;
3382   cppchar_t c;
3383
3384   if (token1->flags & NAMED_OP)
3385     a = CPP_NAME;
3386   if (token2->flags & NAMED_OP)
3387     b = CPP_NAME;
3388
3389   c = EOF;
3390   if (token2->flags & DIGRAPH)
3391     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
3392   else if (token_spellings[b].category == SPELL_OPERATOR)
3393     c = token_spellings[b].name[0];
3394
3395   /* Quickly get everything that can paste with an '='.  */
3396   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
3397     return 1;
3398
3399   switch (a)
3400     {
3401     case CPP_GREATER:   return c == '>';
3402     case CPP_LESS:      return c == '<' || c == '%' || c == ':';
3403     case CPP_PLUS:      return c == '+';
3404     case CPP_MINUS:     return c == '-' || c == '>';
3405     case CPP_DIV:       return c == '/' || c == '*'; /* Comments.  */
3406     case CPP_MOD:       return c == ':' || c == '>';
3407     case CPP_AND:       return c == '&';
3408     case CPP_OR:        return c == '|';
3409     case CPP_COLON:     return c == ':' || c == '>';
3410     case CPP_DEREF:     return c == '*';
3411     case CPP_DOT:       return c == '.' || c == '%' || b == CPP_NUMBER;
3412     case CPP_HASH:      return c == '#' || c == '%'; /* Digraph form.  */
3413     case CPP_NAME:      return ((b == CPP_NUMBER
3414                                  && name_p (pfile, &token2->val.str))
3415                                 || b == CPP_NAME
3416                                 || b == CPP_CHAR || b == CPP_STRING); /* L */
3417     case CPP_NUMBER:    return (b == CPP_NUMBER || b == CPP_NAME
3418                                 || c == '.' || c == '+' || c == '-');
3419                                       /* UCNs */
3420     case CPP_OTHER:     return ((token1->val.str.text[0] == '\\'
3421                                  && b == CPP_NAME)
3422                                 || (CPP_OPTION (pfile, objc)
3423                                     && token1->val.str.text[0] == '@'
3424                                     && (b == CPP_NAME || b == CPP_STRING)));
3425     case CPP_STRING:
3426     case CPP_WSTRING:
3427     case CPP_UTF8STRING:
3428     case CPP_STRING16:
3429     case CPP_STRING32:  return (CPP_OPTION (pfile, user_literals)
3430                                 && (b == CPP_NAME
3431                                     || (TOKEN_SPELL (token2) == SPELL_LITERAL
3432                                         && ISIDST (token2->val.str.text[0]))));
3433
3434     default:            break;
3435     }
3436
3437   return 0;
3438 }
3439
3440 /* Output all the remaining tokens on the current line, and a newline
3441    character, to FP.  Leading whitespace is removed.  If there are
3442    macros, special token padding is not performed.  */
3443 void
3444 cpp_output_line (cpp_reader *pfile, FILE *fp)
3445 {
3446   const cpp_token *token;
3447
3448   token = cpp_get_token (pfile);
3449   while (token->type != CPP_EOF)
3450     {
3451       cpp_output_token (token, fp);
3452       token = cpp_get_token (pfile);
3453       if (token->flags & PREV_WHITE)
3454         putc (' ', fp);
3455     }
3456
3457   putc ('\n', fp);
3458 }
3459
3460 /* Return a string representation of all the remaining tokens on the
3461    current line.  The result is allocated using xmalloc and must be
3462    freed by the caller.  */
3463 unsigned char *
3464 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
3465 {
3466   const cpp_token *token;
3467   unsigned int out = dir_name ? ustrlen (dir_name) : 0;
3468   unsigned int alloced = 120 + out;
3469   unsigned char *result = (unsigned char *) xmalloc (alloced);
3470
3471   /* If DIR_NAME is empty, there are no initial contents.  */
3472   if (dir_name)
3473     {
3474       sprintf ((char *) result, "#%s ", dir_name);
3475       out += 2;
3476     }
3477
3478   token = cpp_get_token (pfile);
3479   while (token->type != CPP_EOF)
3480     {
3481       unsigned char *last;
3482       /* Include room for a possible space and the terminating nul.  */
3483       unsigned int len = cpp_token_len (token) + 2;
3484
3485       if (out + len > alloced)
3486         {
3487           alloced *= 2;
3488           if (out + len > alloced)
3489             alloced = out + len;
3490           result = (unsigned char *) xrealloc (result, alloced);
3491         }
3492
3493       last = cpp_spell_token (pfile, token, &result[out], 0);
3494       out = last - result;
3495
3496       token = cpp_get_token (pfile);
3497       if (token->flags & PREV_WHITE)
3498         result[out++] = ' ';
3499     }
3500
3501   result[out] = '\0';
3502   return result;
3503 }
3504
3505 /* Memory buffers.  Changing these three constants can have a dramatic
3506    effect on performance.  The values here are reasonable defaults,
3507    but might be tuned.  If you adjust them, be sure to test across a
3508    range of uses of cpplib, including heavy nested function-like macro
3509    expansion.  Also check the change in peak memory usage (NJAMD is a
3510    good tool for this).  */
3511 #define MIN_BUFF_SIZE 8000
3512 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
3513 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
3514         (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
3515
3516 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
3517   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
3518 #endif
3519
3520 /* Create a new allocation buffer.  Place the control block at the end
3521    of the buffer, so that buffer overflows will cause immediate chaos.  */
3522 static _cpp_buff *
3523 new_buff (size_t len)
3524 {
3525   _cpp_buff *result;
3526   unsigned char *base;
3527
3528   if (len < MIN_BUFF_SIZE)
3529     len = MIN_BUFF_SIZE;
3530   len = CPP_ALIGN (len);
3531
3532 #ifdef ENABLE_VALGRIND_ANNOTATIONS
3533   /* Valgrind warns about uses of interior pointers, so put _cpp_buff
3534      struct first.  */
3535   size_t slen = CPP_ALIGN2 (sizeof (_cpp_buff), 2 * DEFAULT_ALIGNMENT);
3536   base = XNEWVEC (unsigned char, len + slen);
3537   result = (_cpp_buff *) base;
3538   base += slen;
3539 #else
3540   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
3541   result = (_cpp_buff *) (base + len);
3542 #endif
3543   result->base = base;
3544   result->cur = base;
3545   result->limit = base + len;
3546   result->next = NULL;
3547   return result;
3548 }
3549
3550 /* Place a chain of unwanted allocation buffers on the free list.  */
3551 void
3552 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
3553 {
3554   _cpp_buff *end = buff;
3555
3556   while (end->next)
3557     end = end->next;
3558   end->next = pfile->free_buffs;
3559   pfile->free_buffs = buff;
3560 }
3561
3562 /* Return a free buffer of size at least MIN_SIZE.  */
3563 _cpp_buff *
3564 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
3565 {
3566   _cpp_buff *result, **p;
3567
3568   for (p = &pfile->free_buffs;; p = &(*p)->next)
3569     {
3570       size_t size;
3571
3572       if (*p == NULL)
3573         return new_buff (min_size);
3574       result = *p;
3575       size = result->limit - result->base;
3576       /* Return a buffer that's big enough, but don't waste one that's
3577          way too big.  */
3578       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
3579         break;
3580     }
3581
3582   *p = result->next;
3583   result->next = NULL;
3584   result->cur = result->base;
3585   return result;
3586 }
3587
3588 /* Creates a new buffer with enough space to hold the uncommitted
3589    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
3590    the excess bytes to the new buffer.  Chains the new buffer after
3591    BUFF, and returns the new buffer.  */
3592 _cpp_buff *
3593 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
3594 {
3595   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
3596   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
3597
3598   buff->next = new_buff;
3599   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
3600   return new_buff;
3601 }
3602
3603 /* Creates a new buffer with enough space to hold the uncommitted
3604    remaining bytes of the buffer pointed to by BUFF, and at least
3605    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
3606    Chains the new buffer before the buffer pointed to by BUFF, and
3607    updates the pointer to point to the new buffer.  */
3608 void
3609 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
3610 {
3611   _cpp_buff *new_buff, *old_buff = *pbuff;
3612   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
3613
3614   new_buff = _cpp_get_buff (pfile, size);
3615   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
3616   new_buff->next = old_buff;
3617   *pbuff = new_buff;
3618 }
3619
3620 /* Free a chain of buffers starting at BUFF.  */
3621 void
3622 _cpp_free_buff (_cpp_buff *buff)
3623 {
3624   _cpp_buff *next;
3625
3626   for (; buff; buff = next)
3627     {
3628       next = buff->next;
3629 #ifdef ENABLE_VALGRIND_ANNOTATIONS
3630       free (buff);
3631 #else
3632       free (buff->base);
3633 #endif
3634     }
3635 }
3636
3637 /* Allocate permanent, unaligned storage of length LEN.  */
3638 unsigned char *
3639 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
3640 {
3641   _cpp_buff *buff = pfile->u_buff;
3642   unsigned char *result = buff->cur;
3643
3644   if (len > (size_t) (buff->limit - result))
3645     {
3646       buff = _cpp_get_buff (pfile, len);
3647       buff->next = pfile->u_buff;
3648       pfile->u_buff = buff;
3649       result = buff->cur;
3650     }
3651
3652   buff->cur = result + len;
3653   return result;
3654 }
3655
3656 /* Allocate permanent, unaligned storage of length LEN from a_buff.
3657    That buffer is used for growing allocations when saving macro
3658    replacement lists in a #define, and when parsing an answer to an
3659    assertion in #assert, #unassert or #if (and therefore possibly
3660    whilst expanding macros).  It therefore must not be used by any
3661    code that they might call: specifically the lexer and the guts of
3662    the macro expander.
3663
3664    All existing other uses clearly fit this restriction: storing
3665    registered pragmas during initialization.  */
3666 unsigned char *
3667 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
3668 {
3669   _cpp_buff *buff = pfile->a_buff;
3670   unsigned char *result = buff->cur;
3671
3672   if (len > (size_t) (buff->limit - result))
3673     {
3674       buff = _cpp_get_buff (pfile, len);
3675       buff->next = pfile->a_buff;
3676       pfile->a_buff = buff;
3677       result = buff->cur;
3678     }
3679
3680   buff->cur = result + len;
3681   return result;
3682 }
3683
3684 /* Say which field of TOK is in use.  */
3685
3686 enum cpp_token_fld_kind
3687 cpp_token_val_index (const cpp_token *tok)
3688 {
3689   switch (TOKEN_SPELL (tok))
3690     {
3691     case SPELL_IDENT:
3692       return CPP_TOKEN_FLD_NODE;
3693     case SPELL_LITERAL:
3694       return CPP_TOKEN_FLD_STR;
3695     case SPELL_OPERATOR:
3696       if (tok->type == CPP_PASTE)
3697         return CPP_TOKEN_FLD_TOKEN_NO;
3698       else
3699         return CPP_TOKEN_FLD_NONE;
3700     case SPELL_NONE:
3701       if (tok->type == CPP_MACRO_ARG)
3702         return CPP_TOKEN_FLD_ARG_NO;
3703       else if (tok->type == CPP_PADDING)
3704         return CPP_TOKEN_FLD_SOURCE;
3705       else if (tok->type == CPP_PRAGMA)
3706         return CPP_TOKEN_FLD_PRAGMA;
3707       /* fall through */
3708     default:
3709       return CPP_TOKEN_FLD_NONE;
3710     }
3711 }
3712
3713 /* All tokens lexed in R after calling this function will be forced to have
3714    their source_location the same as the location referenced by P, until
3715    cpp_stop_forcing_token_locations is called for R.  */
3716
3717 void
3718 cpp_force_token_locations (cpp_reader *r, source_location *p)
3719 {
3720   r->forced_token_location_p = p;
3721 }
3722
3723 /* Go back to assigning locations naturally for lexed tokens.  */
3724
3725 void
3726 cpp_stop_forcing_token_locations (cpp_reader *r)
3727 {
3728   r->forced_token_location_p = NULL;
3729 }