libcpp/lex.c

   1 /* CPP Library - lexical analysis.
   2    Copyright (C) 2000-2021 Free Software Foundation, Inc.
   3    Contributed by Per Bothner, 1994-95.
   4    Based on CCCP program by Paul Rubin, June 1986
   5    Adapted to ANSI C, Richard Stallman, Jan 1987
   6    Broken out to separate file, Zack Weinberg, Mar 2000
   7
   8 This program is free software; you can redistribute it and/or modify it
   9 under the terms of the GNU General Public License as published by the
  10 Free Software Foundation; either version 3, or (at your option) any
  11 later version.
  12
  13 This program is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with this program; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "cpplib.h"
  25 #include "internal.h"
  26
  27 enum spell_type
  28 {
  29   SPELL_OPERATOR = 0,
  30   SPELL_IDENT,
  31   SPELL_LITERAL,
  32   SPELL_NONE
  33 };
  34
  35 struct token_spelling
  36 {
  37   enum spell_type category;
  38   const unsigned char *name;
  39 };
  40
  41 static const unsigned char *const digraph_spellings[] =
  42 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
  43
  44 #define OP(e, s) { SPELL_OPERATOR, UC s  },
  45 #define TK(e, s) { SPELL_ ## s,    UC #e },
  46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
  47 #undef OP
  48 #undef TK
  49
  50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
  51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
  52
  53 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
  54 static int skip_line_comment (cpp_reader *);
  55 static void skip_whitespace (cpp_reader *, cppchar_t);
  56 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
  57 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
  58 static void store_comment (cpp_reader *, cpp_token *);
  59 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
  60                             unsigned int, enum cpp_ttype);
  61 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
  62 static int name_p (cpp_reader *, const cpp_string *);
  63 static tokenrun *next_tokenrun (tokenrun *);
  64
  65 static _cpp_buff *new_buff (size_t);
  66
  67
  68 /* Utility routine:
  69
  70    Compares, the token TOKEN to the NUL-terminated string STRING.
  71    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
  72 int
  73 cpp_ideq (const cpp_token *token, const char *string)
  74 {
  75   if (token->type != CPP_NAME)
  76     return 0;
  77
  78   return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
  79 }
  80
  81 /* Record a note TYPE at byte POS into the current cleaned logical
  82    line.  */
  83 static void
  84 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
  85 {
  86   if (buffer->notes_used == buffer->notes_cap)
  87     {
  88       buffer->notes_cap = buffer->notes_cap * 2 + 200;
  89       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
  90                                   buffer->notes_cap);
  91     }
  92
  93   buffer->notes[buffer->notes_used].pos = pos;
  94   buffer->notes[buffer->notes_used].type = type;
  95   buffer->notes_used++;
  96 }
  97
  98 \f
  99 /* Fast path to find line special characters using optimized character
 100    scanning algorithms.  Anything complicated falls back to the slow
 101    path below.  Since this loop is very hot it's worth doing these kinds
 102    of optimizations.
 103
 104    One of the paths through the ifdefs should provide
 105
 106      const uchar *search_line_fast (const uchar *s, const uchar *end);
 107
 108    Between S and END, search for \n, \r, \\, ?.  Return a pointer to
 109    the found character.
 110
 111    Note that the last character of the buffer is *always* a newline,
 112    as forced by _cpp_convert_input.  This fact can be used to avoid
 113    explicitly looking for the end of the buffer.  */
 114
 115 /* Configure gives us an ifdef test.  */
 116 #ifndef WORDS_BIGENDIAN
 117 #define WORDS_BIGENDIAN 0
 118 #endif
 119
 120 /* We'd like the largest integer that fits into a register.  There's nothing
 121    in <stdint.h> that gives us that.  For most hosts this is unsigned long,
 122    but MS decided on an LLP64 model.  Thankfully when building with GCC we
 123    can get the "real" word size.  */
 124 #ifdef __GNUC__
 125 typedef unsigned int word_type __attribute__((__mode__(__word__)));
 126 #else
 127 typedef unsigned long word_type;
 128 #endif
 129
 130 /* The code below is only expecting sizes 4 or 8.
 131    Die at compile-time if this expectation is violated.  */
 132 typedef char check_word_type_size
 133   [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
 134
 135 /* Return X with the first N bytes forced to values that won't match one
 136    of the interesting characters.  Note that NUL is not interesting.  */
 137
 138 static inline word_type
 139 acc_char_mask_misalign (word_type val, unsigned int n)
 140 {
 141   word_type mask = -1;
 142   if (WORDS_BIGENDIAN)
 143     mask >>= n * 8;
 144   else
 145     mask <<= n * 8;
 146   return val & mask;
 147 }
 148
 149 /* Return X replicated to all byte positions within WORD_TYPE.  */
 150
 151 static inline word_type
 152 acc_char_replicate (uchar x)
 153 {
 154   word_type ret;
 155
 156   ret = (x << 24) | (x << 16) | (x << 8) | x;
 157   if (sizeof(word_type) == 8)
 158     ret = (ret << 16 << 16) | ret;
 159   return ret;
 160 }
 161
 162 /* Return non-zero if some byte of VAL is (probably) C.  */
 163
 164 static inline word_type
 165 acc_char_cmp (word_type val, word_type c)
 166 {
 167 #if defined(__GNUC__) && defined(__alpha__)
 168   /* We can get exact results using a compare-bytes instruction.
 169      Get (val == c) via (0 >= (val ^ c)).  */
 170   return __builtin_alpha_cmpbge (0, val ^ c);
 171 #else
 172   word_type magic = 0x7efefefeU;
 173   if (sizeof(word_type) == 8)
 174     magic = (magic << 16 << 16) | 0xfefefefeU;
 175   magic |= 1;
 176
 177   val ^= c;
 178   return ((val + magic) ^ ~val) & ~magic;
 179 #endif
 180 }
 181
 182 /* Given the result of acc_char_cmp is non-zero, return the index of
 183    the found character.  If this was a false positive, return -1.  */
 184
 185 static inline int
 186 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
 187                 word_type val ATTRIBUTE_UNUSED)
 188 {
 189 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
 190   /* The cmpbge instruction sets *bits* of the result corresponding to
 191      matches in the bytes with no false positives.  */
 192   return __builtin_ctzl (cmp);
 193 #else
 194   unsigned int i;
 195
 196   /* ??? It would be nice to force unrolling here,
 197      and have all of these constants folded.  */
 198   for (i = 0; i < sizeof(word_type); ++i)
 199     {
 200       uchar c;
 201       if (WORDS_BIGENDIAN)
 202         c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
 203       else
 204         c = (val >> i * 8) & 0xff;
 205
 206       if (c == '\n' || c == '\r' || c == '\\' || c == '?')
 207         return i;
 208     }
 209
 210   return -1;
 211 #endif
 212 }
 213
 214 /* A version of the fast scanner using bit fiddling techniques.
 215
 216    For 32-bit words, one would normally perform 16 comparisons and
 217    16 branches.  With this algorithm one performs 24 arithmetic
 218    operations and one branch.  Whether this is faster with a 32-bit
 219    word size is going to be somewhat system dependent.
 220
 221    For 64-bit words, we eliminate twice the number of comparisons
 222    and branches without increasing the number of arithmetic operations.
 223    It's almost certainly going to be a win with 64-bit word size.  */
 224
 225 static const uchar * search_line_acc_char (const uchar *, const uchar *)
 226   ATTRIBUTE_UNUSED;
 227
 228 static const uchar *
 229 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 230 {
 231   const word_type repl_nl = acc_char_replicate ('\n');
 232   const word_type repl_cr = acc_char_replicate ('\r');
 233   const word_type repl_bs = acc_char_replicate ('\\');
 234   const word_type repl_qm = acc_char_replicate ('?');
 235
 236   unsigned int misalign;
 237   const word_type *p;
 238   word_type val, t;
 239
 240   /* Align the buffer.  Mask out any bytes from before the beginning.  */
 241   p = (word_type *)((uintptr_t)s & -sizeof(word_type));
 242   val = *p;
 243   misalign = (uintptr_t)s & (sizeof(word_type) - 1);
 244   if (misalign)
 245     val = acc_char_mask_misalign (val, misalign);
 246
 247   /* Main loop.  */
 248   while (1)
 249     {
 250       t  = acc_char_cmp (val, repl_nl);
 251       t |= acc_char_cmp (val, repl_cr);
 252       t |= acc_char_cmp (val, repl_bs);
 253       t |= acc_char_cmp (val, repl_qm);
 254
 255       if (__builtin_expect (t != 0, 0))
 256         {
 257           int i = acc_char_index (t, val);
 258           if (i >= 0)
 259             return (const uchar *)p + i;
 260         }
 261
 262       val = *++p;
 263     }
 264 }
 265
 266 /* Disable on Solaris 2/x86 until the following problem can be properly
 267    autoconfed:
 268
 269    The Solaris 10+ assembler tags objects with the instruction set
 270    extensions used, so SSE4.2 executables cannot run on machines that
 271    don't support that extension.  */
 272
 273 #if (GCC_VERSION >= 4005) && (__GNUC__ >= 5 || !defined(__PIC__)) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
 274
 275 /* Replicated character data to be shared between implementations.
 276    Recall that outside of a context with vector support we can't
 277    define compatible vector types, therefore these are all defined
 278    in terms of raw characters.  */
 279 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
 280   { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 281     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
 282   { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 283     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
 284   { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 285     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
 286   { '?', '?', '?', '?', '?', '?', '?', '?',
 287     '?', '?', '?', '?', '?', '?', '?', '?' },
 288 };
 289
 290 /* A version of the fast scanner using MMX vectorized byte compare insns.
 291
 292    This uses the PMOVMSKB instruction which was introduced with "MMX2",
 293    which was packaged into SSE1; it is also present in the AMD MMX
 294    extension.  Mark the function as using "sse" so that we emit a real
 295    "emms" instruction, rather than the 3dNOW "femms" instruction.  */
 296
 297 static const uchar *
 298 #ifndef __SSE__
 299 __attribute__((__target__("sse")))
 300 #endif
 301 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 302 {
 303   typedef char v8qi __attribute__ ((__vector_size__ (8)));
 304   typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
 305
 306   const v8qi repl_nl = *(const v8qi *)repl_chars[0];
 307   const v8qi repl_cr = *(const v8qi *)repl_chars[1];
 308   const v8qi repl_bs = *(const v8qi *)repl_chars[2];
 309   const v8qi repl_qm = *(const v8qi *)repl_chars[3];
 310
 311   unsigned int misalign, found, mask;
 312   const v8qi *p;
 313   v8qi data, t, c;
 314
 315   /* Align the source pointer.  While MMX doesn't generate unaligned data
 316      faults, this allows us to safely scan to the end of the buffer without
 317      reading beyond the end of the last page.  */
 318   misalign = (uintptr_t)s & 7;
 319   p = (const v8qi *)((uintptr_t)s & -8);
 320   data = *p;
 321
 322   /* Create a mask for the bytes that are valid within the first
 323      16-byte block.  The Idea here is that the AND with the mask
 324      within the loop is "free", since we need some AND or TEST
 325      insn in order to set the flags for the branch anyway.  */
 326   mask = -1u << misalign;
 327
 328   /* Main loop processing 8 bytes at a time.  */
 329   goto start;
 330   do
 331     {
 332       data = *++p;
 333       mask = -1;
 334
 335     start:
 336       t = __builtin_ia32_pcmpeqb(data, repl_nl);
 337       c = __builtin_ia32_pcmpeqb(data, repl_cr);
 338       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 339       c = __builtin_ia32_pcmpeqb(data, repl_bs);
 340       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 341       c = __builtin_ia32_pcmpeqb(data, repl_qm);
 342       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 343       found = __builtin_ia32_pmovmskb (t);
 344       found &= mask;
 345     }
 346   while (!found);
 347
 348   __builtin_ia32_emms ();
 349
 350   /* FOUND contains 1 in bits for which we matched a relevant
 351      character.  Conversion to the byte index is trivial.  */
 352   found = __builtin_ctz(found);
 353   return (const uchar *)p + found;
 354 }
 355
 356 /* A version of the fast scanner using SSE2 vectorized byte compare insns.  */
 357
 358 static const uchar *
 359 #ifndef __SSE2__
 360 __attribute__((__target__("sse2")))
 361 #endif
 362 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 363 {
 364   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 365
 366   const v16qi repl_nl = *(const v16qi *)repl_chars[0];
 367   const v16qi repl_cr = *(const v16qi *)repl_chars[1];
 368   const v16qi repl_bs = *(const v16qi *)repl_chars[2];
 369   const v16qi repl_qm = *(const v16qi *)repl_chars[3];
 370
 371   unsigned int misalign, found, mask;
 372   const v16qi *p;
 373   v16qi data, t;
 374
 375   /* Align the source pointer.  */
 376   misalign = (uintptr_t)s & 15;
 377   p = (const v16qi *)((uintptr_t)s & -16);
 378   data = *p;
 379
 380   /* Create a mask for the bytes that are valid within the first
 381      16-byte block.  The Idea here is that the AND with the mask
 382      within the loop is "free", since we need some AND or TEST
 383      insn in order to set the flags for the branch anyway.  */
 384   mask = -1u << misalign;
 385
 386   /* Main loop processing 16 bytes at a time.  */
 387   goto start;
 388   do
 389     {
 390       data = *++p;
 391       mask = -1;
 392
 393     start:
 394       t  = data == repl_nl;
 395       t |= data == repl_cr;
 396       t |= data == repl_bs;
 397       t |= data == repl_qm;
 398       found = __builtin_ia32_pmovmskb128 (t);
 399       found &= mask;
 400     }
 401   while (!found);
 402
 403   /* FOUND contains 1 in bits for which we matched a relevant
 404      character.  Conversion to the byte index is trivial.  */
 405   found = __builtin_ctz(found);
 406   return (const uchar *)p + found;
 407 }
 408
 409 #ifdef HAVE_SSE4
 410 /* A version of the fast scanner using SSE 4.2 vectorized string insns.  */
 411
 412 static const uchar *
 413 #ifndef __SSE4_2__
 414 __attribute__((__target__("sse4.2")))
 415 #endif
 416 search_line_sse42 (const uchar *s, const uchar *end)
 417 {
 418   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 419   static const v16qi search = { '\n', '\r', '?', '\\' };
 420
 421   uintptr_t si = (uintptr_t)s;
 422   uintptr_t index;
 423
 424   /* Check for unaligned input.  */
 425   if (si & 15)
 426     {
 427       v16qi sv;
 428
 429       if (__builtin_expect (end - s < 16, 0)
 430           && __builtin_expect ((si & 0xfff) > 0xff0, 0))
 431         {
 432           /* There are less than 16 bytes left in the buffer, and less
 433              than 16 bytes left on the page.  Reading 16 bytes at this
 434              point might generate a spurious page fault.  Defer to the
 435              SSE2 implementation, which already handles alignment.  */
 436           return search_line_sse2 (s, end);
 437         }
 438
 439       /* ??? The builtin doesn't understand that the PCMPESTRI read from
 440          memory need not be aligned.  */
 441       sv = __builtin_ia32_loaddqu ((const char *) s);
 442       index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
 443
 444       if (__builtin_expect (index < 16, 0))
 445         goto found;
 446
 447       /* Advance the pointer to an aligned address.  We will re-scan a
 448          few bytes, but we no longer need care for reading past the
 449          end of a page, since we're guaranteed a match.  */
 450       s = (const uchar *)((si + 15) & -16);
 451     }
 452
 453   /* Main loop, processing 16 bytes at a time.  */
 454 #ifdef __GCC_ASM_FLAG_OUTPUTS__
 455   while (1)
 456     {
 457       char f;
 458
 459       /* By using inline assembly instead of the builtin,
 460          we can use the result, as well as the flags set.  */
 461       __asm ("%vpcmpestri\t$0, %2, %3"
 462              : "=c"(index), "=@ccc"(f)
 463              : "m"(*s), "x"(search), "a"(4), "d"(16));
 464       if (f)
 465         break;
 466
 467       s += 16;
 468     }
 469 #else
 470   s -= 16;
 471   /* By doing the whole loop in inline assembly,
 472      we can make proper use of the flags set.  */
 473   __asm (      ".balign 16\n"
 474         "0:     add $16, %1\n"
 475         "       %vpcmpestri\t$0, (%1), %2\n"
 476         "       jnc 0b"
 477         : "=&c"(index), "+r"(s)
 478         : "x"(search), "a"(4), "d"(16));
 479 #endif
 480
 481  found:
 482   return s + index;
 483 }
 484
 485 #else
 486 /* Work around out-dated assemblers without sse4 support.  */
 487 #define search_line_sse42 search_line_sse2
 488 #endif
 489
 490 /* Check the CPU capabilities.  */
 491
 492 #include "../gcc/config/i386/cpuid.h"
 493
 494 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
 495 static search_line_fast_type search_line_fast;
 496
 497 #define HAVE_init_vectorized_lexer 1
 498 static inline void
 499 init_vectorized_lexer (void)
 500 {
 501   unsigned dummy, ecx = 0, edx = 0;
 502   search_line_fast_type impl = search_line_acc_char;
 503   int minimum = 0;
 504
 505 #if defined(__SSE4_2__)
 506   minimum = 3;
 507 #elif defined(__SSE2__)
 508   minimum = 2;
 509 #elif defined(__SSE__)
 510   minimum = 1;
 511 #endif
 512
 513   if (minimum == 3)
 514     impl = search_line_sse42;
 515   else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
 516     {
 517       if (minimum == 3 || (ecx & bit_SSE4_2))
 518         impl = search_line_sse42;
 519       else if (minimum == 2 || (edx & bit_SSE2))
 520         impl = search_line_sse2;
 521       else if (minimum == 1 || (edx & bit_SSE))
 522         impl = search_line_mmx;
 523     }
 524   else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
 525     {
 526       if (minimum == 1
 527           || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
 528         impl = search_line_mmx;
 529     }
 530
 531   search_line_fast = impl;
 532 }
 533
 534 #elif (GCC_VERSION >= 4005) && defined(_ARCH_PWR8) && defined(__ALTIVEC__)
 535
 536 /* A vection of the fast scanner using AltiVec vectorized byte compares
 537    and VSX unaligned loads (when VSX is available).  This is otherwise
 538    the same as the AltiVec version.  */
 539
 540 ATTRIBUTE_NO_SANITIZE_UNDEFINED
 541 static const uchar *
 542 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 543 {
 544   typedef __attribute__((altivec(vector))) unsigned char vc;
 545
 546   const vc repl_nl = {
 547     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 548     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 549   };
 550   const vc repl_cr = {
 551     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 552     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 553   };
 554   const vc repl_bs = {
 555     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 556     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 557   };
 558   const vc repl_qm = {
 559     '?', '?', '?', '?', '?', '?', '?', '?',
 560     '?', '?', '?', '?', '?', '?', '?', '?',
 561   };
 562   const vc zero = { 0 };
 563
 564   vc data, t;
 565
 566   /* Main loop processing 16 bytes at a time.  */
 567   do
 568     {
 569       vc m_nl, m_cr, m_bs, m_qm;
 570
 571       data = __builtin_vec_vsx_ld (0, s);
 572       s += 16;
 573
 574       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 575       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 576       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 577       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 578       t = (m_nl | m_cr) | (m_bs | m_qm);
 579
 580       /* T now contains 0xff in bytes for which we matched one of the relevant
 581          characters.  We want to exit the loop if any byte in T is non-zero.
 582          Below is the expansion of vec_any_ne(t, zero).  */
 583     }
 584   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 585
 586   /* Restore s to to point to the 16 bytes we just processed.  */
 587   s -= 16;
 588
 589   {
 590 #define N  (sizeof(vc) / sizeof(long))
 591
 592     union {
 593       vc v;
 594       /* Statically assert that N is 2 or 4.  */
 595       unsigned long l[(N == 2 || N == 4) ? N : -1];
 596     } u;
 597     unsigned long l, i = 0;
 598
 599     u.v = t;
 600
 601     /* Find the first word of T that is non-zero.  */
 602     switch (N)
 603       {
 604       case 4:
 605         l = u.l[i++];
 606         if (l != 0)
 607           break;
 608         s += sizeof(unsigned long);
 609         l = u.l[i++];
 610         if (l != 0)
 611           break;
 612         s += sizeof(unsigned long);
 613         /* FALLTHRU */
 614       case 2:
 615         l = u.l[i++];
 616         if (l != 0)
 617           break;
 618         s += sizeof(unsigned long);
 619         l = u.l[i];
 620       }
 621
 622     /* L now contains 0xff in bytes for which we matched one of the
 623        relevant characters.  We can find the byte index by finding
 624        its bit index and dividing by 8.  */
 625 #ifdef __BIG_ENDIAN__
 626     l = __builtin_clzl(l) >> 3;
 627 #else
 628     l = __builtin_ctzl(l) >> 3;
 629 #endif
 630     return s + l;
 631
 632 #undef N
 633   }
 634 }
 635
 636 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__) && defined (__BIG_ENDIAN__)
 637
 638 /* A vection of the fast scanner using AltiVec vectorized byte compares.
 639    This cannot be used for little endian because vec_lvsl/lvsr are
 640    deprecated for little endian and the code won't work properly.  */
 641 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
 642    so we can't compile this function without -maltivec on the command line
 643    (or implied by some other switch).  */
 644
 645 static const uchar *
 646 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 647 {
 648   typedef __attribute__((altivec(vector))) unsigned char vc;
 649
 650   const vc repl_nl = {
 651     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 652     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 653   };
 654   const vc repl_cr = {
 655     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 656     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 657   };
 658   const vc repl_bs = {
 659     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 660     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 661   };
 662   const vc repl_qm = {
 663     '?', '?', '?', '?', '?', '?', '?', '?',
 664     '?', '?', '?', '?', '?', '?', '?', '?',
 665   };
 666   const vc ones = {
 667     -1, -1, -1, -1, -1, -1, -1, -1,
 668     -1, -1, -1, -1, -1, -1, -1, -1,
 669   };
 670   const vc zero = { 0 };
 671
 672   vc data, mask, t;
 673
 674   /* Altivec loads automatically mask addresses with -16.  This lets us
 675      issue the first load as early as possible.  */
 676   data = __builtin_vec_ld(0, (const vc *)s);
 677
 678   /* Discard bytes before the beginning of the buffer.  Do this by
 679      beginning with all ones and shifting in zeros according to the
 680      mis-alignment.  The LVSR instruction pulls the exact shift we
 681      want from the address.  */
 682   mask = __builtin_vec_lvsr(0, s);
 683   mask = __builtin_vec_perm(zero, ones, mask);
 684   data &= mask;
 685
 686   /* While altivec loads mask addresses, we still need to align S so
 687      that the offset we compute at the end is correct.  */
 688   s = (const uchar *)((uintptr_t)s & -16);
 689
 690   /* Main loop processing 16 bytes at a time.  */
 691   goto start;
 692   do
 693     {
 694       vc m_nl, m_cr, m_bs, m_qm;
 695
 696       s += 16;
 697       data = __builtin_vec_ld(0, (const vc *)s);
 698
 699     start:
 700       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 701       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 702       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 703       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 704       t = (m_nl | m_cr) | (m_bs | m_qm);
 705
 706       /* T now contains 0xff in bytes for which we matched one of the relevant
 707          characters.  We want to exit the loop if any byte in T is non-zero.
 708          Below is the expansion of vec_any_ne(t, zero).  */
 709     }
 710   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 711
 712   {
 713 #define N  (sizeof(vc) / sizeof(long))
 714
 715     union {
 716       vc v;
 717       /* Statically assert that N is 2 or 4.  */
 718       unsigned long l[(N == 2 || N == 4) ? N : -1];
 719     } u;
 720     unsigned long l, i = 0;
 721
 722     u.v = t;
 723
 724     /* Find the first word of T that is non-zero.  */
 725     switch (N)
 726       {
 727       case 4:
 728         l = u.l[i++];
 729         if (l != 0)
 730           break;
 731         s += sizeof(unsigned long);
 732         l = u.l[i++];
 733         if (l != 0)
 734           break;
 735         s += sizeof(unsigned long);
 736         /* FALLTHROUGH */
 737       case 2:
 738         l = u.l[i++];
 739         if (l != 0)
 740           break;
 741         s += sizeof(unsigned long);
 742         l = u.l[i];
 743       }
 744
 745     /* L now contains 0xff in bytes for which we matched one of the
 746        relevant characters.  We can find the byte index by finding
 747        its bit index and dividing by 8.  */
 748     l = __builtin_clzl(l) >> 3;
 749     return s + l;
 750
 751 #undef N
 752   }
 753 }
 754
 755 #elif defined (__ARM_NEON) && defined (__ARM_64BIT_STATE)
 756 #include "arm_neon.h"
 757
 758 /* This doesn't have to be the exact page size, but no system may use
 759    a size smaller than this.  ARMv8 requires a minimum page size of
 760    4k.  The impact of being conservative here is a small number of
 761    cases will take the slightly slower entry path into the main
 762    loop.  */
 763
 764 #define AARCH64_MIN_PAGE_SIZE 4096
 765
 766 static const uchar *
 767 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 768 {
 769   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
 770   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
 771   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
 772   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
 773   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
 774
 775 #ifdef __ARM_BIG_ENDIAN
 776   const int16x8_t shift = {8, 8, 8, 8, 0, 0, 0, 0};
 777 #else
 778   const int16x8_t shift = {0, 0, 0, 0, 8, 8, 8, 8};
 779 #endif
 780
 781   unsigned int found;
 782   const uint8_t *p;
 783   uint8x16_t data;
 784   uint8x16_t t;
 785   uint16x8_t m;
 786   uint8x16_t u, v, w;
 787
 788   /* Align the source pointer.  */
 789   p = (const uint8_t *)((uintptr_t)s & -16);
 790
 791   /* Assuming random string start positions, with a 4k page size we'll take
 792      the slow path about 0.37% of the time.  */
 793   if (__builtin_expect ((AARCH64_MIN_PAGE_SIZE
 794                          - (((uintptr_t) s) & (AARCH64_MIN_PAGE_SIZE - 1)))
 795                         < 16, 0))
 796     {
 797       /* Slow path: the string starts near a possible page boundary.  */
 798       uint32_t misalign, mask;
 799
 800       misalign = (uintptr_t)s & 15;
 801       mask = (-1u << misalign) & 0xffff;
 802       data = vld1q_u8 (p);
 803       t = vceqq_u8 (data, repl_nl);
 804       u = vceqq_u8 (data, repl_cr);
 805       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 806       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 807       t = vorrq_u8 (v, w);
 808       t = vandq_u8 (t, xmask);
 809       m = vpaddlq_u8 (t);
 810       m = vshlq_u16 (m, shift);
 811       found = vaddvq_u16 (m);
 812       found &= mask;
 813       if (found)
 814         return (const uchar*)p + __builtin_ctz (found);
 815     }
 816   else
 817     {
 818       data = vld1q_u8 ((const uint8_t *) s);
 819       t = vceqq_u8 (data, repl_nl);
 820       u = vceqq_u8 (data, repl_cr);
 821       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 822       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 823       t = vorrq_u8 (v, w);
 824       if (__builtin_expect (vpaddd_u64 ((uint64x2_t)t) != 0, 0))
 825         goto done;
 826     }
 827
 828   do
 829     {
 830       p += 16;
 831       data = vld1q_u8 (p);
 832       t = vceqq_u8 (data, repl_nl);
 833       u = vceqq_u8 (data, repl_cr);
 834       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 835       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 836       t = vorrq_u8 (v, w);
 837     } while (!vpaddd_u64 ((uint64x2_t)t));
 838
 839 done:
 840   /* Now that we've found the terminating substring, work out precisely where
 841      we need to stop.  */
 842   t = vandq_u8 (t, xmask);
 843   m = vpaddlq_u8 (t);
 844   m = vshlq_u16 (m, shift);
 845   found = vaddvq_u16 (m);
 846   return (((((uintptr_t) p) < (uintptr_t) s) ? s : (const uchar *)p)
 847           + __builtin_ctz (found));
 848 }
 849
 850 #elif defined (__ARM_NEON)
 851 #include "arm_neon.h"
 852
 853 static const uchar *
 854 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 855 {
 856   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
 857   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
 858   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
 859   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
 860   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
 861
 862   unsigned int misalign, found, mask;
 863   const uint8_t *p;
 864   uint8x16_t data;
 865
 866   /* Align the source pointer.  */
 867   misalign = (uintptr_t)s & 15;
 868   p = (const uint8_t *)((uintptr_t)s & -16);
 869   data = vld1q_u8 (p);
 870
 871   /* Create a mask for the bytes that are valid within the first
 872      16-byte block.  The Idea here is that the AND with the mask
 873      within the loop is "free", since we need some AND or TEST
 874      insn in order to set the flags for the branch anyway.  */
 875   mask = (-1u << misalign) & 0xffff;
 876
 877   /* Main loop, processing 16 bytes at a time.  */
 878   goto start;
 879
 880   do
 881     {
 882       uint8x8_t l;
 883       uint16x4_t m;
 884       uint32x2_t n;
 885       uint8x16_t t, u, v, w;
 886
 887       p += 16;
 888       data = vld1q_u8 (p);
 889       mask = 0xffff;
 890
 891     start:
 892       t = vceqq_u8 (data, repl_nl);
 893       u = vceqq_u8 (data, repl_cr);
 894       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 895       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 896       t = vandq_u8 (vorrq_u8 (v, w), xmask);
 897       l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
 898       m = vpaddl_u8 (l);
 899       n = vpaddl_u16 (m);
 900
 901       found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
 902               vshr_n_u64 ((uint64x1_t) n, 24)), 0);
 903       found &= mask;
 904     }
 905   while (!found);
 906
 907   /* FOUND contains 1 in bits for which we matched a relevant
 908      character.  Conversion to the byte index is trivial.  */
 909   found = __builtin_ctz (found);
 910   return (const uchar *)p + found;
 911 }
 912
 913 #else
 914
 915 /* We only have one accelerated alternative.  Use a direct call so that
 916    we encourage inlining.  */
 917
 918 #define search_line_fast  search_line_acc_char
 919
 920 #endif
 921
 922 /* Initialize the lexer if needed.  */
 923
 924 void
 925 _cpp_init_lexer (void)
 926 {
 927 #ifdef HAVE_init_vectorized_lexer
 928   init_vectorized_lexer ();
 929 #endif
 930 }
 931
 932 /* Returns with a logical line that contains no escaped newlines or
 933    trigraphs.  This is a time-critical inner loop.  */
 934 void
 935 _cpp_clean_line (cpp_reader *pfile)
 936 {
 937   cpp_buffer *buffer;
 938   const uchar *s;
 939   uchar c, *d, *p;
 940
 941   buffer = pfile->buffer;
 942   buffer->cur_note = buffer->notes_used = 0;
 943   buffer->cur = buffer->line_base = buffer->next_line;
 944   buffer->need_line = false;
 945   s = buffer->next_line;
 946
 947   if (!buffer->from_stage3)
 948     {
 949       const uchar *pbackslash = NULL;
 950
 951       /* Fast path.  This is the common case of an un-escaped line with
 952          no trigraphs.  The primary win here is by not writing any
 953          data back to memory until we have to.  */
 954       while (1)
 955         {
 956           /* Perform an optimized search for \n, \r, \\, ?.  */
 957           s = search_line_fast (s, buffer->rlimit);
 958
 959           c = *s;
 960           if (c == '\\')
 961             {
 962               /* Record the location of the backslash and continue.  */
 963               pbackslash = s++;
 964             }
 965           else if (__builtin_expect (c == '?', 0))
 966             {
 967               if (__builtin_expect (s[1] == '?', false)
 968                    && _cpp_trigraph_map[s[2]])
 969                 {
 970                   /* Have a trigraph.  We may or may not have to convert
 971                      it.  Add a line note regardless, for -Wtrigraphs.  */
 972                   add_line_note (buffer, s, s[2]);
 973                   if (CPP_OPTION (pfile, trigraphs))
 974                     {
 975                       /* We do, and that means we have to switch to the
 976                          slow path.  */
 977                       d = (uchar *) s;
 978                       *d = _cpp_trigraph_map[s[2]];
 979                       s += 2;
 980                       goto slow_path;
 981                     }
 982                 }
 983               /* Not a trigraph.  Continue on fast-path.  */
 984               s++;
 985             }
 986           else
 987             break;
 988         }
 989
 990       /* This must be \r or \n.  We're either done, or we'll be forced
 991          to write back to the buffer and continue on the slow path.  */
 992       d = (uchar *) s;
 993
 994       if (__builtin_expect (s == buffer->rlimit, false))
 995         goto done;
 996
 997       /* DOS line ending? */
 998       if (__builtin_expect (c == '\r', false) && s[1] == '\n')
 999         {
1000           s++;
1001           if (s == buffer->rlimit)
1002             goto done;
1003         }
1004
1005       if (__builtin_expect (pbackslash == NULL, true))
1006         goto done;
1007
1008       /* Check for escaped newline.  */
1009       p = d;
1010       while (is_nvspace (p[-1]))
1011         p--;
1012       if (p - 1 != pbackslash)
1013         goto done;
1014
1015       /* Have an escaped newline; process it and proceed to
1016          the slow path.  */
1017       add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
1018       d = p - 2;
1019       buffer->next_line = p - 1;
1020
1021     slow_path:
1022       while (1)
1023         {
1024           c = *++s;
1025           *++d = c;
1026
1027           if (c == '\n' || c == '\r')
1028             {
1029               /* Handle DOS line endings.  */
1030               if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
1031                 s++;
1032               if (s == buffer->rlimit)
1033                 break;
1034
1035               /* Escaped?  */
1036               p = d;
1037               while (p != buffer->next_line && is_nvspace (p[-1]))
1038                 p--;
1039               if (p == buffer->next_line || p[-1] != '\\')
1040                 break;
1041
1042               add_line_note (buffer, p - 1, p != d ? ' ': '\\');
1043               d = p - 2;
1044               buffer->next_line = p - 1;
1045             }
1046           else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
1047             {
1048               /* Add a note regardless, for the benefit of -Wtrigraphs.  */
1049               add_line_note (buffer, d, s[2]);
1050               if (CPP_OPTION (pfile, trigraphs))
1051                 {
1052                   *d = _cpp_trigraph_map[s[2]];
1053                   s += 2;
1054                 }
1055             }
1056         }
1057     }
1058   else
1059     {
1060       while (*s != '\n' && *s != '\r')
1061         s++;
1062       d = (uchar *) s;
1063
1064       /* Handle DOS line endings.  */
1065       if (*s == '\r' && s + 1 != buffer->rlimit && s[1] == '\n')
1066         s++;
1067     }
1068
1069  done:
1070   *d = '\n';
1071   /* A sentinel note that should never be processed.  */
1072   add_line_note (buffer, d + 1, '\n');
1073   buffer->next_line = s + 1;
1074 }
1075
1076 /* Return true if the trigraph indicated by NOTE should be warned
1077    about in a comment.  */
1078 static bool
1079 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
1080 {
1081   const uchar *p;
1082
1083   /* Within comments we don't warn about trigraphs, unless the
1084      trigraph forms an escaped newline, as that may change
1085      behavior.  */
1086   if (note->type != '/')
1087     return false;
1088
1089   /* If -trigraphs, then this was an escaped newline iff the next note
1090      is coincident.  */
1091   if (CPP_OPTION (pfile, trigraphs))
1092     return note[1].pos == note->pos;
1093
1094   /* Otherwise, see if this forms an escaped newline.  */
1095   p = note->pos + 3;
1096   while (is_nvspace (*p))
1097     p++;
1098
1099   /* There might have been escaped newlines between the trigraph and the
1100      newline we found.  Hence the position test.  */
1101   return (*p == '\n' && p < note[1].pos);
1102 }
1103
1104 /* Process the notes created by add_line_note as far as the current
1105    location.  */
1106 void
1107 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
1108 {
1109   cpp_buffer *buffer = pfile->buffer;
1110
1111   for (;;)
1112     {
1113       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
1114       unsigned int col;
1115
1116       if (note->pos > buffer->cur)
1117         break;
1118
1119       buffer->cur_note++;
1120       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
1121
1122       if (note->type == '\\' || note->type == ' ')
1123         {
1124           if (note->type == ' ' && !in_comment)
1125             cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
1126                                  "backslash and newline separated by space");
1127
1128           if (buffer->next_line > buffer->rlimit)
1129             {
1130               cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
1131                                    "backslash-newline at end of file");
1132               /* Prevent "no newline at end of file" warning.  */
1133               buffer->next_line = buffer->rlimit;
1134             }
1135
1136           buffer->line_base = note->pos;
1137           CPP_INCREMENT_LINE (pfile, 0);
1138         }
1139       else if (_cpp_trigraph_map[note->type])
1140         {
1141           if (CPP_OPTION (pfile, warn_trigraphs)
1142               && (!in_comment || warn_in_comment (pfile, note)))
1143             {
1144               if (CPP_OPTION (pfile, trigraphs))
1145                 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
1146                                        pfile->line_table->highest_line, col,
1147                                        "trigraph ??%c converted to %c",
1148                                        note->type,
1149                                        (int) _cpp_trigraph_map[note->type]);
1150               else
1151                 {
1152                   cpp_warning_with_line
1153                     (pfile, CPP_W_TRIGRAPHS,
1154                      pfile->line_table->highest_line, col,
1155                      "trigraph ??%c ignored, use -trigraphs to enable",
1156                      note->type);
1157                 }
1158             }
1159         }
1160       else if (note->type == 0)
1161         /* Already processed in lex_raw_string.  */;
1162       else
1163         abort ();
1164     }
1165 }
1166
1167 /* Skip a C-style block comment.  We find the end of the comment by
1168    seeing if an asterisk is before every '/' we encounter.  Returns
1169    nonzero if comment terminated by EOF, zero otherwise.
1170
1171    Buffer->cur points to the initial asterisk of the comment.  */
1172 bool
1173 _cpp_skip_block_comment (cpp_reader *pfile)
1174 {
1175   cpp_buffer *buffer = pfile->buffer;
1176   const uchar *cur = buffer->cur;
1177   uchar c;
1178
1179   cur++;
1180   if (*cur == '/')
1181     cur++;
1182
1183   for (;;)
1184     {
1185       /* People like decorating comments with '*', so check for '/'
1186          instead for efficiency.  */
1187       c = *cur++;
1188
1189       if (c == '/')
1190         {
1191           if (cur[-2] == '*')
1192             break;
1193
1194           /* Warn about potential nested comments, but not if the '/'
1195              comes immediately before the true comment delimiter.
1196              Don't bother to get it right across escaped newlines.  */
1197           if (CPP_OPTION (pfile, warn_comments)
1198               && cur[0] == '*' && cur[1] != '/')
1199             {
1200               buffer->cur = cur;
1201               cpp_warning_with_line (pfile, CPP_W_COMMENTS,
1202                                      pfile->line_table->highest_line,
1203                                      CPP_BUF_COL (buffer),
1204                                      "\"/*\" within comment");
1205             }
1206         }
1207       else if (c == '\n')
1208         {
1209           unsigned int cols;
1210           buffer->cur = cur - 1;
1211           _cpp_process_line_notes (pfile, true);
1212           if (buffer->next_line >= buffer->rlimit)
1213             return true;
1214           _cpp_clean_line (pfile);
1215
1216           cols = buffer->next_line - buffer->line_base;
1217           CPP_INCREMENT_LINE (pfile, cols);
1218
1219           cur = buffer->cur;
1220         }
1221     }
1222
1223   buffer->cur = cur;
1224   _cpp_process_line_notes (pfile, true);
1225   return false;
1226 }
1227
1228 /* Skip a C++ line comment, leaving buffer->cur pointing to the
1229    terminating newline.  Handles escaped newlines.  Returns nonzero
1230    if a multiline comment.  */
1231 static int
1232 skip_line_comment (cpp_reader *pfile)
1233 {
1234   cpp_buffer *buffer = pfile->buffer;
1235   location_t orig_line = pfile->line_table->highest_line;
1236
1237   while (*buffer->cur != '\n')
1238     buffer->cur++;
1239
1240   _cpp_process_line_notes (pfile, true);
1241   return orig_line != pfile->line_table->highest_line;
1242 }
1243
1244 /* Skips whitespace, saving the next non-whitespace character.  */
1245 static void
1246 skip_whitespace (cpp_reader *pfile, cppchar_t c)
1247 {
1248   cpp_buffer *buffer = pfile->buffer;
1249   bool saw_NUL = false;
1250
1251   do
1252     {
1253       /* Horizontal space always OK.  */
1254       if (c == ' ' || c == '\t')
1255         ;
1256       /* Just \f \v or \0 left.  */
1257       else if (c == '\0')
1258         saw_NUL = true;
1259       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
1260         cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
1261                              CPP_BUF_COL (buffer),
1262                              "%s in preprocessing directive",
1263                              c == '\f' ? "form feed" : "vertical tab");
1264
1265       c = *buffer->cur++;
1266     }
1267   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
1268   while (is_nvspace (c));
1269
1270   if (saw_NUL)
1271     {
1272       encoding_rich_location rich_loc (pfile);
1273       cpp_error_at (pfile, CPP_DL_WARNING, &rich_loc,
1274                     "null character(s) ignored");
1275     }
1276
1277   buffer->cur--;
1278 }
1279
1280 /* See if the characters of a number token are valid in a name (no
1281    '.', '+' or '-').  */
1282 static int
1283 name_p (cpp_reader *pfile, const cpp_string *string)
1284 {
1285   unsigned int i;
1286
1287   for (i = 0; i < string->len; i++)
1288     if (!is_idchar (string->text[i]))
1289       return 0;
1290
1291   return 1;
1292 }
1293
1294 /* After parsing an identifier or other sequence, produce a warning about
1295    sequences not in NFC/NFKC.  */
1296 static void
1297 warn_about_normalization (cpp_reader *pfile,
1298                           const cpp_token *token,
1299                           const struct normalize_state *s)
1300 {
1301   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
1302       && !pfile->state.skipping)
1303     {
1304       location_t loc = token->src_loc;
1305
1306       /* If possible, create a location range for the token.  */
1307       if (loc >= RESERVED_LOCATION_COUNT
1308           && token->type != CPP_EOF
1309           /* There must be no line notes to process.  */
1310           && (!(pfile->buffer->cur
1311                 >= pfile->buffer->notes[pfile->buffer->cur_note].pos
1312                 && !pfile->overlaid_buffer)))
1313         {
1314           source_range tok_range;
1315           tok_range.m_start = loc;
1316           tok_range.m_finish
1317             = linemap_position_for_column (pfile->line_table,
1318                                            CPP_BUF_COLUMN (pfile->buffer,
1319                                                            pfile->buffer->cur));
1320           loc = COMBINE_LOCATION_DATA (pfile->line_table,
1321                                        loc, tok_range, NULL);
1322         }
1323
1324       encoding_rich_location rich_loc (pfile, loc);
1325
1326       /* Make sure that the token is printed using UCNs, even
1327          if we'd otherwise happily print UTF-8.  */
1328       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
1329       size_t sz;
1330
1331       sz = cpp_spell_token (pfile, token, buf, false) - buf;
1332       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
1333         cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
1334                         "`%.*s' is not in NFKC", (int) sz, buf);
1335       else if (CPP_OPTION (pfile, cxx23_identifiers))
1336         cpp_pedwarning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
1337                                   "`%.*s' is not in NFC", (int) sz, buf);
1338       else
1339         cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
1340                         "`%.*s' is not in NFC", (int) sz, buf);
1341       free (buf);
1342     }
1343 }
1344
1345 static const cppchar_t utf8_signifier = 0xC0;
1346
1347 /* Returns TRUE if the sequence starting at buffer->cur is valid in
1348    an identifier.  FIRST is TRUE if this starts an identifier.  */
1349 static bool
1350 forms_identifier_p (cpp_reader *pfile, int first,
1351                     struct normalize_state *state)
1352 {
1353   cpp_buffer *buffer = pfile->buffer;
1354
1355   if (*buffer->cur == '$')
1356     {
1357       if (!CPP_OPTION (pfile, dollars_in_ident))
1358         return false;
1359
1360       buffer->cur++;
1361       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
1362         {
1363           CPP_OPTION (pfile, warn_dollars) = 0;
1364           cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
1365         }
1366
1367       return true;
1368     }
1369
1370   /* Is this a syntactically valid UCN or a valid UTF-8 char?  */
1371   if (CPP_OPTION (pfile, extended_identifiers))
1372     {
1373       cppchar_t s;
1374       if (*buffer->cur >= utf8_signifier)
1375         {
1376           if (_cpp_valid_utf8 (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1377                                state, &s))
1378             return true;
1379         }
1380       else if (*buffer->cur == '\\'
1381                && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
1382         {
1383           buffer->cur += 2;
1384           if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1385                               state, &s, NULL, NULL))
1386             return true;
1387           buffer->cur -= 2;
1388         }
1389     }
1390
1391   return false;
1392 }
1393
1394 /* Helper function to issue error about improper __VA_OPT__ use.  */
1395 static void
1396 maybe_va_opt_error (cpp_reader *pfile)
1397 {
1398   if (CPP_PEDANTIC (pfile) && !CPP_OPTION (pfile, va_opt))
1399     {
1400       /* __VA_OPT__ should not be accepted at all, but allow it in
1401          system headers.  */
1402       if (!_cpp_in_system_header (pfile))
1403         cpp_error (pfile, CPP_DL_PEDWARN,
1404                    "__VA_OPT__ is not available until C++20");
1405     }
1406   else if (!pfile->state.va_args_ok)
1407     {
1408       /* __VA_OPT__ should only appear in the replacement list of a
1409          variadic macro.  */
1410       cpp_error (pfile, CPP_DL_PEDWARN,
1411                  "__VA_OPT__ can only appear in the expansion"
1412                  " of a C++20 variadic macro");
1413     }
1414 }
1415
1416 /* Helper function to get the cpp_hashnode of the identifier BASE.  */
1417 static cpp_hashnode *
1418 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
1419 {
1420   cpp_hashnode *result;
1421   const uchar *cur;
1422   unsigned int len;
1423   unsigned int hash = HT_HASHSTEP (0, *base);
1424
1425   cur = base + 1;
1426   while (ISIDNUM (*cur))
1427     {
1428       hash = HT_HASHSTEP (hash, *cur);
1429       cur++;
1430     }
1431   len = cur - base;
1432   hash = HT_HASHFINISH (hash, len);
1433   result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1434                                               base, len, hash, HT_ALLOC));
1435
1436   /* Rarely, identifiers require diagnostics when lexed.  */
1437   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1438                         && !pfile->state.skipping, 0))
1439     {
1440       /* It is allowed to poison the same identifier twice.  */
1441       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1442         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1443                    NODE_NAME (result));
1444
1445       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1446          replacement list of a variadic macro.  */
1447       if (result == pfile->spec_nodes.n__VA_ARGS__
1448           && !pfile->state.va_args_ok)
1449         {
1450           if (CPP_OPTION (pfile, cplusplus))
1451             cpp_error (pfile, CPP_DL_PEDWARN,
1452                        "__VA_ARGS__ can only appear in the expansion"
1453                        " of a C++11 variadic macro");
1454           else
1455             cpp_error (pfile, CPP_DL_PEDWARN,
1456                        "__VA_ARGS__ can only appear in the expansion"
1457                        " of a C99 variadic macro");
1458         }
1459
1460       if (result == pfile->spec_nodes.n__VA_OPT__)
1461         maybe_va_opt_error (pfile);
1462
1463       /* For -Wc++-compat, warn about use of C++ named operators.  */
1464       if (result->flags & NODE_WARN_OPERATOR)
1465         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1466                      "identifier \"%s\" is a special operator name in C++",
1467                      NODE_NAME (result));
1468     }
1469
1470   return result;
1471 }
1472
1473 /* Get the cpp_hashnode of an identifier specified by NAME in
1474    the current cpp_reader object.  If none is found, NULL is returned.  */
1475 cpp_hashnode *
1476 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
1477 {
1478   cpp_hashnode *result;
1479   result = lex_identifier_intern (pfile, (uchar *) name);
1480   return result;
1481 }
1482
1483 /* Lex an identifier starting at BUFFER->CUR - 1.  */
1484 static cpp_hashnode *
1485 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
1486                 struct normalize_state *nst, cpp_hashnode **spelling)
1487 {
1488   cpp_hashnode *result;
1489   const uchar *cur;
1490   unsigned int len;
1491   unsigned int hash = HT_HASHSTEP (0, *base);
1492
1493   cur = pfile->buffer->cur;
1494   if (! starts_ucn)
1495     {
1496       while (ISIDNUM (*cur))
1497         {
1498           hash = HT_HASHSTEP (hash, *cur);
1499           cur++;
1500         }
1501       NORMALIZE_STATE_UPDATE_IDNUM (nst, *(cur - 1));
1502     }
1503   pfile->buffer->cur = cur;
1504   if (starts_ucn || forms_identifier_p (pfile, false, nst))
1505     {
1506       /* Slower version for identifiers containing UCNs
1507          or extended chars (including $).  */
1508       do {
1509         while (ISIDNUM (*pfile->buffer->cur))
1510           {
1511             NORMALIZE_STATE_UPDATE_IDNUM (nst, *pfile->buffer->cur);
1512             pfile->buffer->cur++;
1513           }
1514       } while (forms_identifier_p (pfile, false, nst));
1515       result = _cpp_interpret_identifier (pfile, base,
1516                                           pfile->buffer->cur - base);
1517       *spelling = cpp_lookup (pfile, base, pfile->buffer->cur - base);
1518     }
1519   else
1520     {
1521       len = cur - base;
1522       hash = HT_HASHFINISH (hash, len);
1523
1524       result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1525                                                   base, len, hash, HT_ALLOC));
1526       *spelling = result;
1527     }
1528
1529   /* Rarely, identifiers require diagnostics when lexed.  */
1530   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1531                         && !pfile->state.skipping, 0))
1532     {
1533       /* It is allowed to poison the same identifier twice.  */
1534       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1535         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1536                    NODE_NAME (result));
1537
1538       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1539          replacement list of a variadic macro.  */
1540       if (result == pfile->spec_nodes.n__VA_ARGS__
1541           && !pfile->state.va_args_ok)
1542         {
1543           if (CPP_OPTION (pfile, cplusplus))
1544             cpp_error (pfile, CPP_DL_PEDWARN,
1545                        "__VA_ARGS__ can only appear in the expansion"
1546                        " of a C++11 variadic macro");
1547           else
1548             cpp_error (pfile, CPP_DL_PEDWARN,
1549                        "__VA_ARGS__ can only appear in the expansion"
1550                        " of a C99 variadic macro");
1551         }
1552
1553       /* __VA_OPT__ should only appear in the replacement list of a
1554          variadic macro.  */
1555       if (result == pfile->spec_nodes.n__VA_OPT__)
1556         maybe_va_opt_error (pfile);
1557
1558       /* For -Wc++-compat, warn about use of C++ named operators.  */
1559       if (result->flags & NODE_WARN_OPERATOR)
1560         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1561                      "identifier \"%s\" is a special operator name in C++",
1562                      NODE_NAME (result));
1563     }
1564
1565   return result;
1566 }
1567
1568 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
1569 static void
1570 lex_number (cpp_reader *pfile, cpp_string *number,
1571             struct normalize_state *nst)
1572 {
1573   const uchar *cur;
1574   const uchar *base;
1575   uchar *dest;
1576
1577   base = pfile->buffer->cur - 1;
1578   do
1579     {
1580       const uchar *adj_digit_sep = NULL;
1581       cur = pfile->buffer->cur;
1582
1583       /* N.B. ISIDNUM does not include $.  */
1584       while (ISIDNUM (*cur)
1585              || (*cur == '.' && !DIGIT_SEP (cur[-1]))
1586              || DIGIT_SEP (*cur)
1587              || (VALID_SIGN (*cur, cur[-1]) && !DIGIT_SEP (cur[-2])))
1588         {
1589           NORMALIZE_STATE_UPDATE_IDNUM (nst, *cur);
1590           /* Adjacent digit separators do not form part of the pp-number syntax.
1591              However, they can safely be diagnosed here as an error, since '' is
1592              not a valid preprocessing token.  */
1593           if (DIGIT_SEP (*cur) && DIGIT_SEP (cur[-1]) && !adj_digit_sep)
1594             adj_digit_sep = cur;
1595           cur++;
1596         }
1597       /* A number can't end with a digit separator.  */
1598       while (cur > pfile->buffer->cur && DIGIT_SEP (cur[-1]))
1599         --cur;
1600       if (adj_digit_sep && adj_digit_sep < cur)
1601         cpp_error (pfile, CPP_DL_ERROR, "adjacent digit separators");
1602
1603       pfile->buffer->cur = cur;
1604     }
1605   while (forms_identifier_p (pfile, false, nst));
1606
1607   number->len = cur - base;
1608   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
1609   memcpy (dest, base, number->len);
1610   dest[number->len] = '\0';
1611   number->text = dest;
1612 }
1613
1614 /* Create a token of type TYPE with a literal spelling.  */
1615 static void
1616 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
1617                 unsigned int len, enum cpp_ttype type)
1618 {
1619   token->type = type;
1620   token->val.str.len = len;
1621   token->val.str.text = cpp_alloc_token_string (pfile, base, len);
1622 }
1623
1624 const uchar *
1625 cpp_alloc_token_string (cpp_reader *pfile,
1626                         const unsigned char *ptr, unsigned len)
1627 {
1628   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
1629
1630   dest[len] = 0;
1631   memcpy (dest, ptr, len);
1632   return dest;
1633 }
1634
1635 /* A pair of raw buffer pointers.  The currently open one is [1], the
1636    first one is [0].  Used for string literal lexing.  */
1637 struct lit_accum {
1638   _cpp_buff *first;
1639   _cpp_buff *last;
1640   const uchar *rpos;
1641   size_t accum;
1642
1643   lit_accum ()
1644     : first (NULL), last (NULL), rpos (0), accum (0)
1645   {
1646   }
1647
1648   void append (cpp_reader *, const uchar *, size_t);
1649
1650   void read_begin (cpp_reader *);
1651   bool reading_p () const
1652   {
1653     return rpos != NULL;
1654   }
1655   char read_char ()
1656   {
1657     char c = *rpos++;
1658     if (rpos == BUFF_FRONT (last))
1659       rpos = NULL;
1660     return c;
1661   }
1662 };
1663
1664 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
1665    sequence from *FIRST_BUFF_P to LAST_BUFF_P.  */
1666
1667 void
1668 lit_accum::append (cpp_reader *pfile, const uchar *base, size_t len)
1669 {
1670   if (!last)
1671     /* Starting.  */
1672     first = last = _cpp_get_buff (pfile, len);
1673   else if (len > BUFF_ROOM (last))
1674     {
1675       /* There is insufficient room in the buffer.  Copy what we can,
1676          and then either extend or create a new one.  */
1677       size_t room = BUFF_ROOM (last);
1678       memcpy (BUFF_FRONT (last), base, room);
1679       BUFF_FRONT (last) += room;
1680       base += room;
1681       len -= room;
1682       accum += room;
1683
1684       gcc_checking_assert (!rpos);
1685
1686       last = _cpp_append_extend_buff (pfile, last, len);
1687     }
1688
1689   memcpy (BUFF_FRONT (last), base, len);
1690   BUFF_FRONT (last) += len;
1691   accum += len;
1692 }
1693
1694 void
1695 lit_accum::read_begin (cpp_reader *pfile)
1696 {
1697   /* We never accumulate more than 4 chars to read.  */
1698   if (BUFF_ROOM (last) < 4)
1699
1700     last = _cpp_append_extend_buff (pfile, last, 4);
1701   rpos = BUFF_FRONT (last);
1702 }
1703
1704 /* Returns true if a macro has been defined.
1705    This might not work if compile with -save-temps,
1706    or preprocess separately from compilation.  */
1707
1708 static bool
1709 is_macro(cpp_reader *pfile, const uchar *base)
1710 {
1711   const uchar *cur = base;
1712   if (! ISIDST (*cur))
1713     return false;
1714   unsigned int hash = HT_HASHSTEP (0, *cur);
1715   ++cur;
1716   while (ISIDNUM (*cur))
1717     {
1718       hash = HT_HASHSTEP (hash, *cur);
1719       ++cur;
1720     }
1721   hash = HT_HASHFINISH (hash, cur - base);
1722
1723   cpp_hashnode *result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1724                                         base, cur - base, hash, HT_NO_INSERT));
1725
1726   return result && cpp_macro_p (result);
1727 }
1728
1729 /* Returns true if a literal suffix does not have the expected form
1730    and is defined as a macro.  */
1731
1732 static bool
1733 is_macro_not_literal_suffix(cpp_reader *pfile, const uchar *base)
1734 {
1735   /* User-defined literals outside of namespace std must start with a single
1736      underscore, so assume anything of that form really is a UDL suffix.
1737      We don't need to worry about UDLs defined inside namespace std because
1738      their names are reserved, so cannot be used as macro names in valid
1739      programs.  */
1740   if (base[0] == '_' && base[1] != '_')
1741     return false;
1742   return is_macro (pfile, base);
1743 }
1744
1745 /* Lexes a raw string.  The stored string contains the spelling,
1746    including double quotes, delimiter string, '(' and ')', any leading
1747    'L', 'u', 'U' or 'u8' and 'R' modifier.  The created token contains
1748    the type of the literal, or CPP_OTHER if it was not properly
1749    terminated.
1750
1751    BASE is the start of the token.  Updates pfile->buffer->cur to just
1752    after the lexed string.
1753
1754    The spelling is NUL-terminated, but it is not guaranteed that this
1755    is the first NUL since embedded NULs are preserved.  */
1756
1757 static void
1758 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
1759 {
1760   const uchar *pos = base;
1761
1762   /* 'tis a pity this information isn't passed down from the lexer's
1763      initial categorization of the token.  */
1764   enum cpp_ttype type = CPP_STRING;
1765
1766   if (*pos == 'L')
1767     {
1768       type = CPP_WSTRING;
1769       pos++;
1770     }
1771   else if (*pos == 'U')
1772     {
1773       type = CPP_STRING32;
1774       pos++;
1775     }
1776   else if (*pos == 'u')
1777     {
1778       if (pos[1] == '8')
1779         {
1780           type = CPP_UTF8STRING;
1781           pos++;
1782         }
1783       else
1784         type = CPP_STRING16;
1785       pos++;
1786     }
1787
1788   gcc_checking_assert (pos[0] == 'R' && pos[1] == '"');
1789   pos += 2;
1790
1791   _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
1792
1793   /* Skip notes before the ".  */
1794   while (note->pos < pos)
1795     ++note;
1796
1797   lit_accum accum;
1798
1799   uchar prefix[17];
1800   unsigned prefix_len = 0;
1801   enum Phase
1802   {
1803    PHASE_PREFIX = -2,
1804    PHASE_NONE = -1,
1805    PHASE_SUFFIX = 0
1806   } phase = PHASE_PREFIX;
1807
1808   for (;;)
1809     {
1810       gcc_checking_assert (note->pos >= pos);
1811
1812       /* Undo any escaped newlines and trigraphs.  */
1813       if (!accum.reading_p () && note->pos == pos)
1814         switch (note->type)
1815           {
1816           case '\\':
1817           case ' ':
1818             /* Restore backslash followed by newline.  */
1819             accum.append (pfile, base, pos - base);
1820             base = pos;
1821             accum.read_begin (pfile);
1822             accum.append (pfile, UC"\\", 1);
1823
1824           after_backslash:
1825             if (note->type == ' ')
1826               /* GNU backslash whitespace newline extension.  FIXME
1827                  could be any sequence of non-vertical space.  When we
1828                  can properly restore any such sequence, we should
1829                  mark this note as handled so _cpp_process_line_notes
1830                  doesn't warn.  */
1831               accum.append (pfile, UC" ", 1);
1832
1833             accum.append (pfile, UC"\n", 1);
1834             note++;
1835             break;
1836
1837           case '\n':
1838             /* This can happen for ??/<NEWLINE> when trigraphs are not
1839                being interpretted.  */
1840             gcc_checking_assert (!CPP_OPTION (pfile, trigraphs));
1841             note->type = 0;
1842             note++;
1843             break;
1844
1845           default:
1846             gcc_checking_assert (_cpp_trigraph_map[note->type]);
1847
1848             /* Don't warn about this trigraph in
1849                _cpp_process_line_notes, since trigraphs show up as
1850                trigraphs in raw strings.  */
1851             uchar type = note->type;
1852             note->type = 0;
1853
1854             if (CPP_OPTION (pfile, trigraphs))
1855               {
1856                 accum.append (pfile, base, pos - base);
1857                 base = pos;
1858                 accum.read_begin (pfile);
1859                 accum.append (pfile, UC"??", 2);
1860                 accum.append (pfile, &type, 1);
1861
1862                 /* ??/ followed by newline gets two line notes, one for
1863                    the trigraph and one for the backslash/newline.  */
1864                 if (type == '/' && note[1].pos == pos)
1865                   {
1866                     note++;
1867                     gcc_assert (note->type == '\\' || note->type == ' ');
1868                     goto after_backslash;
1869                   }
1870                 /* Skip the replacement character.  */
1871                 base = ++pos;
1872               }
1873
1874             note++;
1875             break;
1876           }
1877
1878       /* Now get a char to process.  Either from an expanded note, or
1879          from the line buffer.  */
1880       bool read_note = accum.reading_p ();
1881       char c = read_note ? accum.read_char () : *pos++;
1882
1883       if (phase == PHASE_PREFIX)
1884         {
1885           if (c == '(')
1886             {
1887               /* Done.  */
1888               phase = PHASE_NONE;
1889               prefix[prefix_len++] = '"';
1890             }
1891           else if (prefix_len < 16
1892                    /* Prefix chars are any of the basic character set,
1893                       [lex.charset] except for '
1894                       ()\\\t\v\f\n'. Optimized for a contiguous
1895                       alphabet.  */
1896                    /* Unlike a switch, this collapses down to one or
1897                       two shift and bitmask operations on an ASCII
1898                       system, with an outlier or two.   */
1899                    && (('Z' - 'A' == 25
1900                         ? ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
1901                         : ISIDST (c))
1902                        || (c >= '0' && c <= '9')
1903                        || c == '_' || c == '{' || c == '}'
1904                        || c == '[' || c == ']' || c == '#'
1905                        || c == '<' || c == '>' || c == '%'
1906                        || c == ':' || c == ';' || c == '.' || c == '?'
1907                        || c == '*' || c == '+' || c == '-' || c == '/'
1908                        || c == '^' || c == '&' || c == '|' || c == '~'
1909                        || c == '!' || c == '=' || c == ','
1910                        || c == '"' || c == '\''))
1911             prefix[prefix_len++] = c;
1912           else
1913             {
1914               /* Something is wrong.  */
1915               int col = CPP_BUF_COLUMN (pfile->buffer, pos) + read_note;
1916               if (prefix_len == 16)
1917                 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1918                                      col, "raw string delimiter longer "
1919                                      "than 16 characters");
1920               else if (c == '\n')
1921                 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1922                                      col, "invalid new-line in raw "
1923                                      "string delimiter");
1924               else
1925                 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1926                                      col, "invalid character '%c' in "
1927                                      "raw string delimiter", c);
1928               type = CPP_OTHER;
1929               phase = PHASE_NONE;
1930               /* Continue until we get a close quote, that's probably
1931                  the best failure mode.  */
1932               prefix_len = 0;
1933             }
1934           if (c != '\n')
1935             continue;
1936         }
1937
1938       if (phase != PHASE_NONE)
1939         {
1940           if (prefix[phase] != c)
1941             phase = PHASE_NONE;
1942           else if (unsigned (phase + 1) == prefix_len)
1943             break;
1944           else
1945             {
1946               phase = Phase (phase + 1);
1947               continue;
1948             }
1949         }
1950
1951       if (!prefix_len && c == '"')
1952         /* Failure mode lexing.  */
1953         goto out;
1954       else if (prefix_len && c == ')')
1955         phase = PHASE_SUFFIX;
1956       else if (!read_note && c == '\n')
1957         {
1958           pos--;
1959           pfile->buffer->cur = pos;
1960           if (pfile->state.in_directive
1961               || (pfile->state.parsing_args
1962                   && pfile->buffer->next_line >= pfile->buffer->rlimit))
1963             {
1964               cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
1965                                    "unterminated raw string");
1966               type = CPP_OTHER;
1967               goto out;
1968             }
1969
1970           accum.append (pfile, base, pos - base + 1);
1971           _cpp_process_line_notes (pfile, false);
1972
1973           if (pfile->buffer->next_line < pfile->buffer->rlimit)
1974             CPP_INCREMENT_LINE (pfile, 0);
1975           pfile->buffer->need_line = true;
1976
1977           if (!_cpp_get_fresh_line (pfile))
1978             {
1979               /* We ran out of file and failed to get a line.  */
1980               location_t src_loc = token->src_loc;
1981               token->type = CPP_EOF;
1982               /* Tell the compiler the line number of the EOF token.  */
1983               token->src_loc = pfile->line_table->highest_line;
1984               token->flags = BOL;
1985               if (accum.first)
1986                 _cpp_release_buff (pfile, accum.first);
1987               cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
1988                                    "unterminated raw string");
1989               /* Now pop the buffer that _cpp_get_fresh_line did not.  */
1990               _cpp_pop_buffer (pfile);
1991               return;
1992             }
1993
1994           pos = base = pfile->buffer->cur;
1995           note = &pfile->buffer->notes[pfile->buffer->cur_note];
1996         }
1997     }
1998
1999   if (CPP_OPTION (pfile, user_literals))
2000     {
2001       /* If a string format macro, say from inttypes.h, is placed touching
2002          a string literal it could be parsed as a C++11 user-defined string
2003          literal thus breaking the program.  */
2004       if (is_macro_not_literal_suffix (pfile, pos))
2005         {
2006           /* Raise a warning, but do not consume subsequent tokens.  */
2007           if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
2008             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
2009                                    token->src_loc, 0,
2010                                    "invalid suffix on literal; C++11 requires "
2011                                    "a space between literal and string macro");
2012         }
2013       /* Grab user defined literal suffix.  */
2014       else if (ISIDST (*pos))
2015         {
2016           type = cpp_userdef_string_add_type (type);
2017           ++pos;
2018
2019           while (ISIDNUM (*pos))
2020             ++pos;
2021         }
2022     }
2023
2024  out:
2025   pfile->buffer->cur = pos;
2026   if (!accum.accum)
2027     create_literal (pfile, token, base, pos - base, type);
2028   else
2029     {
2030       size_t extra_len = pos - base;
2031       uchar *dest = _cpp_unaligned_alloc (pfile, accum.accum + extra_len + 1);
2032
2033       token->type = type;
2034       token->val.str.len = accum.accum + extra_len;
2035       token->val.str.text = dest;
2036       for (_cpp_buff *buf = accum.first; buf; buf = buf->next)
2037         {
2038           size_t len = BUFF_FRONT (buf) - buf->base;
2039           memcpy (dest, buf->base, len);
2040           dest += len;
2041         }
2042       _cpp_release_buff (pfile, accum.first);
2043       memcpy (dest, base, extra_len);
2044       dest[extra_len] = '\0';
2045     }
2046 }
2047
2048 /* Lexes a string, character constant, or angle-bracketed header file
2049    name.  The stored string contains the spelling, including opening
2050    quote and any leading 'L', 'u', 'U' or 'u8' and optional
2051    'R' modifier.  It returns the type of the literal, or CPP_OTHER
2052    if it was not properly terminated, or CPP_LESS for an unterminated
2053    header name which must be relexed as normal tokens.
2054
2055    The spelling is NUL-terminated, but it is not guaranteed that this
2056    is the first NUL since embedded NULs are preserved.  */
2057 static void
2058 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
2059 {
2060   bool saw_NUL = false;
2061   const uchar *cur;
2062   cppchar_t terminator;
2063   enum cpp_ttype type;
2064
2065   cur = base;
2066   terminator = *cur++;
2067   if (terminator == 'L' || terminator == 'U')
2068     terminator = *cur++;
2069   else if (terminator == 'u')
2070     {
2071       terminator = *cur++;
2072       if (terminator == '8')
2073         terminator = *cur++;
2074     }
2075   if (terminator == 'R')
2076     {
2077       lex_raw_string (pfile, token, base);
2078       return;
2079     }
2080   if (terminator == '"')
2081     type = (*base == 'L' ? CPP_WSTRING :
2082             *base == 'U' ? CPP_STRING32 :
2083             *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
2084                          : CPP_STRING);
2085   else if (terminator == '\'')
2086     type = (*base == 'L' ? CPP_WCHAR :
2087             *base == 'U' ? CPP_CHAR32 :
2088             *base == 'u' ? (base[1] == '8' ? CPP_UTF8CHAR : CPP_CHAR16)
2089                          : CPP_CHAR);
2090   else
2091     terminator = '>', type = CPP_HEADER_NAME;
2092
2093   for (;;)
2094     {
2095       cppchar_t c = *cur++;
2096
2097       /* In #include-style directives, terminators are not escapable.  */
2098       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
2099         cur++;
2100       else if (c == terminator)
2101         break;
2102       else if (c == '\n')
2103         {
2104           cur--;
2105           /* Unmatched quotes always yield undefined behavior, but
2106              greedy lexing means that what appears to be an unterminated
2107              header name may actually be a legitimate sequence of tokens.  */
2108           if (terminator == '>')
2109             {
2110               token->type = CPP_LESS;
2111               return;
2112             }
2113           type = CPP_OTHER;
2114           break;
2115         }
2116       else if (c == '\0')
2117         saw_NUL = true;
2118     }
2119
2120   if (saw_NUL && !pfile->state.skipping)
2121     cpp_error (pfile, CPP_DL_WARNING,
2122                "null character(s) preserved in literal");
2123
2124   if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
2125     cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
2126                (int) terminator);
2127
2128   if (CPP_OPTION (pfile, user_literals))
2129     {
2130       /* If a string format macro, say from inttypes.h, is placed touching
2131          a string literal it could be parsed as a C++11 user-defined string
2132          literal thus breaking the program.  */
2133       if (is_macro_not_literal_suffix (pfile, cur))
2134         {
2135           /* Raise a warning, but do not consume subsequent tokens.  */
2136           if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
2137             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
2138                                    token->src_loc, 0,
2139                                    "invalid suffix on literal; C++11 requires "
2140                                    "a space between literal and string macro");
2141         }
2142       /* Grab user defined literal suffix.  */
2143       else if (ISIDST (*cur))
2144         {
2145           type = cpp_userdef_char_add_type (type);
2146           type = cpp_userdef_string_add_type (type);
2147           ++cur;
2148
2149           while (ISIDNUM (*cur))
2150             ++cur;
2151         }
2152     }
2153   else if (CPP_OPTION (pfile, cpp_warn_cxx11_compat)
2154            && is_macro (pfile, cur)
2155            && !pfile->state.skipping)
2156     cpp_warning_with_line (pfile, CPP_W_CXX11_COMPAT,
2157                            token->src_loc, 0, "C++11 requires a space "
2158                            "between string literal and macro");
2159
2160   pfile->buffer->cur = cur;
2161   create_literal (pfile, token, base, cur - base, type);
2162 }
2163
2164 /* Return the comment table. The client may not make any assumption
2165    about the ordering of the table.  */
2166 cpp_comment_table *
2167 cpp_get_comments (cpp_reader *pfile)
2168 {
2169   return &pfile->comments;
2170 }
2171
2172 /* Append a comment to the end of the comment table. */
2173 static void
2174 store_comment (cpp_reader *pfile, cpp_token *token)
2175 {
2176   int len;
2177
2178   if (pfile->comments.allocated == 0)
2179     {
2180       pfile->comments.allocated = 256;
2181       pfile->comments.entries = (cpp_comment *) xmalloc
2182         (pfile->comments.allocated * sizeof (cpp_comment));
2183     }
2184
2185   if (pfile->comments.count == pfile->comments.allocated)
2186     {
2187       pfile->comments.allocated *= 2;
2188       pfile->comments.entries = (cpp_comment *) xrealloc
2189         (pfile->comments.entries,
2190          pfile->comments.allocated * sizeof (cpp_comment));
2191     }
2192
2193   len = token->val.str.len;
2194
2195   /* Copy comment. Note, token may not be NULL terminated. */
2196   pfile->comments.entries[pfile->comments.count].comment =
2197     (char *) xmalloc (sizeof (char) * (len + 1));
2198   memcpy (pfile->comments.entries[pfile->comments.count].comment,
2199           token->val.str.text, len);
2200   pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
2201
2202   /* Set source location. */
2203   pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
2204
2205   /* Increment the count of entries in the comment table. */
2206   pfile->comments.count++;
2207 }
2208
2209 /* The stored comment includes the comment start and any terminator.  */
2210 static void
2211 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
2212               cppchar_t type)
2213 {
2214   unsigned char *buffer;
2215   unsigned int len, clen, i;
2216
2217   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
2218
2219   /* C++ comments probably (not definitely) have moved past a new
2220      line, which we don't want to save in the comment.  */
2221   if (is_vspace (pfile->buffer->cur[-1]))
2222     len--;
2223
2224   /* If we are currently in a directive or in argument parsing, then
2225      we need to store all C++ comments as C comments internally, and
2226      so we need to allocate a little extra space in that case.
2227
2228      Note that the only time we encounter a directive here is
2229      when we are saving comments in a "#define".  */
2230   clen = ((pfile->state.in_directive || pfile->state.parsing_args)
2231           && type == '/') ? len + 2 : len;
2232
2233   buffer = _cpp_unaligned_alloc (pfile, clen);
2234
2235   token->type = CPP_COMMENT;
2236   token->val.str.len = clen;
2237   token->val.str.text = buffer;
2238
2239   buffer[0] = '/';
2240   memcpy (buffer + 1, from, len - 1);
2241
2242   /* Finish conversion to a C comment, if necessary.  */
2243   if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
2244     {
2245       buffer[1] = '*';
2246       buffer[clen - 2] = '*';
2247       buffer[clen - 1] = '/';
2248       /* As there can be in a C++ comments illegal sequences for C comments
2249          we need to filter them out.  */
2250       for (i = 2; i < (clen - 2); i++)
2251         if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
2252           buffer[i] = '|';
2253     }
2254
2255   /* Finally store this comment for use by clients of libcpp. */
2256   store_comment (pfile, token);
2257 }
2258
2259 /* Returns true if comment at COMMENT_START is a recognized FALLTHROUGH
2260    comment.  */
2261
2262 static bool
2263 fallthrough_comment_p (cpp_reader *pfile, const unsigned char *comment_start)
2264 {
2265   const unsigned char *from = comment_start + 1;
2266
2267   switch (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough))
2268     {
2269       /* For both -Wimplicit-fallthrough=0 and -Wimplicit-fallthrough=5 we
2270          don't recognize any comments.  The latter only checks attributes,
2271          the former doesn't warn.  */
2272     case 0:
2273     default:
2274       return false;
2275       /* -Wimplicit-fallthrough=1 considers any comment, no matter what
2276          content it has.  */
2277     case 1:
2278       return true;
2279     case 2:
2280       /* -Wimplicit-fallthrough=2 looks for (case insensitive)
2281          .*falls?[ \t-]*thr(u|ough).* regex.  */
2282       for (; (size_t) (pfile->buffer->cur - from) >= sizeof "fallthru" - 1;
2283            from++)
2284         {
2285           /* Is there anything like strpbrk with upper boundary, or
2286              memchr looking for 2 characters rather than just one?  */
2287           if (from[0] != 'f' && from[0] != 'F')
2288             continue;
2289           if (from[1] != 'a' && from[1] != 'A')
2290             continue;
2291           if (from[2] != 'l' && from[2] != 'L')
2292             continue;
2293           if (from[3] != 'l' && from[3] != 'L')
2294             continue;
2295           from += sizeof "fall" - 1;
2296           if (from[0] == 's' || from[0] == 'S')
2297             from++;
2298           while (*from == ' ' || *from == '\t' || *from == '-')
2299             from++;
2300           if (from[0] != 't' && from[0] != 'T')
2301             continue;
2302           if (from[1] != 'h' && from[1] != 'H')
2303             continue;
2304           if (from[2] != 'r' && from[2] != 'R')
2305             continue;
2306           if (from[3] == 'u' || from[3] == 'U')
2307             return true;
2308           if (from[3] != 'o' && from[3] != 'O')
2309             continue;
2310           if (from[4] != 'u' && from[4] != 'U')
2311             continue;
2312           if (from[5] != 'g' && from[5] != 'G')
2313             continue;
2314           if (from[6] != 'h' && from[6] != 'H')
2315             continue;
2316           return true;
2317         }
2318       return false;
2319     case 3:
2320     case 4:
2321       break;
2322     }
2323
2324   /* Whole comment contents:
2325      -fallthrough
2326      @fallthrough@
2327    */
2328   if (*from == '-' || *from == '@')
2329     {
2330       size_t len = sizeof "fallthrough" - 1;
2331       if ((size_t) (pfile->buffer->cur - from - 1) < len)
2332         return false;
2333       if (memcmp (from + 1, "fallthrough", len))
2334         return false;
2335       if (*from == '@')
2336         {
2337           if (from[len + 1] != '@')
2338             return false;
2339           len++;
2340         }
2341       from += 1 + len;
2342     }
2343   /* Whole comment contents (regex):
2344      lint -fallthrough[ \t]*
2345    */
2346   else if (*from == 'l')
2347     {
2348       size_t len = sizeof "int -fallthrough" - 1;
2349       if ((size_t) (pfile->buffer->cur - from - 1) < len)
2350         return false;
2351       if (memcmp (from + 1, "int -fallthrough", len))
2352         return false;
2353       from += 1 + len;
2354       while (*from == ' ' || *from == '\t')
2355         from++;
2356     }
2357   /* Whole comment contents (regex):
2358      [ \t]*FALLTHR(U|OUGH)[ \t]*
2359    */
2360   else if (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough) == 4)
2361     {
2362       while (*from == ' ' || *from == '\t')
2363         from++;
2364       if ((size_t) (pfile->buffer->cur - from)  < sizeof "FALLTHRU" - 1)
2365         return false;
2366       if (memcmp (from, "FALLTHR", sizeof "FALLTHR" - 1))
2367         return false;
2368       from += sizeof "FALLTHR" - 1;
2369       if (*from == 'U')
2370         from++;
2371       else if ((size_t) (pfile->buffer->cur - from)  < sizeof "OUGH" - 1)
2372         return false;
2373       else if (memcmp (from, "OUGH", sizeof "OUGH" - 1))
2374         return false;
2375       else
2376         from += sizeof "OUGH" - 1;
2377       while (*from == ' ' || *from == '\t')
2378         from++;
2379     }
2380   /* Whole comment contents (regex):
2381      [ \t.!]*(ELSE,? |INTENTIONAL(LY)? )?FALL(S | |-)?THR(OUGH|U)[ \t.!]*(-[^\n\r]*)?
2382      [ \t.!]*(Else,? |Intentional(ly)? )?Fall((s | |-)[Tt]|t)hr(ough|u)[ \t.!]*(-[^\n\r]*)?
2383      [ \t.!]*([Ee]lse,? |[Ii]ntentional(ly)? )?fall(s | |-)?thr(ough|u)[ \t.!]*(-[^\n\r]*)?
2384    */
2385   else
2386     {
2387       while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
2388         from++;
2389       unsigned char f = *from;
2390       bool all_upper = false;
2391       if (f == 'E' || f == 'e')
2392         {
2393           if ((size_t) (pfile->buffer->cur - from)
2394               < sizeof "else fallthru" - 1)
2395             return false;
2396           if (f == 'E' && memcmp (from + 1, "LSE", sizeof "LSE" - 1) == 0)
2397             all_upper = true;
2398           else if (memcmp (from + 1, "lse", sizeof "lse" - 1))
2399             return false;
2400           from += sizeof "else" - 1;
2401           if (*from == ',')
2402             from++;
2403           if (*from != ' ')
2404             return false;
2405           from++;
2406           if (all_upper && *from == 'f')
2407             return false;
2408           if (f == 'e' && *from == 'F')
2409             return false;
2410           f = *from;
2411         }
2412       else if (f == 'I' || f == 'i')
2413         {
2414           if ((size_t) (pfile->buffer->cur - from)
2415               < sizeof "intentional fallthru" - 1)
2416             return false;
2417           if (f == 'I' && memcmp (from + 1, "NTENTIONAL",
2418                                   sizeof "NTENTIONAL" - 1) == 0)
2419             all_upper = true;
2420           else if (memcmp (from + 1, "ntentional",
2421                            sizeof "ntentional" - 1))
2422             return false;
2423           from += sizeof "intentional" - 1;
2424           if (*from == ' ')
2425             {
2426               from++;
2427               if (all_upper && *from == 'f')
2428                 return false;
2429             }
2430           else if (all_upper)
2431             {
2432               if (memcmp (from, "LY F", sizeof "LY F" - 1))
2433                 return false;
2434               from += sizeof "LY " - 1;
2435             }
2436           else
2437             {
2438               if (memcmp (from, "ly ", sizeof "ly " - 1))
2439                 return false;
2440               from += sizeof "ly " - 1;
2441             }
2442           if (f == 'i' && *from == 'F')
2443             return false;
2444           f = *from;
2445         }
2446       if (f != 'F' && f != 'f')
2447         return false;
2448       if ((size_t) (pfile->buffer->cur - from) < sizeof "fallthru" - 1)
2449         return false;
2450       if (f == 'F' && memcmp (from + 1, "ALL", sizeof "ALL" - 1) == 0)
2451         all_upper = true;
2452       else if (all_upper)
2453         return false;
2454       else if (memcmp (from + 1, "all", sizeof "all" - 1))
2455         return false;
2456       from += sizeof "fall" - 1;
2457       if (*from == (all_upper ? 'S' : 's') && from[1] == ' ')
2458         from += 2;
2459       else if (*from == ' ' || *from == '-')
2460         from++;
2461       else if (*from != (all_upper ? 'T' : 't'))
2462         return false;
2463       if ((f == 'f' || *from != 'T') && (all_upper || *from != 't'))
2464         return false;
2465       if ((size_t) (pfile->buffer->cur - from) < sizeof "thru" - 1)
2466         return false;
2467       if (memcmp (from + 1, all_upper ? "HRU" : "hru", sizeof "hru" - 1))
2468         {
2469           if ((size_t) (pfile->buffer->cur - from) < sizeof "through" - 1)
2470             return false;
2471           if (memcmp (from + 1, all_upper ? "HROUGH" : "hrough",
2472                       sizeof "hrough" - 1))
2473             return false;
2474           from += sizeof "through" - 1;
2475         }
2476       else
2477         from += sizeof "thru" - 1;
2478       while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
2479         from++;
2480       if (*from == '-')
2481         {
2482           from++;
2483           if (*comment_start == '*')
2484             {
2485               do
2486                 {
2487                   while (*from && *from != '*'
2488                          && *from != '\n' && *from != '\r')
2489                     from++;
2490                   if (*from != '*' || from[1] == '/')
2491                     break;
2492                   from++;
2493                 }
2494               while (1);
2495             }
2496           else
2497             while (*from && *from != '\n' && *from != '\r')
2498               from++;
2499         }
2500     }
2501   /* C block comment.  */
2502   if (*comment_start == '*')
2503     {
2504       if (*from != '*' || from[1] != '/')
2505         return false;
2506     }
2507   /* C++ line comment.  */
2508   else if (*from != '\n')
2509     return false;
2510
2511   return true;
2512 }
2513
2514 /* Allocate COUNT tokens for RUN.  */
2515 void
2516 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
2517 {
2518   run->base = XNEWVEC (cpp_token, count);
2519   run->limit = run->base + count;
2520   run->next = NULL;
2521 }
2522
2523 /* Returns the next tokenrun, or creates one if there is none.  */
2524 static tokenrun *
2525 next_tokenrun (tokenrun *run)
2526 {
2527   if (run->next == NULL)
2528     {
2529       run->next = XNEW (tokenrun);
2530       run->next->prev = run;
2531       _cpp_init_tokenrun (run->next, 250);
2532     }
2533
2534   return run->next;
2535 }
2536
2537 /* Return the number of not yet processed token in a given
2538    context.  */
2539 int
2540 _cpp_remaining_tokens_num_in_context (cpp_context *context)
2541 {
2542   if (context->tokens_kind == TOKENS_KIND_DIRECT)
2543     return (LAST (context).token - FIRST (context).token);
2544   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
2545            || context->tokens_kind == TOKENS_KIND_EXTENDED)
2546     return (LAST (context).ptoken - FIRST (context).ptoken);
2547   else
2548       abort ();
2549 }
2550
2551 /* Returns the token present at index INDEX in a given context.  If
2552    INDEX is zero, the next token to be processed is returned.  */
2553 static const cpp_token*
2554 _cpp_token_from_context_at (cpp_context *context, int index)
2555 {
2556   if (context->tokens_kind == TOKENS_KIND_DIRECT)
2557     return &(FIRST (context).token[index]);
2558   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
2559            || context->tokens_kind == TOKENS_KIND_EXTENDED)
2560     return FIRST (context).ptoken[index];
2561  else
2562    abort ();
2563 }
2564
2565 /* Look ahead in the input stream.  */
2566 const cpp_token *
2567 cpp_peek_token (cpp_reader *pfile, int index)
2568 {
2569   cpp_context *context = pfile->context;
2570   const cpp_token *peektok;
2571   int count;
2572
2573   /* First, scan through any pending cpp_context objects.  */
2574   while (context->prev)
2575     {
2576       ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
2577
2578       if (index < (int) sz)
2579         return _cpp_token_from_context_at (context, index);
2580       index -= (int) sz;
2581       context = context->prev;
2582     }
2583
2584   /* We will have to read some new tokens after all (and do so
2585      without invalidating preceding tokens).  */
2586   count = index;
2587   pfile->keep_tokens++;
2588
2589   /* For peeked tokens temporarily disable line_change reporting,
2590      until the tokens are parsed for real.  */
2591   void (*line_change) (cpp_reader *, const cpp_token *, int)
2592     = pfile->cb.line_change;
2593   pfile->cb.line_change = NULL;
2594
2595   do
2596     {
2597       peektok = _cpp_lex_token (pfile);
2598       if (peektok->type == CPP_EOF)
2599         {
2600           index--;
2601           break;
2602         }
2603       else if (peektok->type == CPP_PRAGMA)
2604         {
2605           /* Don't peek past a pragma.  */
2606           if (peektok == &pfile->directive_result)
2607             /* Save the pragma in the buffer.  */
2608             *pfile->cur_token++ = *peektok;
2609           index--;
2610           break;
2611         }
2612     }
2613   while (index--);
2614
2615   _cpp_backup_tokens_direct (pfile, count - index);
2616   pfile->keep_tokens--;
2617   pfile->cb.line_change = line_change;
2618
2619   return peektok;
2620 }
2621
2622 /* Allocate a single token that is invalidated at the same time as the
2623    rest of the tokens on the line.  Has its line and col set to the
2624    same as the last lexed token, so that diagnostics appear in the
2625    right place.  */
2626 cpp_token *
2627 _cpp_temp_token (cpp_reader *pfile)
2628 {
2629   cpp_token *old, *result;
2630   ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
2631   ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
2632
2633   old = pfile->cur_token - 1;
2634   /* Any pre-existing lookaheads must not be clobbered.  */
2635   if (la)
2636     {
2637       if (sz <= la)
2638         {
2639           tokenrun *next = next_tokenrun (pfile->cur_run);
2640
2641           if (sz < la)
2642             memmove (next->base + 1, next->base,
2643                      (la - sz) * sizeof (cpp_token));
2644
2645           next->base[0] = pfile->cur_run->limit[-1];
2646         }
2647
2648       if (sz > 1)
2649         memmove (pfile->cur_token + 1, pfile->cur_token,
2650                  MIN (la, sz - 1) * sizeof (cpp_token));
2651     }
2652
2653   if (!sz && pfile->cur_token == pfile->cur_run->limit)
2654     {
2655       pfile->cur_run = next_tokenrun (pfile->cur_run);
2656       pfile->cur_token = pfile->cur_run->base;
2657     }
2658
2659   result = pfile->cur_token++;
2660   result->src_loc = old->src_loc;
2661   return result;
2662 }
2663
2664 /* We're at the beginning of a logical line (so not in
2665   directives-mode) and RESULT is a CPP_NAME with NODE_MODULE set.  See
2666   if we should enter deferred_pragma mode to tokenize the rest of the
2667   line as a module control-line.  */
2668
2669 static void
2670 cpp_maybe_module_directive (cpp_reader *pfile, cpp_token *result)
2671 {
2672   unsigned backup = 0; /* Tokens we peeked.  */
2673   cpp_hashnode *node = result->val.node.node;
2674   cpp_token *peek = result;
2675   cpp_token *keyword = peek;
2676   cpp_hashnode *(&n_modules)[spec_nodes::M_HWM][2] = pfile->spec_nodes.n_modules;
2677   int header_count = 0;
2678
2679   /* Make sure the incoming state is as we expect it.  This way we
2680      can restore it using constants.  */
2681   gcc_checking_assert (!pfile->state.in_deferred_pragma
2682                        && !pfile->state.skipping
2683                        && !pfile->state.parsing_args
2684                        && !pfile->state.angled_headers
2685                        && (pfile->state.save_comments
2686                            == !CPP_OPTION (pfile, discard_comments)));
2687
2688   /* Enter directives mode sufficiently for peeking.  We don't have
2689      to actually set in_directive.  */
2690   pfile->state.in_deferred_pragma = true;
2691
2692   /* These two fields are needed to process tokenization in deferred
2693      pragma mode.  They are not used outside deferred pragma mode or
2694      directives mode.  */
2695   pfile->state.pragma_allow_expansion = true;
2696   pfile->directive_line = result->src_loc;
2697
2698   /* Saving comments is incompatible with directives mode.   */
2699   pfile->state.save_comments = 0;
2700
2701   if (node == n_modules[spec_nodes::M_EXPORT][0])
2702     {
2703       peek = _cpp_lex_direct (pfile);
2704       keyword = peek;
2705       backup++;
2706       if (keyword->type != CPP_NAME)
2707         goto not_module;
2708       node = keyword->val.node.node;
2709       if (!(node->flags & NODE_MODULE))
2710         goto not_module;
2711     }
2712
2713   if (node == n_modules[spec_nodes::M__IMPORT][0])
2714     /* __import  */
2715     header_count = backup + 2 + 16;
2716   else if (node == n_modules[spec_nodes::M_IMPORT][0])
2717     /* import  */
2718     header_count = backup + 2 + (CPP_OPTION (pfile, preprocessed) ? 16 : 0);
2719   else if (node == n_modules[spec_nodes::M_MODULE][0])
2720     ; /* module  */
2721   else
2722     goto not_module;
2723
2724   /* We've seen [export] {module|import|__import}.  Check the next token.  */
2725   if (header_count)
2726     /* After '{,__}import' a header name may appear.  */
2727     pfile->state.angled_headers = true;
2728   peek = _cpp_lex_direct (pfile);
2729   backup++;
2730
2731   /* ... import followed by identifier, ':', '<' or
2732      header-name preprocessing tokens, or module
2733      followed by cpp-identifier, ':' or ';' preprocessing
2734      tokens.  C++ keywords are not yet relevant.  */
2735   if (peek->type == CPP_NAME
2736       || peek->type == CPP_COLON
2737       ||  (header_count
2738            ? (peek->type == CPP_LESS
2739               || (peek->type == CPP_STRING && peek->val.str.text[0] != 'R')
2740               || peek->type == CPP_HEADER_NAME)
2741            : peek->type == CPP_SEMICOLON))
2742     {
2743       pfile->state.pragma_allow_expansion = !CPP_OPTION (pfile, preprocessed);
2744       if (!pfile->state.pragma_allow_expansion)
2745         pfile->state.prevent_expansion++;
2746
2747       if (!header_count && linemap_included_from
2748           (LINEMAPS_LAST_ORDINARY_MAP (pfile->line_table)))
2749         cpp_error_with_line (pfile, CPP_DL_ERROR, keyword->src_loc, 0,
2750                              "module control-line cannot be in included file");
2751
2752       /* The first one or two tokens cannot be macro names.  */
2753       for (int ix = backup; ix--;)
2754         {
2755           cpp_token *tok = ix ? keyword : result;
2756           cpp_hashnode *node = tok->val.node.node;
2757
2758           /* Don't attempt to expand the token.  */
2759           tok->flags |= NO_EXPAND;
2760           if (_cpp_defined_macro_p (node)
2761               && _cpp_maybe_notify_macro_use (pfile, node, tok->src_loc)
2762               && !cpp_fun_like_macro_p (node))
2763             cpp_error_with_line (pfile, CPP_DL_ERROR, tok->src_loc, 0,
2764                                  "module control-line \"%s\" cannot be"
2765                                  " an object-like macro",
2766                                  NODE_NAME (node));
2767         }
2768
2769       /* Map to underbar variants.  */
2770       keyword->val.node.node = n_modules[header_count
2771                                          ? spec_nodes::M_IMPORT
2772                                          : spec_nodes::M_MODULE][1];
2773       if (backup != 1)
2774         result->val.node.node = n_modules[spec_nodes::M_EXPORT][1];
2775
2776       /* Maybe tell the tokenizer we expect a header-name down the
2777          road.  */
2778       pfile->state.directive_file_token = header_count;
2779     }
2780   else
2781     {
2782     not_module:
2783       /* Drop out of directive mode.  */
2784       /* We aaserted save_comments had this value upon entry.  */
2785       pfile->state.save_comments
2786         = !CPP_OPTION (pfile, discard_comments);
2787       pfile->state.in_deferred_pragma = false;
2788       /* Do not let this remain on.  */
2789       pfile->state.angled_headers = false;
2790     }
2791
2792   /* In either case we want to backup the peeked tokens.  */
2793   if (backup)
2794     {
2795       /* If we saw EOL, we should drop it, because this isn't a module
2796          control-line after all.  */
2797       bool eol = peek->type == CPP_PRAGMA_EOL;
2798       if (!eol || backup > 1)
2799         {
2800           /* Put put the peeked tokens back  */
2801           _cpp_backup_tokens_direct (pfile, backup);
2802           /* But if the last one was an EOL, forget it.  */
2803           if (eol)
2804             pfile->lookaheads--;
2805         }
2806     }
2807 }
2808
2809 /* Lex a token into RESULT (external interface).  Takes care of issues
2810    like directive handling, token lookahead, multiple include
2811    optimization and skipping.  */
2812 const cpp_token *
2813 _cpp_lex_token (cpp_reader *pfile)
2814 {
2815   cpp_token *result;
2816
2817   for (;;)
2818     {
2819       if (pfile->cur_token == pfile->cur_run->limit)
2820         {
2821           pfile->cur_run = next_tokenrun (pfile->cur_run);
2822           pfile->cur_token = pfile->cur_run->base;
2823         }
2824       /* We assume that the current token is somewhere in the current
2825          run.  */
2826       if (pfile->cur_token < pfile->cur_run->base
2827           || pfile->cur_token >= pfile->cur_run->limit)
2828         abort ();
2829
2830       if (pfile->lookaheads)
2831         {
2832           pfile->lookaheads--;
2833           result = pfile->cur_token++;
2834         }
2835       else
2836         result = _cpp_lex_direct (pfile);
2837
2838       if (result->flags & BOL)
2839         {
2840           /* Is this a directive.  If _cpp_handle_directive returns
2841              false, it is an assembler #.  */
2842           if (result->type == CPP_HASH
2843               /* 6.10.3 p 11: Directives in a list of macro arguments
2844                  gives undefined behavior.  This implementation
2845                  handles the directive as normal.  */
2846               && pfile->state.parsing_args != 1)
2847             {
2848               if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
2849                 {
2850                   if (pfile->directive_result.type == CPP_PADDING)
2851                     continue;
2852                   result = &pfile->directive_result;
2853                 }
2854             }
2855           else if (pfile->state.in_deferred_pragma)
2856             result = &pfile->directive_result;
2857           else if (result->type == CPP_NAME
2858                    && (result->val.node.node->flags & NODE_MODULE)
2859                    && !pfile->state.skipping
2860                    /* Unlike regular directives, we do not deal with
2861                       tokenizing module directives as macro arguments.
2862                       That's not permitted.  */
2863                    && !pfile->state.parsing_args)
2864             {
2865               /* P1857.  Before macro expansion, At start of logical
2866                  line ... */
2867               /* We don't have to consider lookaheads at this point.  */
2868               gcc_checking_assert (!pfile->lookaheads);
2869
2870               cpp_maybe_module_directive (pfile, result);
2871             }
2872
2873           if (pfile->cb.line_change && !pfile->state.skipping)
2874             pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
2875         }
2876
2877       /* We don't skip tokens in directives.  */
2878       if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
2879         break;
2880
2881       /* Outside a directive, invalidate controlling macros.  At file
2882          EOF, _cpp_lex_direct takes care of popping the buffer, so we never
2883          get here and MI optimization works.  */
2884       pfile->mi_valid = false;
2885
2886       if (!pfile->state.skipping || result->type == CPP_EOF)
2887         break;
2888     }
2889
2890   return result;
2891 }
2892
2893 /* Returns true if a fresh line has been loaded.  */
2894 bool
2895 _cpp_get_fresh_line (cpp_reader *pfile)
2896 {
2897   /* We can't get a new line until we leave the current directive.  */
2898   if (pfile->state.in_directive)
2899     return false;
2900
2901   for (;;)
2902     {
2903       cpp_buffer *buffer = pfile->buffer;
2904
2905       if (!buffer->need_line)
2906         return true;
2907
2908       if (buffer->next_line < buffer->rlimit)
2909         {
2910           _cpp_clean_line (pfile);
2911           return true;
2912         }
2913
2914       /* First, get out of parsing arguments state.  */
2915       if (pfile->state.parsing_args)
2916         return false;
2917
2918       /* End of buffer.  Non-empty files should end in a newline.  */
2919       if (buffer->buf != buffer->rlimit
2920           && buffer->next_line > buffer->rlimit
2921           && !buffer->from_stage3)
2922         {
2923           /* Clip to buffer size.  */
2924           buffer->next_line = buffer->rlimit;
2925         }
2926
2927       if (buffer->prev && !buffer->return_at_eof)
2928         _cpp_pop_buffer (pfile);
2929       else
2930         {
2931           /* End of translation.  Do not pop the buffer yet. Increment
2932              line number so that the EOF token is on a line of its own
2933              (_cpp_lex_direct doesn't increment in that case, because
2934              it's hard for it to distinguish this special case). */
2935           CPP_INCREMENT_LINE (pfile, 0);
2936           return false;
2937         }
2938     }
2939 }
2940
2941 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)          \
2942   do                                                    \
2943     {                                                   \
2944       result->type = ELSE_TYPE;                         \
2945       if (*buffer->cur == CHAR)                         \
2946         buffer->cur++, result->type = THEN_TYPE;        \
2947     }                                                   \
2948   while (0)
2949
2950 /* Lex a token into pfile->cur_token, which is also incremented, to
2951    get diagnostics pointing to the correct location.
2952
2953    Does not handle issues such as token lookahead, multiple-include
2954    optimization, directives, skipping etc.  This function is only
2955    suitable for use by _cpp_lex_token, and in special cases like
2956    lex_expansion_token which doesn't care for any of these issues.
2957
2958    When meeting a newline, returns CPP_EOF if parsing a directive,
2959    otherwise returns to the start of the token buffer if permissible.
2960    Returns the location of the lexed token.  */
2961 cpp_token *
2962 _cpp_lex_direct (cpp_reader *pfile)
2963 {
2964   cppchar_t c;
2965   cpp_buffer *buffer;
2966   const unsigned char *comment_start;
2967   bool fallthrough_comment = false;
2968   cpp_token *result = pfile->cur_token++;
2969
2970  fresh_line:
2971   result->flags = 0;
2972   buffer = pfile->buffer;
2973   if (buffer->need_line)
2974     {
2975       gcc_assert (!pfile->state.in_deferred_pragma);
2976       if (!_cpp_get_fresh_line (pfile))
2977         {
2978           result->type = CPP_EOF;
2979           /* Not a real EOF in a directive or arg parsing -- we refuse
2980              to advance to the next file now, and will once we're out
2981              of those modes.  */
2982           if (!pfile->state.in_directive && !pfile->state.parsing_args)
2983             {
2984               /* Tell the compiler the line number of the EOF token.  */
2985               result->src_loc = pfile->line_table->highest_line;
2986               result->flags = BOL;
2987               /* Now pop the buffer that _cpp_get_fresh_line did not.  */
2988               _cpp_pop_buffer (pfile);
2989             }
2990           return result;
2991         }
2992       if (buffer != pfile->buffer)
2993         fallthrough_comment = false;
2994       if (!pfile->keep_tokens)
2995         {
2996           pfile->cur_run = &pfile->base_run;
2997           result = pfile->base_run.base;
2998           pfile->cur_token = result + 1;
2999         }
3000       result->flags = BOL;
3001       if (pfile->state.parsing_args == 2)
3002         result->flags |= PREV_WHITE;
3003     }
3004   buffer = pfile->buffer;
3005  update_tokens_line:
3006   result->src_loc = pfile->line_table->highest_line;
3007
3008  skipped_white:
3009   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
3010       && !pfile->overlaid_buffer)
3011     {
3012       _cpp_process_line_notes (pfile, false);
3013       result->src_loc = pfile->line_table->highest_line;
3014     }
3015   c = *buffer->cur++;
3016
3017   if (pfile->forced_token_location)
3018     result->src_loc = pfile->forced_token_location;
3019   else
3020     result->src_loc = linemap_position_for_column (pfile->line_table,
3021                                           CPP_BUF_COLUMN (buffer, buffer->cur));
3022
3023   switch (c)
3024     {
3025     case ' ': case '\t': case '\f': case '\v': case '\0':
3026       result->flags |= PREV_WHITE;
3027       skip_whitespace (pfile, c);
3028       goto skipped_white;
3029
3030     case '\n':
3031       /* Increment the line, unless this is the last line ...  */
3032       if (buffer->cur < buffer->rlimit
3033           /* ... or this is a #include, (where _cpp_stack_file needs to
3034              unwind by one line) ...  */
3035           || (pfile->state.in_directive > 1
3036               /* ... except traditional-cpp increments this elsewhere.  */
3037               && !CPP_OPTION (pfile, traditional)))
3038         CPP_INCREMENT_LINE (pfile, 0);
3039       buffer->need_line = true;
3040       if (pfile->state.in_deferred_pragma)
3041         {
3042           /* Produce the PRAGMA_EOL on this line.  File reading
3043              ensures there is always a \n at end of the buffer, thus
3044              in a deferred pragma we always see CPP_PRAGMA_EOL before
3045              any CPP_EOF.  */
3046           result->type = CPP_PRAGMA_EOL;
3047           result->flags &= ~PREV_WHITE;
3048           pfile->state.in_deferred_pragma = false;
3049           if (!pfile->state.pragma_allow_expansion)
3050             pfile->state.prevent_expansion--;
3051           return result;
3052         }
3053       goto fresh_line;
3054
3055     case '0': case '1': case '2': case '3': case '4':
3056     case '5': case '6': case '7': case '8': case '9':
3057       {
3058         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3059         result->type = CPP_NUMBER;
3060         lex_number (pfile, &result->val.str, &nst);
3061         warn_about_normalization (pfile, result, &nst);
3062         break;
3063       }
3064
3065     case 'L':
3066     case 'u':
3067     case 'U':
3068     case 'R':
3069       /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
3070          wide strings or raw strings.  */
3071       if (c == 'L' || CPP_OPTION (pfile, rliterals)
3072           || (c != 'R' && CPP_OPTION (pfile, uliterals)))
3073         {
3074           if ((*buffer->cur == '\'' && c != 'R')
3075               || *buffer->cur == '"'
3076               || (*buffer->cur == 'R'
3077                   && c != 'R'
3078                   && buffer->cur[1] == '"'
3079                   && CPP_OPTION (pfile, rliterals))
3080               || (*buffer->cur == '8'
3081                   && c == 'u'
3082                   && ((buffer->cur[1] == '"' || (buffer->cur[1] == '\''
3083                                 && CPP_OPTION (pfile, utf8_char_literals)))
3084                       || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
3085                           && CPP_OPTION (pfile, rliterals)))))
3086             {
3087               lex_string (pfile, result, buffer->cur - 1);
3088               break;
3089             }
3090         }
3091       /* Fall through.  */
3092
3093     case '_':
3094     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
3095     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
3096     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
3097     case 's': case 't':           case 'v': case 'w': case 'x':
3098     case 'y': case 'z':
3099     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
3100     case 'G': case 'H': case 'I': case 'J': case 'K':
3101     case 'M': case 'N': case 'O': case 'P': case 'Q':
3102     case 'S': case 'T':           case 'V': case 'W': case 'X':
3103     case 'Y': case 'Z':
3104       result->type = CPP_NAME;
3105       {
3106         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3107         result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
3108                                                 &nst,
3109                                                 &result->val.node.spelling);
3110         warn_about_normalization (pfile, result, &nst);
3111       }
3112
3113       /* Convert named operators to their proper types.  */
3114       if (result->val.node.node->flags & NODE_OPERATOR)
3115         {
3116           result->flags |= NAMED_OP;
3117           result->type = (enum cpp_ttype) result->val.node.node->directive_index;
3118         }
3119
3120       /* Signal FALLTHROUGH comment followed by another token.  */
3121       if (fallthrough_comment)
3122         result->flags |= PREV_FALLTHROUGH;
3123       break;
3124
3125     case '\'':
3126     case '"':
3127       lex_string (pfile, result, buffer->cur - 1);
3128       break;
3129
3130     case '/':
3131       /* A potential block or line comment.  */
3132       comment_start = buffer->cur;
3133       c = *buffer->cur;
3134
3135       if (c == '*')
3136         {
3137           if (_cpp_skip_block_comment (pfile))
3138             cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
3139         }
3140       else if (c == '/' && ! CPP_OPTION (pfile, traditional))
3141         {
3142           /* Don't warn for system headers.  */
3143           if (_cpp_in_system_header (pfile))
3144             ;
3145           /* Warn about comments if pedantically GNUC89, and not
3146              in system headers.  */
3147           else if (CPP_OPTION (pfile, lang) == CLK_GNUC89
3148                    && CPP_PEDANTIC (pfile)
3149                    && ! buffer->warned_cplusplus_comments)
3150             {
3151               if (cpp_error (pfile, CPP_DL_PEDWARN,
3152                              "C++ style comments are not allowed in ISO C90"))
3153                 cpp_error (pfile, CPP_DL_NOTE,
3154                            "(this will be reported only once per input file)");
3155               buffer->warned_cplusplus_comments = 1;
3156             }
3157           /* Or if specifically desired via -Wc90-c99-compat.  */
3158           else if (CPP_OPTION (pfile, cpp_warn_c90_c99_compat) > 0
3159                    && ! CPP_OPTION (pfile, cplusplus)
3160                    && ! buffer->warned_cplusplus_comments)
3161             {
3162               if (cpp_error (pfile, CPP_DL_WARNING,
3163                              "C++ style comments are incompatible with C90"))
3164                 cpp_error (pfile, CPP_DL_NOTE,
3165                            "(this will be reported only once per input file)");
3166               buffer->warned_cplusplus_comments = 1;
3167             }
3168           /* In C89/C94, C++ style comments are forbidden.  */
3169           else if ((CPP_OPTION (pfile, lang) == CLK_STDC89
3170                     || CPP_OPTION (pfile, lang) == CLK_STDC94))
3171             {
3172               /* But don't be confused about valid code such as
3173                  - // immediately followed by *,
3174                  - // in a preprocessing directive,
3175                  - // in an #if 0 block.  */
3176               if (buffer->cur[1] == '*'
3177                   || pfile->state.in_directive
3178                   || pfile->state.skipping)
3179                 {
3180                   result->type = CPP_DIV;
3181                   break;
3182                 }
3183               else if (! buffer->warned_cplusplus_comments)
3184                 {
3185                   if (cpp_error (pfile, CPP_DL_ERROR,
3186                                  "C++ style comments are not allowed in "
3187                                  "ISO C90"))
3188                     cpp_error (pfile, CPP_DL_NOTE,
3189                                "(this will be reported only once per input "
3190                                "file)");
3191                   buffer->warned_cplusplus_comments = 1;
3192                 }
3193             }
3194           if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
3195             cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
3196         }
3197       else if (c == '=')
3198         {
3199           buffer->cur++;
3200           result->type = CPP_DIV_EQ;
3201           break;
3202         }
3203       else
3204         {
3205           result->type = CPP_DIV;
3206           break;
3207         }
3208
3209       if (fallthrough_comment_p (pfile, comment_start))
3210         fallthrough_comment = true;
3211
3212       if (pfile->cb.comment)
3213         {
3214           size_t len = pfile->buffer->cur - comment_start;
3215           pfile->cb.comment (pfile, result->src_loc, comment_start - 1,
3216                              len + 1);
3217         }
3218
3219       if (!pfile->state.save_comments)
3220         {
3221           result->flags |= PREV_WHITE;
3222           goto update_tokens_line;
3223         }
3224
3225       if (fallthrough_comment)
3226         result->flags |= PREV_FALLTHROUGH;
3227
3228       /* Save the comment as a token in its own right.  */
3229       save_comment (pfile, result, comment_start, c);
3230       break;
3231
3232     case '<':
3233       if (pfile->state.angled_headers)
3234         {
3235           lex_string (pfile, result, buffer->cur - 1);
3236           if (result->type != CPP_LESS)
3237             break;
3238         }
3239
3240       result->type = CPP_LESS;
3241       if (*buffer->cur == '=')
3242         {
3243           buffer->cur++, result->type = CPP_LESS_EQ;
3244           if (*buffer->cur == '>'
3245               && CPP_OPTION (pfile, cplusplus)
3246               && CPP_OPTION (pfile, lang) >= CLK_GNUCXX20)
3247             buffer->cur++, result->type = CPP_SPACESHIP;
3248         }
3249       else if (*buffer->cur == '<')
3250         {
3251           buffer->cur++;
3252           IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
3253         }
3254       else if (CPP_OPTION (pfile, digraphs))
3255         {
3256           if (*buffer->cur == ':')
3257             {
3258               /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
3259                  three characters are <:: and the subsequent character
3260                  is neither : nor >, the < is treated as a preprocessor
3261                  token by itself".  */
3262               if (CPP_OPTION (pfile, cplusplus)
3263                   && CPP_OPTION (pfile, lang) != CLK_CXX98
3264                   && CPP_OPTION (pfile, lang) != CLK_GNUCXX
3265                   && buffer->cur[1] == ':'
3266                   && buffer->cur[2] != ':' && buffer->cur[2] != '>')
3267                 break;
3268
3269               buffer->cur++;
3270               result->flags |= DIGRAPH;
3271               result->type = CPP_OPEN_SQUARE;
3272             }
3273           else if (*buffer->cur == '%')
3274             {
3275               buffer->cur++;
3276               result->flags |= DIGRAPH;
3277               result->type = CPP_OPEN_BRACE;
3278             }
3279         }
3280       break;
3281
3282     case '>':
3283       result->type = CPP_GREATER;
3284       if (*buffer->cur == '=')
3285         buffer->cur++, result->type = CPP_GREATER_EQ;
3286       else if (*buffer->cur == '>')
3287         {
3288           buffer->cur++;
3289           IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
3290         }
3291       break;
3292
3293     case '%':
3294       result->type = CPP_MOD;
3295       if (*buffer->cur == '=')
3296         buffer->cur++, result->type = CPP_MOD_EQ;
3297       else if (CPP_OPTION (pfile, digraphs))
3298         {
3299           if (*buffer->cur == ':')
3300             {
3301               buffer->cur++;
3302               result->flags |= DIGRAPH;
3303               result->type = CPP_HASH;
3304               if (*buffer->cur == '%' && buffer->cur[1] == ':')
3305                 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
3306             }
3307           else if (*buffer->cur == '>')
3308             {
3309               buffer->cur++;
3310               result->flags |= DIGRAPH;
3311               result->type = CPP_CLOSE_BRACE;
3312             }
3313         }
3314       break;
3315
3316     case '.':
3317       result->type = CPP_DOT;
3318       if (ISDIGIT (*buffer->cur))
3319         {
3320           struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3321           result->type = CPP_NUMBER;
3322           lex_number (pfile, &result->val.str, &nst);
3323           warn_about_normalization (pfile, result, &nst);
3324         }
3325       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
3326         buffer->cur += 2, result->type = CPP_ELLIPSIS;
3327       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
3328         buffer->cur++, result->type = CPP_DOT_STAR;
3329       break;
3330
3331     case '+':
3332       result->type = CPP_PLUS;
3333       if (*buffer->cur == '+')
3334         buffer->cur++, result->type = CPP_PLUS_PLUS;
3335       else if (*buffer->cur == '=')
3336         buffer->cur++, result->type = CPP_PLUS_EQ;
3337       break;
3338
3339     case '-':
3340       result->type = CPP_MINUS;
3341       if (*buffer->cur == '>')
3342         {
3343           buffer->cur++;
3344           result->type = CPP_DEREF;
3345           if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
3346             buffer->cur++, result->type = CPP_DEREF_STAR;
3347         }
3348       else if (*buffer->cur == '-')
3349         buffer->cur++, result->type = CPP_MINUS_MINUS;
3350       else if (*buffer->cur == '=')
3351         buffer->cur++, result->type = CPP_MINUS_EQ;
3352       break;
3353
3354     case '&':
3355       result->type = CPP_AND;
3356       if (*buffer->cur == '&')
3357         buffer->cur++, result->type = CPP_AND_AND;
3358       else if (*buffer->cur == '=')
3359         buffer->cur++, result->type = CPP_AND_EQ;
3360       break;
3361
3362     case '|':
3363       result->type = CPP_OR;
3364       if (*buffer->cur == '|')
3365         buffer->cur++, result->type = CPP_OR_OR;
3366       else if (*buffer->cur == '=')
3367         buffer->cur++, result->type = CPP_OR_EQ;
3368       break;
3369
3370     case ':':
3371       result->type = CPP_COLON;
3372       if (*buffer->cur == ':' && CPP_OPTION (pfile, scope))
3373         buffer->cur++, result->type = CPP_SCOPE;
3374       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
3375         {
3376           buffer->cur++;
3377           result->flags |= DIGRAPH;
3378           result->type = CPP_CLOSE_SQUARE;
3379         }
3380       break;
3381
3382     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
3383     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
3384     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
3385     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
3386     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
3387
3388     case '?': result->type = CPP_QUERY; break;
3389     case '~': result->type = CPP_COMPL; break;
3390     case ',': result->type = CPP_COMMA; break;
3391     case '(': result->type = CPP_OPEN_PAREN; break;
3392     case ')': result->type = CPP_CLOSE_PAREN; break;
3393     case '[': result->type = CPP_OPEN_SQUARE; break;
3394     case ']': result->type = CPP_CLOSE_SQUARE; break;
3395     case '{': result->type = CPP_OPEN_BRACE; break;
3396     case '}': result->type = CPP_CLOSE_BRACE; break;
3397     case ';': result->type = CPP_SEMICOLON; break;
3398
3399       /* @ is a punctuator in Objective-C.  */
3400     case '@': result->type = CPP_ATSIGN; break;
3401
3402     default:
3403       {
3404         const uchar *base = --buffer->cur;
3405
3406         /* Check for an extended identifier ($ or UCN or UTF-8).  */
3407         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3408         if (forms_identifier_p (pfile, true, &nst))
3409           {
3410             result->type = CPP_NAME;
3411             result->val.node.node = lex_identifier (pfile, base, true, &nst,
3412                                                     &result->val.node.spelling);
3413             warn_about_normalization (pfile, result, &nst);
3414             break;
3415           }
3416
3417         /* Otherwise this will form a CPP_OTHER token.  Parse valid UTF-8 as a
3418            single token.  */
3419         buffer->cur++;
3420         if (c >= utf8_signifier)
3421           {
3422             const uchar *pstr = base;
3423             cppchar_t s;
3424             if (_cpp_valid_utf8 (pfile, &pstr, buffer->rlimit, 0, NULL, &s))
3425               buffer->cur = pstr;
3426           }
3427         create_literal (pfile, result, base, buffer->cur - base, CPP_OTHER);
3428         break;
3429       }
3430
3431     }
3432
3433   /* Potentially convert the location of the token to a range.  */
3434   if (result->src_loc >= RESERVED_LOCATION_COUNT
3435       && result->type != CPP_EOF)
3436     {
3437       /* Ensure that any line notes are processed, so that we have the
3438          correct physical line/column for the end-point of the token even
3439          when a logical line is split via one or more backslashes.  */
3440       if (buffer->cur >= buffer->notes[buffer->cur_note].pos
3441           && !pfile->overlaid_buffer)
3442         _cpp_process_line_notes (pfile, false);
3443
3444       source_range tok_range;
3445       tok_range.m_start = result->src_loc;
3446       tok_range.m_finish
3447         = linemap_position_for_column (pfile->line_table,
3448                                        CPP_BUF_COLUMN (buffer, buffer->cur));
3449
3450       result->src_loc = COMBINE_LOCATION_DATA (pfile->line_table,
3451                                                result->src_loc,
3452                                                tok_range, NULL);
3453     }
3454
3455   return result;
3456 }
3457
3458 /* An upper bound on the number of bytes needed to spell TOKEN.
3459    Does not include preceding whitespace.  */
3460 unsigned int
3461 cpp_token_len (const cpp_token *token)
3462 {
3463   unsigned int len;
3464
3465   switch (TOKEN_SPELL (token))
3466     {
3467     default:            len = 6;                                break;
3468     case SPELL_LITERAL: len = token->val.str.len;               break;
3469     case SPELL_IDENT:   len = NODE_LEN (token->val.node.node) * 10;     break;
3470     }
3471
3472   return len;
3473 }
3474
3475 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
3476    Return the number of bytes read out of NAME.  (There are always
3477    10 bytes written to BUFFER.)  */
3478
3479 static size_t
3480 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
3481 {
3482   int j;
3483   int ucn_len = 0;
3484   int ucn_len_c;
3485   unsigned t;
3486   unsigned long utf32;
3487
3488   /* Compute the length of the UTF-8 sequence.  */
3489   for (t = *name; t & 0x80; t <<= 1)
3490     ucn_len++;
3491
3492   utf32 = *name & (0x7F >> ucn_len);
3493   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
3494     {
3495       utf32 = (utf32 << 6) | (*++name & 0x3F);
3496
3497       /* Ill-formed UTF-8.  */
3498       if ((*name & ~0x3F) != 0x80)
3499         abort ();
3500     }
3501
3502   *buffer++ = '\\';
3503   *buffer++ = 'U';
3504   for (j = 7; j >= 0; j--)
3505     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
3506   return ucn_len;
3507 }
3508
3509 /* Given a token TYPE corresponding to a digraph, return a pointer to
3510    the spelling of the digraph.  */
3511 static const unsigned char *
3512 cpp_digraph2name (enum cpp_ttype type)
3513 {
3514   return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
3515 }
3516
3517 /* Write the spelling of an identifier IDENT, using UCNs, to BUFFER.
3518    The buffer must already contain the enough space to hold the
3519    token's spelling.  Returns a pointer to the character after the
3520    last character written.  */
3521 unsigned char *
3522 _cpp_spell_ident_ucns (unsigned char *buffer, cpp_hashnode *ident)
3523 {
3524   size_t i;
3525   const unsigned char *name = NODE_NAME (ident);
3526
3527   for (i = 0; i < NODE_LEN (ident); i++)
3528     if (name[i] & ~0x7F)
3529       {
3530         i += utf8_to_ucn (buffer, name + i) - 1;
3531         buffer += 10;
3532       }
3533     else
3534       *buffer++ = name[i];
3535
3536   return buffer;
3537 }
3538
3539 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
3540    already contain the enough space to hold the token's spelling.
3541    Returns a pointer to the character after the last character written.
3542    FORSTRING is true if this is to be the spelling after translation
3543    phase 1 (with the original spelling of extended identifiers), false
3544    if extended identifiers should always be written using UCNs (there is
3545    no option for always writing them in the internal UTF-8 form).
3546    FIXME: Would be nice if we didn't need the PFILE argument.  */
3547 unsigned char *
3548 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
3549                  unsigned char *buffer, bool forstring)
3550 {
3551   switch (TOKEN_SPELL (token))
3552     {
3553     case SPELL_OPERATOR:
3554       {
3555         const unsigned char *spelling;
3556         unsigned char c;
3557
3558         if (token->flags & DIGRAPH)
3559           spelling = cpp_digraph2name (token->type);
3560         else if (token->flags & NAMED_OP)
3561           goto spell_ident;
3562         else
3563           spelling = TOKEN_NAME (token);
3564
3565         while ((c = *spelling++) != '\0')
3566           *buffer++ = c;
3567       }
3568       break;
3569
3570     spell_ident:
3571     case SPELL_IDENT:
3572       if (forstring)
3573         {
3574           memcpy (buffer, NODE_NAME (token->val.node.spelling),
3575                   NODE_LEN (token->val.node.spelling));
3576           buffer += NODE_LEN (token->val.node.spelling);
3577         }
3578       else
3579         buffer = _cpp_spell_ident_ucns (buffer, token->val.node.node);
3580       break;
3581
3582     case SPELL_LITERAL:
3583       memcpy (buffer, token->val.str.text, token->val.str.len);
3584       buffer += token->val.str.len;
3585       break;
3586
3587     case SPELL_NONE:
3588       cpp_error (pfile, CPP_DL_ICE,
3589                  "unspellable token %s", TOKEN_NAME (token));
3590       break;
3591     }
3592
3593   return buffer;
3594 }
3595
3596 /* Returns TOKEN spelt as a null-terminated string.  The string is
3597    freed when the reader is destroyed.  Useful for diagnostics.  */
3598 unsigned char *
3599 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
3600 {
3601   unsigned int len = cpp_token_len (token) + 1;
3602   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
3603
3604   end = cpp_spell_token (pfile, token, start, false);
3605   end[0] = '\0';
3606
3607   return start;
3608 }
3609
3610 /* Returns a pointer to a string which spells the token defined by
3611    TYPE and FLAGS.  Used by C front ends, which really should move to
3612    using cpp_token_as_text.  */
3613 const char *
3614 cpp_type2name (enum cpp_ttype type, unsigned char flags)
3615 {
3616   if (flags & DIGRAPH)
3617     return (const char *) cpp_digraph2name (type);
3618   else if (flags & NAMED_OP)
3619     return cpp_named_operator2name (type);
3620
3621   return (const char *) token_spellings[type].name;
3622 }
3623
3624 /* Writes the spelling of token to FP, without any preceding space.
3625    Separated from cpp_spell_token for efficiency - to avoid stdio
3626    double-buffering.  */
3627 void
3628 cpp_output_token (const cpp_token *token, FILE *fp)
3629 {
3630   switch (TOKEN_SPELL (token))
3631     {
3632     case SPELL_OPERATOR:
3633       {
3634         const unsigned char *spelling;
3635         int c;
3636
3637         if (token->flags & DIGRAPH)
3638           spelling = cpp_digraph2name (token->type);
3639         else if (token->flags & NAMED_OP)
3640           goto spell_ident;
3641         else
3642           spelling = TOKEN_NAME (token);
3643
3644         c = *spelling;
3645         do
3646           putc (c, fp);
3647         while ((c = *++spelling) != '\0');
3648       }
3649       break;
3650
3651     spell_ident:
3652     case SPELL_IDENT:
3653       {
3654         size_t i;
3655         const unsigned char * name = NODE_NAME (token->val.node.node);
3656
3657         for (i = 0; i < NODE_LEN (token->val.node.node); i++)
3658           if (name[i] & ~0x7F)
3659             {
3660               unsigned char buffer[10];
3661               i += utf8_to_ucn (buffer, name + i) - 1;
3662               fwrite (buffer, 1, 10, fp);
3663             }
3664           else
3665             fputc (NODE_NAME (token->val.node.node)[i], fp);
3666       }
3667       break;
3668
3669     case SPELL_LITERAL:
3670       if (token->type == CPP_HEADER_NAME)
3671         fputc ('"', fp);
3672       fwrite (token->val.str.text, 1, token->val.str.len, fp);
3673       if (token->type == CPP_HEADER_NAME)
3674         fputc ('"', fp);
3675       break;
3676
3677     case SPELL_NONE:
3678       /* An error, most probably.  */
3679       break;
3680     }
3681 }
3682
3683 /* Compare two tokens.  */
3684 int
3685 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
3686 {
3687   if (a->type == b->type && a->flags == b->flags)
3688     switch (TOKEN_SPELL (a))
3689       {
3690       default:                  /* Keep compiler happy.  */
3691       case SPELL_OPERATOR:
3692         /* token_no is used to track where multiple consecutive ##
3693            tokens were originally located.  */
3694         return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
3695       case SPELL_NONE:
3696         return (a->type != CPP_MACRO_ARG
3697                 || (a->val.macro_arg.arg_no == b->val.macro_arg.arg_no
3698                     && a->val.macro_arg.spelling == b->val.macro_arg.spelling));
3699       case SPELL_IDENT:
3700         return (a->val.node.node == b->val.node.node
3701                 && a->val.node.spelling == b->val.node.spelling);
3702       case SPELL_LITERAL:
3703         return (a->val.str.len == b->val.str.len
3704                 && !memcmp (a->val.str.text, b->val.str.text,
3705                             a->val.str.len));
3706       }
3707
3708   return 0;
3709 }
3710
3711 /* Returns nonzero if a space should be inserted to avoid an
3712    accidental token paste for output.  For simplicity, it is
3713    conservative, and occasionally advises a space where one is not
3714    needed, e.g. "." and ".2".  */
3715 int
3716 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
3717                  const cpp_token *token2)
3718 {
3719   enum cpp_ttype a = token1->type, b = token2->type;
3720   cppchar_t c;
3721
3722   if (token1->flags & NAMED_OP)
3723     a = CPP_NAME;
3724   if (token2->flags & NAMED_OP)
3725     b = CPP_NAME;
3726
3727   c = EOF;
3728   if (token2->flags & DIGRAPH)
3729     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
3730   else if (token_spellings[b].category == SPELL_OPERATOR)
3731     c = token_spellings[b].name[0];
3732
3733   /* Quickly get everything that can paste with an '='.  */
3734   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
3735     return 1;
3736
3737   switch (a)
3738     {
3739     case CPP_GREATER:   return c == '>';
3740     case CPP_LESS:      return c == '<' || c == '%' || c == ':';
3741     case CPP_PLUS:      return c == '+';
3742     case CPP_MINUS:     return c == '-' || c == '>';
3743     case CPP_DIV:       return c == '/' || c == '*'; /* Comments.  */
3744     case CPP_MOD:       return c == ':' || c == '>';
3745     case CPP_AND:       return c == '&';
3746     case CPP_OR:        return c == '|';
3747     case CPP_COLON:     return c == ':' || c == '>';
3748     case CPP_DEREF:     return c == '*';
3749     case CPP_DOT:       return c == '.' || c == '%' || b == CPP_NUMBER;
3750     case CPP_HASH:      return c == '#' || c == '%'; /* Digraph form.  */
3751     case CPP_PRAGMA:
3752     case CPP_NAME:      return ((b == CPP_NUMBER
3753                                  && name_p (pfile, &token2->val.str))
3754                                 || b == CPP_NAME
3755                                 || b == CPP_CHAR || b == CPP_STRING); /* L */
3756     case CPP_NUMBER:    return (b == CPP_NUMBER || b == CPP_NAME
3757                                 || b == CPP_CHAR
3758                                 || c == '.' || c == '+' || c == '-');
3759                                       /* UCNs */
3760     case CPP_OTHER:     return ((token1->val.str.text[0] == '\\'
3761                                  && b == CPP_NAME)
3762                                 || (CPP_OPTION (pfile, objc)
3763                                     && token1->val.str.text[0] == '@'
3764                                     && (b == CPP_NAME || b == CPP_STRING)));
3765     case CPP_LESS_EQ:   return c == '>';
3766     case CPP_STRING:
3767     case CPP_WSTRING:
3768     case CPP_UTF8STRING:
3769     case CPP_STRING16:
3770     case CPP_STRING32:  return (CPP_OPTION (pfile, user_literals)
3771                                 && (b == CPP_NAME
3772                                     || (TOKEN_SPELL (token2) == SPELL_LITERAL
3773                                         && ISIDST (token2->val.str.text[0]))));
3774
3775     default:            break;
3776     }
3777
3778   return 0;
3779 }
3780
3781 /* Output all the remaining tokens on the current line, and a newline
3782    character, to FP.  Leading whitespace is removed.  If there are
3783    macros, special token padding is not performed.  */
3784 void
3785 cpp_output_line (cpp_reader *pfile, FILE *fp)
3786 {
3787   const cpp_token *token;
3788
3789   token = cpp_get_token (pfile);
3790   while (token->type != CPP_EOF)
3791     {
3792       cpp_output_token (token, fp);
3793       token = cpp_get_token (pfile);
3794       if (token->flags & PREV_WHITE)
3795         putc (' ', fp);
3796     }
3797
3798   putc ('\n', fp);
3799 }
3800
3801 /* Return a string representation of all the remaining tokens on the
3802    current line.  The result is allocated using xmalloc and must be
3803    freed by the caller.  */
3804 unsigned char *
3805 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
3806 {
3807   const cpp_token *token;
3808   unsigned int out = dir_name ? ustrlen (dir_name) : 0;
3809   unsigned int alloced = 120 + out;
3810   unsigned char *result = (unsigned char *) xmalloc (alloced);
3811
3812   /* If DIR_NAME is empty, there are no initial contents.  */
3813   if (dir_name)
3814     {
3815       sprintf ((char *) result, "#%s ", dir_name);
3816       out += 2;
3817     }
3818
3819   token = cpp_get_token (pfile);
3820   while (token->type != CPP_EOF)
3821     {
3822       unsigned char *last;
3823       /* Include room for a possible space and the terminating nul.  */
3824       unsigned int len = cpp_token_len (token) + 2;
3825
3826       if (out + len > alloced)
3827         {
3828           alloced *= 2;
3829           if (out + len > alloced)
3830             alloced = out + len;
3831           result = (unsigned char *) xrealloc (result, alloced);
3832         }
3833
3834       last = cpp_spell_token (pfile, token, &result[out], 0);
3835       out = last - result;
3836
3837       token = cpp_get_token (pfile);
3838       if (token->flags & PREV_WHITE)
3839         result[out++] = ' ';
3840     }
3841
3842   result[out] = '\0';
3843   return result;
3844 }
3845
3846 /* Memory buffers.  Changing these three constants can have a dramatic
3847    effect on performance.  The values here are reasonable defaults,
3848    but might be tuned.  If you adjust them, be sure to test across a
3849    range of uses of cpplib, including heavy nested function-like macro
3850    expansion.  Also check the change in peak memory usage (NJAMD is a
3851    good tool for this).  */
3852 #define MIN_BUFF_SIZE 8000
3853 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
3854 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
3855         (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
3856
3857 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
3858   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
3859 #endif
3860
3861 /* Create a new allocation buffer.  Place the control block at the end
3862    of the buffer, so that buffer overflows will cause immediate chaos.  */
3863 static _cpp_buff *
3864 new_buff (size_t len)
3865 {
3866   _cpp_buff *result;
3867   unsigned char *base;
3868
3869   if (len < MIN_BUFF_SIZE)
3870     len = MIN_BUFF_SIZE;
3871   len = CPP_ALIGN (len);
3872
3873 #ifdef ENABLE_VALGRIND_ANNOTATIONS
3874   /* Valgrind warns about uses of interior pointers, so put _cpp_buff
3875      struct first.  */
3876   size_t slen = CPP_ALIGN2 (sizeof (_cpp_buff), 2 * DEFAULT_ALIGNMENT);
3877   base = XNEWVEC (unsigned char, len + slen);
3878   result = (_cpp_buff *) base;
3879   base += slen;
3880 #else
3881   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
3882   result = (_cpp_buff *) (base + len);
3883 #endif
3884   result->base = base;
3885   result->cur = base;
3886   result->limit = base + len;
3887   result->next = NULL;
3888   return result;
3889 }
3890
3891 /* Place a chain of unwanted allocation buffers on the free list.  */
3892 void
3893 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
3894 {
3895   _cpp_buff *end = buff;
3896
3897   while (end->next)
3898     end = end->next;
3899   end->next = pfile->free_buffs;
3900   pfile->free_buffs = buff;
3901 }
3902
3903 /* Return a free buffer of size at least MIN_SIZE.  */
3904 _cpp_buff *
3905 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
3906 {
3907   _cpp_buff *result, **p;
3908
3909   for (p = &pfile->free_buffs;; p = &(*p)->next)
3910     {
3911       size_t size;
3912
3913       if (*p == NULL)
3914         return new_buff (min_size);
3915       result = *p;
3916       size = result->limit - result->base;
3917       /* Return a buffer that's big enough, but don't waste one that's
3918          way too big.  */
3919       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
3920         break;
3921     }
3922
3923   *p = result->next;
3924   result->next = NULL;
3925   result->cur = result->base;
3926   return result;
3927 }
3928
3929 /* Creates a new buffer with enough space to hold the uncommitted
3930    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
3931    the excess bytes to the new buffer.  Chains the new buffer after
3932    BUFF, and returns the new buffer.  */
3933 _cpp_buff *
3934 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
3935 {
3936   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
3937   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
3938
3939   buff->next = new_buff;
3940   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
3941   return new_buff;
3942 }
3943
3944 /* Creates a new buffer with enough space to hold the uncommitted
3945    remaining bytes of the buffer pointed to by BUFF, and at least
3946    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
3947    Chains the new buffer before the buffer pointed to by BUFF, and
3948    updates the pointer to point to the new buffer.  */
3949 void
3950 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
3951 {
3952   _cpp_buff *new_buff, *old_buff = *pbuff;
3953   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
3954
3955   new_buff = _cpp_get_buff (pfile, size);
3956   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
3957   new_buff->next = old_buff;
3958   *pbuff = new_buff;
3959 }
3960
3961 /* Free a chain of buffers starting at BUFF.  */
3962 void
3963 _cpp_free_buff (_cpp_buff *buff)
3964 {
3965   _cpp_buff *next;
3966
3967   for (; buff; buff = next)
3968     {
3969       next = buff->next;
3970 #ifdef ENABLE_VALGRIND_ANNOTATIONS
3971       free (buff);
3972 #else
3973       free (buff->base);
3974 #endif
3975     }
3976 }
3977
3978 /* Allocate permanent, unaligned storage of length LEN.  */
3979 unsigned char *
3980 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
3981 {
3982   _cpp_buff *buff = pfile->u_buff;
3983   unsigned char *result = buff->cur;
3984
3985   if (len > (size_t) (buff->limit - result))
3986     {
3987       buff = _cpp_get_buff (pfile, len);
3988       buff->next = pfile->u_buff;
3989       pfile->u_buff = buff;
3990       result = buff->cur;
3991     }
3992
3993   buff->cur = result + len;
3994   return result;
3995 }
3996
3997 /* Allocate permanent, unaligned storage of length LEN from a_buff.
3998    That buffer is used for growing allocations when saving macro
3999    replacement lists in a #define, and when parsing an answer to an
4000    assertion in #assert, #unassert or #if (and therefore possibly
4001    whilst expanding macros).  It therefore must not be used by any
4002    code that they might call: specifically the lexer and the guts of
4003    the macro expander.
4004
4005    All existing other uses clearly fit this restriction: storing
4006    registered pragmas during initialization.  */
4007 unsigned char *
4008 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
4009 {
4010   _cpp_buff *buff = pfile->a_buff;
4011   unsigned char *result = buff->cur;
4012
4013   if (len > (size_t) (buff->limit - result))
4014     {
4015       buff = _cpp_get_buff (pfile, len);
4016       buff->next = pfile->a_buff;
4017       pfile->a_buff = buff;
4018       result = buff->cur;
4019     }
4020
4021   buff->cur = result + len;
4022   return result;
4023 }
4024
4025 /* Commit or allocate storage from a buffer.  */
4026
4027 void *
4028 _cpp_commit_buff (cpp_reader *pfile, size_t size)
4029 {
4030   void *ptr = BUFF_FRONT (pfile->a_buff);
4031
4032   if (pfile->hash_table->alloc_subobject)
4033     {
4034       void *copy = pfile->hash_table->alloc_subobject (size);
4035       memcpy (copy, ptr, size);
4036       ptr = copy;
4037     }
4038   else
4039     BUFF_FRONT (pfile->a_buff) += size;
4040
4041   return ptr;
4042 }
4043
4044 /* Say which field of TOK is in use.  */
4045
4046 enum cpp_token_fld_kind
4047 cpp_token_val_index (const cpp_token *tok)
4048 {
4049   switch (TOKEN_SPELL (tok))
4050     {
4051     case SPELL_IDENT:
4052       return CPP_TOKEN_FLD_NODE;
4053     case SPELL_LITERAL:
4054       return CPP_TOKEN_FLD_STR;
4055     case SPELL_OPERATOR:
4056       /* Operands which were originally spelled as ident keep around
4057          the node for the exact spelling.  */
4058       if (tok->flags & NAMED_OP)
4059         return CPP_TOKEN_FLD_NODE;
4060       else if (tok->type == CPP_PASTE)
4061         return CPP_TOKEN_FLD_TOKEN_NO;
4062       else
4063         return CPP_TOKEN_FLD_NONE;
4064     case SPELL_NONE:
4065       if (tok->type == CPP_MACRO_ARG)
4066         return CPP_TOKEN_FLD_ARG_NO;
4067       else if (tok->type == CPP_PADDING)
4068         return CPP_TOKEN_FLD_SOURCE;
4069       else if (tok->type == CPP_PRAGMA)
4070         return CPP_TOKEN_FLD_PRAGMA;
4071       /* fall through */
4072     default:
4073       return CPP_TOKEN_FLD_NONE;
4074     }
4075 }
4076
4077 /* All tokens lexed in R after calling this function will be forced to
4078    have their location_t to be P, until
4079    cpp_stop_forcing_token_locations is called for R.  */
4080
4081 void
4082 cpp_force_token_locations (cpp_reader *r, location_t loc)
4083 {
4084   r->forced_token_location = loc;
4085 }
4086
4087 /* Go back to assigning locations naturally for lexed tokens.  */
4088
4089 void
4090 cpp_stop_forcing_token_locations (cpp_reader *r)
4091 {
4092   r->forced_token_location = 0;
4093 }
4094
4095 /* We're looking at \, if it's escaping EOL, look past it.  If at
4096    LIMIT, don't advance.  */
4097
4098 static const unsigned char *
4099 do_peek_backslash (const unsigned char *peek, const unsigned char *limit)
4100 {
4101   const unsigned char *probe = peek;
4102
4103   if (__builtin_expect (peek[1] == '\n', true))
4104     {
4105     eol:
4106       probe += 2;
4107       if (__builtin_expect (probe < limit, true))
4108         {
4109           peek = probe;
4110           if (*peek == '\\')
4111             /* The user might be perverse.  */
4112             return do_peek_backslash (peek, limit);
4113         }
4114     }
4115   else if (__builtin_expect (peek[1] == '\r', false))
4116     {
4117       if (probe[2] == '\n')
4118         probe++;
4119       goto eol;
4120     }
4121
4122   return peek;
4123 }
4124
4125 static const unsigned char *
4126 do_peek_next (const unsigned char *peek, const unsigned char *limit)
4127 {
4128   if (__builtin_expect (*peek == '\\', false))
4129     peek = do_peek_backslash (peek, limit);
4130   return peek;
4131 }
4132
4133 static const unsigned char *
4134 do_peek_prev (const unsigned char *peek, const unsigned char *bound)
4135 {
4136   if (peek == bound)
4137     return NULL;
4138
4139   unsigned char c = *--peek;
4140   if (__builtin_expect (c == '\n', false)
4141       || __builtin_expect (c == 'r', false))
4142     {
4143       if (peek == bound)
4144         return peek;
4145       int ix = -1;
4146       if (c == '\n' && peek[ix] == '\r')
4147         {
4148           if (peek + ix == bound)
4149             return peek;
4150           ix--;
4151         }
4152
4153       if (peek[ix] == '\\')
4154         return do_peek_prev (peek + ix, bound);
4155
4156       return peek;
4157     }
4158   else
4159     return peek;
4160 }
4161
4162 /* If PEEK[-1] is identifier MATCH, scan past it and trailing white
4163    space.  Otherwise return NULL.  */
4164
4165 static const unsigned char *
4166 do_peek_ident (const char *match, const unsigned char *peek,
4167                const unsigned char *limit)
4168 {
4169   for (; *++match; peek++)
4170     if (*peek != *match)
4171       {
4172         peek = do_peek_next (peek, limit);
4173         if (*peek != *match)
4174           return NULL;
4175       }
4176
4177   /* Must now not be looking at an identifier char.  */
4178   peek = do_peek_next (peek, limit);
4179   if (ISIDNUM (*peek))
4180     return NULL;
4181
4182   /* Skip control-line whitespace.  */
4183  ws:
4184   while (*peek == ' ' || *peek == '\t')
4185     peek++;
4186   if (__builtin_expect (*peek == '\\', false))
4187     {
4188       peek = do_peek_backslash (peek, limit);
4189       if (*peek != '\\')
4190         goto ws;
4191     }
4192
4193   return peek;
4194 }
4195
4196 /* Are we looking at a module control line starting as PEEK - 1?  */
4197
4198 static bool
4199 do_peek_module (cpp_reader *pfile, unsigned char c,
4200                 const unsigned char *peek, const unsigned char *limit)
4201 {
4202   bool import = false;
4203
4204   if (__builtin_expect (c == 'e', false))
4205     {
4206       if (!((peek[0] == 'x' || peek[0] == '\\')
4207             && (peek = do_peek_ident ("export", peek, limit))))
4208         return false;
4209
4210       /* export, peek for import or module.  No need to peek __import
4211          here.  */
4212       if (peek[0] == 'i')
4213         {
4214           if (!((peek[1] == 'm' || peek[1] == '\\')
4215                 && (peek = do_peek_ident ("import", peek + 1, limit))))
4216             return false;
4217           import = true;
4218         }
4219       else if (peek[0] == 'm')
4220         {
4221           if (!((peek[1] == 'o' || peek[1] == '\\')
4222                 && (peek = do_peek_ident ("module", peek + 1, limit))))
4223             return false;
4224         }
4225       else
4226         return false;
4227     }
4228   else if (__builtin_expect (c == 'i', false))
4229     {
4230       if (!((peek[0] == 'm' || peek[0] == '\\')
4231             && (peek = do_peek_ident ("import", peek, limit))))
4232         return false;
4233       import = true;
4234     }
4235   else if (__builtin_expect (c == '_', false))
4236     {
4237       /* Needed for translated includes.   */
4238       if (!((peek[0] == '_' || peek[0] == '\\')
4239             && (peek = do_peek_ident ("__import", peek, limit))))
4240         return false;
4241       import = true;
4242     }
4243   else if (__builtin_expect (c == 'm', false))
4244     {
4245       if (!((peek[0] == 'o' || peek[0] == '\\')
4246             && (peek = do_peek_ident ("module", peek, limit))))
4247         return false;
4248     }
4249   else
4250     return false;
4251
4252   /* Peek the next character to see if it's good enough.  We'll be at
4253      the first non-whitespace char, including skipping an escaped
4254      newline.  */
4255   /* ... import followed by identifier, ':', '<' or header-name
4256      preprocessing tokens, or module followed by identifier, ':' or
4257      ';' preprocessing tokens.  */
4258   unsigned char p = *peek++;
4259
4260   /* A character literal is ... single quotes, ... optionally preceded
4261      by u8, u, U, or L */
4262   /* A string-literal is a ... double quotes, optionally prefixed by
4263      R, u8, u8R, u, uR, U, UR, L, or LR */
4264   if (p == 'u')
4265     {
4266       peek = do_peek_next (peek, limit);
4267       if (*peek == '8')
4268         {
4269           peek++;
4270           goto peek_u8;
4271         }
4272       goto peek_u;
4273     }
4274   else if (p == 'U' || p == 'L')
4275     {
4276     peek_u8:
4277       peek = do_peek_next (peek, limit);
4278     peek_u:
4279       if (*peek == '\"' || *peek == '\'')
4280         return false;
4281
4282       if (*peek == 'R')
4283         goto peek_R;
4284       /* Identifier. Ok.  */
4285     }
4286   else if (p == 'R')
4287     {
4288     peek_R:
4289       if (CPP_OPTION (pfile, rliterals))
4290         {
4291           peek = do_peek_next (peek, limit);
4292           if (*peek == '\"')
4293             return false;
4294         }
4295       /* Identifier. Ok.  */
4296     }
4297   else if ('Z' - 'A' == 25
4298            ? ((p >= 'A' && p <= 'Z') || (p >= 'a' && p <= 'z') || p == '_')
4299            : ISIDST (p))
4300     {
4301       /* Identifier.  Ok. */
4302     }
4303   else if (p == '<')
4304     {
4305       /* Maybe angle header, ok for import.  Reject
4306          '<=', '<<' digraph:'<:'.  */
4307       if (!import)
4308         return false;
4309       peek = do_peek_next (peek, limit);
4310       if (*peek == '=' || *peek == '<'
4311           || (*peek == ':' && CPP_OPTION (pfile, digraphs)))
4312         return false;
4313     }
4314   else if (p == ';')
4315     {
4316       /* SEMICOLON, ok for module.  */
4317       if (import)
4318         return false;
4319     }
4320   else if (p == '"')
4321     {
4322       /* STRING, ok for import.  */
4323       if (!import)
4324         return false;
4325     }
4326   else if (p == ':')
4327     {
4328       /* Maybe COLON, ok.  Reject '::', digraph:':>'.  */
4329       peek = do_peek_next (peek, limit);
4330       if (*peek == ':' || (*peek == '>' && CPP_OPTION (pfile, digraphs)))
4331         return false;
4332     }
4333   else
4334     /* FIXME: Detect a unicode character, excluding those not
4335        permitted as the initial character. [lex.name]/1.  I presume
4336        we need to check the \[uU] spellings, and directly using
4337        Unicode in say UTF8 form?  Or perhaps we do the phase-1
4338        conversion of UTF8 to universal-character-names?  */
4339     return false;
4340
4341   return true;
4342 }
4343
4344 /* Directives-only scanning.  Somewhat more relaxed than correct
4345    parsing -- some ill-formed programs will not be rejected.  */
4346
4347 void
4348 cpp_directive_only_process (cpp_reader *pfile,
4349                             void *data,
4350                             void (*cb) (cpp_reader *, CPP_DO_task, void *, ...))
4351 {
4352   bool module_p = CPP_OPTION (pfile, module_directives);
4353
4354   do
4355     {
4356     restart:
4357       /* Buffer initialization, but no line cleaning. */
4358       cpp_buffer *buffer = pfile->buffer;
4359       buffer->cur_note = buffer->notes_used = 0;
4360       buffer->cur = buffer->line_base = buffer->next_line;
4361       buffer->need_line = false;
4362       /* Files always end in a newline or carriage return.  We rely on this for
4363          character peeking safety.  */
4364       gcc_assert (buffer->rlimit[0] == '\n' || buffer->rlimit[0] == '\r');
4365
4366       const unsigned char *base = buffer->cur;
4367       unsigned line_count = 0;
4368       const unsigned char *line_start = base;
4369
4370       bool bol = true;
4371       bool raw = false;
4372
4373       const unsigned char *lwm = base;
4374       for (const unsigned char *pos = base, *limit = buffer->rlimit;
4375            pos < limit;)
4376         {
4377           unsigned char c = *pos++;
4378           /* This matches the switch in _cpp_lex_direct.  */
4379           switch (c)
4380             {
4381             case ' ': case '\t': case '\f': case '\v':
4382               /* Whitespace, do nothing.  */
4383               break;
4384
4385             case '\r': /* MAC line ending, or Windows \r\n  */
4386               if (*pos == '\n')
4387                 pos++;
4388               /* FALLTHROUGH */
4389
4390             case '\n':
4391               bol = true;
4392
4393             next_line:
4394               CPP_INCREMENT_LINE (pfile, 0);
4395               line_count++;
4396               line_start = pos;
4397               break;
4398
4399             case '\\':
4400               /* <backslash><newline> is removed, and doesn't undo any
4401                  preceeding escape or whatnot.  */
4402               if (*pos == '\n')
4403                 {
4404                   pos++;
4405                   goto next_line;
4406                 }
4407               else if (*pos == '\r')
4408                 {
4409                   if (pos[1] == '\n')
4410                     pos++;
4411                   pos++;
4412                   goto next_line;
4413                 }
4414               goto dflt;
4415
4416             case '#':
4417               if (bol)
4418                 {
4419                   /* Line directive.  */
4420                   if (pos - 1 > base && !pfile->state.skipping)
4421                     cb (pfile, CPP_DO_print, data,
4422                         line_count, base, pos - 1 - base);
4423
4424                   /* Prep things for directive handling. */
4425                   buffer->next_line = pos;
4426                   buffer->need_line = true;
4427                   bool ok = _cpp_get_fresh_line (pfile);
4428                   gcc_checking_assert (ok);
4429
4430                   /* Ensure proper column numbering for generated
4431                      error messages. */
4432                   buffer->line_base -= pos - line_start;
4433
4434                   _cpp_handle_directive (pfile, line_start + 1 != pos);
4435
4436                   /* Sanitize the line settings.  Duplicate #include's can
4437                      mess things up. */
4438                   // FIXME: Necessary?
4439                   pfile->line_table->highest_location
4440                     = pfile->line_table->highest_line;
4441
4442                   if (!pfile->state.skipping
4443                       && pfile->buffer->next_line < pfile->buffer->rlimit)
4444                     cb (pfile, CPP_DO_location, data,
4445                         pfile->line_table->highest_line);
4446
4447                   goto restart;
4448                 }
4449               goto dflt;
4450
4451             case '/':
4452               {
4453                 const unsigned char *peek = do_peek_next (pos, limit);
4454                 if (!(*peek == '/' || *peek == '*'))
4455                   goto dflt;
4456
4457                 /* Line or block comment  */
4458                 bool is_block = *peek == '*';
4459                 bool star = false;
4460                 bool esc = false;
4461                 location_t sloc
4462                   = linemap_position_for_column (pfile->line_table,
4463                                                  pos - line_start);
4464
4465                 while (pos < limit)
4466                   {
4467                     char c = *pos++;
4468                     switch (c)
4469                       {
4470                       case '\\':
4471                         esc = true;
4472                         break;
4473
4474                       case '\r':
4475                         if (*pos == '\n')
4476                           pos++;
4477                         /* FALLTHROUGH  */
4478
4479                       case '\n':
4480                         {
4481                           CPP_INCREMENT_LINE (pfile, 0);
4482                           line_count++;
4483                           line_start = pos;
4484                           if (!esc && !is_block)
4485                             {
4486                               bol = true;
4487                               goto done_comment;
4488                             }
4489                         }
4490                         if (!esc)
4491                           star = false;
4492                         esc = false;
4493                         break;
4494
4495                       case '*':
4496                         if (pos > peek && !esc)
4497                           star = is_block;
4498                         esc = false;
4499                         break;
4500
4501                       case '/':
4502                         if (star)
4503                           goto done_comment;
4504                         /* FALLTHROUGH  */
4505
4506                       default:
4507                         star = false;
4508                         esc = false;
4509                         break;
4510                       }
4511                   }
4512                 if (pos < limit || is_block)
4513                   cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
4514                                        "unterminated comment");
4515               done_comment:
4516                 lwm = pos;
4517                 break;
4518               }
4519
4520             case '\'':
4521               if (!CPP_OPTION (pfile, digit_separators))
4522                 goto delimited_string;
4523
4524               /* Possibly a number punctuator.  */
4525               if (!ISIDNUM (*do_peek_next (pos, limit)))
4526                 goto delimited_string;
4527
4528               goto quote_peek;
4529
4530             case '\"':
4531               if (!CPP_OPTION (pfile, rliterals))
4532                 goto delimited_string;
4533
4534             quote_peek:
4535               {
4536                 /* For ' see if it's a number punctuator
4537                    \.?<digit>(<digit>|<identifier-nondigit>
4538                    |'<digit>|'<nondigit>|[eEpP]<sign>|\.)* */
4539                 /* For " see if it's a raw string
4540                    {U,L,u,u8}R.  This includes CPP_NUMBER detection,
4541                    because that could be 0e+R.  */
4542                 const unsigned char *peek = pos - 1;
4543                 bool quote_first = c == '"';
4544                 bool quote_eight = false;
4545                 bool maybe_number_start = false;
4546                 bool want_number = false;
4547
4548                 while ((peek = do_peek_prev (peek, lwm)))
4549                   {
4550                     unsigned char p = *peek;
4551                     if (quote_first)
4552                       {
4553                         if (!raw)
4554                           {
4555                             if (p != 'R')
4556                               break;
4557                             raw = true;
4558                             continue;
4559                           }
4560
4561                         quote_first = false;
4562                         if (p == 'L' || p == 'U' || p == 'u')
4563                           ;
4564                         else if (p == '8')
4565                           quote_eight = true;
4566                         else
4567                           goto second_raw;
4568                       }
4569                     else if (quote_eight)
4570                       {
4571                         if (p != 'u')
4572                           {
4573                             raw = false;
4574                             break;
4575                           }
4576                         quote_eight = false;
4577                       }
4578                     else if (c == '"')
4579                       {
4580                       second_raw:;
4581                         if (!want_number && ISIDNUM (p))
4582                           {
4583                             raw = false;
4584                             break;
4585                           }
4586                       }
4587
4588                     if (ISDIGIT (p))
4589                       maybe_number_start = true;
4590                     else if (p == '.')
4591                       want_number = true;
4592                     else if (ISIDNUM (p))
4593                       maybe_number_start = false;
4594                     else if (p == '+' || p == '-')
4595                       {
4596                         if (const unsigned char *peek_prev
4597                             = do_peek_prev (peek, lwm))
4598                           {
4599                             p = *peek_prev;
4600                             if (p == 'e' || p == 'E'
4601                                 || p == 'p' || p == 'P')
4602                               {
4603                                 want_number = true;
4604                                 maybe_number_start = false;
4605                               }
4606                             else
4607                               break;
4608                           }
4609                         else
4610                           break;
4611                       }
4612                     else if (p == '\'' || p == '\"')
4613                       {
4614                         /* If this is lwm, this must be the end of a
4615                            previous string.  So this is a trailing
4616                            literal type, (a) if those are allowed,
4617                              and (b) maybe_start is false.  Otherwise
4618                              this must be a CPP_NUMBER because we've
4619                              met another ', and we'd have checked that
4620                              in its own right.  */
4621                         if (peek == lwm && CPP_OPTION (pfile, uliterals))
4622                           {
4623                             if  (!maybe_number_start && !want_number)
4624                               /* Must be a literal type.  */
4625                               raw = false;
4626                           }
4627                         else if (p == '\''
4628                                  && CPP_OPTION (pfile, digit_separators))
4629                           maybe_number_start = true;
4630                         break;
4631                       }
4632                     else if (c == '\'')
4633                       break;
4634                     else if (!quote_first && !quote_eight)
4635                       break;
4636                   }
4637
4638                 if (maybe_number_start)
4639                   {
4640                     if (c == '\'')
4641                       /* A CPP NUMBER.  */
4642                       goto dflt;
4643                     raw = false;
4644                   }
4645
4646                 goto delimited_string;
4647               }
4648
4649             delimited_string:
4650               {
4651                 /* (Possibly raw) string or char literal.  */
4652                 unsigned char end = c;
4653                 int delim_len = -1;
4654                 const unsigned char *delim = NULL;
4655                 location_t sloc = linemap_position_for_column (pfile->line_table,
4656                                                                pos - line_start);
4657                 int esc = 0;
4658
4659                 if (raw)
4660                   {
4661                     /* There can be no line breaks in the delimiter.  */
4662                     delim = pos;
4663                     for (delim_len = 0; (c = *pos++) != '('; delim_len++)
4664                       {
4665                         if (delim_len == 16)
4666                           {
4667                             cpp_error_with_line (pfile, CPP_DL_ERROR,
4668                                                  sloc, 0,
4669                                                  "raw string delimiter"
4670                                                  " longer than %d"
4671                                                  " characters",
4672                                                  delim_len);
4673                             raw = false;
4674                             pos = delim;
4675                             break;
4676                           }
4677                         if (strchr (") \\\t\v\f\n", c))
4678                           {
4679                             cpp_error_with_line (pfile, CPP_DL_ERROR,
4680                                                  sloc, 0,
4681                                                  "invalid character '%c'"
4682                                                  " in raw string"
4683                                                  " delimiter", c);
4684                             raw = false;
4685                             pos = delim;
4686                             break;
4687                           }
4688                         if (pos >= limit)
4689                           goto bad_string;
4690                       }
4691                   }
4692
4693                 while (pos < limit)
4694                   {
4695                     char c = *pos++;
4696                     switch (c)
4697                       {
4698                       case '\\':
4699                         if (!raw)
4700                           esc++;
4701                         break;
4702
4703                       case '\r':
4704                         if (*pos == '\n')
4705                           pos++;
4706                         /* FALLTHROUGH  */
4707
4708                       case '\n':
4709                         {
4710                           CPP_INCREMENT_LINE (pfile, 0);
4711                           line_count++;
4712                           line_start = pos;
4713                         }
4714                         if (esc)
4715                           esc--;
4716                         break;
4717
4718                       case ')':
4719                         if (raw
4720                             && pos + delim_len + 1 < limit
4721                             && pos[delim_len] == end
4722                             && !memcmp (delim, pos, delim_len))
4723                           {
4724                             pos += delim_len + 1;
4725                             raw = false;
4726                             goto done_string;
4727                           }
4728                         break;
4729
4730                       default:
4731                         if (!raw && !(esc & 1) && c == end)
4732                           goto done_string;
4733                         esc = 0;
4734                         break;
4735                       }
4736                   }
4737               bad_string:
4738                 cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
4739                                      "unterminated literal");
4740
4741               done_string:
4742                 raw = false;
4743                 lwm = pos - 1;
4744               }
4745               goto dflt;
4746
4747             case '_':
4748             case 'e':
4749             case 'i':
4750             case 'm':
4751               if (bol && module_p && !pfile->state.skipping
4752                   && do_peek_module (pfile, c, pos, limit))
4753                 {
4754                   /* We've seen the start of a module control line.
4755                      Start up the tokenizer.  */
4756                   pos--; /* Backup over the first character.  */
4757
4758                   /* Backup over whitespace to start of line.  */
4759                   while (pos > line_start
4760                          && (pos[-1] == ' ' || pos[-1] == '\t'))
4761                     pos--;
4762
4763                   if (pos > base)
4764                     cb (pfile, CPP_DO_print, data, line_count, base, pos - base);
4765
4766                   /* Prep things for directive handling. */
4767                   buffer->next_line = pos;
4768                   buffer->need_line = true;
4769
4770                   /* Now get tokens until the PRAGMA_EOL.  */
4771                   do
4772                     {
4773                       location_t spelling;
4774                       const cpp_token *tok
4775                         = cpp_get_token_with_location (pfile, &spelling);
4776
4777                       gcc_assert (pfile->state.in_deferred_pragma
4778                                   || tok->type == CPP_PRAGMA_EOL);
4779                       cb (pfile, CPP_DO_token, data, tok, spelling);
4780                     }
4781                   while (pfile->state.in_deferred_pragma);
4782
4783                   if (pfile->buffer->next_line < pfile->buffer->rlimit)
4784                     cb (pfile, CPP_DO_location, data,
4785                         pfile->line_table->highest_line);
4786
4787                   pfile->mi_valid = false;
4788                   goto restart;
4789                 }
4790               goto dflt;
4791
4792             default:
4793             dflt:
4794               bol = false;
4795               pfile->mi_valid = false;
4796               break;
4797             }
4798         }
4799
4800       if (buffer->rlimit > base && !pfile->state.skipping)
4801         {
4802           const unsigned char *limit = buffer->rlimit;
4803           /* If the file was not newline terminated, add rlimit, which is
4804              guaranteed to point to a newline, to the end of our range.  */
4805           if (limit[-1] != '\n')
4806             {
4807               limit++;
4808               CPP_INCREMENT_LINE (pfile, 0);
4809               line_count++;
4810             }
4811           cb (pfile, CPP_DO_print, data, line_count, base, limit - base);
4812         }
4813
4814       _cpp_pop_buffer (pfile);
4815     }
4816   while (pfile->buffer);
4817 }