libcpp/lex.c

   1 /* CPP Library - lexical analysis.
   2    Copyright (C) 2000-2021 Free Software Foundation, Inc.
   3    Contributed by Per Bothner, 1994-95.
   4    Based on CCCP program by Paul Rubin, June 1986
   5    Adapted to ANSI C, Richard Stallman, Jan 1987
   6    Broken out to separate file, Zack Weinberg, Mar 2000
   7
   8 This program is free software; you can redistribute it and/or modify it
   9 under the terms of the GNU General Public License as published by the
  10 Free Software Foundation; either version 3, or (at your option) any
  11 later version.
  12
  13 This program is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with this program; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "cpplib.h"
  25 #include "internal.h"
  26
  27 enum spell_type
  28 {
  29   SPELL_OPERATOR = 0,
  30   SPELL_IDENT,
  31   SPELL_LITERAL,
  32   SPELL_NONE
  33 };
  34
  35 struct token_spelling
  36 {
  37   enum spell_type category;
  38   const unsigned char *name;
  39 };
  40
  41 static const unsigned char *const digraph_spellings[] =
  42 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
  43
  44 #define OP(e, s) { SPELL_OPERATOR, UC s  },
  45 #define TK(e, s) { SPELL_ ## s,    UC #e },
  46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
  47 #undef OP
  48 #undef TK
  49
  50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
  51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
  52
  53 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
  54 static int skip_line_comment (cpp_reader *);
  55 static void skip_whitespace (cpp_reader *, cppchar_t);
  56 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
  57 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
  58 static void store_comment (cpp_reader *, cpp_token *);
  59 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
  60                             unsigned int, enum cpp_ttype);
  61 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
  62 static int name_p (cpp_reader *, const cpp_string *);
  63 static tokenrun *next_tokenrun (tokenrun *);
  64
  65 static _cpp_buff *new_buff (size_t);
  66
  67
  68 /* Utility routine:
  69
  70    Compares, the token TOKEN to the NUL-terminated string STRING.
  71    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
  72 int
  73 cpp_ideq (const cpp_token *token, const char *string)
  74 {
  75   if (token->type != CPP_NAME)
  76     return 0;
  77
  78   return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
  79 }
  80
  81 /* Record a note TYPE at byte POS into the current cleaned logical
  82    line.  */
  83 static void
  84 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
  85 {
  86   if (buffer->notes_used == buffer->notes_cap)
  87     {
  88       buffer->notes_cap = buffer->notes_cap * 2 + 200;
  89       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
  90                                   buffer->notes_cap);
  91     }
  92
  93   buffer->notes[buffer->notes_used].pos = pos;
  94   buffer->notes[buffer->notes_used].type = type;
  95   buffer->notes_used++;
  96 }
  97
  98 \f
  99 /* Fast path to find line special characters using optimized character
 100    scanning algorithms.  Anything complicated falls back to the slow
 101    path below.  Since this loop is very hot it's worth doing these kinds
 102    of optimizations.
 103
 104    One of the paths through the ifdefs should provide
 105
 106      const uchar *search_line_fast (const uchar *s, const uchar *end);
 107
 108    Between S and END, search for \n, \r, \\, ?.  Return a pointer to
 109    the found character.
 110
 111    Note that the last character of the buffer is *always* a newline,
 112    as forced by _cpp_convert_input.  This fact can be used to avoid
 113    explicitly looking for the end of the buffer.  */
 114
 115 /* Configure gives us an ifdef test.  */
 116 #ifndef WORDS_BIGENDIAN
 117 #define WORDS_BIGENDIAN 0
 118 #endif
 119
 120 /* We'd like the largest integer that fits into a register.  There's nothing
 121    in <stdint.h> that gives us that.  For most hosts this is unsigned long,
 122    but MS decided on an LLP64 model.  Thankfully when building with GCC we
 123    can get the "real" word size.  */
 124 #ifdef __GNUC__
 125 typedef unsigned int word_type __attribute__((__mode__(__word__)));
 126 #else
 127 typedef unsigned long word_type;
 128 #endif
 129
 130 /* The code below is only expecting sizes 4 or 8.
 131    Die at compile-time if this expectation is violated.  */
 132 typedef char check_word_type_size
 133   [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
 134
 135 /* Return X with the first N bytes forced to values that won't match one
 136    of the interesting characters.  Note that NUL is not interesting.  */
 137
 138 static inline word_type
 139 acc_char_mask_misalign (word_type val, unsigned int n)
 140 {
 141   word_type mask = -1;
 142   if (WORDS_BIGENDIAN)
 143     mask >>= n * 8;
 144   else
 145     mask <<= n * 8;
 146   return val & mask;
 147 }
 148
 149 /* Return X replicated to all byte positions within WORD_TYPE.  */
 150
 151 static inline word_type
 152 acc_char_replicate (uchar x)
 153 {
 154   word_type ret;
 155
 156   ret = (x << 24) | (x << 16) | (x << 8) | x;
 157   if (sizeof(word_type) == 8)
 158     ret = (ret << 16 << 16) | ret;
 159   return ret;
 160 }
 161
 162 /* Return non-zero if some byte of VAL is (probably) C.  */
 163
 164 static inline word_type
 165 acc_char_cmp (word_type val, word_type c)
 166 {
 167 #if defined(__GNUC__) && defined(__alpha__)
 168   /* We can get exact results using a compare-bytes instruction.
 169      Get (val == c) via (0 >= (val ^ c)).  */
 170   return __builtin_alpha_cmpbge (0, val ^ c);
 171 #else
 172   word_type magic = 0x7efefefeU;
 173   if (sizeof(word_type) == 8)
 174     magic = (magic << 16 << 16) | 0xfefefefeU;
 175   magic |= 1;
 176
 177   val ^= c;
 178   return ((val + magic) ^ ~val) & ~magic;
 179 #endif
 180 }
 181
 182 /* Given the result of acc_char_cmp is non-zero, return the index of
 183    the found character.  If this was a false positive, return -1.  */
 184
 185 static inline int
 186 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
 187                 word_type val ATTRIBUTE_UNUSED)
 188 {
 189 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
 190   /* The cmpbge instruction sets *bits* of the result corresponding to
 191      matches in the bytes with no false positives.  */
 192   return __builtin_ctzl (cmp);
 193 #else
 194   unsigned int i;
 195
 196   /* ??? It would be nice to force unrolling here,
 197      and have all of these constants folded.  */
 198   for (i = 0; i < sizeof(word_type); ++i)
 199     {
 200       uchar c;
 201       if (WORDS_BIGENDIAN)
 202         c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
 203       else
 204         c = (val >> i * 8) & 0xff;
 205
 206       if (c == '\n' || c == '\r' || c == '\\' || c == '?')
 207         return i;
 208     }
 209
 210   return -1;
 211 #endif
 212 }
 213
 214 /* A version of the fast scanner using bit fiddling techniques.
 215
 216    For 32-bit words, one would normally perform 16 comparisons and
 217    16 branches.  With this algorithm one performs 24 arithmetic
 218    operations and one branch.  Whether this is faster with a 32-bit
 219    word size is going to be somewhat system dependent.
 220
 221    For 64-bit words, we eliminate twice the number of comparisons
 222    and branches without increasing the number of arithmetic operations.
 223    It's almost certainly going to be a win with 64-bit word size.  */
 224
 225 static const uchar * search_line_acc_char (const uchar *, const uchar *)
 226   ATTRIBUTE_UNUSED;
 227
 228 static const uchar *
 229 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 230 {
 231   const word_type repl_nl = acc_char_replicate ('\n');
 232   const word_type repl_cr = acc_char_replicate ('\r');
 233   const word_type repl_bs = acc_char_replicate ('\\');
 234   const word_type repl_qm = acc_char_replicate ('?');
 235
 236   unsigned int misalign;
 237   const word_type *p;
 238   word_type val, t;
 239
 240   /* Align the buffer.  Mask out any bytes from before the beginning.  */
 241   p = (word_type *)((uintptr_t)s & -sizeof(word_type));
 242   val = *p;
 243   misalign = (uintptr_t)s & (sizeof(word_type) - 1);
 244   if (misalign)
 245     val = acc_char_mask_misalign (val, misalign);
 246
 247   /* Main loop.  */
 248   while (1)
 249     {
 250       t  = acc_char_cmp (val, repl_nl);
 251       t |= acc_char_cmp (val, repl_cr);
 252       t |= acc_char_cmp (val, repl_bs);
 253       t |= acc_char_cmp (val, repl_qm);
 254
 255       if (__builtin_expect (t != 0, 0))
 256         {
 257           int i = acc_char_index (t, val);
 258           if (i >= 0)
 259             return (const uchar *)p + i;
 260         }
 261
 262       val = *++p;
 263     }
 264 }
 265
 266 /* Disable on Solaris 2/x86 until the following problem can be properly
 267    autoconfed:
 268
 269    The Solaris 10+ assembler tags objects with the instruction set
 270    extensions used, so SSE4.2 executables cannot run on machines that
 271    don't support that extension.  */
 272
 273 #if (GCC_VERSION >= 4005) && (__GNUC__ >= 5 || !defined(__PIC__)) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
 274
 275 /* Replicated character data to be shared between implementations.
 276    Recall that outside of a context with vector support we can't
 277    define compatible vector types, therefore these are all defined
 278    in terms of raw characters.  */
 279 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
 280   { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 281     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
 282   { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 283     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
 284   { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 285     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
 286   { '?', '?', '?', '?', '?', '?', '?', '?',
 287     '?', '?', '?', '?', '?', '?', '?', '?' },
 288 };
 289
 290 /* A version of the fast scanner using MMX vectorized byte compare insns.
 291
 292    This uses the PMOVMSKB instruction which was introduced with "MMX2",
 293    which was packaged into SSE1; it is also present in the AMD MMX
 294    extension.  Mark the function as using "sse" so that we emit a real
 295    "emms" instruction, rather than the 3dNOW "femms" instruction.  */
 296
 297 static const uchar *
 298 #ifndef __SSE__
 299 __attribute__((__target__("sse")))
 300 #endif
 301 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 302 {
 303   typedef char v8qi __attribute__ ((__vector_size__ (8)));
 304   typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
 305
 306   const v8qi repl_nl = *(const v8qi *)repl_chars[0];
 307   const v8qi repl_cr = *(const v8qi *)repl_chars[1];
 308   const v8qi repl_bs = *(const v8qi *)repl_chars[2];
 309   const v8qi repl_qm = *(const v8qi *)repl_chars[3];
 310
 311   unsigned int misalign, found, mask;
 312   const v8qi *p;
 313   v8qi data, t, c;
 314
 315   /* Align the source pointer.  While MMX doesn't generate unaligned data
 316      faults, this allows us to safely scan to the end of the buffer without
 317      reading beyond the end of the last page.  */
 318   misalign = (uintptr_t)s & 7;
 319   p = (const v8qi *)((uintptr_t)s & -8);
 320   data = *p;
 321
 322   /* Create a mask for the bytes that are valid within the first
 323      16-byte block.  The Idea here is that the AND with the mask
 324      within the loop is "free", since we need some AND or TEST
 325      insn in order to set the flags for the branch anyway.  */
 326   mask = -1u << misalign;
 327
 328   /* Main loop processing 8 bytes at a time.  */
 329   goto start;
 330   do
 331     {
 332       data = *++p;
 333       mask = -1;
 334
 335     start:
 336       t = __builtin_ia32_pcmpeqb(data, repl_nl);
 337       c = __builtin_ia32_pcmpeqb(data, repl_cr);
 338       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 339       c = __builtin_ia32_pcmpeqb(data, repl_bs);
 340       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 341       c = __builtin_ia32_pcmpeqb(data, repl_qm);
 342       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 343       found = __builtin_ia32_pmovmskb (t);
 344       found &= mask;
 345     }
 346   while (!found);
 347
 348   __builtin_ia32_emms ();
 349
 350   /* FOUND contains 1 in bits for which we matched a relevant
 351      character.  Conversion to the byte index is trivial.  */
 352   found = __builtin_ctz(found);
 353   return (const uchar *)p + found;
 354 }
 355
 356 /* A version of the fast scanner using SSE2 vectorized byte compare insns.  */
 357
 358 static const uchar *
 359 #ifndef __SSE2__
 360 __attribute__((__target__("sse2")))
 361 #endif
 362 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 363 {
 364   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 365
 366   const v16qi repl_nl = *(const v16qi *)repl_chars[0];
 367   const v16qi repl_cr = *(const v16qi *)repl_chars[1];
 368   const v16qi repl_bs = *(const v16qi *)repl_chars[2];
 369   const v16qi repl_qm = *(const v16qi *)repl_chars[3];
 370
 371   unsigned int misalign, found, mask;
 372   const v16qi *p;
 373   v16qi data, t;
 374
 375   /* Align the source pointer.  */
 376   misalign = (uintptr_t)s & 15;
 377   p = (const v16qi *)((uintptr_t)s & -16);
 378   data = *p;
 379
 380   /* Create a mask for the bytes that are valid within the first
 381      16-byte block.  The Idea here is that the AND with the mask
 382      within the loop is "free", since we need some AND or TEST
 383      insn in order to set the flags for the branch anyway.  */
 384   mask = -1u << misalign;
 385
 386   /* Main loop processing 16 bytes at a time.  */
 387   goto start;
 388   do
 389     {
 390       data = *++p;
 391       mask = -1;
 392
 393     start:
 394       t  = data == repl_nl;
 395       t |= data == repl_cr;
 396       t |= data == repl_bs;
 397       t |= data == repl_qm;
 398       found = __builtin_ia32_pmovmskb128 (t);
 399       found &= mask;
 400     }
 401   while (!found);
 402
 403   /* FOUND contains 1 in bits for which we matched a relevant
 404      character.  Conversion to the byte index is trivial.  */
 405   found = __builtin_ctz(found);
 406   return (const uchar *)p + found;
 407 }
 408
 409 #ifdef HAVE_SSE4
 410 /* A version of the fast scanner using SSE 4.2 vectorized string insns.  */
 411
 412 static const uchar *
 413 #ifndef __SSE4_2__
 414 __attribute__((__target__("sse4.2")))
 415 #endif
 416 search_line_sse42 (const uchar *s, const uchar *end)
 417 {
 418   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 419   static const v16qi search = { '\n', '\r', '?', '\\' };
 420
 421   uintptr_t si = (uintptr_t)s;
 422   uintptr_t index;
 423
 424   /* Check for unaligned input.  */
 425   if (si & 15)
 426     {
 427       v16qi sv;
 428
 429       if (__builtin_expect (end - s < 16, 0)
 430           && __builtin_expect ((si & 0xfff) > 0xff0, 0))
 431         {
 432           /* There are less than 16 bytes left in the buffer, and less
 433              than 16 bytes left on the page.  Reading 16 bytes at this
 434              point might generate a spurious page fault.  Defer to the
 435              SSE2 implementation, which already handles alignment.  */
 436           return search_line_sse2 (s, end);
 437         }
 438
 439       /* ??? The builtin doesn't understand that the PCMPESTRI read from
 440          memory need not be aligned.  */
 441       sv = __builtin_ia32_loaddqu ((const char *) s);
 442       index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
 443
 444       if (__builtin_expect (index < 16, 0))
 445         goto found;
 446
 447       /* Advance the pointer to an aligned address.  We will re-scan a
 448          few bytes, but we no longer need care for reading past the
 449          end of a page, since we're guaranteed a match.  */
 450       s = (const uchar *)((si + 15) & -16);
 451     }
 452
 453   /* Main loop, processing 16 bytes at a time.  */
 454 #ifdef __GCC_ASM_FLAG_OUTPUTS__
 455   while (1)
 456     {
 457       char f;
 458
 459       /* By using inline assembly instead of the builtin,
 460          we can use the result, as well as the flags set.  */
 461       __asm ("%vpcmpestri\t$0, %2, %3"
 462              : "=c"(index), "=@ccc"(f)
 463              : "m"(*s), "x"(search), "a"(4), "d"(16));
 464       if (f)
 465         break;
 466
 467       s += 16;
 468     }
 469 #else
 470   s -= 16;
 471   /* By doing the whole loop in inline assembly,
 472      we can make proper use of the flags set.  */
 473   __asm (      ".balign 16\n"
 474         "0:     add $16, %1\n"
 475         "       %vpcmpestri\t$0, (%1), %2\n"
 476         "       jnc 0b"
 477         : "=&c"(index), "+r"(s)
 478         : "x"(search), "a"(4), "d"(16));
 479 #endif
 480
 481  found:
 482   return s + index;
 483 }
 484
 485 #else
 486 /* Work around out-dated assemblers without sse4 support.  */
 487 #define search_line_sse42 search_line_sse2
 488 #endif
 489
 490 /* Check the CPU capabilities.  */
 491
 492 #include "../gcc/config/i386/cpuid.h"
 493
 494 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
 495 static search_line_fast_type search_line_fast;
 496
 497 #define HAVE_init_vectorized_lexer 1
 498 static inline void
 499 init_vectorized_lexer (void)
 500 {
 501   unsigned dummy, ecx = 0, edx = 0;
 502   search_line_fast_type impl = search_line_acc_char;
 503   int minimum = 0;
 504
 505 #if defined(__SSE4_2__)
 506   minimum = 3;
 507 #elif defined(__SSE2__)
 508   minimum = 2;
 509 #elif defined(__SSE__)
 510   minimum = 1;
 511 #endif
 512
 513   if (minimum == 3)
 514     impl = search_line_sse42;
 515   else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
 516     {
 517       if (minimum == 3 || (ecx & bit_SSE4_2))
 518         impl = search_line_sse42;
 519       else if (minimum == 2 || (edx & bit_SSE2))
 520         impl = search_line_sse2;
 521       else if (minimum == 1 || (edx & bit_SSE))
 522         impl = search_line_mmx;
 523     }
 524   else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
 525     {
 526       if (minimum == 1
 527           || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
 528         impl = search_line_mmx;
 529     }
 530
 531   search_line_fast = impl;
 532 }
 533
 534 #elif (GCC_VERSION >= 4005) && defined(_ARCH_PWR8) && defined(__ALTIVEC__)
 535
 536 /* A vection of the fast scanner using AltiVec vectorized byte compares
 537    and VSX unaligned loads (when VSX is available).  This is otherwise
 538    the same as the AltiVec version.  */
 539
 540 ATTRIBUTE_NO_SANITIZE_UNDEFINED
 541 static const uchar *
 542 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 543 {
 544   typedef __attribute__((altivec(vector))) unsigned char vc;
 545
 546   const vc repl_nl = {
 547     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 548     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 549   };
 550   const vc repl_cr = {
 551     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 552     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 553   };
 554   const vc repl_bs = {
 555     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 556     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 557   };
 558   const vc repl_qm = {
 559     '?', '?', '?', '?', '?', '?', '?', '?',
 560     '?', '?', '?', '?', '?', '?', '?', '?',
 561   };
 562   const vc zero = { 0 };
 563
 564   vc data, t;
 565
 566   /* Main loop processing 16 bytes at a time.  */
 567   do
 568     {
 569       vc m_nl, m_cr, m_bs, m_qm;
 570
 571       data = __builtin_vec_vsx_ld (0, s);
 572       s += 16;
 573
 574       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 575       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 576       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 577       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 578       t = (m_nl | m_cr) | (m_bs | m_qm);
 579
 580       /* T now contains 0xff in bytes for which we matched one of the relevant
 581          characters.  We want to exit the loop if any byte in T is non-zero.
 582          Below is the expansion of vec_any_ne(t, zero).  */
 583     }
 584   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 585
 586   /* Restore s to to point to the 16 bytes we just processed.  */
 587   s -= 16;
 588
 589   {
 590 #define N  (sizeof(vc) / sizeof(long))
 591
 592     union {
 593       vc v;
 594       /* Statically assert that N is 2 or 4.  */
 595       unsigned long l[(N == 2 || N == 4) ? N : -1];
 596     } u;
 597     unsigned long l, i = 0;
 598
 599     u.v = t;
 600
 601     /* Find the first word of T that is non-zero.  */
 602     switch (N)
 603       {
 604       case 4:
 605         l = u.l[i++];
 606         if (l != 0)
 607           break;
 608         s += sizeof(unsigned long);
 609         l = u.l[i++];
 610         if (l != 0)
 611           break;
 612         s += sizeof(unsigned long);
 613         /* FALLTHRU */
 614       case 2:
 615         l = u.l[i++];
 616         if (l != 0)
 617           break;
 618         s += sizeof(unsigned long);
 619         l = u.l[i];
 620       }
 621
 622     /* L now contains 0xff in bytes for which we matched one of the
 623        relevant characters.  We can find the byte index by finding
 624        its bit index and dividing by 8.  */
 625 #ifdef __BIG_ENDIAN__
 626     l = __builtin_clzl(l) >> 3;
 627 #else
 628     l = __builtin_ctzl(l) >> 3;
 629 #endif
 630     return s + l;
 631
 632 #undef N
 633   }
 634 }
 635
 636 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__) && defined (__BIG_ENDIAN__)
 637
 638 /* A vection of the fast scanner using AltiVec vectorized byte compares.
 639    This cannot be used for little endian because vec_lvsl/lvsr are
 640    deprecated for little endian and the code won't work properly.  */
 641 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
 642    so we can't compile this function without -maltivec on the command line
 643    (or implied by some other switch).  */
 644
 645 static const uchar *
 646 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 647 {
 648   typedef __attribute__((altivec(vector))) unsigned char vc;
 649
 650   const vc repl_nl = {
 651     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 652     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 653   };
 654   const vc repl_cr = {
 655     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 656     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 657   };
 658   const vc repl_bs = {
 659     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 660     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 661   };
 662   const vc repl_qm = {
 663     '?', '?', '?', '?', '?', '?', '?', '?',
 664     '?', '?', '?', '?', '?', '?', '?', '?',
 665   };
 666   const vc ones = {
 667     -1, -1, -1, -1, -1, -1, -1, -1,
 668     -1, -1, -1, -1, -1, -1, -1, -1,
 669   };
 670   const vc zero = { 0 };
 671
 672   vc data, mask, t;
 673
 674   /* Altivec loads automatically mask addresses with -16.  This lets us
 675      issue the first load as early as possible.  */
 676   data = __builtin_vec_ld(0, (const vc *)s);
 677
 678   /* Discard bytes before the beginning of the buffer.  Do this by
 679      beginning with all ones and shifting in zeros according to the
 680      mis-alignment.  The LVSR instruction pulls the exact shift we
 681      want from the address.  */
 682   mask = __builtin_vec_lvsr(0, s);
 683   mask = __builtin_vec_perm(zero, ones, mask);
 684   data &= mask;
 685
 686   /* While altivec loads mask addresses, we still need to align S so
 687      that the offset we compute at the end is correct.  */
 688   s = (const uchar *)((uintptr_t)s & -16);
 689
 690   /* Main loop processing 16 bytes at a time.  */
 691   goto start;
 692   do
 693     {
 694       vc m_nl, m_cr, m_bs, m_qm;
 695
 696       s += 16;
 697       data = __builtin_vec_ld(0, (const vc *)s);
 698
 699     start:
 700       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 701       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 702       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 703       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 704       t = (m_nl | m_cr) | (m_bs | m_qm);
 705
 706       /* T now contains 0xff in bytes for which we matched one of the relevant
 707          characters.  We want to exit the loop if any byte in T is non-zero.
 708          Below is the expansion of vec_any_ne(t, zero).  */
 709     }
 710   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 711
 712   {
 713 #define N  (sizeof(vc) / sizeof(long))
 714
 715     union {
 716       vc v;
 717       /* Statically assert that N is 2 or 4.  */
 718       unsigned long l[(N == 2 || N == 4) ? N : -1];
 719     } u;
 720     unsigned long l, i = 0;
 721
 722     u.v = t;
 723
 724     /* Find the first word of T that is non-zero.  */
 725     switch (N)
 726       {
 727       case 4:
 728         l = u.l[i++];
 729         if (l != 0)
 730           break;
 731         s += sizeof(unsigned long);
 732         l = u.l[i++];
 733         if (l != 0)
 734           break;
 735         s += sizeof(unsigned long);
 736         /* FALLTHROUGH */
 737       case 2:
 738         l = u.l[i++];
 739         if (l != 0)
 740           break;
 741         s += sizeof(unsigned long);
 742         l = u.l[i];
 743       }
 744
 745     /* L now contains 0xff in bytes for which we matched one of the
 746        relevant characters.  We can find the byte index by finding
 747        its bit index and dividing by 8.  */
 748     l = __builtin_clzl(l) >> 3;
 749     return s + l;
 750
 751 #undef N
 752   }
 753 }
 754
 755 #elif defined (__ARM_NEON) && defined (__ARM_64BIT_STATE)
 756 #include "arm_neon.h"
 757
 758 /* This doesn't have to be the exact page size, but no system may use
 759    a size smaller than this.  ARMv8 requires a minimum page size of
 760    4k.  The impact of being conservative here is a small number of
 761    cases will take the slightly slower entry path into the main
 762    loop.  */
 763
 764 #define AARCH64_MIN_PAGE_SIZE 4096
 765
 766 static const uchar *
 767 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 768 {
 769   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
 770   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
 771   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
 772   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
 773   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
 774
 775 #ifdef __ARM_BIG_ENDIAN
 776   const int16x8_t shift = {8, 8, 8, 8, 0, 0, 0, 0};
 777 #else
 778   const int16x8_t shift = {0, 0, 0, 0, 8, 8, 8, 8};
 779 #endif
 780
 781   unsigned int found;
 782   const uint8_t *p;
 783   uint8x16_t data;
 784   uint8x16_t t;
 785   uint16x8_t m;
 786   uint8x16_t u, v, w;
 787
 788   /* Align the source pointer.  */
 789   p = (const uint8_t *)((uintptr_t)s & -16);
 790
 791   /* Assuming random string start positions, with a 4k page size we'll take
 792      the slow path about 0.37% of the time.  */
 793   if (__builtin_expect ((AARCH64_MIN_PAGE_SIZE
 794                          - (((uintptr_t) s) & (AARCH64_MIN_PAGE_SIZE - 1)))
 795                         < 16, 0))
 796     {
 797       /* Slow path: the string starts near a possible page boundary.  */
 798       uint32_t misalign, mask;
 799
 800       misalign = (uintptr_t)s & 15;
 801       mask = (-1u << misalign) & 0xffff;
 802       data = vld1q_u8 (p);
 803       t = vceqq_u8 (data, repl_nl);
 804       u = vceqq_u8 (data, repl_cr);
 805       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 806       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 807       t = vorrq_u8 (v, w);
 808       t = vandq_u8 (t, xmask);
 809       m = vpaddlq_u8 (t);
 810       m = vshlq_u16 (m, shift);
 811       found = vaddvq_u16 (m);
 812       found &= mask;
 813       if (found)
 814         return (const uchar*)p + __builtin_ctz (found);
 815     }
 816   else
 817     {
 818       data = vld1q_u8 ((const uint8_t *) s);
 819       t = vceqq_u8 (data, repl_nl);
 820       u = vceqq_u8 (data, repl_cr);
 821       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 822       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 823       t = vorrq_u8 (v, w);
 824       if (__builtin_expect (vpaddd_u64 ((uint64x2_t)t) != 0, 0))
 825         goto done;
 826     }
 827
 828   do
 829     {
 830       p += 16;
 831       data = vld1q_u8 (p);
 832       t = vceqq_u8 (data, repl_nl);
 833       u = vceqq_u8 (data, repl_cr);
 834       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 835       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 836       t = vorrq_u8 (v, w);
 837     } while (!vpaddd_u64 ((uint64x2_t)t));
 838
 839 done:
 840   /* Now that we've found the terminating substring, work out precisely where
 841      we need to stop.  */
 842   t = vandq_u8 (t, xmask);
 843   m = vpaddlq_u8 (t);
 844   m = vshlq_u16 (m, shift);
 845   found = vaddvq_u16 (m);
 846   return (((((uintptr_t) p) < (uintptr_t) s) ? s : (const uchar *)p)
 847           + __builtin_ctz (found));
 848 }
 849
 850 #elif defined (__ARM_NEON)
 851 #include "arm_neon.h"
 852
 853 static const uchar *
 854 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 855 {
 856   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
 857   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
 858   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
 859   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
 860   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
 861
 862   unsigned int misalign, found, mask;
 863   const uint8_t *p;
 864   uint8x16_t data;
 865
 866   /* Align the source pointer.  */
 867   misalign = (uintptr_t)s & 15;
 868   p = (const uint8_t *)((uintptr_t)s & -16);
 869   data = vld1q_u8 (p);
 870
 871   /* Create a mask for the bytes that are valid within the first
 872      16-byte block.  The Idea here is that the AND with the mask
 873      within the loop is "free", since we need some AND or TEST
 874      insn in order to set the flags for the branch anyway.  */
 875   mask = (-1u << misalign) & 0xffff;
 876
 877   /* Main loop, processing 16 bytes at a time.  */
 878   goto start;
 879
 880   do
 881     {
 882       uint8x8_t l;
 883       uint16x4_t m;
 884       uint32x2_t n;
 885       uint8x16_t t, u, v, w;
 886
 887       p += 16;
 888       data = vld1q_u8 (p);
 889       mask = 0xffff;
 890
 891     start:
 892       t = vceqq_u8 (data, repl_nl);
 893       u = vceqq_u8 (data, repl_cr);
 894       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 895       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 896       t = vandq_u8 (vorrq_u8 (v, w), xmask);
 897       l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
 898       m = vpaddl_u8 (l);
 899       n = vpaddl_u16 (m);
 900
 901       found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
 902               vshr_n_u64 ((uint64x1_t) n, 24)), 0);
 903       found &= mask;
 904     }
 905   while (!found);
 906
 907   /* FOUND contains 1 in bits for which we matched a relevant
 908      character.  Conversion to the byte index is trivial.  */
 909   found = __builtin_ctz (found);
 910   return (const uchar *)p + found;
 911 }
 912
 913 #else
 914
 915 /* We only have one accelerated alternative.  Use a direct call so that
 916    we encourage inlining.  */
 917
 918 #define search_line_fast  search_line_acc_char
 919
 920 #endif
 921
 922 /* Initialize the lexer if needed.  */
 923
 924 void
 925 _cpp_init_lexer (void)
 926 {
 927 #ifdef HAVE_init_vectorized_lexer
 928   init_vectorized_lexer ();
 929 #endif
 930 }
 931
 932 /* Returns with a logical line that contains no escaped newlines or
 933    trigraphs.  This is a time-critical inner loop.  */
 934 void
 935 _cpp_clean_line (cpp_reader *pfile)
 936 {
 937   cpp_buffer *buffer;
 938   const uchar *s;
 939   uchar c, *d, *p;
 940
 941   buffer = pfile->buffer;
 942   buffer->cur_note = buffer->notes_used = 0;
 943   buffer->cur = buffer->line_base = buffer->next_line;
 944   buffer->need_line = false;
 945   s = buffer->next_line;
 946
 947   if (!buffer->from_stage3)
 948     {
 949       const uchar *pbackslash = NULL;
 950
 951       /* Fast path.  This is the common case of an un-escaped line with
 952          no trigraphs.  The primary win here is by not writing any
 953          data back to memory until we have to.  */
 954       while (1)
 955         {
 956           /* Perform an optimized search for \n, \r, \\, ?.  */
 957           s = search_line_fast (s, buffer->rlimit);
 958
 959           c = *s;
 960           if (c == '\\')
 961             {
 962               /* Record the location of the backslash and continue.  */
 963               pbackslash = s++;
 964             }
 965           else if (__builtin_expect (c == '?', 0))
 966             {
 967               if (__builtin_expect (s[1] == '?', false)
 968                    && _cpp_trigraph_map[s[2]])
 969                 {
 970                   /* Have a trigraph.  We may or may not have to convert
 971                      it.  Add a line note regardless, for -Wtrigraphs.  */
 972                   add_line_note (buffer, s, s[2]);
 973                   if (CPP_OPTION (pfile, trigraphs))
 974                     {
 975                       /* We do, and that means we have to switch to the
 976                          slow path.  */
 977                       d = (uchar *) s;
 978                       *d = _cpp_trigraph_map[s[2]];
 979                       s += 2;
 980                       goto slow_path;
 981                     }
 982                 }
 983               /* Not a trigraph.  Continue on fast-path.  */
 984               s++;
 985             }
 986           else
 987             break;
 988         }
 989
 990       /* This must be \r or \n.  We're either done, or we'll be forced
 991          to write back to the buffer and continue on the slow path.  */
 992       d = (uchar *) s;
 993
 994       if (__builtin_expect (s == buffer->rlimit, false))
 995         goto done;
 996
 997       /* DOS line ending? */
 998       if (__builtin_expect (c == '\r', false) && s[1] == '\n')
 999         {
1000           s++;
1001           if (s == buffer->rlimit)
1002             goto done;
1003         }
1004
1005       if (__builtin_expect (pbackslash == NULL, true))
1006         goto done;
1007
1008       /* Check for escaped newline.  */
1009       p = d;
1010       while (is_nvspace (p[-1]))
1011         p--;
1012       if (p - 1 != pbackslash)
1013         goto done;
1014
1015       /* Have an escaped newline; process it and proceed to
1016          the slow path.  */
1017       add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
1018       d = p - 2;
1019       buffer->next_line = p - 1;
1020
1021     slow_path:
1022       while (1)
1023         {
1024           c = *++s;
1025           *++d = c;
1026
1027           if (c == '\n' || c == '\r')
1028             {
1029               /* Handle DOS line endings.  */
1030               if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
1031                 s++;
1032               if (s == buffer->rlimit)
1033                 break;
1034
1035               /* Escaped?  */
1036               p = d;
1037               while (p != buffer->next_line && is_nvspace (p[-1]))
1038                 p--;
1039               if (p == buffer->next_line || p[-1] != '\\')
1040                 break;
1041
1042               add_line_note (buffer, p - 1, p != d ? ' ': '\\');
1043               d = p - 2;
1044               buffer->next_line = p - 1;
1045             }
1046           else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
1047             {
1048               /* Add a note regardless, for the benefit of -Wtrigraphs.  */
1049               add_line_note (buffer, d, s[2]);
1050               if (CPP_OPTION (pfile, trigraphs))
1051                 {
1052                   *d = _cpp_trigraph_map[s[2]];
1053                   s += 2;
1054                 }
1055             }
1056         }
1057     }
1058   else
1059     {
1060       while (*s != '\n' && *s != '\r')
1061         s++;
1062       d = (uchar *) s;
1063
1064       /* Handle DOS line endings.  */
1065       if (*s == '\r' && s + 1 != buffer->rlimit && s[1] == '\n')
1066         s++;
1067     }
1068
1069  done:
1070   *d = '\n';
1071   /* A sentinel note that should never be processed.  */
1072   add_line_note (buffer, d + 1, '\n');
1073   buffer->next_line = s + 1;
1074 }
1075
1076 /* Return true if the trigraph indicated by NOTE should be warned
1077    about in a comment.  */
1078 static bool
1079 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
1080 {
1081   const uchar *p;
1082
1083   /* Within comments we don't warn about trigraphs, unless the
1084      trigraph forms an escaped newline, as that may change
1085      behavior.  */
1086   if (note->type != '/')
1087     return false;
1088
1089   /* If -trigraphs, then this was an escaped newline iff the next note
1090      is coincident.  */
1091   if (CPP_OPTION (pfile, trigraphs))
1092     return note[1].pos == note->pos;
1093
1094   /* Otherwise, see if this forms an escaped newline.  */
1095   p = note->pos + 3;
1096   while (is_nvspace (*p))
1097     p++;
1098
1099   /* There might have been escaped newlines between the trigraph and the
1100      newline we found.  Hence the position test.  */
1101   return (*p == '\n' && p < note[1].pos);
1102 }
1103
1104 /* Process the notes created by add_line_note as far as the current
1105    location.  */
1106 void
1107 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
1108 {
1109   cpp_buffer *buffer = pfile->buffer;
1110
1111   for (;;)
1112     {
1113       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
1114       unsigned int col;
1115
1116       if (note->pos > buffer->cur)
1117         break;
1118
1119       buffer->cur_note++;
1120       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
1121
1122       if (note->type == '\\' || note->type == ' ')
1123         {
1124           if (note->type == ' ' && !in_comment)
1125             cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
1126                                  "backslash and newline separated by space");
1127
1128           if (buffer->next_line > buffer->rlimit)
1129             {
1130               cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
1131                                    "backslash-newline at end of file");
1132               /* Prevent "no newline at end of file" warning.  */
1133               buffer->next_line = buffer->rlimit;
1134             }
1135
1136           buffer->line_base = note->pos;
1137           CPP_INCREMENT_LINE (pfile, 0);
1138         }
1139       else if (_cpp_trigraph_map[note->type])
1140         {
1141           if (CPP_OPTION (pfile, warn_trigraphs)
1142               && (!in_comment || warn_in_comment (pfile, note)))
1143             {
1144               if (CPP_OPTION (pfile, trigraphs))
1145                 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
1146                                        pfile->line_table->highest_line, col,
1147                                        "trigraph ??%c converted to %c",
1148                                        note->type,
1149                                        (int) _cpp_trigraph_map[note->type]);
1150               else
1151                 {
1152                   cpp_warning_with_line
1153                     (pfile, CPP_W_TRIGRAPHS,
1154                      pfile->line_table->highest_line, col,
1155                      "trigraph ??%c ignored, use -trigraphs to enable",
1156                      note->type);
1157                 }
1158             }
1159         }
1160       else if (note->type == 0)
1161         /* Already processed in lex_raw_string.  */;
1162       else
1163         abort ();
1164     }
1165 }
1166
1167 /* Skip a C-style block comment.  We find the end of the comment by
1168    seeing if an asterisk is before every '/' we encounter.  Returns
1169    nonzero if comment terminated by EOF, zero otherwise.
1170
1171    Buffer->cur points to the initial asterisk of the comment.  */
1172 bool
1173 _cpp_skip_block_comment (cpp_reader *pfile)
1174 {
1175   cpp_buffer *buffer = pfile->buffer;
1176   const uchar *cur = buffer->cur;
1177   uchar c;
1178
1179   cur++;
1180   if (*cur == '/')
1181     cur++;
1182
1183   for (;;)
1184     {
1185       /* People like decorating comments with '*', so check for '/'
1186          instead for efficiency.  */
1187       c = *cur++;
1188
1189       if (c == '/')
1190         {
1191           if (cur[-2] == '*')
1192             break;
1193
1194           /* Warn about potential nested comments, but not if the '/'
1195              comes immediately before the true comment delimiter.
1196              Don't bother to get it right across escaped newlines.  */
1197           if (CPP_OPTION (pfile, warn_comments)
1198               && cur[0] == '*' && cur[1] != '/')
1199             {
1200               buffer->cur = cur;
1201               cpp_warning_with_line (pfile, CPP_W_COMMENTS,
1202                                      pfile->line_table->highest_line,
1203                                      CPP_BUF_COL (buffer),
1204                                      "\"/*\" within comment");
1205             }
1206         }
1207       else if (c == '\n')
1208         {
1209           unsigned int cols;
1210           buffer->cur = cur - 1;
1211           _cpp_process_line_notes (pfile, true);
1212           if (buffer->next_line >= buffer->rlimit)
1213             return true;
1214           _cpp_clean_line (pfile);
1215
1216           cols = buffer->next_line - buffer->line_base;
1217           CPP_INCREMENT_LINE (pfile, cols);
1218
1219           cur = buffer->cur;
1220         }
1221     }
1222
1223   buffer->cur = cur;
1224   _cpp_process_line_notes (pfile, true);
1225   return false;
1226 }
1227
1228 /* Skip a C++ line comment, leaving buffer->cur pointing to the
1229    terminating newline.  Handles escaped newlines.  Returns nonzero
1230    if a multiline comment.  */
1231 static int
1232 skip_line_comment (cpp_reader *pfile)
1233 {
1234   cpp_buffer *buffer = pfile->buffer;
1235   location_t orig_line = pfile->line_table->highest_line;
1236
1237   while (*buffer->cur != '\n')
1238     buffer->cur++;
1239
1240   _cpp_process_line_notes (pfile, true);
1241   return orig_line != pfile->line_table->highest_line;
1242 }
1243
1244 /* Skips whitespace, saving the next non-whitespace character.  */
1245 static void
1246 skip_whitespace (cpp_reader *pfile, cppchar_t c)
1247 {
1248   cpp_buffer *buffer = pfile->buffer;
1249   bool saw_NUL = false;
1250
1251   do
1252     {
1253       /* Horizontal space always OK.  */
1254       if (c == ' ' || c == '\t')
1255         ;
1256       /* Just \f \v or \0 left.  */
1257       else if (c == '\0')
1258         saw_NUL = true;
1259       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
1260         cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
1261                              CPP_BUF_COL (buffer),
1262                              "%s in preprocessing directive",
1263                              c == '\f' ? "form feed" : "vertical tab");
1264
1265       c = *buffer->cur++;
1266     }
1267   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
1268   while (is_nvspace (c));
1269
1270   if (saw_NUL)
1271     cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored");
1272
1273   buffer->cur--;
1274 }
1275
1276 /* See if the characters of a number token are valid in a name (no
1277    '.', '+' or '-').  */
1278 static int
1279 name_p (cpp_reader *pfile, const cpp_string *string)
1280 {
1281   unsigned int i;
1282
1283   for (i = 0; i < string->len; i++)
1284     if (!is_idchar (string->text[i]))
1285       return 0;
1286
1287   return 1;
1288 }
1289
1290 /* After parsing an identifier or other sequence, produce a warning about
1291    sequences not in NFC/NFKC.  */
1292 static void
1293 warn_about_normalization (cpp_reader *pfile,
1294                           const cpp_token *token,
1295                           const struct normalize_state *s)
1296 {
1297   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
1298       && !pfile->state.skipping)
1299     {
1300       /* Make sure that the token is printed using UCNs, even
1301          if we'd otherwise happily print UTF-8.  */
1302       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
1303       size_t sz;
1304
1305       sz = cpp_spell_token (pfile, token, buf, false) - buf;
1306       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
1307         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1308                                "`%.*s' is not in NFKC", (int) sz, buf);
1309       else if (CPP_OPTION (pfile, cxx23_identifiers))
1310         cpp_pedwarning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1311                                   "`%.*s' is not in NFC", (int) sz, buf);
1312       else
1313         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1314                                "`%.*s' is not in NFC", (int) sz, buf);
1315       free (buf);
1316     }
1317 }
1318
1319 static const cppchar_t utf8_signifier = 0xC0;
1320
1321 /* Returns TRUE if the sequence starting at buffer->cur is valid in
1322    an identifier.  FIRST is TRUE if this starts an identifier.  */
1323 static bool
1324 forms_identifier_p (cpp_reader *pfile, int first,
1325                     struct normalize_state *state)
1326 {
1327   cpp_buffer *buffer = pfile->buffer;
1328
1329   if (*buffer->cur == '$')
1330     {
1331       if (!CPP_OPTION (pfile, dollars_in_ident))
1332         return false;
1333
1334       buffer->cur++;
1335       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
1336         {
1337           CPP_OPTION (pfile, warn_dollars) = 0;
1338           cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
1339         }
1340
1341       return true;
1342     }
1343
1344   /* Is this a syntactically valid UCN or a valid UTF-8 char?  */
1345   if (CPP_OPTION (pfile, extended_identifiers))
1346     {
1347       cppchar_t s;
1348       if (*buffer->cur >= utf8_signifier)
1349         {
1350           if (_cpp_valid_utf8 (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1351                                state, &s))
1352             return true;
1353         }
1354       else if (*buffer->cur == '\\'
1355                && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
1356         {
1357           buffer->cur += 2;
1358           if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1359                               state, &s, NULL, NULL))
1360             return true;
1361           buffer->cur -= 2;
1362         }
1363     }
1364
1365   return false;
1366 }
1367
1368 /* Helper function to issue error about improper __VA_OPT__ use.  */
1369 static void
1370 maybe_va_opt_error (cpp_reader *pfile)
1371 {
1372   if (CPP_PEDANTIC (pfile) && !CPP_OPTION (pfile, va_opt))
1373     {
1374       /* __VA_OPT__ should not be accepted at all, but allow it in
1375          system headers.  */
1376       if (!_cpp_in_system_header (pfile))
1377         cpp_error (pfile, CPP_DL_PEDWARN,
1378                    "__VA_OPT__ is not available until C++20");
1379     }
1380   else if (!pfile->state.va_args_ok)
1381     {
1382       /* __VA_OPT__ should only appear in the replacement list of a
1383          variadic macro.  */
1384       cpp_error (pfile, CPP_DL_PEDWARN,
1385                  "__VA_OPT__ can only appear in the expansion"
1386                  " of a C++20 variadic macro");
1387     }
1388 }
1389
1390 /* Helper function to get the cpp_hashnode of the identifier BASE.  */
1391 static cpp_hashnode *
1392 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
1393 {
1394   cpp_hashnode *result;
1395   const uchar *cur;
1396   unsigned int len;
1397   unsigned int hash = HT_HASHSTEP (0, *base);
1398
1399   cur = base + 1;
1400   while (ISIDNUM (*cur))
1401     {
1402       hash = HT_HASHSTEP (hash, *cur);
1403       cur++;
1404     }
1405   len = cur - base;
1406   hash = HT_HASHFINISH (hash, len);
1407   result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1408                                               base, len, hash, HT_ALLOC));
1409
1410   /* Rarely, identifiers require diagnostics when lexed.  */
1411   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1412                         && !pfile->state.skipping, 0))
1413     {
1414       /* It is allowed to poison the same identifier twice.  */
1415       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1416         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1417                    NODE_NAME (result));
1418
1419       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1420          replacement list of a variadic macro.  */
1421       if (result == pfile->spec_nodes.n__VA_ARGS__
1422           && !pfile->state.va_args_ok)
1423         {
1424           if (CPP_OPTION (pfile, cplusplus))
1425             cpp_error (pfile, CPP_DL_PEDWARN,
1426                        "__VA_ARGS__ can only appear in the expansion"
1427                        " of a C++11 variadic macro");
1428           else
1429             cpp_error (pfile, CPP_DL_PEDWARN,
1430                        "__VA_ARGS__ can only appear in the expansion"
1431                        " of a C99 variadic macro");
1432         }
1433
1434       if (result == pfile->spec_nodes.n__VA_OPT__)
1435         maybe_va_opt_error (pfile);
1436
1437       /* For -Wc++-compat, warn about use of C++ named operators.  */
1438       if (result->flags & NODE_WARN_OPERATOR)
1439         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1440                      "identifier \"%s\" is a special operator name in C++",
1441                      NODE_NAME (result));
1442     }
1443
1444   return result;
1445 }
1446
1447 /* Get the cpp_hashnode of an identifier specified by NAME in
1448    the current cpp_reader object.  If none is found, NULL is returned.  */
1449 cpp_hashnode *
1450 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
1451 {
1452   cpp_hashnode *result;
1453   result = lex_identifier_intern (pfile, (uchar *) name);
1454   return result;
1455 }
1456
1457 /* Lex an identifier starting at BUFFER->CUR - 1.  */
1458 static cpp_hashnode *
1459 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
1460                 struct normalize_state *nst, cpp_hashnode **spelling)
1461 {
1462   cpp_hashnode *result;
1463   const uchar *cur;
1464   unsigned int len;
1465   unsigned int hash = HT_HASHSTEP (0, *base);
1466
1467   cur = pfile->buffer->cur;
1468   if (! starts_ucn)
1469     {
1470       while (ISIDNUM (*cur))
1471         {
1472           hash = HT_HASHSTEP (hash, *cur);
1473           cur++;
1474         }
1475       NORMALIZE_STATE_UPDATE_IDNUM (nst, *(cur - 1));
1476     }
1477   pfile->buffer->cur = cur;
1478   if (starts_ucn || forms_identifier_p (pfile, false, nst))
1479     {
1480       /* Slower version for identifiers containing UCNs
1481          or extended chars (including $).  */
1482       do {
1483         while (ISIDNUM (*pfile->buffer->cur))
1484           {
1485             NORMALIZE_STATE_UPDATE_IDNUM (nst, *pfile->buffer->cur);
1486             pfile->buffer->cur++;
1487           }
1488       } while (forms_identifier_p (pfile, false, nst));
1489       result = _cpp_interpret_identifier (pfile, base,
1490                                           pfile->buffer->cur - base);
1491       *spelling = cpp_lookup (pfile, base, pfile->buffer->cur - base);
1492     }
1493   else
1494     {
1495       len = cur - base;
1496       hash = HT_HASHFINISH (hash, len);
1497
1498       result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1499                                                   base, len, hash, HT_ALLOC));
1500       *spelling = result;
1501     }
1502
1503   /* Rarely, identifiers require diagnostics when lexed.  */
1504   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1505                         && !pfile->state.skipping, 0))
1506     {
1507       /* It is allowed to poison the same identifier twice.  */
1508       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1509         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1510                    NODE_NAME (result));
1511
1512       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1513          replacement list of a variadic macro.  */
1514       if (result == pfile->spec_nodes.n__VA_ARGS__
1515           && !pfile->state.va_args_ok)
1516         {
1517           if (CPP_OPTION (pfile, cplusplus))
1518             cpp_error (pfile, CPP_DL_PEDWARN,
1519                        "__VA_ARGS__ can only appear in the expansion"
1520                        " of a C++11 variadic macro");
1521           else
1522             cpp_error (pfile, CPP_DL_PEDWARN,
1523                        "__VA_ARGS__ can only appear in the expansion"
1524                        " of a C99 variadic macro");
1525         }
1526
1527       /* __VA_OPT__ should only appear in the replacement list of a
1528          variadic macro.  */
1529       if (result == pfile->spec_nodes.n__VA_OPT__)
1530         maybe_va_opt_error (pfile);
1531
1532       /* For -Wc++-compat, warn about use of C++ named operators.  */
1533       if (result->flags & NODE_WARN_OPERATOR)
1534         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1535                      "identifier \"%s\" is a special operator name in C++",
1536                      NODE_NAME (result));
1537     }
1538
1539   return result;
1540 }
1541
1542 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
1543 static void
1544 lex_number (cpp_reader *pfile, cpp_string *number,
1545             struct normalize_state *nst)
1546 {
1547   const uchar *cur;
1548   const uchar *base;
1549   uchar *dest;
1550
1551   base = pfile->buffer->cur - 1;
1552   do
1553     {
1554       const uchar *adj_digit_sep = NULL;
1555       cur = pfile->buffer->cur;
1556
1557       /* N.B. ISIDNUM does not include $.  */
1558       while (ISIDNUM (*cur)
1559              || (*cur == '.' && !DIGIT_SEP (cur[-1]))
1560              || DIGIT_SEP (*cur)
1561              || (VALID_SIGN (*cur, cur[-1]) && !DIGIT_SEP (cur[-2])))
1562         {
1563           NORMALIZE_STATE_UPDATE_IDNUM (nst, *cur);
1564           /* Adjacent digit separators do not form part of the pp-number syntax.
1565              However, they can safely be diagnosed here as an error, since '' is
1566              not a valid preprocessing token.  */
1567           if (DIGIT_SEP (*cur) && DIGIT_SEP (cur[-1]) && !adj_digit_sep)
1568             adj_digit_sep = cur;
1569           cur++;
1570         }
1571       /* A number can't end with a digit separator.  */
1572       while (cur > pfile->buffer->cur && DIGIT_SEP (cur[-1]))
1573         --cur;
1574       if (adj_digit_sep && adj_digit_sep < cur)
1575         cpp_error (pfile, CPP_DL_ERROR, "adjacent digit separators");
1576
1577       pfile->buffer->cur = cur;
1578     }
1579   while (forms_identifier_p (pfile, false, nst));
1580
1581   number->len = cur - base;
1582   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
1583   memcpy (dest, base, number->len);
1584   dest[number->len] = '\0';
1585   number->text = dest;
1586 }
1587
1588 /* Create a token of type TYPE with a literal spelling.  */
1589 static void
1590 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
1591                 unsigned int len, enum cpp_ttype type)
1592 {
1593   token->type = type;
1594   token->val.str.len = len;
1595   token->val.str.text = cpp_alloc_token_string (pfile, base, len);
1596 }
1597
1598 const uchar *
1599 cpp_alloc_token_string (cpp_reader *pfile,
1600                         const unsigned char *ptr, unsigned len)
1601 {
1602   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
1603
1604   dest[len] = 0;
1605   memcpy (dest, ptr, len);
1606   return dest;
1607 }
1608
1609 /* A pair of raw buffer pointers.  The currently open one is [1], the
1610    first one is [0].  Used for string literal lexing.  */
1611 struct lit_accum {
1612   _cpp_buff *first;
1613   _cpp_buff *last;
1614   const uchar *rpos;
1615   size_t accum;
1616
1617   lit_accum ()
1618     : first (NULL), last (NULL), rpos (0), accum (0)
1619   {
1620   }
1621
1622   void append (cpp_reader *, const uchar *, size_t);
1623
1624   void read_begin (cpp_reader *);
1625   bool reading_p () const
1626   {
1627     return rpos != NULL;
1628   }
1629   char read_char ()
1630   {
1631     char c = *rpos++;
1632     if (rpos == BUFF_FRONT (last))
1633       rpos = NULL;
1634     return c;
1635   }
1636 };
1637
1638 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
1639    sequence from *FIRST_BUFF_P to LAST_BUFF_P.  */
1640
1641 void
1642 lit_accum::append (cpp_reader *pfile, const uchar *base, size_t len)
1643 {
1644   if (!last)
1645     /* Starting.  */
1646     first = last = _cpp_get_buff (pfile, len);
1647   else if (len > BUFF_ROOM (last))
1648     {
1649       /* There is insufficient room in the buffer.  Copy what we can,
1650          and then either extend or create a new one.  */
1651       size_t room = BUFF_ROOM (last);
1652       memcpy (BUFF_FRONT (last), base, room);
1653       BUFF_FRONT (last) += room;
1654       base += room;
1655       len -= room;
1656       accum += room;
1657
1658       gcc_checking_assert (!rpos);
1659
1660       last = _cpp_append_extend_buff (pfile, last, len);
1661     }
1662
1663   memcpy (BUFF_FRONT (last), base, len);
1664   BUFF_FRONT (last) += len;
1665   accum += len;
1666 }
1667
1668 void
1669 lit_accum::read_begin (cpp_reader *pfile)
1670 {
1671   /* We never accumulate more than 4 chars to read.  */
1672   if (BUFF_ROOM (last) < 4)
1673
1674     last = _cpp_append_extend_buff (pfile, last, 4);
1675   rpos = BUFF_FRONT (last);
1676 }
1677
1678 /* Returns true if a macro has been defined.
1679    This might not work if compile with -save-temps,
1680    or preprocess separately from compilation.  */
1681
1682 static bool
1683 is_macro(cpp_reader *pfile, const uchar *base)
1684 {
1685   const uchar *cur = base;
1686   if (! ISIDST (*cur))
1687     return false;
1688   unsigned int hash = HT_HASHSTEP (0, *cur);
1689   ++cur;
1690   while (ISIDNUM (*cur))
1691     {
1692       hash = HT_HASHSTEP (hash, *cur);
1693       ++cur;
1694     }
1695   hash = HT_HASHFINISH (hash, cur - base);
1696
1697   cpp_hashnode *result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1698                                         base, cur - base, hash, HT_NO_INSERT));
1699
1700   return result && cpp_macro_p (result);
1701 }
1702
1703 /* Returns true if a literal suffix does not have the expected form
1704    and is defined as a macro.  */
1705
1706 static bool
1707 is_macro_not_literal_suffix(cpp_reader *pfile, const uchar *base)
1708 {
1709   /* User-defined literals outside of namespace std must start with a single
1710      underscore, so assume anything of that form really is a UDL suffix.
1711      We don't need to worry about UDLs defined inside namespace std because
1712      their names are reserved, so cannot be used as macro names in valid
1713      programs.  */
1714   if (base[0] == '_' && base[1] != '_')
1715     return false;
1716   return is_macro (pfile, base);
1717 }
1718
1719 /* Lexes a raw string.  The stored string contains the spelling,
1720    including double quotes, delimiter string, '(' and ')', any leading
1721    'L', 'u', 'U' or 'u8' and 'R' modifier.  The created token contains
1722    the type of the literal, or CPP_OTHER if it was not properly
1723    terminated.
1724
1725    BASE is the start of the token.  Updates pfile->buffer->cur to just
1726    after the lexed string.
1727
1728    The spelling is NUL-terminated, but it is not guaranteed that this
1729    is the first NUL since embedded NULs are preserved.  */
1730
1731 static void
1732 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
1733 {
1734   const uchar *pos = base;
1735
1736   /* 'tis a pity this information isn't passed down from the lexer's
1737      initial categorization of the token.  */
1738   enum cpp_ttype type = CPP_STRING;
1739
1740   if (*pos == 'L')
1741     {
1742       type = CPP_WSTRING;
1743       pos++;
1744     }
1745   else if (*pos == 'U')
1746     {
1747       type = CPP_STRING32;
1748       pos++;
1749     }
1750   else if (*pos == 'u')
1751     {
1752       if (pos[1] == '8')
1753         {
1754           type = CPP_UTF8STRING;
1755           pos++;
1756         }
1757       else
1758         type = CPP_STRING16;
1759       pos++;
1760     }
1761
1762   gcc_checking_assert (pos[0] == 'R' && pos[1] == '"');
1763   pos += 2;
1764
1765   _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
1766
1767   /* Skip notes before the ".  */
1768   while (note->pos < pos)
1769     ++note;
1770
1771   lit_accum accum;
1772
1773   uchar prefix[17];
1774   unsigned prefix_len = 0;
1775   enum Phase
1776   {
1777    PHASE_PREFIX = -2,
1778    PHASE_NONE = -1,
1779    PHASE_SUFFIX = 0
1780   } phase = PHASE_PREFIX;
1781
1782   for (;;)
1783     {
1784       gcc_checking_assert (note->pos >= pos);
1785
1786       /* Undo any escaped newlines and trigraphs.  */
1787       if (!accum.reading_p () && note->pos == pos)
1788         switch (note->type)
1789           {
1790           case '\\':
1791           case ' ':
1792             /* Restore backslash followed by newline.  */
1793             accum.append (pfile, base, pos - base);
1794             base = pos;
1795             accum.read_begin (pfile);
1796             accum.append (pfile, UC"\\", 1);
1797
1798           after_backslash:
1799             if (note->type == ' ')
1800               /* GNU backslash whitespace newline extension.  FIXME
1801                  could be any sequence of non-vertical space.  When we
1802                  can properly restore any such sequence, we should
1803                  mark this note as handled so _cpp_process_line_notes
1804                  doesn't warn.  */
1805               accum.append (pfile, UC" ", 1);
1806
1807             accum.append (pfile, UC"\n", 1);
1808             note++;
1809             break;
1810
1811           case '\n':
1812             /* This can happen for ??/<NEWLINE> when trigraphs are not
1813                being interpretted.  */
1814             gcc_checking_assert (!CPP_OPTION (pfile, trigraphs));
1815             note->type = 0;
1816             note++;
1817             break;
1818
1819           default:
1820             gcc_checking_assert (_cpp_trigraph_map[note->type]);
1821
1822             /* Don't warn about this trigraph in
1823                _cpp_process_line_notes, since trigraphs show up as
1824                trigraphs in raw strings.  */
1825             uchar type = note->type;
1826             note->type = 0;
1827
1828             if (CPP_OPTION (pfile, trigraphs))
1829               {
1830                 accum.append (pfile, base, pos - base);
1831                 base = pos;
1832                 accum.read_begin (pfile);
1833                 accum.append (pfile, UC"??", 2);
1834                 accum.append (pfile, &type, 1);
1835
1836                 /* ??/ followed by newline gets two line notes, one for
1837                    the trigraph and one for the backslash/newline.  */
1838                 if (type == '/' && note[1].pos == pos)
1839                   {
1840                     note++;
1841                     gcc_assert (note->type == '\\' || note->type == ' ');
1842                     goto after_backslash;
1843                   }
1844                 /* Skip the replacement character.  */
1845                 base = ++pos;
1846               }
1847
1848             note++;
1849             break;
1850           }
1851
1852       /* Now get a char to process.  Either from an expanded note, or
1853          from the line buffer.  */
1854       bool read_note = accum.reading_p ();
1855       char c = read_note ? accum.read_char () : *pos++;
1856
1857       if (phase == PHASE_PREFIX)
1858         {
1859           if (c == '(')
1860             {
1861               /* Done.  */
1862               phase = PHASE_NONE;
1863               prefix[prefix_len++] = '"';
1864             }
1865           else if (prefix_len < 16
1866                    /* Prefix chars are any of the basic character set,
1867                       [lex.charset] except for '
1868                       ()\\\t\v\f\n'. Optimized for a contiguous
1869                       alphabet.  */
1870                    /* Unlike a switch, this collapses down to one or
1871                       two shift and bitmask operations on an ASCII
1872                       system, with an outlier or two.   */
1873                    && (('Z' - 'A' == 25
1874                         ? ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
1875                         : ISIDST (c))
1876                        || (c >= '0' && c <= '9')
1877                        || c == '_' || c == '{' || c == '}'
1878                        || c == '[' || c == ']' || c == '#'
1879                        || c == '<' || c == '>' || c == '%'
1880                        || c == ':' || c == ';' || c == '.' || c == '?'
1881                        || c == '*' || c == '+' || c == '-' || c == '/'
1882                        || c == '^' || c == '&' || c == '|' || c == '~'
1883                        || c == '!' || c == '=' || c == ','
1884                        || c == '"' || c == '\''))
1885             prefix[prefix_len++] = c;
1886           else
1887             {
1888               /* Something is wrong.  */
1889               int col = CPP_BUF_COLUMN (pfile->buffer, pos) + read_note;
1890               if (prefix_len == 16)
1891                 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1892                                      col, "raw string delimiter longer "
1893                                      "than 16 characters");
1894               else if (c == '\n')
1895                 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1896                                      col, "invalid new-line in raw "
1897                                      "string delimiter");
1898               else
1899                 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1900                                      col, "invalid character '%c' in "
1901                                      "raw string delimiter", c);
1902               type = CPP_OTHER;
1903               phase = PHASE_NONE;
1904               /* Continue until we get a close quote, that's probably
1905                  the best failure mode.  */
1906               prefix_len = 0;
1907             }
1908           if (c != '\n')
1909             continue;
1910         }
1911
1912       if (phase != PHASE_NONE)
1913         {
1914           if (prefix[phase] != c)
1915             phase = PHASE_NONE;
1916           else if (unsigned (phase + 1) == prefix_len)
1917             break;
1918           else
1919             {
1920               phase = Phase (phase + 1);
1921               continue;
1922             }
1923         }
1924
1925       if (!prefix_len && c == '"')
1926         /* Failure mode lexing.  */
1927         goto out;
1928       else if (prefix_len && c == ')')
1929         phase = PHASE_SUFFIX;
1930       else if (!read_note && c == '\n')
1931         {
1932           pos--;
1933           pfile->buffer->cur = pos;
1934           if (pfile->state.in_directive
1935               || (pfile->state.parsing_args
1936                   && pfile->buffer->next_line >= pfile->buffer->rlimit))
1937             {
1938               cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
1939                                    "unterminated raw string");
1940               type = CPP_OTHER;
1941               goto out;
1942             }
1943
1944           accum.append (pfile, base, pos - base + 1);
1945           _cpp_process_line_notes (pfile, false);
1946
1947           if (pfile->buffer->next_line < pfile->buffer->rlimit)
1948             CPP_INCREMENT_LINE (pfile, 0);
1949           pfile->buffer->need_line = true;
1950
1951           if (!_cpp_get_fresh_line (pfile))
1952             {
1953               /* We ran out of file and failed to get a line.  */
1954               location_t src_loc = token->src_loc;
1955               token->type = CPP_EOF;
1956               /* Tell the compiler the line number of the EOF token.  */
1957               token->src_loc = pfile->line_table->highest_line;
1958               token->flags = BOL;
1959               if (accum.first)
1960                 _cpp_release_buff (pfile, accum.first);
1961               cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
1962                                    "unterminated raw string");
1963               /* Now pop the buffer that _cpp_get_fresh_line did not.  */
1964               _cpp_pop_buffer (pfile);
1965               return;
1966             }
1967
1968           pos = base = pfile->buffer->cur;
1969           note = &pfile->buffer->notes[pfile->buffer->cur_note];
1970         }
1971     }
1972
1973   if (CPP_OPTION (pfile, user_literals))
1974     {
1975       /* If a string format macro, say from inttypes.h, is placed touching
1976          a string literal it could be parsed as a C++11 user-defined string
1977          literal thus breaking the program.  */
1978       if (is_macro_not_literal_suffix (pfile, pos))
1979         {
1980           /* Raise a warning, but do not consume subsequent tokens.  */
1981           if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
1982             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
1983                                    token->src_loc, 0,
1984                                    "invalid suffix on literal; C++11 requires "
1985                                    "a space between literal and string macro");
1986         }
1987       /* Grab user defined literal suffix.  */
1988       else if (ISIDST (*pos))
1989         {
1990           type = cpp_userdef_string_add_type (type);
1991           ++pos;
1992
1993           while (ISIDNUM (*pos))
1994             ++pos;
1995         }
1996     }
1997
1998  out:
1999   pfile->buffer->cur = pos;
2000   if (!accum.accum)
2001     create_literal (pfile, token, base, pos - base, type);
2002   else
2003     {
2004       size_t extra_len = pos - base;
2005       uchar *dest = _cpp_unaligned_alloc (pfile, accum.accum + extra_len + 1);
2006
2007       token->type = type;
2008       token->val.str.len = accum.accum + extra_len;
2009       token->val.str.text = dest;
2010       for (_cpp_buff *buf = accum.first; buf; buf = buf->next)
2011         {
2012           size_t len = BUFF_FRONT (buf) - buf->base;
2013           memcpy (dest, buf->base, len);
2014           dest += len;
2015         }
2016       _cpp_release_buff (pfile, accum.first);
2017       memcpy (dest, base, extra_len);
2018       dest[extra_len] = '\0';
2019     }
2020 }
2021
2022 /* Lexes a string, character constant, or angle-bracketed header file
2023    name.  The stored string contains the spelling, including opening
2024    quote and any leading 'L', 'u', 'U' or 'u8' and optional
2025    'R' modifier.  It returns the type of the literal, or CPP_OTHER
2026    if it was not properly terminated, or CPP_LESS for an unterminated
2027    header name which must be relexed as normal tokens.
2028
2029    The spelling is NUL-terminated, but it is not guaranteed that this
2030    is the first NUL since embedded NULs are preserved.  */
2031 static void
2032 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
2033 {
2034   bool saw_NUL = false;
2035   const uchar *cur;
2036   cppchar_t terminator;
2037   enum cpp_ttype type;
2038
2039   cur = base;
2040   terminator = *cur++;
2041   if (terminator == 'L' || terminator == 'U')
2042     terminator = *cur++;
2043   else if (terminator == 'u')
2044     {
2045       terminator = *cur++;
2046       if (terminator == '8')
2047         terminator = *cur++;
2048     }
2049   if (terminator == 'R')
2050     {
2051       lex_raw_string (pfile, token, base);
2052       return;
2053     }
2054   if (terminator == '"')
2055     type = (*base == 'L' ? CPP_WSTRING :
2056             *base == 'U' ? CPP_STRING32 :
2057             *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
2058                          : CPP_STRING);
2059   else if (terminator == '\'')
2060     type = (*base == 'L' ? CPP_WCHAR :
2061             *base == 'U' ? CPP_CHAR32 :
2062             *base == 'u' ? (base[1] == '8' ? CPP_UTF8CHAR : CPP_CHAR16)
2063                          : CPP_CHAR);
2064   else
2065     terminator = '>', type = CPP_HEADER_NAME;
2066
2067   for (;;)
2068     {
2069       cppchar_t c = *cur++;
2070
2071       /* In #include-style directives, terminators are not escapable.  */
2072       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
2073         cur++;
2074       else if (c == terminator)
2075         break;
2076       else if (c == '\n')
2077         {
2078           cur--;
2079           /* Unmatched quotes always yield undefined behavior, but
2080              greedy lexing means that what appears to be an unterminated
2081              header name may actually be a legitimate sequence of tokens.  */
2082           if (terminator == '>')
2083             {
2084               token->type = CPP_LESS;
2085               return;
2086             }
2087           type = CPP_OTHER;
2088           break;
2089         }
2090       else if (c == '\0')
2091         saw_NUL = true;
2092     }
2093
2094   if (saw_NUL && !pfile->state.skipping)
2095     cpp_error (pfile, CPP_DL_WARNING,
2096                "null character(s) preserved in literal");
2097
2098   if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
2099     cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
2100                (int) terminator);
2101
2102   if (CPP_OPTION (pfile, user_literals))
2103     {
2104       /* If a string format macro, say from inttypes.h, is placed touching
2105          a string literal it could be parsed as a C++11 user-defined string
2106          literal thus breaking the program.  */
2107       if (is_macro_not_literal_suffix (pfile, cur))
2108         {
2109           /* Raise a warning, but do not consume subsequent tokens.  */
2110           if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
2111             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
2112                                    token->src_loc, 0,
2113                                    "invalid suffix on literal; C++11 requires "
2114                                    "a space between literal and string macro");
2115         }
2116       /* Grab user defined literal suffix.  */
2117       else if (ISIDST (*cur))
2118         {
2119           type = cpp_userdef_char_add_type (type);
2120           type = cpp_userdef_string_add_type (type);
2121           ++cur;
2122
2123           while (ISIDNUM (*cur))
2124             ++cur;
2125         }
2126     }
2127   else if (CPP_OPTION (pfile, cpp_warn_cxx11_compat)
2128            && is_macro (pfile, cur)
2129            && !pfile->state.skipping)
2130     cpp_warning_with_line (pfile, CPP_W_CXX11_COMPAT,
2131                            token->src_loc, 0, "C++11 requires a space "
2132                            "between string literal and macro");
2133
2134   pfile->buffer->cur = cur;
2135   create_literal (pfile, token, base, cur - base, type);
2136 }
2137
2138 /* Return the comment table. The client may not make any assumption
2139    about the ordering of the table.  */
2140 cpp_comment_table *
2141 cpp_get_comments (cpp_reader *pfile)
2142 {
2143   return &pfile->comments;
2144 }
2145
2146 /* Append a comment to the end of the comment table. */
2147 static void
2148 store_comment (cpp_reader *pfile, cpp_token *token)
2149 {
2150   int len;
2151
2152   if (pfile->comments.allocated == 0)
2153     {
2154       pfile->comments.allocated = 256;
2155       pfile->comments.entries = (cpp_comment *) xmalloc
2156         (pfile->comments.allocated * sizeof (cpp_comment));
2157     }
2158
2159   if (pfile->comments.count == pfile->comments.allocated)
2160     {
2161       pfile->comments.allocated *= 2;
2162       pfile->comments.entries = (cpp_comment *) xrealloc
2163         (pfile->comments.entries,
2164          pfile->comments.allocated * sizeof (cpp_comment));
2165     }
2166
2167   len = token->val.str.len;
2168
2169   /* Copy comment. Note, token may not be NULL terminated. */
2170   pfile->comments.entries[pfile->comments.count].comment =
2171     (char *) xmalloc (sizeof (char) * (len + 1));
2172   memcpy (pfile->comments.entries[pfile->comments.count].comment,
2173           token->val.str.text, len);
2174   pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
2175
2176   /* Set source location. */
2177   pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
2178
2179   /* Increment the count of entries in the comment table. */
2180   pfile->comments.count++;
2181 }
2182
2183 /* The stored comment includes the comment start and any terminator.  */
2184 static void
2185 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
2186               cppchar_t type)
2187 {
2188   unsigned char *buffer;
2189   unsigned int len, clen, i;
2190
2191   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
2192
2193   /* C++ comments probably (not definitely) have moved past a new
2194      line, which we don't want to save in the comment.  */
2195   if (is_vspace (pfile->buffer->cur[-1]))
2196     len--;
2197
2198   /* If we are currently in a directive or in argument parsing, then
2199      we need to store all C++ comments as C comments internally, and
2200      so we need to allocate a little extra space in that case.
2201
2202      Note that the only time we encounter a directive here is
2203      when we are saving comments in a "#define".  */
2204   clen = ((pfile->state.in_directive || pfile->state.parsing_args)
2205           && type == '/') ? len + 2 : len;
2206
2207   buffer = _cpp_unaligned_alloc (pfile, clen);
2208
2209   token->type = CPP_COMMENT;
2210   token->val.str.len = clen;
2211   token->val.str.text = buffer;
2212
2213   buffer[0] = '/';
2214   memcpy (buffer + 1, from, len - 1);
2215
2216   /* Finish conversion to a C comment, if necessary.  */
2217   if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
2218     {
2219       buffer[1] = '*';
2220       buffer[clen - 2] = '*';
2221       buffer[clen - 1] = '/';
2222       /* As there can be in a C++ comments illegal sequences for C comments
2223          we need to filter them out.  */
2224       for (i = 2; i < (clen - 2); i++)
2225         if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
2226           buffer[i] = '|';
2227     }
2228
2229   /* Finally store this comment for use by clients of libcpp. */
2230   store_comment (pfile, token);
2231 }
2232
2233 /* Returns true if comment at COMMENT_START is a recognized FALLTHROUGH
2234    comment.  */
2235
2236 static bool
2237 fallthrough_comment_p (cpp_reader *pfile, const unsigned char *comment_start)
2238 {
2239   const unsigned char *from = comment_start + 1;
2240
2241   switch (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough))
2242     {
2243       /* For both -Wimplicit-fallthrough=0 and -Wimplicit-fallthrough=5 we
2244          don't recognize any comments.  The latter only checks attributes,
2245          the former doesn't warn.  */
2246     case 0:
2247     default:
2248       return false;
2249       /* -Wimplicit-fallthrough=1 considers any comment, no matter what
2250          content it has.  */
2251     case 1:
2252       return true;
2253     case 2:
2254       /* -Wimplicit-fallthrough=2 looks for (case insensitive)
2255          .*falls?[ \t-]*thr(u|ough).* regex.  */
2256       for (; (size_t) (pfile->buffer->cur - from) >= sizeof "fallthru" - 1;
2257            from++)
2258         {
2259           /* Is there anything like strpbrk with upper boundary, or
2260              memchr looking for 2 characters rather than just one?  */
2261           if (from[0] != 'f' && from[0] != 'F')
2262             continue;
2263           if (from[1] != 'a' && from[1] != 'A')
2264             continue;
2265           if (from[2] != 'l' && from[2] != 'L')
2266             continue;
2267           if (from[3] != 'l' && from[3] != 'L')
2268             continue;
2269           from += sizeof "fall" - 1;
2270           if (from[0] == 's' || from[0] == 'S')
2271             from++;
2272           while (*from == ' ' || *from == '\t' || *from == '-')
2273             from++;
2274           if (from[0] != 't' && from[0] != 'T')
2275             continue;
2276           if (from[1] != 'h' && from[1] != 'H')
2277             continue;
2278           if (from[2] != 'r' && from[2] != 'R')
2279             continue;
2280           if (from[3] == 'u' || from[3] == 'U')
2281             return true;
2282           if (from[3] != 'o' && from[3] != 'O')
2283             continue;
2284           if (from[4] != 'u' && from[4] != 'U')
2285             continue;
2286           if (from[5] != 'g' && from[5] != 'G')
2287             continue;
2288           if (from[6] != 'h' && from[6] != 'H')
2289             continue;
2290           return true;
2291         }
2292       return false;
2293     case 3:
2294     case 4:
2295       break;
2296     }
2297
2298   /* Whole comment contents:
2299      -fallthrough
2300      @fallthrough@
2301    */
2302   if (*from == '-' || *from == '@')
2303     {
2304       size_t len = sizeof "fallthrough" - 1;
2305       if ((size_t) (pfile->buffer->cur - from - 1) < len)
2306         return false;
2307       if (memcmp (from + 1, "fallthrough", len))
2308         return false;
2309       if (*from == '@')
2310         {
2311           if (from[len + 1] != '@')
2312             return false;
2313           len++;
2314         }
2315       from += 1 + len;
2316     }
2317   /* Whole comment contents (regex):
2318      lint -fallthrough[ \t]*
2319    */
2320   else if (*from == 'l')
2321     {
2322       size_t len = sizeof "int -fallthrough" - 1;
2323       if ((size_t) (pfile->buffer->cur - from - 1) < len)
2324         return false;
2325       if (memcmp (from + 1, "int -fallthrough", len))
2326         return false;
2327       from += 1 + len;
2328       while (*from == ' ' || *from == '\t')
2329         from++;
2330     }
2331   /* Whole comment contents (regex):
2332      [ \t]*FALLTHR(U|OUGH)[ \t]*
2333    */
2334   else if (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough) == 4)
2335     {
2336       while (*from == ' ' || *from == '\t')
2337         from++;
2338       if ((size_t) (pfile->buffer->cur - from)  < sizeof "FALLTHRU" - 1)
2339         return false;
2340       if (memcmp (from, "FALLTHR", sizeof "FALLTHR" - 1))
2341         return false;
2342       from += sizeof "FALLTHR" - 1;
2343       if (*from == 'U')
2344         from++;
2345       else if ((size_t) (pfile->buffer->cur - from)  < sizeof "OUGH" - 1)
2346         return false;
2347       else if (memcmp (from, "OUGH", sizeof "OUGH" - 1))
2348         return false;
2349       else
2350         from += sizeof "OUGH" - 1;
2351       while (*from == ' ' || *from == '\t')
2352         from++;
2353     }
2354   /* Whole comment contents (regex):
2355      [ \t.!]*(ELSE,? |INTENTIONAL(LY)? )?FALL(S | |-)?THR(OUGH|U)[ \t.!]*(-[^\n\r]*)?
2356      [ \t.!]*(Else,? |Intentional(ly)? )?Fall((s | |-)[Tt]|t)hr(ough|u)[ \t.!]*(-[^\n\r]*)?
2357      [ \t.!]*([Ee]lse,? |[Ii]ntentional(ly)? )?fall(s | |-)?thr(ough|u)[ \t.!]*(-[^\n\r]*)?
2358    */
2359   else
2360     {
2361       while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
2362         from++;
2363       unsigned char f = *from;
2364       bool all_upper = false;
2365       if (f == 'E' || f == 'e')
2366         {
2367           if ((size_t) (pfile->buffer->cur - from)
2368               < sizeof "else fallthru" - 1)
2369             return false;
2370           if (f == 'E' && memcmp (from + 1, "LSE", sizeof "LSE" - 1) == 0)
2371             all_upper = true;
2372           else if (memcmp (from + 1, "lse", sizeof "lse" - 1))
2373             return false;
2374           from += sizeof "else" - 1;
2375           if (*from == ',')
2376             from++;
2377           if (*from != ' ')
2378             return false;
2379           from++;
2380           if (all_upper && *from == 'f')
2381             return false;
2382           if (f == 'e' && *from == 'F')
2383             return false;
2384           f = *from;
2385         }
2386       else if (f == 'I' || f == 'i')
2387         {
2388           if ((size_t) (pfile->buffer->cur - from)
2389               < sizeof "intentional fallthru" - 1)
2390             return false;
2391           if (f == 'I' && memcmp (from + 1, "NTENTIONAL",
2392                                   sizeof "NTENTIONAL" - 1) == 0)
2393             all_upper = true;
2394           else if (memcmp (from + 1, "ntentional",
2395                            sizeof "ntentional" - 1))
2396             return false;
2397           from += sizeof "intentional" - 1;
2398           if (*from == ' ')
2399             {
2400               from++;
2401               if (all_upper && *from == 'f')
2402                 return false;
2403             }
2404           else if (all_upper)
2405             {
2406               if (memcmp (from, "LY F", sizeof "LY F" - 1))
2407                 return false;
2408               from += sizeof "LY " - 1;
2409             }
2410           else
2411             {
2412               if (memcmp (from, "ly ", sizeof "ly " - 1))
2413                 return false;
2414               from += sizeof "ly " - 1;
2415             }
2416           if (f == 'i' && *from == 'F')
2417             return false;
2418           f = *from;
2419         }
2420       if (f != 'F' && f != 'f')
2421         return false;
2422       if ((size_t) (pfile->buffer->cur - from) < sizeof "fallthru" - 1)
2423         return false;
2424       if (f == 'F' && memcmp (from + 1, "ALL", sizeof "ALL" - 1) == 0)
2425         all_upper = true;
2426       else if (all_upper)
2427         return false;
2428       else if (memcmp (from + 1, "all", sizeof "all" - 1))
2429         return false;
2430       from += sizeof "fall" - 1;
2431       if (*from == (all_upper ? 'S' : 's') && from[1] == ' ')
2432         from += 2;
2433       else if (*from == ' ' || *from == '-')
2434         from++;
2435       else if (*from != (all_upper ? 'T' : 't'))
2436         return false;
2437       if ((f == 'f' || *from != 'T') && (all_upper || *from != 't'))
2438         return false;
2439       if ((size_t) (pfile->buffer->cur - from) < sizeof "thru" - 1)
2440         return false;
2441       if (memcmp (from + 1, all_upper ? "HRU" : "hru", sizeof "hru" - 1))
2442         {
2443           if ((size_t) (pfile->buffer->cur - from) < sizeof "through" - 1)
2444             return false;
2445           if (memcmp (from + 1, all_upper ? "HROUGH" : "hrough",
2446                       sizeof "hrough" - 1))
2447             return false;
2448           from += sizeof "through" - 1;
2449         }
2450       else
2451         from += sizeof "thru" - 1;
2452       while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
2453         from++;
2454       if (*from == '-')
2455         {
2456           from++;
2457           if (*comment_start == '*')
2458             {
2459               do
2460                 {
2461                   while (*from && *from != '*'
2462                          && *from != '\n' && *from != '\r')
2463                     from++;
2464                   if (*from != '*' || from[1] == '/')
2465                     break;
2466                   from++;
2467                 }
2468               while (1);
2469             }
2470           else
2471             while (*from && *from != '\n' && *from != '\r')
2472               from++;
2473         }
2474     }
2475   /* C block comment.  */
2476   if (*comment_start == '*')
2477     {
2478       if (*from != '*' || from[1] != '/')
2479         return false;
2480     }
2481   /* C++ line comment.  */
2482   else if (*from != '\n')
2483     return false;
2484
2485   return true;
2486 }
2487
2488 /* Allocate COUNT tokens for RUN.  */
2489 void
2490 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
2491 {
2492   run->base = XNEWVEC (cpp_token, count);
2493   run->limit = run->base + count;
2494   run->next = NULL;
2495 }
2496
2497 /* Returns the next tokenrun, or creates one if there is none.  */
2498 static tokenrun *
2499 next_tokenrun (tokenrun *run)
2500 {
2501   if (run->next == NULL)
2502     {
2503       run->next = XNEW (tokenrun);
2504       run->next->prev = run;
2505       _cpp_init_tokenrun (run->next, 250);
2506     }
2507
2508   return run->next;
2509 }
2510
2511 /* Return the number of not yet processed token in a given
2512    context.  */
2513 int
2514 _cpp_remaining_tokens_num_in_context (cpp_context *context)
2515 {
2516   if (context->tokens_kind == TOKENS_KIND_DIRECT)
2517     return (LAST (context).token - FIRST (context).token);
2518   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
2519            || context->tokens_kind == TOKENS_KIND_EXTENDED)
2520     return (LAST (context).ptoken - FIRST (context).ptoken);
2521   else
2522       abort ();
2523 }
2524
2525 /* Returns the token present at index INDEX in a given context.  If
2526    INDEX is zero, the next token to be processed is returned.  */
2527 static const cpp_token*
2528 _cpp_token_from_context_at (cpp_context *context, int index)
2529 {
2530   if (context->tokens_kind == TOKENS_KIND_DIRECT)
2531     return &(FIRST (context).token[index]);
2532   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
2533            || context->tokens_kind == TOKENS_KIND_EXTENDED)
2534     return FIRST (context).ptoken[index];
2535  else
2536    abort ();
2537 }
2538
2539 /* Look ahead in the input stream.  */
2540 const cpp_token *
2541 cpp_peek_token (cpp_reader *pfile, int index)
2542 {
2543   cpp_context *context = pfile->context;
2544   const cpp_token *peektok;
2545   int count;
2546
2547   /* First, scan through any pending cpp_context objects.  */
2548   while (context->prev)
2549     {
2550       ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
2551
2552       if (index < (int) sz)
2553         return _cpp_token_from_context_at (context, index);
2554       index -= (int) sz;
2555       context = context->prev;
2556     }
2557
2558   /* We will have to read some new tokens after all (and do so
2559      without invalidating preceding tokens).  */
2560   count = index;
2561   pfile->keep_tokens++;
2562
2563   /* For peeked tokens temporarily disable line_change reporting,
2564      until the tokens are parsed for real.  */
2565   void (*line_change) (cpp_reader *, const cpp_token *, int)
2566     = pfile->cb.line_change;
2567   pfile->cb.line_change = NULL;
2568
2569   do
2570     {
2571       peektok = _cpp_lex_token (pfile);
2572       if (peektok->type == CPP_EOF)
2573         {
2574           index--;
2575           break;
2576         }
2577       else if (peektok->type == CPP_PRAGMA)
2578         {
2579           /* Don't peek past a pragma.  */
2580           if (peektok == &pfile->directive_result)
2581             /* Save the pragma in the buffer.  */
2582             *pfile->cur_token++ = *peektok;
2583           index--;
2584           break;
2585         }
2586     }
2587   while (index--);
2588
2589   _cpp_backup_tokens_direct (pfile, count - index);
2590   pfile->keep_tokens--;
2591   pfile->cb.line_change = line_change;
2592
2593   return peektok;
2594 }
2595
2596 /* Allocate a single token that is invalidated at the same time as the
2597    rest of the tokens on the line.  Has its line and col set to the
2598    same as the last lexed token, so that diagnostics appear in the
2599    right place.  */
2600 cpp_token *
2601 _cpp_temp_token (cpp_reader *pfile)
2602 {
2603   cpp_token *old, *result;
2604   ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
2605   ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
2606
2607   old = pfile->cur_token - 1;
2608   /* Any pre-existing lookaheads must not be clobbered.  */
2609   if (la)
2610     {
2611       if (sz <= la)
2612         {
2613           tokenrun *next = next_tokenrun (pfile->cur_run);
2614
2615           if (sz < la)
2616             memmove (next->base + 1, next->base,
2617                      (la - sz) * sizeof (cpp_token));
2618
2619           next->base[0] = pfile->cur_run->limit[-1];
2620         }
2621
2622       if (sz > 1)
2623         memmove (pfile->cur_token + 1, pfile->cur_token,
2624                  MIN (la, sz - 1) * sizeof (cpp_token));
2625     }
2626
2627   if (!sz && pfile->cur_token == pfile->cur_run->limit)
2628     {
2629       pfile->cur_run = next_tokenrun (pfile->cur_run);
2630       pfile->cur_token = pfile->cur_run->base;
2631     }
2632
2633   result = pfile->cur_token++;
2634   result->src_loc = old->src_loc;
2635   return result;
2636 }
2637
2638 /* We're at the beginning of a logical line (so not in
2639   directives-mode) and RESULT is a CPP_NAME with NODE_MODULE set.  See
2640   if we should enter deferred_pragma mode to tokenize the rest of the
2641   line as a module control-line.  */
2642
2643 static void
2644 cpp_maybe_module_directive (cpp_reader *pfile, cpp_token *result)
2645 {
2646   unsigned backup = 0; /* Tokens we peeked.  */
2647   cpp_hashnode *node = result->val.node.node;
2648   cpp_token *peek = result;
2649   cpp_token *keyword = peek;
2650   cpp_hashnode *(&n_modules)[spec_nodes::M_HWM][2] = pfile->spec_nodes.n_modules;
2651   int header_count = 0;
2652
2653   /* Make sure the incoming state is as we expect it.  This way we
2654      can restore it using constants.  */
2655   gcc_checking_assert (!pfile->state.in_deferred_pragma
2656                        && !pfile->state.skipping
2657                        && !pfile->state.parsing_args
2658                        && !pfile->state.angled_headers
2659                        && (pfile->state.save_comments
2660                            == !CPP_OPTION (pfile, discard_comments)));
2661
2662   /* Enter directives mode sufficiently for peeking.  We don't have
2663      to actually set in_directive.  */
2664   pfile->state.in_deferred_pragma = true;
2665
2666   /* These two fields are needed to process tokenization in deferred
2667      pragma mode.  They are not used outside deferred pragma mode or
2668      directives mode.  */
2669   pfile->state.pragma_allow_expansion = true;
2670   pfile->directive_line = result->src_loc;
2671
2672   /* Saving comments is incompatible with directives mode.   */
2673   pfile->state.save_comments = 0;
2674
2675   if (node == n_modules[spec_nodes::M_EXPORT][0])
2676     {
2677       peek = _cpp_lex_direct (pfile);
2678       keyword = peek;
2679       backup++;
2680       if (keyword->type != CPP_NAME)
2681         goto not_module;
2682       node = keyword->val.node.node;
2683       if (!(node->flags & NODE_MODULE))
2684         goto not_module;
2685     }
2686
2687   if (node == n_modules[spec_nodes::M__IMPORT][0])
2688     /* __import  */
2689     header_count = backup + 2 + 16;
2690   else if (node == n_modules[spec_nodes::M_IMPORT][0])
2691     /* import  */
2692     header_count = backup + 2 + (CPP_OPTION (pfile, preprocessed) ? 16 : 0);
2693   else if (node == n_modules[spec_nodes::M_MODULE][0])
2694     ; /* module  */
2695   else
2696     goto not_module;
2697
2698   /* We've seen [export] {module|import|__import}.  Check the next token.  */
2699   if (header_count)
2700     /* After '{,__}import' a header name may appear.  */
2701     pfile->state.angled_headers = true;
2702   peek = _cpp_lex_direct (pfile);
2703   backup++;
2704
2705   /* ... import followed by identifier, ':', '<' or
2706      header-name preprocessing tokens, or module
2707      followed by cpp-identifier, ':' or ';' preprocessing
2708      tokens.  C++ keywords are not yet relevant.  */
2709   if (peek->type == CPP_NAME
2710       || peek->type == CPP_COLON
2711       ||  (header_count
2712            ? (peek->type == CPP_LESS
2713               || (peek->type == CPP_STRING && peek->val.str.text[0] != 'R')
2714               || peek->type == CPP_HEADER_NAME)
2715            : peek->type == CPP_SEMICOLON))
2716     {
2717       pfile->state.pragma_allow_expansion = !CPP_OPTION (pfile, preprocessed);
2718       if (!pfile->state.pragma_allow_expansion)
2719         pfile->state.prevent_expansion++;
2720
2721       if (!header_count && linemap_included_from
2722           (LINEMAPS_LAST_ORDINARY_MAP (pfile->line_table)))
2723         cpp_error_with_line (pfile, CPP_DL_ERROR, keyword->src_loc, 0,
2724                              "module control-line cannot be in included file");
2725
2726       /* The first one or two tokens cannot be macro names.  */
2727       for (int ix = backup; ix--;)
2728         {
2729           cpp_token *tok = ix ? keyword : result;
2730           cpp_hashnode *node = tok->val.node.node;
2731
2732           /* Don't attempt to expand the token.  */
2733           tok->flags |= NO_EXPAND;
2734           if (_cpp_defined_macro_p (node)
2735               && _cpp_maybe_notify_macro_use (pfile, node, tok->src_loc)
2736               && !cpp_fun_like_macro_p (node))
2737             cpp_error_with_line (pfile, CPP_DL_ERROR, tok->src_loc, 0,
2738                                  "module control-line \"%s\" cannot be"
2739                                  " an object-like macro",
2740                                  NODE_NAME (node));
2741         }
2742
2743       /* Map to underbar variants.  */
2744       keyword->val.node.node = n_modules[header_count
2745                                          ? spec_nodes::M_IMPORT
2746                                          : spec_nodes::M_MODULE][1];
2747       if (backup != 1)
2748         result->val.node.node = n_modules[spec_nodes::M_EXPORT][1];
2749
2750       /* Maybe tell the tokenizer we expect a header-name down the
2751          road.  */
2752       pfile->state.directive_file_token = header_count;
2753     }
2754   else
2755     {
2756     not_module:
2757       /* Drop out of directive mode.  */
2758       /* We aaserted save_comments had this value upon entry.  */
2759       pfile->state.save_comments
2760         = !CPP_OPTION (pfile, discard_comments);
2761       pfile->state.in_deferred_pragma = false;
2762       /* Do not let this remain on.  */
2763       pfile->state.angled_headers = false;
2764     }
2765
2766   /* In either case we want to backup the peeked tokens.  */
2767   if (backup)
2768     {
2769       /* If we saw EOL, we should drop it, because this isn't a module
2770          control-line after all.  */
2771       bool eol = peek->type == CPP_PRAGMA_EOL;
2772       if (!eol || backup > 1)
2773         {
2774           /* Put put the peeked tokens back  */
2775           _cpp_backup_tokens_direct (pfile, backup);
2776           /* But if the last one was an EOL, forget it.  */
2777           if (eol)
2778             pfile->lookaheads--;
2779         }
2780     }
2781 }
2782
2783 /* Lex a token into RESULT (external interface).  Takes care of issues
2784    like directive handling, token lookahead, multiple include
2785    optimization and skipping.  */
2786 const cpp_token *
2787 _cpp_lex_token (cpp_reader *pfile)
2788 {
2789   cpp_token *result;
2790
2791   for (;;)
2792     {
2793       if (pfile->cur_token == pfile->cur_run->limit)
2794         {
2795           pfile->cur_run = next_tokenrun (pfile->cur_run);
2796           pfile->cur_token = pfile->cur_run->base;
2797         }
2798       /* We assume that the current token is somewhere in the current
2799          run.  */
2800       if (pfile->cur_token < pfile->cur_run->base
2801           || pfile->cur_token >= pfile->cur_run->limit)
2802         abort ();
2803
2804       if (pfile->lookaheads)
2805         {
2806           pfile->lookaheads--;
2807           result = pfile->cur_token++;
2808         }
2809       else
2810         result = _cpp_lex_direct (pfile);
2811
2812       if (result->flags & BOL)
2813         {
2814           /* Is this a directive.  If _cpp_handle_directive returns
2815              false, it is an assembler #.  */
2816           if (result->type == CPP_HASH
2817               /* 6.10.3 p 11: Directives in a list of macro arguments
2818                  gives undefined behavior.  This implementation
2819                  handles the directive as normal.  */
2820               && pfile->state.parsing_args != 1)
2821             {
2822               if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
2823                 {
2824                   if (pfile->directive_result.type == CPP_PADDING)
2825                     continue;
2826                   result = &pfile->directive_result;
2827                 }
2828             }
2829           else if (pfile->state.in_deferred_pragma)
2830             result = &pfile->directive_result;
2831           else if (result->type == CPP_NAME
2832                    && (result->val.node.node->flags & NODE_MODULE)
2833                    && !pfile->state.skipping
2834                    /* Unlike regular directives, we do not deal with
2835                       tokenizing module directives as macro arguments.
2836                       That's not permitted.  */
2837                    && !pfile->state.parsing_args)
2838             {
2839               /* P1857.  Before macro expansion, At start of logical
2840                  line ... */
2841               /* We don't have to consider lookaheads at this point.  */
2842               gcc_checking_assert (!pfile->lookaheads);
2843
2844               cpp_maybe_module_directive (pfile, result);
2845             }
2846
2847           if (pfile->cb.line_change && !pfile->state.skipping)
2848             pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
2849         }
2850
2851       /* We don't skip tokens in directives.  */
2852       if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
2853         break;
2854
2855       /* Outside a directive, invalidate controlling macros.  At file
2856          EOF, _cpp_lex_direct takes care of popping the buffer, so we never
2857          get here and MI optimization works.  */
2858       pfile->mi_valid = false;
2859
2860       if (!pfile->state.skipping || result->type == CPP_EOF)
2861         break;
2862     }
2863
2864   return result;
2865 }
2866
2867 /* Returns true if a fresh line has been loaded.  */
2868 bool
2869 _cpp_get_fresh_line (cpp_reader *pfile)
2870 {
2871   /* We can't get a new line until we leave the current directive.  */
2872   if (pfile->state.in_directive)
2873     return false;
2874
2875   for (;;)
2876     {
2877       cpp_buffer *buffer = pfile->buffer;
2878
2879       if (!buffer->need_line)
2880         return true;
2881
2882       if (buffer->next_line < buffer->rlimit)
2883         {
2884           _cpp_clean_line (pfile);
2885           return true;
2886         }
2887
2888       /* First, get out of parsing arguments state.  */
2889       if (pfile->state.parsing_args)
2890         return false;
2891
2892       /* End of buffer.  Non-empty files should end in a newline.  */
2893       if (buffer->buf != buffer->rlimit
2894           && buffer->next_line > buffer->rlimit
2895           && !buffer->from_stage3)
2896         {
2897           /* Clip to buffer size.  */
2898           buffer->next_line = buffer->rlimit;
2899         }
2900
2901       if (buffer->prev && !buffer->return_at_eof)
2902         _cpp_pop_buffer (pfile);
2903       else
2904         {
2905           /* End of translation.  Do not pop the buffer yet. Increment
2906              line number so that the EOF token is on a line of its own
2907              (_cpp_lex_direct doesn't increment in that case, because
2908              it's hard for it to distinguish this special case). */
2909           CPP_INCREMENT_LINE (pfile, 0);
2910           return false;
2911         }
2912     }
2913 }
2914
2915 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)          \
2916   do                                                    \
2917     {                                                   \
2918       result->type = ELSE_TYPE;                         \
2919       if (*buffer->cur == CHAR)                         \
2920         buffer->cur++, result->type = THEN_TYPE;        \
2921     }                                                   \
2922   while (0)
2923
2924 /* Lex a token into pfile->cur_token, which is also incremented, to
2925    get diagnostics pointing to the correct location.
2926
2927    Does not handle issues such as token lookahead, multiple-include
2928    optimization, directives, skipping etc.  This function is only
2929    suitable for use by _cpp_lex_token, and in special cases like
2930    lex_expansion_token which doesn't care for any of these issues.
2931
2932    When meeting a newline, returns CPP_EOF if parsing a directive,
2933    otherwise returns to the start of the token buffer if permissible.
2934    Returns the location of the lexed token.  */
2935 cpp_token *
2936 _cpp_lex_direct (cpp_reader *pfile)
2937 {
2938   cppchar_t c;
2939   cpp_buffer *buffer;
2940   const unsigned char *comment_start;
2941   bool fallthrough_comment = false;
2942   cpp_token *result = pfile->cur_token++;
2943
2944  fresh_line:
2945   result->flags = 0;
2946   buffer = pfile->buffer;
2947   if (buffer->need_line)
2948     {
2949       gcc_assert (!pfile->state.in_deferred_pragma);
2950       if (!_cpp_get_fresh_line (pfile))
2951         {
2952           result->type = CPP_EOF;
2953           /* Not a real EOF in a directive or arg parsing -- we refuse
2954              to advance to the next file now, and will once we're out
2955              of those modes.  */
2956           if (!pfile->state.in_directive && !pfile->state.parsing_args)
2957             {
2958               /* Tell the compiler the line number of the EOF token.  */
2959               result->src_loc = pfile->line_table->highest_line;
2960               result->flags = BOL;
2961               /* Now pop the buffer that _cpp_get_fresh_line did not.  */
2962               _cpp_pop_buffer (pfile);
2963             }
2964           return result;
2965         }
2966       if (buffer != pfile->buffer)
2967         fallthrough_comment = false;
2968       if (!pfile->keep_tokens)
2969         {
2970           pfile->cur_run = &pfile->base_run;
2971           result = pfile->base_run.base;
2972           pfile->cur_token = result + 1;
2973         }
2974       result->flags = BOL;
2975       if (pfile->state.parsing_args == 2)
2976         result->flags |= PREV_WHITE;
2977     }
2978   buffer = pfile->buffer;
2979  update_tokens_line:
2980   result->src_loc = pfile->line_table->highest_line;
2981
2982  skipped_white:
2983   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
2984       && !pfile->overlaid_buffer)
2985     {
2986       _cpp_process_line_notes (pfile, false);
2987       result->src_loc = pfile->line_table->highest_line;
2988     }
2989   c = *buffer->cur++;
2990
2991   if (pfile->forced_token_location)
2992     result->src_loc = pfile->forced_token_location;
2993   else
2994     result->src_loc = linemap_position_for_column (pfile->line_table,
2995                                           CPP_BUF_COLUMN (buffer, buffer->cur));
2996
2997   switch (c)
2998     {
2999     case ' ': case '\t': case '\f': case '\v': case '\0':
3000       result->flags |= PREV_WHITE;
3001       skip_whitespace (pfile, c);
3002       goto skipped_white;
3003
3004     case '\n':
3005       /* Increment the line, unless this is the last line ...  */
3006       if (buffer->cur < buffer->rlimit
3007           /* ... or this is a #include, (where _cpp_stack_file needs to
3008              unwind by one line) ...  */
3009           || (pfile->state.in_directive > 1
3010               /* ... except traditional-cpp increments this elsewhere.  */
3011               && !CPP_OPTION (pfile, traditional)))
3012         CPP_INCREMENT_LINE (pfile, 0);
3013       buffer->need_line = true;
3014       if (pfile->state.in_deferred_pragma)
3015         {
3016           /* Produce the PRAGMA_EOL on this line.  File reading
3017              ensures there is always a \n at end of the buffer, thus
3018              in a deferred pragma we always see CPP_PRAGMA_EOL before
3019              any CPP_EOF.  */
3020           result->type = CPP_PRAGMA_EOL;
3021           result->flags &= ~PREV_WHITE;
3022           pfile->state.in_deferred_pragma = false;
3023           if (!pfile->state.pragma_allow_expansion)
3024             pfile->state.prevent_expansion--;
3025           return result;
3026         }
3027       goto fresh_line;
3028
3029     case '0': case '1': case '2': case '3': case '4':
3030     case '5': case '6': case '7': case '8': case '9':
3031       {
3032         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3033         result->type = CPP_NUMBER;
3034         lex_number (pfile, &result->val.str, &nst);
3035         warn_about_normalization (pfile, result, &nst);
3036         break;
3037       }
3038
3039     case 'L':
3040     case 'u':
3041     case 'U':
3042     case 'R':
3043       /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
3044          wide strings or raw strings.  */
3045       if (c == 'L' || CPP_OPTION (pfile, rliterals)
3046           || (c != 'R' && CPP_OPTION (pfile, uliterals)))
3047         {
3048           if ((*buffer->cur == '\'' && c != 'R')
3049               || *buffer->cur == '"'
3050               || (*buffer->cur == 'R'
3051                   && c != 'R'
3052                   && buffer->cur[1] == '"'
3053                   && CPP_OPTION (pfile, rliterals))
3054               || (*buffer->cur == '8'
3055                   && c == 'u'
3056                   && ((buffer->cur[1] == '"' || (buffer->cur[1] == '\''
3057                                 && CPP_OPTION (pfile, utf8_char_literals)))
3058                       || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
3059                           && CPP_OPTION (pfile, rliterals)))))
3060             {
3061               lex_string (pfile, result, buffer->cur - 1);
3062               break;
3063             }
3064         }
3065       /* Fall through.  */
3066
3067     case '_':
3068     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
3069     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
3070     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
3071     case 's': case 't':           case 'v': case 'w': case 'x':
3072     case 'y': case 'z':
3073     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
3074     case 'G': case 'H': case 'I': case 'J': case 'K':
3075     case 'M': case 'N': case 'O': case 'P': case 'Q':
3076     case 'S': case 'T':           case 'V': case 'W': case 'X':
3077     case 'Y': case 'Z':
3078       result->type = CPP_NAME;
3079       {
3080         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3081         result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
3082                                                 &nst,
3083                                                 &result->val.node.spelling);
3084         warn_about_normalization (pfile, result, &nst);
3085       }
3086
3087       /* Convert named operators to their proper types.  */
3088       if (result->val.node.node->flags & NODE_OPERATOR)
3089         {
3090           result->flags |= NAMED_OP;
3091           result->type = (enum cpp_ttype) result->val.node.node->directive_index;
3092         }
3093
3094       /* Signal FALLTHROUGH comment followed by another token.  */
3095       if (fallthrough_comment)
3096         result->flags |= PREV_FALLTHROUGH;
3097       break;
3098
3099     case '\'':
3100     case '"':
3101       lex_string (pfile, result, buffer->cur - 1);
3102       break;
3103
3104     case '/':
3105       /* A potential block or line comment.  */
3106       comment_start = buffer->cur;
3107       c = *buffer->cur;
3108
3109       if (c == '*')
3110         {
3111           if (_cpp_skip_block_comment (pfile))
3112             cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
3113         }
3114       else if (c == '/' && ! CPP_OPTION (pfile, traditional))
3115         {
3116           /* Don't warn for system headers.  */
3117           if (_cpp_in_system_header (pfile))
3118             ;
3119           /* Warn about comments if pedantically GNUC89, and not
3120              in system headers.  */
3121           else if (CPP_OPTION (pfile, lang) == CLK_GNUC89
3122                    && CPP_PEDANTIC (pfile)
3123                    && ! buffer->warned_cplusplus_comments)
3124             {
3125               if (cpp_error (pfile, CPP_DL_PEDWARN,
3126                              "C++ style comments are not allowed in ISO C90"))
3127                 cpp_error (pfile, CPP_DL_NOTE,
3128                            "(this will be reported only once per input file)");
3129               buffer->warned_cplusplus_comments = 1;
3130             }
3131           /* Or if specifically desired via -Wc90-c99-compat.  */
3132           else if (CPP_OPTION (pfile, cpp_warn_c90_c99_compat) > 0
3133                    && ! CPP_OPTION (pfile, cplusplus)
3134                    && ! buffer->warned_cplusplus_comments)
3135             {
3136               if (cpp_error (pfile, CPP_DL_WARNING,
3137                              "C++ style comments are incompatible with C90"))
3138                 cpp_error (pfile, CPP_DL_NOTE,
3139                            "(this will be reported only once per input file)");
3140               buffer->warned_cplusplus_comments = 1;
3141             }
3142           /* In C89/C94, C++ style comments are forbidden.  */
3143           else if ((CPP_OPTION (pfile, lang) == CLK_STDC89
3144                     || CPP_OPTION (pfile, lang) == CLK_STDC94))
3145             {
3146               /* But don't be confused about valid code such as
3147                  - // immediately followed by *,
3148                  - // in a preprocessing directive,
3149                  - // in an #if 0 block.  */
3150               if (buffer->cur[1] == '*'
3151                   || pfile->state.in_directive
3152                   || pfile->state.skipping)
3153                 {
3154                   result->type = CPP_DIV;
3155                   break;
3156                 }
3157               else if (! buffer->warned_cplusplus_comments)
3158                 {
3159                   if (cpp_error (pfile, CPP_DL_ERROR,
3160                                  "C++ style comments are not allowed in "
3161                                  "ISO C90"))
3162                     cpp_error (pfile, CPP_DL_NOTE,
3163                                "(this will be reported only once per input "
3164                                "file)");
3165                   buffer->warned_cplusplus_comments = 1;
3166                 }
3167             }
3168           if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
3169             cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
3170         }
3171       else if (c == '=')
3172         {
3173           buffer->cur++;
3174           result->type = CPP_DIV_EQ;
3175           break;
3176         }
3177       else
3178         {
3179           result->type = CPP_DIV;
3180           break;
3181         }
3182
3183       if (fallthrough_comment_p (pfile, comment_start))
3184         fallthrough_comment = true;
3185
3186       if (pfile->cb.comment)
3187         {
3188           size_t len = pfile->buffer->cur - comment_start;
3189           pfile->cb.comment (pfile, result->src_loc, comment_start - 1,
3190                              len + 1);
3191         }
3192
3193       if (!pfile->state.save_comments)
3194         {
3195           result->flags |= PREV_WHITE;
3196           goto update_tokens_line;
3197         }
3198
3199       if (fallthrough_comment)
3200         result->flags |= PREV_FALLTHROUGH;
3201
3202       /* Save the comment as a token in its own right.  */
3203       save_comment (pfile, result, comment_start, c);
3204       break;
3205
3206     case '<':
3207       if (pfile->state.angled_headers)
3208         {
3209           lex_string (pfile, result, buffer->cur - 1);
3210           if (result->type != CPP_LESS)
3211             break;
3212         }
3213
3214       result->type = CPP_LESS;
3215       if (*buffer->cur == '=')
3216         {
3217           buffer->cur++, result->type = CPP_LESS_EQ;
3218           if (*buffer->cur == '>'
3219               && CPP_OPTION (pfile, cplusplus)
3220               && CPP_OPTION (pfile, lang) >= CLK_GNUCXX20)
3221             buffer->cur++, result->type = CPP_SPACESHIP;
3222         }
3223       else if (*buffer->cur == '<')
3224         {
3225           buffer->cur++;
3226           IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
3227         }
3228       else if (CPP_OPTION (pfile, digraphs))
3229         {
3230           if (*buffer->cur == ':')
3231             {
3232               /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
3233                  three characters are <:: and the subsequent character
3234                  is neither : nor >, the < is treated as a preprocessor
3235                  token by itself".  */
3236               if (CPP_OPTION (pfile, cplusplus)
3237                   && CPP_OPTION (pfile, lang) != CLK_CXX98
3238                   && CPP_OPTION (pfile, lang) != CLK_GNUCXX
3239                   && buffer->cur[1] == ':'
3240                   && buffer->cur[2] != ':' && buffer->cur[2] != '>')
3241                 break;
3242
3243               buffer->cur++;
3244               result->flags |= DIGRAPH;
3245               result->type = CPP_OPEN_SQUARE;
3246             }
3247           else if (*buffer->cur == '%')
3248             {
3249               buffer->cur++;
3250               result->flags |= DIGRAPH;
3251               result->type = CPP_OPEN_BRACE;
3252             }
3253         }
3254       break;
3255
3256     case '>':
3257       result->type = CPP_GREATER;
3258       if (*buffer->cur == '=')
3259         buffer->cur++, result->type = CPP_GREATER_EQ;
3260       else if (*buffer->cur == '>')
3261         {
3262           buffer->cur++;
3263           IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
3264         }
3265       break;
3266
3267     case '%':
3268       result->type = CPP_MOD;
3269       if (*buffer->cur == '=')
3270         buffer->cur++, result->type = CPP_MOD_EQ;
3271       else if (CPP_OPTION (pfile, digraphs))
3272         {
3273           if (*buffer->cur == ':')
3274             {
3275               buffer->cur++;
3276               result->flags |= DIGRAPH;
3277               result->type = CPP_HASH;
3278               if (*buffer->cur == '%' && buffer->cur[1] == ':')
3279                 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
3280             }
3281           else if (*buffer->cur == '>')
3282             {
3283               buffer->cur++;
3284               result->flags |= DIGRAPH;
3285               result->type = CPP_CLOSE_BRACE;
3286             }
3287         }
3288       break;
3289
3290     case '.':
3291       result->type = CPP_DOT;
3292       if (ISDIGIT (*buffer->cur))
3293         {
3294           struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3295           result->type = CPP_NUMBER;
3296           lex_number (pfile, &result->val.str, &nst);
3297           warn_about_normalization (pfile, result, &nst);
3298         }
3299       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
3300         buffer->cur += 2, result->type = CPP_ELLIPSIS;
3301       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
3302         buffer->cur++, result->type = CPP_DOT_STAR;
3303       break;
3304
3305     case '+':
3306       result->type = CPP_PLUS;
3307       if (*buffer->cur == '+')
3308         buffer->cur++, result->type = CPP_PLUS_PLUS;
3309       else if (*buffer->cur == '=')
3310         buffer->cur++, result->type = CPP_PLUS_EQ;
3311       break;
3312
3313     case '-':
3314       result->type = CPP_MINUS;
3315       if (*buffer->cur == '>')
3316         {
3317           buffer->cur++;
3318           result->type = CPP_DEREF;
3319           if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
3320             buffer->cur++, result->type = CPP_DEREF_STAR;
3321         }
3322       else if (*buffer->cur == '-')
3323         buffer->cur++, result->type = CPP_MINUS_MINUS;
3324       else if (*buffer->cur == '=')
3325         buffer->cur++, result->type = CPP_MINUS_EQ;
3326       break;
3327
3328     case '&':
3329       result->type = CPP_AND;
3330       if (*buffer->cur == '&')
3331         buffer->cur++, result->type = CPP_AND_AND;
3332       else if (*buffer->cur == '=')
3333         buffer->cur++, result->type = CPP_AND_EQ;
3334       break;
3335
3336     case '|':
3337       result->type = CPP_OR;
3338       if (*buffer->cur == '|')
3339         buffer->cur++, result->type = CPP_OR_OR;
3340       else if (*buffer->cur == '=')
3341         buffer->cur++, result->type = CPP_OR_EQ;
3342       break;
3343
3344     case ':':
3345       result->type = CPP_COLON;
3346       if (*buffer->cur == ':' && CPP_OPTION (pfile, scope))
3347         buffer->cur++, result->type = CPP_SCOPE;
3348       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
3349         {
3350           buffer->cur++;
3351           result->flags |= DIGRAPH;
3352           result->type = CPP_CLOSE_SQUARE;
3353         }
3354       break;
3355
3356     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
3357     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
3358     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
3359     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
3360     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
3361
3362     case '?': result->type = CPP_QUERY; break;
3363     case '~': result->type = CPP_COMPL; break;
3364     case ',': result->type = CPP_COMMA; break;
3365     case '(': result->type = CPP_OPEN_PAREN; break;
3366     case ')': result->type = CPP_CLOSE_PAREN; break;
3367     case '[': result->type = CPP_OPEN_SQUARE; break;
3368     case ']': result->type = CPP_CLOSE_SQUARE; break;
3369     case '{': result->type = CPP_OPEN_BRACE; break;
3370     case '}': result->type = CPP_CLOSE_BRACE; break;
3371     case ';': result->type = CPP_SEMICOLON; break;
3372
3373       /* @ is a punctuator in Objective-C.  */
3374     case '@': result->type = CPP_ATSIGN; break;
3375
3376     default:
3377       {
3378         const uchar *base = --buffer->cur;
3379
3380         /* Check for an extended identifier ($ or UCN or UTF-8).  */
3381         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3382         if (forms_identifier_p (pfile, true, &nst))
3383           {
3384             result->type = CPP_NAME;
3385             result->val.node.node = lex_identifier (pfile, base, true, &nst,
3386                                                     &result->val.node.spelling);
3387             warn_about_normalization (pfile, result, &nst);
3388             break;
3389           }
3390
3391         /* Otherwise this will form a CPP_OTHER token.  Parse valid UTF-8 as a
3392            single token.  */
3393         buffer->cur++;
3394         if (c >= utf8_signifier)
3395           {
3396             const uchar *pstr = base;
3397             cppchar_t s;
3398             if (_cpp_valid_utf8 (pfile, &pstr, buffer->rlimit, 0, NULL, &s))
3399               buffer->cur = pstr;
3400           }
3401         create_literal (pfile, result, base, buffer->cur - base, CPP_OTHER);
3402         break;
3403       }
3404
3405     }
3406
3407   /* Potentially convert the location of the token to a range.  */
3408   if (result->src_loc >= RESERVED_LOCATION_COUNT
3409       && result->type != CPP_EOF)
3410     {
3411       /* Ensure that any line notes are processed, so that we have the
3412          correct physical line/column for the end-point of the token even
3413          when a logical line is split via one or more backslashes.  */
3414       if (buffer->cur >= buffer->notes[buffer->cur_note].pos
3415           && !pfile->overlaid_buffer)
3416         _cpp_process_line_notes (pfile, false);
3417
3418       source_range tok_range;
3419       tok_range.m_start = result->src_loc;
3420       tok_range.m_finish
3421         = linemap_position_for_column (pfile->line_table,
3422                                        CPP_BUF_COLUMN (buffer, buffer->cur));
3423
3424       result->src_loc = COMBINE_LOCATION_DATA (pfile->line_table,
3425                                                result->src_loc,
3426                                                tok_range, NULL);
3427     }
3428
3429   return result;
3430 }
3431
3432 /* An upper bound on the number of bytes needed to spell TOKEN.
3433    Does not include preceding whitespace.  */
3434 unsigned int
3435 cpp_token_len (const cpp_token *token)
3436 {
3437   unsigned int len;
3438
3439   switch (TOKEN_SPELL (token))
3440     {
3441     default:            len = 6;                                break;
3442     case SPELL_LITERAL: len = token->val.str.len;               break;
3443     case SPELL_IDENT:   len = NODE_LEN (token->val.node.node) * 10;     break;
3444     }
3445
3446   return len;
3447 }
3448
3449 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
3450    Return the number of bytes read out of NAME.  (There are always
3451    10 bytes written to BUFFER.)  */
3452
3453 static size_t
3454 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
3455 {
3456   int j;
3457   int ucn_len = 0;
3458   int ucn_len_c;
3459   unsigned t;
3460   unsigned long utf32;
3461
3462   /* Compute the length of the UTF-8 sequence.  */
3463   for (t = *name; t & 0x80; t <<= 1)
3464     ucn_len++;
3465
3466   utf32 = *name & (0x7F >> ucn_len);
3467   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
3468     {
3469       utf32 = (utf32 << 6) | (*++name & 0x3F);
3470
3471       /* Ill-formed UTF-8.  */
3472       if ((*name & ~0x3F) != 0x80)
3473         abort ();
3474     }
3475
3476   *buffer++ = '\\';
3477   *buffer++ = 'U';
3478   for (j = 7; j >= 0; j--)
3479     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
3480   return ucn_len;
3481 }
3482
3483 /* Given a token TYPE corresponding to a digraph, return a pointer to
3484    the spelling of the digraph.  */
3485 static const unsigned char *
3486 cpp_digraph2name (enum cpp_ttype type)
3487 {
3488   return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
3489 }
3490
3491 /* Write the spelling of an identifier IDENT, using UCNs, to BUFFER.
3492    The buffer must already contain the enough space to hold the
3493    token's spelling.  Returns a pointer to the character after the
3494    last character written.  */
3495 unsigned char *
3496 _cpp_spell_ident_ucns (unsigned char *buffer, cpp_hashnode *ident)
3497 {
3498   size_t i;
3499   const unsigned char *name = NODE_NAME (ident);
3500
3501   for (i = 0; i < NODE_LEN (ident); i++)
3502     if (name[i] & ~0x7F)
3503       {
3504         i += utf8_to_ucn (buffer, name + i) - 1;
3505         buffer += 10;
3506       }
3507     else
3508       *buffer++ = name[i];
3509
3510   return buffer;
3511 }
3512
3513 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
3514    already contain the enough space to hold the token's spelling.
3515    Returns a pointer to the character after the last character written.
3516    FORSTRING is true if this is to be the spelling after translation
3517    phase 1 (with the original spelling of extended identifiers), false
3518    if extended identifiers should always be written using UCNs (there is
3519    no option for always writing them in the internal UTF-8 form).
3520    FIXME: Would be nice if we didn't need the PFILE argument.  */
3521 unsigned char *
3522 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
3523                  unsigned char *buffer, bool forstring)
3524 {
3525   switch (TOKEN_SPELL (token))
3526     {
3527     case SPELL_OPERATOR:
3528       {
3529         const unsigned char *spelling;
3530         unsigned char c;
3531
3532         if (token->flags & DIGRAPH)
3533           spelling = cpp_digraph2name (token->type);
3534         else if (token->flags & NAMED_OP)
3535           goto spell_ident;
3536         else
3537           spelling = TOKEN_NAME (token);
3538
3539         while ((c = *spelling++) != '\0')
3540           *buffer++ = c;
3541       }
3542       break;
3543
3544     spell_ident:
3545     case SPELL_IDENT:
3546       if (forstring)
3547         {
3548           memcpy (buffer, NODE_NAME (token->val.node.spelling),
3549                   NODE_LEN (token->val.node.spelling));
3550           buffer += NODE_LEN (token->val.node.spelling);
3551         }
3552       else
3553         buffer = _cpp_spell_ident_ucns (buffer, token->val.node.node);
3554       break;
3555
3556     case SPELL_LITERAL:
3557       memcpy (buffer, token->val.str.text, token->val.str.len);
3558       buffer += token->val.str.len;
3559       break;
3560
3561     case SPELL_NONE:
3562       cpp_error (pfile, CPP_DL_ICE,
3563                  "unspellable token %s", TOKEN_NAME (token));
3564       break;
3565     }
3566
3567   return buffer;
3568 }
3569
3570 /* Returns TOKEN spelt as a null-terminated string.  The string is
3571    freed when the reader is destroyed.  Useful for diagnostics.  */
3572 unsigned char *
3573 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
3574 {
3575   unsigned int len = cpp_token_len (token) + 1;
3576   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
3577
3578   end = cpp_spell_token (pfile, token, start, false);
3579   end[0] = '\0';
3580
3581   return start;
3582 }
3583
3584 /* Returns a pointer to a string which spells the token defined by
3585    TYPE and FLAGS.  Used by C front ends, which really should move to
3586    using cpp_token_as_text.  */
3587 const char *
3588 cpp_type2name (enum cpp_ttype type, unsigned char flags)
3589 {
3590   if (flags & DIGRAPH)
3591     return (const char *) cpp_digraph2name (type);
3592   else if (flags & NAMED_OP)
3593     return cpp_named_operator2name (type);
3594
3595   return (const char *) token_spellings[type].name;
3596 }
3597
3598 /* Writes the spelling of token to FP, without any preceding space.
3599    Separated from cpp_spell_token for efficiency - to avoid stdio
3600    double-buffering.  */
3601 void
3602 cpp_output_token (const cpp_token *token, FILE *fp)
3603 {
3604   switch (TOKEN_SPELL (token))
3605     {
3606     case SPELL_OPERATOR:
3607       {
3608         const unsigned char *spelling;
3609         int c;
3610
3611         if (token->flags & DIGRAPH)
3612           spelling = cpp_digraph2name (token->type);
3613         else if (token->flags & NAMED_OP)
3614           goto spell_ident;
3615         else
3616           spelling = TOKEN_NAME (token);
3617
3618         c = *spelling;
3619         do
3620           putc (c, fp);
3621         while ((c = *++spelling) != '\0');
3622       }
3623       break;
3624
3625     spell_ident:
3626     case SPELL_IDENT:
3627       {
3628         size_t i;
3629         const unsigned char * name = NODE_NAME (token->val.node.node);
3630
3631         for (i = 0; i < NODE_LEN (token->val.node.node); i++)
3632           if (name[i] & ~0x7F)
3633             {
3634               unsigned char buffer[10];
3635               i += utf8_to_ucn (buffer, name + i) - 1;
3636               fwrite (buffer, 1, 10, fp);
3637             }
3638           else
3639             fputc (NODE_NAME (token->val.node.node)[i], fp);
3640       }
3641       break;
3642
3643     case SPELL_LITERAL:
3644       if (token->type == CPP_HEADER_NAME)
3645         fputc ('"', fp);
3646       fwrite (token->val.str.text, 1, token->val.str.len, fp);
3647       if (token->type == CPP_HEADER_NAME)
3648         fputc ('"', fp);
3649       break;
3650
3651     case SPELL_NONE:
3652       /* An error, most probably.  */
3653       break;
3654     }
3655 }
3656
3657 /* Compare two tokens.  */
3658 int
3659 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
3660 {
3661   if (a->type == b->type && a->flags == b->flags)
3662     switch (TOKEN_SPELL (a))
3663       {
3664       default:                  /* Keep compiler happy.  */
3665       case SPELL_OPERATOR:
3666         /* token_no is used to track where multiple consecutive ##
3667            tokens were originally located.  */
3668         return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
3669       case SPELL_NONE:
3670         return (a->type != CPP_MACRO_ARG
3671                 || (a->val.macro_arg.arg_no == b->val.macro_arg.arg_no
3672                     && a->val.macro_arg.spelling == b->val.macro_arg.spelling));
3673       case SPELL_IDENT:
3674         return (a->val.node.node == b->val.node.node
3675                 && a->val.node.spelling == b->val.node.spelling);
3676       case SPELL_LITERAL:
3677         return (a->val.str.len == b->val.str.len
3678                 && !memcmp (a->val.str.text, b->val.str.text,
3679                             a->val.str.len));
3680       }
3681
3682   return 0;
3683 }
3684
3685 /* Returns nonzero if a space should be inserted to avoid an
3686    accidental token paste for output.  For simplicity, it is
3687    conservative, and occasionally advises a space where one is not
3688    needed, e.g. "." and ".2".  */
3689 int
3690 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
3691                  const cpp_token *token2)
3692 {
3693   enum cpp_ttype a = token1->type, b = token2->type;
3694   cppchar_t c;
3695
3696   if (token1->flags & NAMED_OP)
3697     a = CPP_NAME;
3698   if (token2->flags & NAMED_OP)
3699     b = CPP_NAME;
3700
3701   c = EOF;
3702   if (token2->flags & DIGRAPH)
3703     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
3704   else if (token_spellings[b].category == SPELL_OPERATOR)
3705     c = token_spellings[b].name[0];
3706
3707   /* Quickly get everything that can paste with an '='.  */
3708   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
3709     return 1;
3710
3711   switch (a)
3712     {
3713     case CPP_GREATER:   return c == '>';
3714     case CPP_LESS:      return c == '<' || c == '%' || c == ':';
3715     case CPP_PLUS:      return c == '+';
3716     case CPP_MINUS:     return c == '-' || c == '>';
3717     case CPP_DIV:       return c == '/' || c == '*'; /* Comments.  */
3718     case CPP_MOD:       return c == ':' || c == '>';
3719     case CPP_AND:       return c == '&';
3720     case CPP_OR:        return c == '|';
3721     case CPP_COLON:     return c == ':' || c == '>';
3722     case CPP_DEREF:     return c == '*';
3723     case CPP_DOT:       return c == '.' || c == '%' || b == CPP_NUMBER;
3724     case CPP_HASH:      return c == '#' || c == '%'; /* Digraph form.  */
3725     case CPP_PRAGMA:
3726     case CPP_NAME:      return ((b == CPP_NUMBER
3727                                  && name_p (pfile, &token2->val.str))
3728                                 || b == CPP_NAME
3729                                 || b == CPP_CHAR || b == CPP_STRING); /* L */
3730     case CPP_NUMBER:    return (b == CPP_NUMBER || b == CPP_NAME
3731                                 || b == CPP_CHAR
3732                                 || c == '.' || c == '+' || c == '-');
3733                                       /* UCNs */
3734     case CPP_OTHER:     return ((token1->val.str.text[0] == '\\'
3735                                  && b == CPP_NAME)
3736                                 || (CPP_OPTION (pfile, objc)
3737                                     && token1->val.str.text[0] == '@'
3738                                     && (b == CPP_NAME || b == CPP_STRING)));
3739     case CPP_LESS_EQ:   return c == '>';
3740     case CPP_STRING:
3741     case CPP_WSTRING:
3742     case CPP_UTF8STRING:
3743     case CPP_STRING16:
3744     case CPP_STRING32:  return (CPP_OPTION (pfile, user_literals)
3745                                 && (b == CPP_NAME
3746                                     || (TOKEN_SPELL (token2) == SPELL_LITERAL
3747                                         && ISIDST (token2->val.str.text[0]))));
3748
3749     default:            break;
3750     }
3751
3752   return 0;
3753 }
3754
3755 /* Output all the remaining tokens on the current line, and a newline
3756    character, to FP.  Leading whitespace is removed.  If there are
3757    macros, special token padding is not performed.  */
3758 void
3759 cpp_output_line (cpp_reader *pfile, FILE *fp)
3760 {
3761   const cpp_token *token;
3762
3763   token = cpp_get_token (pfile);
3764   while (token->type != CPP_EOF)
3765     {
3766       cpp_output_token (token, fp);
3767       token = cpp_get_token (pfile);
3768       if (token->flags & PREV_WHITE)
3769         putc (' ', fp);
3770     }
3771
3772   putc ('\n', fp);
3773 }
3774
3775 /* Return a string representation of all the remaining tokens on the
3776    current line.  The result is allocated using xmalloc and must be
3777    freed by the caller.  */
3778 unsigned char *
3779 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
3780 {
3781   const cpp_token *token;
3782   unsigned int out = dir_name ? ustrlen (dir_name) : 0;
3783   unsigned int alloced = 120 + out;
3784   unsigned char *result = (unsigned char *) xmalloc (alloced);
3785
3786   /* If DIR_NAME is empty, there are no initial contents.  */
3787   if (dir_name)
3788     {
3789       sprintf ((char *) result, "#%s ", dir_name);
3790       out += 2;
3791     }
3792
3793   token = cpp_get_token (pfile);
3794   while (token->type != CPP_EOF)
3795     {
3796       unsigned char *last;
3797       /* Include room for a possible space and the terminating nul.  */
3798       unsigned int len = cpp_token_len (token) + 2;
3799
3800       if (out + len > alloced)
3801         {
3802           alloced *= 2;
3803           if (out + len > alloced)
3804             alloced = out + len;
3805           result = (unsigned char *) xrealloc (result, alloced);
3806         }
3807
3808       last = cpp_spell_token (pfile, token, &result[out], 0);
3809       out = last - result;
3810
3811       token = cpp_get_token (pfile);
3812       if (token->flags & PREV_WHITE)
3813         result[out++] = ' ';
3814     }
3815
3816   result[out] = '\0';
3817   return result;
3818 }
3819
3820 /* Memory buffers.  Changing these three constants can have a dramatic
3821    effect on performance.  The values here are reasonable defaults,
3822    but might be tuned.  If you adjust them, be sure to test across a
3823    range of uses of cpplib, including heavy nested function-like macro
3824    expansion.  Also check the change in peak memory usage (NJAMD is a
3825    good tool for this).  */
3826 #define MIN_BUFF_SIZE 8000
3827 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
3828 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
3829         (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
3830
3831 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
3832   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
3833 #endif
3834
3835 /* Create a new allocation buffer.  Place the control block at the end
3836    of the buffer, so that buffer overflows will cause immediate chaos.  */
3837 static _cpp_buff *
3838 new_buff (size_t len)
3839 {
3840   _cpp_buff *result;
3841   unsigned char *base;
3842
3843   if (len < MIN_BUFF_SIZE)
3844     len = MIN_BUFF_SIZE;
3845   len = CPP_ALIGN (len);
3846
3847 #ifdef ENABLE_VALGRIND_ANNOTATIONS
3848   /* Valgrind warns about uses of interior pointers, so put _cpp_buff
3849      struct first.  */
3850   size_t slen = CPP_ALIGN2 (sizeof (_cpp_buff), 2 * DEFAULT_ALIGNMENT);
3851   base = XNEWVEC (unsigned char, len + slen);
3852   result = (_cpp_buff *) base;
3853   base += slen;
3854 #else
3855   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
3856   result = (_cpp_buff *) (base + len);
3857 #endif
3858   result->base = base;
3859   result->cur = base;
3860   result->limit = base + len;
3861   result->next = NULL;
3862   return result;
3863 }
3864
3865 /* Place a chain of unwanted allocation buffers on the free list.  */
3866 void
3867 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
3868 {
3869   _cpp_buff *end = buff;
3870
3871   while (end->next)
3872     end = end->next;
3873   end->next = pfile->free_buffs;
3874   pfile->free_buffs = buff;
3875 }
3876
3877 /* Return a free buffer of size at least MIN_SIZE.  */
3878 _cpp_buff *
3879 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
3880 {
3881   _cpp_buff *result, **p;
3882
3883   for (p = &pfile->free_buffs;; p = &(*p)->next)
3884     {
3885       size_t size;
3886
3887       if (*p == NULL)
3888         return new_buff (min_size);
3889       result = *p;
3890       size = result->limit - result->base;
3891       /* Return a buffer that's big enough, but don't waste one that's
3892          way too big.  */
3893       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
3894         break;
3895     }
3896
3897   *p = result->next;
3898   result->next = NULL;
3899   result->cur = result->base;
3900   return result;
3901 }
3902
3903 /* Creates a new buffer with enough space to hold the uncommitted
3904    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
3905    the excess bytes to the new buffer.  Chains the new buffer after
3906    BUFF, and returns the new buffer.  */
3907 _cpp_buff *
3908 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
3909 {
3910   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
3911   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
3912
3913   buff->next = new_buff;
3914   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
3915   return new_buff;
3916 }
3917
3918 /* Creates a new buffer with enough space to hold the uncommitted
3919    remaining bytes of the buffer pointed to by BUFF, and at least
3920    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
3921    Chains the new buffer before the buffer pointed to by BUFF, and
3922    updates the pointer to point to the new buffer.  */
3923 void
3924 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
3925 {
3926   _cpp_buff *new_buff, *old_buff = *pbuff;
3927   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
3928
3929   new_buff = _cpp_get_buff (pfile, size);
3930   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
3931   new_buff->next = old_buff;
3932   *pbuff = new_buff;
3933 }
3934
3935 /* Free a chain of buffers starting at BUFF.  */
3936 void
3937 _cpp_free_buff (_cpp_buff *buff)
3938 {
3939   _cpp_buff *next;
3940
3941   for (; buff; buff = next)
3942     {
3943       next = buff->next;
3944 #ifdef ENABLE_VALGRIND_ANNOTATIONS
3945       free (buff);
3946 #else
3947       free (buff->base);
3948 #endif
3949     }
3950 }
3951
3952 /* Allocate permanent, unaligned storage of length LEN.  */
3953 unsigned char *
3954 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
3955 {
3956   _cpp_buff *buff = pfile->u_buff;
3957   unsigned char *result = buff->cur;
3958
3959   if (len > (size_t) (buff->limit - result))
3960     {
3961       buff = _cpp_get_buff (pfile, len);
3962       buff->next = pfile->u_buff;
3963       pfile->u_buff = buff;
3964       result = buff->cur;
3965     }
3966
3967   buff->cur = result + len;
3968   return result;
3969 }
3970
3971 /* Allocate permanent, unaligned storage of length LEN from a_buff.
3972    That buffer is used for growing allocations when saving macro
3973    replacement lists in a #define, and when parsing an answer to an
3974    assertion in #assert, #unassert or #if (and therefore possibly
3975    whilst expanding macros).  It therefore must not be used by any
3976    code that they might call: specifically the lexer and the guts of
3977    the macro expander.
3978
3979    All existing other uses clearly fit this restriction: storing
3980    registered pragmas during initialization.  */
3981 unsigned char *
3982 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
3983 {
3984   _cpp_buff *buff = pfile->a_buff;
3985   unsigned char *result = buff->cur;
3986
3987   if (len > (size_t) (buff->limit - result))
3988     {
3989       buff = _cpp_get_buff (pfile, len);
3990       buff->next = pfile->a_buff;
3991       pfile->a_buff = buff;
3992       result = buff->cur;
3993     }
3994
3995   buff->cur = result + len;
3996   return result;
3997 }
3998
3999 /* Commit or allocate storage from a buffer.  */
4000
4001 void *
4002 _cpp_commit_buff (cpp_reader *pfile, size_t size)
4003 {
4004   void *ptr = BUFF_FRONT (pfile->a_buff);
4005
4006   if (pfile->hash_table->alloc_subobject)
4007     {
4008       void *copy = pfile->hash_table->alloc_subobject (size);
4009       memcpy (copy, ptr, size);
4010       ptr = copy;
4011     }
4012   else
4013     BUFF_FRONT (pfile->a_buff) += size;
4014
4015   return ptr;
4016 }
4017
4018 /* Say which field of TOK is in use.  */
4019
4020 enum cpp_token_fld_kind
4021 cpp_token_val_index (const cpp_token *tok)
4022 {
4023   switch (TOKEN_SPELL (tok))
4024     {
4025     case SPELL_IDENT:
4026       return CPP_TOKEN_FLD_NODE;
4027     case SPELL_LITERAL:
4028       return CPP_TOKEN_FLD_STR;
4029     case SPELL_OPERATOR:
4030       /* Operands which were originally spelled as ident keep around
4031          the node for the exact spelling.  */
4032       if (tok->flags & NAMED_OP)
4033         return CPP_TOKEN_FLD_NODE;
4034       else if (tok->type == CPP_PASTE)
4035         return CPP_TOKEN_FLD_TOKEN_NO;
4036       else
4037         return CPP_TOKEN_FLD_NONE;
4038     case SPELL_NONE:
4039       if (tok->type == CPP_MACRO_ARG)
4040         return CPP_TOKEN_FLD_ARG_NO;
4041       else if (tok->type == CPP_PADDING)
4042         return CPP_TOKEN_FLD_SOURCE;
4043       else if (tok->type == CPP_PRAGMA)
4044         return CPP_TOKEN_FLD_PRAGMA;
4045       /* fall through */
4046     default:
4047       return CPP_TOKEN_FLD_NONE;
4048     }
4049 }
4050
4051 /* All tokens lexed in R after calling this function will be forced to
4052    have their location_t to be P, until
4053    cpp_stop_forcing_token_locations is called for R.  */
4054
4055 void
4056 cpp_force_token_locations (cpp_reader *r, location_t loc)
4057 {
4058   r->forced_token_location = loc;
4059 }
4060
4061 /* Go back to assigning locations naturally for lexed tokens.  */
4062
4063 void
4064 cpp_stop_forcing_token_locations (cpp_reader *r)
4065 {
4066   r->forced_token_location = 0;
4067 }
4068
4069 /* We're looking at \, if it's escaping EOL, look past it.  If at
4070    LIMIT, don't advance.  */
4071
4072 static const unsigned char *
4073 do_peek_backslash (const unsigned char *peek, const unsigned char *limit)
4074 {
4075   const unsigned char *probe = peek;
4076
4077   if (__builtin_expect (peek[1] == '\n', true))
4078     {
4079     eol:
4080       probe += 2;
4081       if (__builtin_expect (probe < limit, true))
4082         {
4083           peek = probe;
4084           if (*peek == '\\')
4085             /* The user might be perverse.  */
4086             return do_peek_backslash (peek, limit);
4087         }
4088     }
4089   else if (__builtin_expect (peek[1] == '\r', false))
4090     {
4091       if (probe[2] == '\n')
4092         probe++;
4093       goto eol;
4094     }
4095
4096   return peek;
4097 }
4098
4099 static const unsigned char *
4100 do_peek_next (const unsigned char *peek, const unsigned char *limit)
4101 {
4102   if (__builtin_expect (*peek == '\\', false))
4103     peek = do_peek_backslash (peek, limit);
4104   return peek;
4105 }
4106
4107 static const unsigned char *
4108 do_peek_prev (const unsigned char *peek, const unsigned char *bound)
4109 {
4110   if (peek == bound)
4111     return NULL;
4112
4113   unsigned char c = *--peek;
4114   if (__builtin_expect (c == '\n', false)
4115       || __builtin_expect (c == 'r', false))
4116     {
4117       if (peek == bound)
4118         return peek;
4119       int ix = -1;
4120       if (c == '\n' && peek[ix] == '\r')
4121         {
4122           if (peek + ix == bound)
4123             return peek;
4124           ix--;
4125         }
4126
4127       if (peek[ix] == '\\')
4128         return do_peek_prev (peek + ix, bound);
4129
4130       return peek;
4131     }
4132   else
4133     return peek;
4134 }
4135
4136 /* If PEEK[-1] is identifier MATCH, scan past it and trailing white
4137    space.  Otherwise return NULL.  */
4138
4139 static const unsigned char *
4140 do_peek_ident (const char *match, const unsigned char *peek,
4141                const unsigned char *limit)
4142 {
4143   for (; *++match; peek++)
4144     if (*peek != *match)
4145       {
4146         peek = do_peek_next (peek, limit);
4147         if (*peek != *match)
4148           return NULL;
4149       }
4150
4151   /* Must now not be looking at an identifier char.  */
4152   peek = do_peek_next (peek, limit);
4153   if (ISIDNUM (*peek))
4154     return NULL;
4155
4156   /* Skip control-line whitespace.  */
4157  ws:
4158   while (*peek == ' ' || *peek == '\t')
4159     peek++;
4160   if (__builtin_expect (*peek == '\\', false))
4161     {
4162       peek = do_peek_backslash (peek, limit);
4163       if (*peek != '\\')
4164         goto ws;
4165     }
4166
4167   return peek;
4168 }
4169
4170 /* Are we looking at a module control line starting as PEEK - 1?  */
4171
4172 static bool
4173 do_peek_module (cpp_reader *pfile, unsigned char c,
4174                 const unsigned char *peek, const unsigned char *limit)
4175 {
4176   bool import = false;
4177
4178   if (__builtin_expect (c == 'e', false))
4179     {
4180       if (!((peek[0] == 'x' || peek[0] == '\\')
4181             && (peek = do_peek_ident ("export", peek, limit))))
4182         return false;
4183
4184       /* export, peek for import or module.  No need to peek __import
4185          here.  */
4186       if (peek[0] == 'i')
4187         {
4188           if (!((peek[1] == 'm' || peek[1] == '\\')
4189                 && (peek = do_peek_ident ("import", peek + 1, limit))))
4190             return false;
4191           import = true;
4192         }
4193       else if (peek[0] == 'm')
4194         {
4195           if (!((peek[1] == 'o' || peek[1] == '\\')
4196                 && (peek = do_peek_ident ("module", peek + 1, limit))))
4197             return false;
4198         }
4199       else
4200         return false;
4201     }
4202   else if (__builtin_expect (c == 'i', false))
4203     {
4204       if (!((peek[0] == 'm' || peek[0] == '\\')
4205             && (peek = do_peek_ident ("import", peek, limit))))
4206         return false;
4207       import = true;
4208     }
4209   else if (__builtin_expect (c == '_', false))
4210     {
4211       /* Needed for translated includes.   */
4212       if (!((peek[0] == '_' || peek[0] == '\\')
4213             && (peek = do_peek_ident ("__import", peek, limit))))
4214         return false;
4215       import = true;
4216     }
4217   else if (__builtin_expect (c == 'm', false))
4218     {
4219       if (!((peek[0] == 'o' || peek[0] == '\\')
4220             && (peek = do_peek_ident ("module", peek, limit))))
4221         return false;
4222     }
4223   else
4224     return false;
4225
4226   /* Peek the next character to see if it's good enough.  We'll be at
4227      the first non-whitespace char, including skipping an escaped
4228      newline.  */
4229   /* ... import followed by identifier, ':', '<' or header-name
4230      preprocessing tokens, or module followed by identifier, ':' or
4231      ';' preprocessing tokens.  */
4232   unsigned char p = *peek++;
4233
4234   /* A character literal is ... single quotes, ... optionally preceded
4235      by u8, u, U, or L */
4236   /* A string-literal is a ... double quotes, optionally prefixed by
4237      R, u8, u8R, u, uR, U, UR, L, or LR */
4238   if (p == 'u')
4239     {
4240       peek = do_peek_next (peek, limit);
4241       if (*peek == '8')
4242         {
4243           peek++;
4244           goto peek_u8;
4245         }
4246       goto peek_u;
4247     }
4248   else if (p == 'U' || p == 'L')
4249     {
4250     peek_u8:
4251       peek = do_peek_next (peek, limit);
4252     peek_u:
4253       if (*peek == '\"' || *peek == '\'')
4254         return false;
4255
4256       if (*peek == 'R')
4257         goto peek_R;
4258       /* Identifier. Ok.  */
4259     }
4260   else if (p == 'R')
4261     {
4262     peek_R:
4263       if (CPP_OPTION (pfile, rliterals))
4264         {
4265           peek = do_peek_next (peek, limit);
4266           if (*peek == '\"')
4267             return false;
4268         }
4269       /* Identifier. Ok.  */
4270     }
4271   else if ('Z' - 'A' == 25
4272            ? ((p >= 'A' && p <= 'Z') || (p >= 'a' && p <= 'z') || p == '_')
4273            : ISIDST (p))
4274     {
4275       /* Identifier.  Ok. */
4276     }
4277   else if (p == '<')
4278     {
4279       /* Maybe angle header, ok for import.  Reject
4280          '<=', '<<' digraph:'<:'.  */
4281       if (!import)
4282         return false;
4283       peek = do_peek_next (peek, limit);
4284       if (*peek == '=' || *peek == '<'
4285           || (*peek == ':' && CPP_OPTION (pfile, digraphs)))
4286         return false;
4287     }
4288   else if (p == ';')
4289     {
4290       /* SEMICOLON, ok for module.  */
4291       if (import)
4292         return false;
4293     }
4294   else if (p == '"')
4295     {
4296       /* STRING, ok for import.  */
4297       if (!import)
4298         return false;
4299     }
4300   else if (p == ':')
4301     {
4302       /* Maybe COLON, ok.  Reject '::', digraph:':>'.  */
4303       peek = do_peek_next (peek, limit);
4304       if (*peek == ':' || (*peek == '>' && CPP_OPTION (pfile, digraphs)))
4305         return false;
4306     }
4307   else
4308     /* FIXME: Detect a unicode character, excluding those not
4309        permitted as the initial character. [lex.name]/1.  I presume
4310        we need to check the \[uU] spellings, and directly using
4311        Unicode in say UTF8 form?  Or perhaps we do the phase-1
4312        conversion of UTF8 to universal-character-names?  */
4313     return false;
4314
4315   return true;
4316 }
4317
4318 /* Directives-only scanning.  Somewhat more relaxed than correct
4319    parsing -- some ill-formed programs will not be rejected.  */
4320
4321 void
4322 cpp_directive_only_process (cpp_reader *pfile,
4323                             void *data,
4324                             void (*cb) (cpp_reader *, CPP_DO_task, void *, ...))
4325 {
4326   bool module_p = CPP_OPTION (pfile, module_directives);
4327
4328   do
4329     {
4330     restart:
4331       /* Buffer initialization, but no line cleaning. */
4332       cpp_buffer *buffer = pfile->buffer;
4333       buffer->cur_note = buffer->notes_used = 0;
4334       buffer->cur = buffer->line_base = buffer->next_line;
4335       buffer->need_line = false;
4336       /* Files always end in a newline or carriage return.  We rely on this for
4337          character peeking safety.  */
4338       gcc_assert (buffer->rlimit[0] == '\n' || buffer->rlimit[0] == '\r');
4339
4340       const unsigned char *base = buffer->cur;
4341       unsigned line_count = 0;
4342       const unsigned char *line_start = base;
4343
4344       bool bol = true;
4345       bool raw = false;
4346
4347       const unsigned char *lwm = base;
4348       for (const unsigned char *pos = base, *limit = buffer->rlimit;
4349            pos < limit;)
4350         {
4351           unsigned char c = *pos++;
4352           /* This matches the switch in _cpp_lex_direct.  */
4353           switch (c)
4354             {
4355             case ' ': case '\t': case '\f': case '\v':
4356               /* Whitespace, do nothing.  */
4357               break;
4358
4359             case '\r': /* MAC line ending, or Windows \r\n  */
4360               if (*pos == '\n')
4361                 pos++;
4362               /* FALLTHROUGH */
4363
4364             case '\n':
4365               bol = true;
4366
4367             next_line:
4368               CPP_INCREMENT_LINE (pfile, 0);
4369               line_count++;
4370               line_start = pos;
4371               break;
4372
4373             case '\\':
4374               /* <backslash><newline> is removed, and doesn't undo any
4375                  preceeding escape or whatnot.  */
4376               if (*pos == '\n')
4377                 {
4378                   pos++;
4379                   goto next_line;
4380                 }
4381               else if (*pos == '\r')
4382                 {
4383                   if (pos[1] == '\n')
4384                     pos++;
4385                   pos++;
4386                   goto next_line;
4387                 }
4388               goto dflt;
4389
4390             case '#':
4391               if (bol)
4392                 {
4393                   /* Line directive.  */
4394                   if (pos - 1 > base && !pfile->state.skipping)
4395                     cb (pfile, CPP_DO_print, data,
4396                         line_count, base, pos - 1 - base);
4397
4398                   /* Prep things for directive handling. */
4399                   buffer->next_line = pos;
4400                   buffer->need_line = true;
4401                   bool ok = _cpp_get_fresh_line (pfile);
4402                   gcc_checking_assert (ok);
4403
4404                   /* Ensure proper column numbering for generated
4405                      error messages. */
4406                   buffer->line_base -= pos - line_start;
4407
4408                   _cpp_handle_directive (pfile, line_start + 1 != pos);
4409
4410                   /* Sanitize the line settings.  Duplicate #include's can
4411                      mess things up. */
4412                   // FIXME: Necessary?
4413                   pfile->line_table->highest_location
4414                     = pfile->line_table->highest_line;
4415
4416                   if (!pfile->state.skipping
4417                       && pfile->buffer->next_line < pfile->buffer->rlimit)
4418                     cb (pfile, CPP_DO_location, data,
4419                         pfile->line_table->highest_line);
4420
4421                   goto restart;
4422                 }
4423               goto dflt;
4424
4425             case '/':
4426               {
4427                 const unsigned char *peek = do_peek_next (pos, limit);
4428                 if (!(*peek == '/' || *peek == '*'))
4429                   goto dflt;
4430
4431                 /* Line or block comment  */
4432                 bool is_block = *peek == '*';
4433                 bool star = false;
4434                 bool esc = false;
4435                 location_t sloc
4436                   = linemap_position_for_column (pfile->line_table,
4437                                                  pos - line_start);
4438
4439                 while (pos < limit)
4440                   {
4441                     char c = *pos++;
4442                     switch (c)
4443                       {
4444                       case '\\':
4445                         esc = true;
4446                         break;
4447
4448                       case '\r':
4449                         if (*pos == '\n')
4450                           pos++;
4451                         /* FALLTHROUGH  */
4452
4453                       case '\n':
4454                         {
4455                           CPP_INCREMENT_LINE (pfile, 0);
4456                           line_count++;
4457                           line_start = pos;
4458                           if (!esc && !is_block)
4459                             {
4460                               bol = true;
4461                               goto done_comment;
4462                             }
4463                         }
4464                         if (!esc)
4465                           star = false;
4466                         esc = false;
4467                         break;
4468
4469                       case '*':
4470                         if (pos > peek && !esc)
4471                           star = is_block;
4472                         esc = false;
4473                         break;
4474
4475                       case '/':
4476                         if (star)
4477                           goto done_comment;
4478                         /* FALLTHROUGH  */
4479
4480                       default:
4481                         star = false;
4482                         esc = false;
4483                         break;
4484                       }
4485                   }
4486                 if (pos < limit || is_block)
4487                   cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
4488                                        "unterminated comment");
4489               done_comment:
4490                 lwm = pos;
4491                 break;
4492               }
4493
4494             case '\'':
4495               if (!CPP_OPTION (pfile, digit_separators))
4496                 goto delimited_string;
4497
4498               /* Possibly a number punctuator.  */
4499               if (!ISIDNUM (*do_peek_next (pos, limit)))
4500                 goto delimited_string;
4501
4502               goto quote_peek;
4503
4504             case '\"':
4505               if (!CPP_OPTION (pfile, rliterals))
4506                 goto delimited_string;
4507
4508             quote_peek:
4509               {
4510                 /* For ' see if it's a number punctuator
4511                    \.?<digit>(<digit>|<identifier-nondigit>
4512                    |'<digit>|'<nondigit>|[eEpP]<sign>|\.)* */
4513                 /* For " see if it's a raw string
4514                    {U,L,u,u8}R.  This includes CPP_NUMBER detection,
4515                    because that could be 0e+R.  */
4516                 const unsigned char *peek = pos - 1;
4517                 bool quote_first = c == '"';
4518                 bool quote_eight = false;
4519                 bool maybe_number_start = false;
4520                 bool want_number = false;
4521
4522                 while ((peek = do_peek_prev (peek, lwm)))
4523                   {
4524                     unsigned char p = *peek;
4525                     if (quote_first)
4526                       {
4527                         if (!raw)
4528                           {
4529                             if (p != 'R')
4530                               break;
4531                             raw = true;
4532                             continue;
4533                           }
4534
4535                         quote_first = false;
4536                         if (p == 'L' || p == 'U' || p == 'u')
4537                           ;
4538                         else if (p == '8')
4539                           quote_eight = true;
4540                         else
4541                           goto second_raw;
4542                       }
4543                     else if (quote_eight)
4544                       {
4545                         if (p != 'u')
4546                           {
4547                             raw = false;
4548                             break;
4549                           }
4550                         quote_eight = false;
4551                       }
4552                     else if (c == '"')
4553                       {
4554                       second_raw:;
4555                         if (!want_number && ISIDNUM (p))
4556                           {
4557                             raw = false;
4558                             break;
4559                           }
4560                       }
4561
4562                     if (ISDIGIT (p))
4563                       maybe_number_start = true;
4564                     else if (p == '.')
4565                       want_number = true;
4566                     else if (ISIDNUM (p))
4567                       maybe_number_start = false;
4568                     else if (p == '+' || p == '-')
4569                       {
4570                         if (const unsigned char *peek_prev
4571                             = do_peek_prev (peek, lwm))
4572                           {
4573                             p = *peek_prev;
4574                             if (p == 'e' || p == 'E'
4575                                 || p == 'p' || p == 'P')
4576                               {
4577                                 want_number = true;
4578                                 maybe_number_start = false;
4579                               }
4580                             else
4581                               break;
4582                           }
4583                         else
4584                           break;
4585                       }
4586                     else if (p == '\'' || p == '\"')
4587                       {
4588                         /* If this is lwm, this must be the end of a
4589                            previous string.  So this is a trailing
4590                            literal type, (a) if those are allowed,
4591                              and (b) maybe_start is false.  Otherwise
4592                              this must be a CPP_NUMBER because we've
4593                              met another ', and we'd have checked that
4594                              in its own right.  */
4595                         if (peek == lwm && CPP_OPTION (pfile, uliterals))
4596                           {
4597                             if  (!maybe_number_start && !want_number)
4598                               /* Must be a literal type.  */
4599                               raw = false;
4600                           }
4601                         else if (p == '\''
4602                                  && CPP_OPTION (pfile, digit_separators))
4603                           maybe_number_start = true;
4604                         break;
4605                       }
4606                     else if (c == '\'')
4607                       break;
4608                     else if (!quote_first && !quote_eight)
4609                       break;
4610                   }
4611
4612                 if (maybe_number_start)
4613                   {
4614                     if (c == '\'')
4615                       /* A CPP NUMBER.  */
4616                       goto dflt;
4617                     raw = false;
4618                   }
4619
4620                 goto delimited_string;
4621               }
4622
4623             delimited_string:
4624               {
4625                 /* (Possibly raw) string or char literal.  */
4626                 unsigned char end = c;
4627                 int delim_len = -1;
4628                 const unsigned char *delim = NULL;
4629                 location_t sloc = linemap_position_for_column (pfile->line_table,
4630                                                                pos - line_start);
4631                 int esc = 0;
4632
4633                 if (raw)
4634                   {
4635                     /* There can be no line breaks in the delimiter.  */
4636                     delim = pos;
4637                     for (delim_len = 0; (c = *pos++) != '('; delim_len++)
4638                       {
4639                         if (delim_len == 16)
4640                           {
4641                             cpp_error_with_line (pfile, CPP_DL_ERROR,
4642                                                  sloc, 0,
4643                                                  "raw string delimiter"
4644                                                  " longer than %d"
4645                                                  " characters",
4646                                                  delim_len);
4647                             raw = false;
4648                             pos = delim;
4649                             break;
4650                           }
4651                         if (strchr (") \\\t\v\f\n", c))
4652                           {
4653                             cpp_error_with_line (pfile, CPP_DL_ERROR,
4654                                                  sloc, 0,
4655                                                  "invalid character '%c'"
4656                                                  " in raw string"
4657                                                  " delimiter", c);
4658                             raw = false;
4659                             pos = delim;
4660                             break;
4661                           }
4662                         if (pos >= limit)
4663                           goto bad_string;
4664                       }
4665                   }
4666
4667                 while (pos < limit)
4668                   {
4669                     char c = *pos++;
4670                     switch (c)
4671                       {
4672                       case '\\':
4673                         if (!raw)
4674                           esc++;
4675                         break;
4676
4677                       case '\r':
4678                         if (*pos == '\n')
4679                           pos++;
4680                         /* FALLTHROUGH  */
4681
4682                       case '\n':
4683                         {
4684                           CPP_INCREMENT_LINE (pfile, 0);
4685                           line_count++;
4686                           line_start = pos;
4687                         }
4688                         if (esc)
4689                           esc--;
4690                         break;
4691
4692                       case ')':
4693                         if (raw
4694                             && pos + delim_len + 1 < limit
4695                             && pos[delim_len] == end
4696                             && !memcmp (delim, pos, delim_len))
4697                           {
4698                             pos += delim_len + 1;
4699                             raw = false;
4700                             goto done_string;
4701                           }
4702                         break;
4703
4704                       default:
4705                         if (!raw && !(esc & 1) && c == end)
4706                           goto done_string;
4707                         esc = 0;
4708                         break;
4709                       }
4710                   }
4711               bad_string:
4712                 cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
4713                                      "unterminated literal");
4714
4715               done_string:
4716                 raw = false;
4717                 lwm = pos - 1;
4718               }
4719               goto dflt;
4720
4721             case '_':
4722             case 'e':
4723             case 'i':
4724             case 'm':
4725               if (bol && module_p && !pfile->state.skipping
4726                   && do_peek_module (pfile, c, pos, limit))
4727                 {
4728                   /* We've seen the start of a module control line.
4729                      Start up the tokenizer.  */
4730                   pos--; /* Backup over the first character.  */
4731
4732                   /* Backup over whitespace to start of line.  */
4733                   while (pos > line_start
4734                          && (pos[-1] == ' ' || pos[-1] == '\t'))
4735                     pos--;
4736
4737                   if (pos > base)
4738                     cb (pfile, CPP_DO_print, data, line_count, base, pos - base);
4739
4740                   /* Prep things for directive handling. */
4741                   buffer->next_line = pos;
4742                   buffer->need_line = true;
4743
4744                   /* Now get tokens until the PRAGMA_EOL.  */
4745                   do
4746                     {
4747                       location_t spelling;
4748                       const cpp_token *tok
4749                         = cpp_get_token_with_location (pfile, &spelling);
4750
4751                       gcc_assert (pfile->state.in_deferred_pragma
4752                                   || tok->type == CPP_PRAGMA_EOL);
4753                       cb (pfile, CPP_DO_token, data, tok, spelling);
4754                     }
4755                   while (pfile->state.in_deferred_pragma);
4756
4757                   if (pfile->buffer->next_line < pfile->buffer->rlimit)
4758                     cb (pfile, CPP_DO_location, data,
4759                         pfile->line_table->highest_line);
4760
4761                   pfile->mi_valid = false;
4762                   goto restart;
4763                 }
4764               goto dflt;
4765
4766             default:
4767             dflt:
4768               bol = false;
4769               pfile->mi_valid = false;
4770               break;
4771             }
4772         }
4773
4774       if (buffer->rlimit > base && !pfile->state.skipping)
4775         {
4776           const unsigned char *limit = buffer->rlimit;
4777           /* If the file was not newline terminated, add rlimit, which is
4778              guaranteed to point to a newline, to the end of our range.  */
4779           if (limit[-1] != '\n')
4780             {
4781               limit++;
4782               CPP_INCREMENT_LINE (pfile, 0);
4783               line_count++;
4784             }
4785           cb (pfile, CPP_DO_print, data, line_count, base, limit - base);
4786         }
4787
4788       _cpp_pop_buffer (pfile);
4789     }
4790   while (pfile->buffer);
4791 }