libcpp/lex.cc

   1 /* CPP Library - lexical analysis.
   2    Copyright (C) 2000-2024 Free Software Foundation, Inc.
   3    Contributed by Per Bothner, 1994-95.
   4    Based on CCCP program by Paul Rubin, June 1986
   5    Adapted to ANSI C, Richard Stallman, Jan 1987
   6    Broken out to separate file, Zack Weinberg, Mar 2000
   7
   8 This program is free software; you can redistribute it and/or modify it
   9 under the terms of the GNU General Public License as published by the
  10 Free Software Foundation; either version 3, or (at your option) any
  11 later version.
  12
  13 This program is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with this program; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "cpplib.h"
  25 #include "internal.h"
  26
  27 enum spell_type
  28 {
  29   SPELL_OPERATOR = 0,
  30   SPELL_IDENT,
  31   SPELL_LITERAL,
  32   SPELL_NONE
  33 };
  34
  35 struct token_spelling
  36 {
  37   enum spell_type category;
  38   const unsigned char *name;
  39 };
  40
  41 static const unsigned char *const digraph_spellings[] =
  42 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
  43
  44 #define OP(e, s) { SPELL_OPERATOR, UC s  },
  45 #define TK(e, s) { SPELL_ ## s,    UC #e },
  46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
  47 #undef OP
  48 #undef TK
  49
  50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
  51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
  52
  53 /* ISO 10646 defines the UCS codespace as the range 0-0x10FFFF inclusive.  */
  54 #define UCS_LIMIT 0x10FFFF
  55
  56 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
  57 static int skip_line_comment (cpp_reader *);
  58 static void skip_whitespace (cpp_reader *, cppchar_t);
  59 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
  60 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
  61 static void store_comment (cpp_reader *, cpp_token *);
  62 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
  63                             unsigned int, enum cpp_ttype);
  64 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
  65 static int name_p (cpp_reader *, const cpp_string *);
  66 static tokenrun *next_tokenrun (tokenrun *);
  67
  68 static _cpp_buff *new_buff (size_t);
  69
  70
  71 /* Utility routine:
  72
  73    Compares, the token TOKEN to the NUL-terminated string STRING.
  74    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
  75 int
  76 cpp_ideq (const cpp_token *token, const char *string)
  77 {
  78   if (token->type != CPP_NAME)
  79     return 0;
  80
  81   return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
  82 }
  83
  84 /* Record a note TYPE at byte POS into the current cleaned logical
  85    line.  */
  86 static void
  87 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
  88 {
  89   if (buffer->notes_used == buffer->notes_cap)
  90     {
  91       buffer->notes_cap = buffer->notes_cap * 2 + 200;
  92       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
  93                                   buffer->notes_cap);
  94     }
  95
  96   buffer->notes[buffer->notes_used].pos = pos;
  97   buffer->notes[buffer->notes_used].type = type;
  98   buffer->notes_used++;
  99 }
 100
 101 \f
 102 /* Fast path to find line special characters using optimized character
 103    scanning algorithms.  Anything complicated falls back to the slow
 104    path below.  Since this loop is very hot it's worth doing these kinds
 105    of optimizations.
 106
 107    One of the paths through the ifdefs should provide
 108
 109      const uchar *search_line_fast (const uchar *s, const uchar *end);
 110
 111    Between S and END, search for \n, \r, \\, ?.  Return a pointer to
 112    the found character.
 113
 114    Note that the last character of the buffer is *always* a newline,
 115    as forced by _cpp_convert_input.  This fact can be used to avoid
 116    explicitly looking for the end of the buffer.  */
 117
 118 /* Configure gives us an ifdef test.  */
 119 #ifndef WORDS_BIGENDIAN
 120 #define WORDS_BIGENDIAN 0
 121 #endif
 122
 123 /* We'd like the largest integer that fits into a register.  There's nothing
 124    in <stdint.h> that gives us that.  For most hosts this is unsigned long,
 125    but MS decided on an LLP64 model.  Thankfully when building with GCC we
 126    can get the "real" word size.  */
 127 #ifdef __GNUC__
 128 typedef unsigned int word_type __attribute__((__mode__(__word__)));
 129 #else
 130 typedef unsigned long word_type;
 131 #endif
 132
 133 /* The code below is only expecting sizes 4 or 8.
 134    Die at compile-time if this expectation is violated.  */
 135 typedef char check_word_type_size
 136   [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
 137
 138 /* Return X with the first N bytes forced to values that won't match one
 139    of the interesting characters.  Note that NUL is not interesting.  */
 140
 141 static inline word_type
 142 acc_char_mask_misalign (word_type val, unsigned int n)
 143 {
 144   word_type mask = -1;
 145   if (WORDS_BIGENDIAN)
 146     mask >>= n * 8;
 147   else
 148     mask <<= n * 8;
 149   return val & mask;
 150 }
 151
 152 /* Return X replicated to all byte positions within WORD_TYPE.  */
 153
 154 static inline word_type
 155 acc_char_replicate (uchar x)
 156 {
 157   word_type ret;
 158
 159   ret = (x << 24) | (x << 16) | (x << 8) | x;
 160   if (sizeof(word_type) == 8)
 161     ret = (ret << 16 << 16) | ret;
 162   return ret;
 163 }
 164
 165 /* Return non-zero if some byte of VAL is (probably) C.  */
 166
 167 static inline word_type
 168 acc_char_cmp (word_type val, word_type c)
 169 {
 170 #if defined(__GNUC__) && defined(__alpha__)
 171   /* We can get exact results using a compare-bytes instruction.
 172      Get (val == c) via (0 >= (val ^ c)).  */
 173   return __builtin_alpha_cmpbge (0, val ^ c);
 174 #else
 175   word_type magic = 0x7efefefeU;
 176   if (sizeof(word_type) == 8)
 177     magic = (magic << 16 << 16) | 0xfefefefeU;
 178   magic |= 1;
 179
 180   val ^= c;
 181   return ((val + magic) ^ ~val) & ~magic;
 182 #endif
 183 }
 184
 185 /* Given the result of acc_char_cmp is non-zero, return the index of
 186    the found character.  If this was a false positive, return -1.  */
 187
 188 static inline int
 189 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
 190                 word_type val ATTRIBUTE_UNUSED)
 191 {
 192 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
 193   /* The cmpbge instruction sets *bits* of the result corresponding to
 194      matches in the bytes with no false positives.  */
 195   return __builtin_ctzl (cmp);
 196 #else
 197   unsigned int i;
 198
 199   /* ??? It would be nice to force unrolling here,
 200      and have all of these constants folded.  */
 201   for (i = 0; i < sizeof(word_type); ++i)
 202     {
 203       uchar c;
 204       if (WORDS_BIGENDIAN)
 205         c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
 206       else
 207         c = (val >> i * 8) & 0xff;
 208
 209       if (c == '\n' || c == '\r' || c == '\\' || c == '?')
 210         return i;
 211     }
 212
 213   return -1;
 214 #endif
 215 }
 216
 217 /* A version of the fast scanner using bit fiddling techniques.
 218
 219    For 32-bit words, one would normally perform 16 comparisons and
 220    16 branches.  With this algorithm one performs 24 arithmetic
 221    operations and one branch.  Whether this is faster with a 32-bit
 222    word size is going to be somewhat system dependent.
 223
 224    For 64-bit words, we eliminate twice the number of comparisons
 225    and branches without increasing the number of arithmetic operations.
 226    It's almost certainly going to be a win with 64-bit word size.  */
 227
 228 static const uchar * search_line_acc_char (const uchar *, const uchar *)
 229   ATTRIBUTE_UNUSED;
 230
 231 static const uchar *
 232 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 233 {
 234   const word_type repl_nl = acc_char_replicate ('\n');
 235   const word_type repl_cr = acc_char_replicate ('\r');
 236   const word_type repl_bs = acc_char_replicate ('\\');
 237   const word_type repl_qm = acc_char_replicate ('?');
 238
 239   unsigned int misalign;
 240   const word_type *p;
 241   word_type val, t;
 242
 243   /* Align the buffer.  Mask out any bytes from before the beginning.  */
 244   p = (word_type *)((uintptr_t)s & -sizeof(word_type));
 245   val = *p;
 246   misalign = (uintptr_t)s & (sizeof(word_type) - 1);
 247   if (misalign)
 248     val = acc_char_mask_misalign (val, misalign);
 249
 250   /* Main loop.  */
 251   while (1)
 252     {
 253       t  = acc_char_cmp (val, repl_nl);
 254       t |= acc_char_cmp (val, repl_cr);
 255       t |= acc_char_cmp (val, repl_bs);
 256       t |= acc_char_cmp (val, repl_qm);
 257
 258       if (__builtin_expect (t != 0, 0))
 259         {
 260           int i = acc_char_index (t, val);
 261           if (i >= 0)
 262             return (const uchar *)p + i;
 263         }
 264
 265       val = *++p;
 266     }
 267 }
 268
 269 /* Disable on Solaris 2/x86 until the following problem can be properly
 270    autoconfed:
 271
 272    The Solaris 10+ assembler tags objects with the instruction set
 273    extensions used, so SSE4.2 executables cannot run on machines that
 274    don't support that extension.  */
 275
 276 #if (GCC_VERSION >= 4005) && (__GNUC__ >= 5 || !defined(__PIC__)) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
 277
 278 /* Replicated character data to be shared between implementations.
 279    Recall that outside of a context with vector support we can't
 280    define compatible vector types, therefore these are all defined
 281    in terms of raw characters.  */
 282 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
 283   { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 284     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
 285   { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 286     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
 287   { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 288     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
 289   { '?', '?', '?', '?', '?', '?', '?', '?',
 290     '?', '?', '?', '?', '?', '?', '?', '?' },
 291 };
 292
 293 /* A version of the fast scanner using MMX vectorized byte compare insns.
 294
 295    This uses the PMOVMSKB instruction which was introduced with "MMX2",
 296    which was packaged into SSE1; it is also present in the AMD MMX
 297    extension.  Mark the function as using "sse" so that we emit a real
 298    "emms" instruction, rather than the 3dNOW "femms" instruction.  */
 299
 300 static const uchar *
 301 #ifndef __SSE__
 302 __attribute__((__target__("sse")))
 303 #endif
 304 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 305 {
 306   typedef char v8qi __attribute__ ((__vector_size__ (8)));
 307   typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
 308
 309   const v8qi repl_nl = *(const v8qi *)repl_chars[0];
 310   const v8qi repl_cr = *(const v8qi *)repl_chars[1];
 311   const v8qi repl_bs = *(const v8qi *)repl_chars[2];
 312   const v8qi repl_qm = *(const v8qi *)repl_chars[3];
 313
 314   unsigned int misalign, found, mask;
 315   const v8qi *p;
 316   v8qi data, t, c;
 317
 318   /* Align the source pointer.  While MMX doesn't generate unaligned data
 319      faults, this allows us to safely scan to the end of the buffer without
 320      reading beyond the end of the last page.  */
 321   misalign = (uintptr_t)s & 7;
 322   p = (const v8qi *)((uintptr_t)s & -8);
 323   data = *p;
 324
 325   /* Create a mask for the bytes that are valid within the first
 326      16-byte block.  The Idea here is that the AND with the mask
 327      within the loop is "free", since we need some AND or TEST
 328      insn in order to set the flags for the branch anyway.  */
 329   mask = -1u << misalign;
 330
 331   /* Main loop processing 8 bytes at a time.  */
 332   goto start;
 333   do
 334     {
 335       data = *++p;
 336       mask = -1;
 337
 338     start:
 339       t = __builtin_ia32_pcmpeqb(data, repl_nl);
 340       c = __builtin_ia32_pcmpeqb(data, repl_cr);
 341       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 342       c = __builtin_ia32_pcmpeqb(data, repl_bs);
 343       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 344       c = __builtin_ia32_pcmpeqb(data, repl_qm);
 345       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 346       found = __builtin_ia32_pmovmskb (t);
 347       found &= mask;
 348     }
 349   while (!found);
 350
 351   __builtin_ia32_emms ();
 352
 353   /* FOUND contains 1 in bits for which we matched a relevant
 354      character.  Conversion to the byte index is trivial.  */
 355   found = __builtin_ctz(found);
 356   return (const uchar *)p + found;
 357 }
 358
 359 /* A version of the fast scanner using SSE2 vectorized byte compare insns.  */
 360
 361 static const uchar *
 362 #ifndef __SSE2__
 363 __attribute__((__target__("sse2")))
 364 #endif
 365 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 366 {
 367   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 368
 369   const v16qi repl_nl = *(const v16qi *)repl_chars[0];
 370   const v16qi repl_cr = *(const v16qi *)repl_chars[1];
 371   const v16qi repl_bs = *(const v16qi *)repl_chars[2];
 372   const v16qi repl_qm = *(const v16qi *)repl_chars[3];
 373
 374   unsigned int misalign, found, mask;
 375   const v16qi *p;
 376   v16qi data, t;
 377
 378   /* Align the source pointer.  */
 379   misalign = (uintptr_t)s & 15;
 380   p = (const v16qi *)((uintptr_t)s & -16);
 381   data = *p;
 382
 383   /* Create a mask for the bytes that are valid within the first
 384      16-byte block.  The Idea here is that the AND with the mask
 385      within the loop is "free", since we need some AND or TEST
 386      insn in order to set the flags for the branch anyway.  */
 387   mask = -1u << misalign;
 388
 389   /* Main loop processing 16 bytes at a time.  */
 390   goto start;
 391   do
 392     {
 393       data = *++p;
 394       mask = -1;
 395
 396     start:
 397       t  = data == repl_nl;
 398       t |= data == repl_cr;
 399       t |= data == repl_bs;
 400       t |= data == repl_qm;
 401       found = __builtin_ia32_pmovmskb128 (t);
 402       found &= mask;
 403     }
 404   while (!found);
 405
 406   /* FOUND contains 1 in bits for which we matched a relevant
 407      character.  Conversion to the byte index is trivial.  */
 408   found = __builtin_ctz(found);
 409   return (const uchar *)p + found;
 410 }
 411
 412 #ifdef HAVE_SSE4
 413 /* A version of the fast scanner using SSE 4.2 vectorized string insns.  */
 414
 415 static const uchar *
 416 #ifndef __SSE4_2__
 417 __attribute__((__target__("sse4.2")))
 418 #endif
 419 search_line_sse42 (const uchar *s, const uchar *end)
 420 {
 421   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 422   static const v16qi search = { '\n', '\r', '?', '\\' };
 423
 424   uintptr_t si = (uintptr_t)s;
 425   uintptr_t index;
 426
 427   /* Check for unaligned input.  */
 428   if (si & 15)
 429     {
 430       v16qi sv;
 431
 432       if (__builtin_expect (end - s < 16, 0)
 433           && __builtin_expect ((si & 0xfff) > 0xff0, 0))
 434         {
 435           /* There are less than 16 bytes left in the buffer, and less
 436              than 16 bytes left on the page.  Reading 16 bytes at this
 437              point might generate a spurious page fault.  Defer to the
 438              SSE2 implementation, which already handles alignment.  */
 439           return search_line_sse2 (s, end);
 440         }
 441
 442       /* ??? The builtin doesn't understand that the PCMPESTRI read from
 443          memory need not be aligned.  */
 444       sv = __builtin_ia32_loaddqu ((const char *) s);
 445       index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
 446
 447       if (__builtin_expect (index < 16, 0))
 448         goto found;
 449
 450       /* Advance the pointer to an aligned address.  We will re-scan a
 451          few bytes, but we no longer need care for reading past the
 452          end of a page, since we're guaranteed a match.  */
 453       s = (const uchar *)((si + 15) & -16);
 454     }
 455
 456   /* Main loop, processing 16 bytes at a time.  */
 457 #ifdef __GCC_ASM_FLAG_OUTPUTS__
 458   while (1)
 459     {
 460       char f;
 461
 462       /* By using inline assembly instead of the builtin,
 463          we can use the result, as well as the flags set.  */
 464       __asm ("%vpcmpestri\t$0, %2, %3"
 465              : "=c"(index), "=@ccc"(f)
 466              : "m"(*s), "x"(search), "a"(4), "d"(16));
 467       if (f)
 468         break;
 469
 470       s += 16;
 471     }
 472 #else
 473   s -= 16;
 474   /* By doing the whole loop in inline assembly,
 475      we can make proper use of the flags set.  */
 476   __asm (      ".balign 16\n"
 477         "0:     add $16, %1\n"
 478         "       %vpcmpestri\t$0, (%1), %2\n"
 479         "       jnc 0b"
 480         : "=&c"(index), "+r"(s)
 481         : "x"(search), "a"(4), "d"(16));
 482 #endif
 483
 484  found:
 485   return s + index;
 486 }
 487
 488 #else
 489 /* Work around out-dated assemblers without sse4 support.  */
 490 #define search_line_sse42 search_line_sse2
 491 #endif
 492
 493 /* Check the CPU capabilities.  */
 494
 495 #include "../gcc/config/i386/cpuid.h"
 496
 497 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
 498 static search_line_fast_type search_line_fast;
 499
 500 #define HAVE_init_vectorized_lexer 1
 501 static inline void
 502 init_vectorized_lexer (void)
 503 {
 504   unsigned dummy, ecx = 0, edx = 0;
 505   search_line_fast_type impl = search_line_acc_char;
 506   int minimum = 0;
 507
 508 #if defined(__SSE4_2__)
 509   minimum = 3;
 510 #elif defined(__SSE2__)
 511   minimum = 2;
 512 #elif defined(__SSE__)
 513   minimum = 1;
 514 #endif
 515
 516   if (minimum == 3)
 517     impl = search_line_sse42;
 518   else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
 519     {
 520       if (minimum == 3 || (ecx & bit_SSE4_2))
 521         impl = search_line_sse42;
 522       else if (minimum == 2 || (edx & bit_SSE2))
 523         impl = search_line_sse2;
 524       else if (minimum == 1 || (edx & bit_SSE))
 525         impl = search_line_mmx;
 526     }
 527   else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
 528     {
 529       if (minimum == 1
 530           || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
 531         impl = search_line_mmx;
 532     }
 533
 534   search_line_fast = impl;
 535 }
 536
 537 #elif (GCC_VERSION >= 4005) && defined(_ARCH_PWR8) && defined(__ALTIVEC__)
 538
 539 /* A vection of the fast scanner using AltiVec vectorized byte compares
 540    and VSX unaligned loads (when VSX is available).  This is otherwise
 541    the same as the AltiVec version.  */
 542
 543 ATTRIBUTE_NO_SANITIZE_UNDEFINED
 544 static const uchar *
 545 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 546 {
 547   typedef __attribute__((altivec(vector))) unsigned char vc;
 548
 549   const vc repl_nl = {
 550     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 551     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 552   };
 553   const vc repl_cr = {
 554     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 555     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 556   };
 557   const vc repl_bs = {
 558     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 559     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 560   };
 561   const vc repl_qm = {
 562     '?', '?', '?', '?', '?', '?', '?', '?',
 563     '?', '?', '?', '?', '?', '?', '?', '?',
 564   };
 565   const vc zero = { 0 };
 566
 567   vc data, t;
 568
 569   /* Main loop processing 16 bytes at a time.  */
 570   do
 571     {
 572       vc m_nl, m_cr, m_bs, m_qm;
 573
 574       data = __builtin_vec_vsx_ld (0, s);
 575       s += 16;
 576
 577       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 578       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 579       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 580       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 581       t = (m_nl | m_cr) | (m_bs | m_qm);
 582
 583       /* T now contains 0xff in bytes for which we matched one of the relevant
 584          characters.  We want to exit the loop if any byte in T is non-zero.
 585          Below is the expansion of vec_any_ne(t, zero).  */
 586     }
 587   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 588
 589   /* Restore s to to point to the 16 bytes we just processed.  */
 590   s -= 16;
 591
 592   {
 593 #define N  (sizeof(vc) / sizeof(long))
 594
 595     union {
 596       vc v;
 597       /* Statically assert that N is 2 or 4.  */
 598       unsigned long l[(N == 2 || N == 4) ? N : -1];
 599     } u;
 600     unsigned long l, i = 0;
 601
 602     u.v = t;
 603
 604     /* Find the first word of T that is non-zero.  */
 605     switch (N)
 606       {
 607       case 4:
 608         l = u.l[i++];
 609         if (l != 0)
 610           break;
 611         s += sizeof(unsigned long);
 612         l = u.l[i++];
 613         if (l != 0)
 614           break;
 615         s += sizeof(unsigned long);
 616         /* FALLTHRU */
 617       case 2:
 618         l = u.l[i++];
 619         if (l != 0)
 620           break;
 621         s += sizeof(unsigned long);
 622         l = u.l[i];
 623       }
 624
 625     /* L now contains 0xff in bytes for which we matched one of the
 626        relevant characters.  We can find the byte index by finding
 627        its bit index and dividing by 8.  */
 628 #ifdef __BIG_ENDIAN__
 629     l = __builtin_clzl(l) >> 3;
 630 #else
 631     l = __builtin_ctzl(l) >> 3;
 632 #endif
 633     return s + l;
 634
 635 #undef N
 636   }
 637 }
 638
 639 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__) && defined (__BIG_ENDIAN__)
 640
 641 /* A vection of the fast scanner using AltiVec vectorized byte compares.
 642    This cannot be used for little endian because vec_lvsl/lvsr are
 643    deprecated for little endian and the code won't work properly.  */
 644 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
 645    so we can't compile this function without -maltivec on the command line
 646    (or implied by some other switch).  */
 647
 648 static const uchar *
 649 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 650 {
 651   typedef __attribute__((altivec(vector))) unsigned char vc;
 652
 653   const vc repl_nl = {
 654     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 655     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 656   };
 657   const vc repl_cr = {
 658     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 659     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 660   };
 661   const vc repl_bs = {
 662     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 663     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 664   };
 665   const vc repl_qm = {
 666     '?', '?', '?', '?', '?', '?', '?', '?',
 667     '?', '?', '?', '?', '?', '?', '?', '?',
 668   };
 669   const vc ones = {
 670     -1, -1, -1, -1, -1, -1, -1, -1,
 671     -1, -1, -1, -1, -1, -1, -1, -1,
 672   };
 673   const vc zero = { 0 };
 674
 675   vc data, mask, t;
 676
 677   /* Altivec loads automatically mask addresses with -16.  This lets us
 678      issue the first load as early as possible.  */
 679   data = __builtin_vec_ld(0, (const vc *)s);
 680
 681   /* Discard bytes before the beginning of the buffer.  Do this by
 682      beginning with all ones and shifting in zeros according to the
 683      mis-alignment.  The LVSR instruction pulls the exact shift we
 684      want from the address.  */
 685   mask = __builtin_vec_lvsr(0, s);
 686   mask = __builtin_vec_perm(zero, ones, mask);
 687   data &= mask;
 688
 689   /* While altivec loads mask addresses, we still need to align S so
 690      that the offset we compute at the end is correct.  */
 691   s = (const uchar *)((uintptr_t)s & -16);
 692
 693   /* Main loop processing 16 bytes at a time.  */
 694   goto start;
 695   do
 696     {
 697       vc m_nl, m_cr, m_bs, m_qm;
 698
 699       s += 16;
 700       data = __builtin_vec_ld(0, (const vc *)s);
 701
 702     start:
 703       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 704       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 705       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 706       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 707       t = (m_nl | m_cr) | (m_bs | m_qm);
 708
 709       /* T now contains 0xff in bytes for which we matched one of the relevant
 710          characters.  We want to exit the loop if any byte in T is non-zero.
 711          Below is the expansion of vec_any_ne(t, zero).  */
 712     }
 713   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 714
 715   {
 716 #define N  (sizeof(vc) / sizeof(long))
 717
 718     union {
 719       vc v;
 720       /* Statically assert that N is 2 or 4.  */
 721       unsigned long l[(N == 2 || N == 4) ? N : -1];
 722     } u;
 723     unsigned long l, i = 0;
 724
 725     u.v = t;
 726
 727     /* Find the first word of T that is non-zero.  */
 728     switch (N)
 729       {
 730       case 4:
 731         l = u.l[i++];
 732         if (l != 0)
 733           break;
 734         s += sizeof(unsigned long);
 735         l = u.l[i++];
 736         if (l != 0)
 737           break;
 738         s += sizeof(unsigned long);
 739         /* FALLTHROUGH */
 740       case 2:
 741         l = u.l[i++];
 742         if (l != 0)
 743           break;
 744         s += sizeof(unsigned long);
 745         l = u.l[i];
 746       }
 747
 748     /* L now contains 0xff in bytes for which we matched one of the
 749        relevant characters.  We can find the byte index by finding
 750        its bit index and dividing by 8.  */
 751     l = __builtin_clzl(l) >> 3;
 752     return s + l;
 753
 754 #undef N
 755   }
 756 }
 757
 758 #elif defined (__ARM_NEON) && defined (__ARM_64BIT_STATE)
 759 #include "arm_neon.h"
 760
 761 /* This doesn't have to be the exact page size, but no system may use
 762    a size smaller than this.  ARMv8 requires a minimum page size of
 763    4k.  The impact of being conservative here is a small number of
 764    cases will take the slightly slower entry path into the main
 765    loop.  */
 766
 767 #define AARCH64_MIN_PAGE_SIZE 4096
 768
 769 static const uchar *
 770 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 771 {
 772   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
 773   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
 774   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
 775   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
 776   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
 777
 778 #ifdef __ARM_BIG_ENDIAN
 779   const int16x8_t shift = {8, 8, 8, 8, 0, 0, 0, 0};
 780 #else
 781   const int16x8_t shift = {0, 0, 0, 0, 8, 8, 8, 8};
 782 #endif
 783
 784   unsigned int found;
 785   const uint8_t *p;
 786   uint8x16_t data;
 787   uint8x16_t t;
 788   uint16x8_t m;
 789   uint8x16_t u, v, w;
 790
 791   /* Align the source pointer.  */
 792   p = (const uint8_t *)((uintptr_t)s & -16);
 793
 794   /* Assuming random string start positions, with a 4k page size we'll take
 795      the slow path about 0.37% of the time.  */
 796   if (__builtin_expect ((AARCH64_MIN_PAGE_SIZE
 797                          - (((uintptr_t) s) & (AARCH64_MIN_PAGE_SIZE - 1)))
 798                         < 16, 0))
 799     {
 800       /* Slow path: the string starts near a possible page boundary.  */
 801       uint32_t misalign, mask;
 802
 803       misalign = (uintptr_t)s & 15;
 804       mask = (-1u << misalign) & 0xffff;
 805       data = vld1q_u8 (p);
 806       t = vceqq_u8 (data, repl_nl);
 807       u = vceqq_u8 (data, repl_cr);
 808       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 809       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 810       t = vorrq_u8 (v, w);
 811       t = vandq_u8 (t, xmask);
 812       m = vpaddlq_u8 (t);
 813       m = vshlq_u16 (m, shift);
 814       found = vaddvq_u16 (m);
 815       found &= mask;
 816       if (found)
 817         return (const uchar*)p + __builtin_ctz (found);
 818     }
 819   else
 820     {
 821       data = vld1q_u8 ((const uint8_t *) s);
 822       t = vceqq_u8 (data, repl_nl);
 823       u = vceqq_u8 (data, repl_cr);
 824       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 825       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 826       t = vorrq_u8 (v, w);
 827       if (__builtin_expect (vpaddd_u64 ((uint64x2_t)t) != 0, 0))
 828         goto done;
 829     }
 830
 831   do
 832     {
 833       p += 16;
 834       data = vld1q_u8 (p);
 835       t = vceqq_u8 (data, repl_nl);
 836       u = vceqq_u8 (data, repl_cr);
 837       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 838       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 839       t = vorrq_u8 (v, w);
 840     } while (!vpaddd_u64 ((uint64x2_t)t));
 841
 842 done:
 843   /* Now that we've found the terminating substring, work out precisely where
 844      we need to stop.  */
 845   t = vandq_u8 (t, xmask);
 846   m = vpaddlq_u8 (t);
 847   m = vshlq_u16 (m, shift);
 848   found = vaddvq_u16 (m);
 849   return (((((uintptr_t) p) < (uintptr_t) s) ? s : (const uchar *)p)
 850           + __builtin_ctz (found));
 851 }
 852
 853 #elif defined (__ARM_NEON)
 854 #include "arm_neon.h"
 855
 856 static const uchar *
 857 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 858 {
 859   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
 860   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
 861   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
 862   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
 863   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
 864
 865   unsigned int misalign, found, mask;
 866   const uint8_t *p;
 867   uint8x16_t data;
 868
 869   /* Align the source pointer.  */
 870   misalign = (uintptr_t)s & 15;
 871   p = (const uint8_t *)((uintptr_t)s & -16);
 872   data = vld1q_u8 (p);
 873
 874   /* Create a mask for the bytes that are valid within the first
 875      16-byte block.  The Idea here is that the AND with the mask
 876      within the loop is "free", since we need some AND or TEST
 877      insn in order to set the flags for the branch anyway.  */
 878   mask = (-1u << misalign) & 0xffff;
 879
 880   /* Main loop, processing 16 bytes at a time.  */
 881   goto start;
 882
 883   do
 884     {
 885       uint8x8_t l;
 886       uint16x4_t m;
 887       uint32x2_t n;
 888       uint8x16_t t, u, v, w;
 889
 890       p += 16;
 891       data = vld1q_u8 (p);
 892       mask = 0xffff;
 893
 894     start:
 895       t = vceqq_u8 (data, repl_nl);
 896       u = vceqq_u8 (data, repl_cr);
 897       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 898       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 899       t = vandq_u8 (vorrq_u8 (v, w), xmask);
 900       l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
 901       m = vpaddl_u8 (l);
 902       n = vpaddl_u16 (m);
 903
 904       found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
 905               vshr_n_u64 ((uint64x1_t) n, 24)), 0);
 906       found &= mask;
 907     }
 908   while (!found);
 909
 910   /* FOUND contains 1 in bits for which we matched a relevant
 911      character.  Conversion to the byte index is trivial.  */
 912   found = __builtin_ctz (found);
 913   return (const uchar *)p + found;
 914 }
 915
 916 #else
 917
 918 /* We only have one accelerated alternative.  Use a direct call so that
 919    we encourage inlining.  */
 920
 921 #define search_line_fast  search_line_acc_char
 922
 923 #endif
 924
 925 /* Initialize the lexer if needed.  */
 926
 927 void
 928 _cpp_init_lexer (void)
 929 {
 930 #ifdef HAVE_init_vectorized_lexer
 931   init_vectorized_lexer ();
 932 #endif
 933 }
 934
 935 /* Returns with a logical line that contains no escaped newlines or
 936    trigraphs.  This is a time-critical inner loop.  */
 937 void
 938 _cpp_clean_line (cpp_reader *pfile)
 939 {
 940   cpp_buffer *buffer;
 941   const uchar *s;
 942   uchar c, *d, *p;
 943
 944   buffer = pfile->buffer;
 945   buffer->cur_note = buffer->notes_used = 0;
 946   buffer->cur = buffer->line_base = buffer->next_line;
 947   buffer->need_line = false;
 948   s = buffer->next_line;
 949
 950   if (!buffer->from_stage3)
 951     {
 952       const uchar *pbackslash = NULL;
 953
 954       /* Fast path.  This is the common case of an un-escaped line with
 955          no trigraphs.  The primary win here is by not writing any
 956          data back to memory until we have to.  */
 957       while (1)
 958         {
 959           /* Perform an optimized search for \n, \r, \\, ?.  */
 960           s = search_line_fast (s, buffer->rlimit);
 961
 962           c = *s;
 963           if (c == '\\')
 964             {
 965               /* Record the location of the backslash and continue.  */
 966               pbackslash = s++;
 967             }
 968           else if (__builtin_expect (c == '?', 0))
 969             {
 970               if (__builtin_expect (s[1] == '?', false)
 971                    && _cpp_trigraph_map[s[2]])
 972                 {
 973                   /* Have a trigraph.  We may or may not have to convert
 974                      it.  Add a line note regardless, for -Wtrigraphs.  */
 975                   add_line_note (buffer, s, s[2]);
 976                   if (CPP_OPTION (pfile, trigraphs))
 977                     {
 978                       /* We do, and that means we have to switch to the
 979                          slow path.  */
 980                       d = (uchar *) s;
 981                       *d = _cpp_trigraph_map[s[2]];
 982                       s += 2;
 983                       goto slow_path;
 984                     }
 985                 }
 986               /* Not a trigraph.  Continue on fast-path.  */
 987               s++;
 988             }
 989           else
 990             break;
 991         }
 992
 993       /* This must be \r or \n.  We're either done, or we'll be forced
 994          to write back to the buffer and continue on the slow path.  */
 995       d = (uchar *) s;
 996
 997       if (__builtin_expect (s == buffer->rlimit, false))
 998         goto done;
 999
1000       /* DOS line ending? */
1001       if (__builtin_expect (c == '\r', false) && s[1] == '\n')
1002         {
1003           s++;
1004           if (s == buffer->rlimit)
1005             goto done;
1006         }
1007
1008       if (__builtin_expect (pbackslash == NULL, true))
1009         goto done;
1010
1011       /* Check for escaped newline.  */
1012       p = d;
1013       while (is_nvspace (p[-1]))
1014         p--;
1015       if (p - 1 != pbackslash)
1016         goto done;
1017
1018       /* Have an escaped newline; process it and proceed to
1019          the slow path.  */
1020       add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
1021       d = p - 2;
1022       buffer->next_line = p - 1;
1023
1024     slow_path:
1025       while (1)
1026         {
1027           c = *++s;
1028           *++d = c;
1029
1030           if (c == '\n' || c == '\r')
1031             {
1032               /* Handle DOS line endings.  */
1033               if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
1034                 s++;
1035               if (s == buffer->rlimit)
1036                 break;
1037
1038               /* Escaped?  */
1039               p = d;
1040               while (p != buffer->next_line && is_nvspace (p[-1]))
1041                 p--;
1042               if (p == buffer->next_line || p[-1] != '\\')
1043                 break;
1044
1045               add_line_note (buffer, p - 1, p != d ? ' ': '\\');
1046               d = p - 2;
1047               buffer->next_line = p - 1;
1048             }
1049           else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
1050             {
1051               /* Add a note regardless, for the benefit of -Wtrigraphs.  */
1052               add_line_note (buffer, d, s[2]);
1053               if (CPP_OPTION (pfile, trigraphs))
1054                 {
1055                   *d = _cpp_trigraph_map[s[2]];
1056                   s += 2;
1057                 }
1058             }
1059         }
1060     }
1061   else
1062     {
1063       while (*s != '\n' && *s != '\r')
1064         s++;
1065       d = (uchar *) s;
1066
1067       /* Handle DOS line endings.  */
1068       if (*s == '\r' && s + 1 != buffer->rlimit && s[1] == '\n')
1069         s++;
1070     }
1071
1072  done:
1073   *d = '\n';
1074   /* A sentinel note that should never be processed.  */
1075   add_line_note (buffer, d + 1, '\n');
1076   buffer->next_line = s + 1;
1077 }
1078
1079 template <bool lexing_raw_string>
1080 static bool get_fresh_line_impl (cpp_reader *pfile);
1081
1082 /* Return true if the trigraph indicated by NOTE should be warned
1083    about in a comment.  */
1084 static bool
1085 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
1086 {
1087   const uchar *p;
1088
1089   /* Within comments we don't warn about trigraphs, unless the
1090      trigraph forms an escaped newline, as that may change
1091      behavior.  */
1092   if (note->type != '/')
1093     return false;
1094
1095   /* If -trigraphs, then this was an escaped newline iff the next note
1096      is coincident.  */
1097   if (CPP_OPTION (pfile, trigraphs))
1098     return note[1].pos == note->pos;
1099
1100   /* Otherwise, see if this forms an escaped newline.  */
1101   p = note->pos + 3;
1102   while (is_nvspace (*p))
1103     p++;
1104
1105   /* There might have been escaped newlines between the trigraph and the
1106      newline we found.  Hence the position test.  */
1107   return (*p == '\n' && p < note[1].pos);
1108 }
1109
1110 /* Process the notes created by add_line_note as far as the current
1111    location.  */
1112 void
1113 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
1114 {
1115   cpp_buffer *buffer = pfile->buffer;
1116
1117   for (;;)
1118     {
1119       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
1120       unsigned int col;
1121
1122       if (note->pos > buffer->cur)
1123         break;
1124
1125       buffer->cur_note++;
1126       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
1127
1128       if (note->type == '\\' || note->type == ' ')
1129         {
1130           if (note->type == ' ' && !in_comment)
1131             cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
1132                                  "backslash and newline separated by space");
1133
1134           if (buffer->next_line > buffer->rlimit)
1135             {
1136               cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
1137                                    "backslash-newline at end of file");
1138               /* Prevent "no newline at end of file" warning.  */
1139               buffer->next_line = buffer->rlimit;
1140             }
1141
1142           buffer->line_base = note->pos;
1143           CPP_INCREMENT_LINE (pfile, 0);
1144         }
1145       else if (_cpp_trigraph_map[note->type])
1146         {
1147           if (CPP_OPTION (pfile, warn_trigraphs)
1148               && (!in_comment || warn_in_comment (pfile, note)))
1149             {
1150               if (CPP_OPTION (pfile, trigraphs))
1151                 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
1152                                        pfile->line_table->highest_line, col,
1153                                        "trigraph ??%c converted to %c",
1154                                        note->type,
1155                                        (int) _cpp_trigraph_map[note->type]);
1156               else
1157                 {
1158                   cpp_warning_with_line
1159                     (pfile, CPP_W_TRIGRAPHS,
1160                      pfile->line_table->highest_line, col,
1161                      "trigraph ??%c ignored, use -trigraphs to enable",
1162                      note->type);
1163                 }
1164             }
1165         }
1166       else if (note->type == 0)
1167         /* Already processed in lex_raw_string.  */;
1168       else
1169         abort ();
1170     }
1171 }
1172
1173 namespace bidi {
1174   enum class kind {
1175     NONE, LRE, RLE, LRO, RLO, LRI, RLI, FSI, PDF, PDI, LTR, RTL
1176   };
1177
1178   /* All the UTF-8 encodings of bidi characters start with E2.  */
1179   constexpr uchar utf8_start = 0xe2;
1180
1181   struct context
1182   {
1183     context () {}
1184     context (location_t loc, kind k, bool pdf, bool ucn)
1185     : m_loc (loc), m_kind (k), m_pdf (pdf), m_ucn (ucn)
1186     {
1187     }
1188
1189     kind get_pop_kind () const
1190     {
1191       return m_pdf ? kind::PDF : kind::PDI;
1192     }
1193     bool ucn_p () const
1194     {
1195       return m_ucn;
1196     }
1197
1198     location_t m_loc;
1199     kind m_kind;
1200     unsigned m_pdf : 1;
1201     unsigned m_ucn : 1;
1202   };
1203
1204   /* A vector holding currently open bidi contexts.  We use a char for
1205      each context, its LSB is 1 if it represents a PDF context, 0 if it
1206      represents a PDI context.  The next bit is 1 if this context was open
1207      by a bidi character written as a UCN, and 0 when it was UTF-8.  */
1208   semi_embedded_vec <context, 16> vec;
1209
1210   /* Close the whole comment/identifier/string literal/character constant
1211      context.  */
1212   void on_close ()
1213   {
1214     vec.truncate (0);
1215   }
1216
1217   /* Pop the last element in the vector.  */
1218   void pop ()
1219   {
1220     unsigned int len = vec.count ();
1221     gcc_checking_assert (len > 0);
1222     vec.truncate (len - 1);
1223   }
1224
1225   /* Return the pop kind of the context of the Ith element.  */
1226   kind pop_kind_at (unsigned int i)
1227   {
1228     return vec[i].get_pop_kind ();
1229   }
1230
1231   /* Return the pop kind of the context that is currently opened.  */
1232   kind current_ctx ()
1233   {
1234     unsigned int len = vec.count ();
1235     if (len == 0)
1236       return kind::NONE;
1237     return vec[len - 1].get_pop_kind ();
1238   }
1239
1240   /* Return true if the current context comes from a UCN origin, that is,
1241      the bidi char which started this bidi context was written as a UCN.  */
1242   bool current_ctx_ucn_p ()
1243   {
1244     unsigned int len = vec.count ();
1245     gcc_checking_assert (len > 0);
1246     return vec[len - 1].m_ucn;
1247   }
1248
1249   location_t current_ctx_loc ()
1250   {
1251     unsigned int len = vec.count ();
1252     gcc_checking_assert (len > 0);
1253     return vec[len - 1].m_loc;
1254   }
1255
1256   /* We've read a bidi char, update the current vector as necessary.
1257      LOC is only valid when K is not kind::NONE.  */
1258   void on_char (kind k, bool ucn_p, location_t loc)
1259   {
1260     switch (k)
1261       {
1262       case kind::LRE:
1263       case kind::RLE:
1264       case kind::LRO:
1265       case kind::RLO:
1266         vec.push (context (loc, k, true, ucn_p));
1267         break;
1268       case kind::LRI:
1269       case kind::RLI:
1270       case kind::FSI:
1271         vec.push (context (loc, k, false, ucn_p));
1272         break;
1273       /* PDF terminates the scope of the last LRE, RLE, LRO, or RLO
1274          whose scope has not yet been terminated.  */
1275       case kind::PDF:
1276         if (current_ctx () == kind::PDF)
1277           pop ();
1278         break;
1279       /* PDI terminates the scope of the last LRI, RLI, or FSI whose
1280          scope has not yet been terminated, as well as the scopes of
1281          any subsequent LREs, RLEs, LROs, or RLOs whose scopes have not
1282          yet been terminated.  */
1283       case kind::PDI:
1284         for (int i = vec.count () - 1; i >= 0; --i)
1285           if (pop_kind_at (i) == kind::PDI)
1286             {
1287               vec.truncate (i);
1288               break;
1289             }
1290         break;
1291       case kind::LTR:
1292       case kind::RTL:
1293         /* These aren't popped by a PDF/PDI.  */
1294         break;
1295       ATTR_LIKELY case kind::NONE:
1296         break;
1297       default:
1298         abort ();
1299       }
1300   }
1301
1302   /* Return a descriptive string for K.  */
1303   const char *to_str (kind k)
1304   {
1305     switch (k)
1306       {
1307       case kind::LRE:
1308         return "U+202A (LEFT-TO-RIGHT EMBEDDING)";
1309       case kind::RLE:
1310         return "U+202B (RIGHT-TO-LEFT EMBEDDING)";
1311       case kind::LRO:
1312         return "U+202D (LEFT-TO-RIGHT OVERRIDE)";
1313       case kind::RLO:
1314         return "U+202E (RIGHT-TO-LEFT OVERRIDE)";
1315       case kind::LRI:
1316         return "U+2066 (LEFT-TO-RIGHT ISOLATE)";
1317       case kind::RLI:
1318         return "U+2067 (RIGHT-TO-LEFT ISOLATE)";
1319       case kind::FSI:
1320         return "U+2068 (FIRST STRONG ISOLATE)";
1321       case kind::PDF:
1322         return "U+202C (POP DIRECTIONAL FORMATTING)";
1323       case kind::PDI:
1324         return "U+2069 (POP DIRECTIONAL ISOLATE)";
1325       case kind::LTR:
1326         return "U+200E (LEFT-TO-RIGHT MARK)";
1327       case kind::RTL:
1328         return "U+200F (RIGHT-TO-LEFT MARK)";
1329       default:
1330         abort ();
1331       }
1332   }
1333 }
1334
1335 /* Get location_t for the range of bytes [START, START + NUM_BYTES)
1336    within the current line in FILE, with the caret at START.  */
1337
1338 static location_t
1339 get_location_for_byte_range_in_cur_line (cpp_reader *pfile,
1340                                          const unsigned char *const start,
1341                                          size_t num_bytes)
1342 {
1343   gcc_checking_assert (num_bytes > 0);
1344
1345   /* CPP_BUF_COLUMN and linemap_position_for_column both refer
1346      to offsets in bytes, but CPP_BUF_COLUMN is 0-based,
1347      whereas linemap_position_for_column is 1-based.  */
1348
1349   /* Get 0-based offsets within the line.  */
1350   size_t start_offset = CPP_BUF_COLUMN (pfile->buffer, start);
1351   size_t end_offset = start_offset + num_bytes - 1;
1352
1353   /* Now convert to location_t, where "columns" are 1-based byte offsets.  */
1354   location_t start_loc = linemap_position_for_column (pfile->line_table,
1355                                                       start_offset + 1);
1356   location_t end_loc = linemap_position_for_column (pfile->line_table,
1357                                                      end_offset + 1);
1358
1359   if (start_loc == end_loc)
1360     return start_loc;
1361
1362   source_range src_range;
1363   src_range.m_start = start_loc;
1364   src_range.m_finish = end_loc;
1365   location_t combined_loc
1366     = pfile->line_table->get_or_create_combined_loc (start_loc,
1367                                                      src_range,
1368                                                      nullptr,
1369                                                      0);
1370   return combined_loc;
1371 }
1372
1373 /* Parse a sequence of 3 bytes starting with P and return its bidi code.  */
1374
1375 static bidi::kind
1376 get_bidi_utf8_1 (const unsigned char *const p)
1377 {
1378   gcc_checking_assert (p[0] == bidi::utf8_start);
1379
1380   if (p[1] == 0x80)
1381     switch (p[2])
1382       {
1383       case 0xaa:
1384         return bidi::kind::LRE;
1385       case 0xab:
1386         return bidi::kind::RLE;
1387       case 0xac:
1388         return bidi::kind::PDF;
1389       case 0xad:
1390         return bidi::kind::LRO;
1391       case 0xae:
1392         return bidi::kind::RLO;
1393       case 0x8e:
1394         return bidi::kind::LTR;
1395       case 0x8f:
1396         return bidi::kind::RTL;
1397       default:
1398         break;
1399       }
1400   else if (p[1] == 0x81)
1401     switch (p[2])
1402       {
1403       case 0xa6:
1404         return bidi::kind::LRI;
1405       case 0xa7:
1406         return bidi::kind::RLI;
1407       case 0xa8:
1408         return bidi::kind::FSI;
1409       case 0xa9:
1410         return bidi::kind::PDI;
1411       default:
1412         break;
1413       }
1414
1415   return bidi::kind::NONE;
1416 }
1417
1418 /* Parse a sequence of 3 bytes starting with P and return its bidi code.
1419    If the kind is not NONE, write the location to *OUT.*/
1420
1421 static bidi::kind
1422 get_bidi_utf8 (cpp_reader *pfile, const unsigned char *const p, location_t *out)
1423 {
1424   bidi::kind result = get_bidi_utf8_1 (p);
1425   if (result != bidi::kind::NONE)
1426     {
1427       /* We have a sequence of 3 bytes starting at P.  */
1428       *out = get_location_for_byte_range_in_cur_line (pfile, p, 3);
1429     }
1430   return result;
1431 }
1432
1433 /* Parse a UCN where P points just past \u or \U and return its bidi code.  */
1434
1435 static bidi::kind
1436 get_bidi_ucn_1 (const unsigned char *p, bool is_U, const unsigned char **end)
1437 {
1438   /* 6.4.3 Universal Character Names
1439       \u hex-quad
1440       \U hex-quad hex-quad
1441       \u { simple-hexadecimal-digit-sequence }
1442      where \unnnn means \U0000nnnn.  */
1443
1444   *end = p + 4;
1445   if (is_U)
1446     {
1447       if (p[0] != '0' || p[1] != '0' || p[2] != '0' || p[3] != '0')
1448         return bidi::kind::NONE;
1449       /* Skip 4B so we can treat \u and \U the same below.  */
1450       p += 4;
1451       *end += 4;
1452     }
1453   else if (p[0] == '{')
1454     {
1455       p++;
1456       while (*p == '0')
1457         p++;
1458       if (p[0] != '2'
1459           || p[1] != '0'
1460           || !ISXDIGIT (p[2])
1461           || !ISXDIGIT (p[3])
1462           || p[4] != '}')
1463         return bidi::kind::NONE;
1464       *end = p + 5;
1465     }
1466
1467   /* All code points we are looking for start with 20xx.  */
1468   if (p[0] != '2' || p[1] != '0')
1469     return bidi::kind::NONE;
1470   else if (p[2] == '2')
1471     switch (p[3])
1472       {
1473       case 'a':
1474       case 'A':
1475         return bidi::kind::LRE;
1476       case 'b':
1477       case 'B':
1478         return bidi::kind::RLE;
1479       case 'c':
1480       case 'C':
1481         return bidi::kind::PDF;
1482       case 'd':
1483       case 'D':
1484         return bidi::kind::LRO;
1485       case 'e':
1486       case 'E':
1487         return bidi::kind::RLO;
1488       default:
1489         break;
1490       }
1491   else if (p[2] == '6')
1492     switch (p[3])
1493       {
1494       case '6':
1495         return bidi::kind::LRI;
1496       case '7':
1497         return bidi::kind::RLI;
1498       case '8':
1499         return bidi::kind::FSI;
1500       case '9':
1501         return bidi::kind::PDI;
1502       default:
1503         break;
1504       }
1505   else if (p[2] == '0')
1506     switch (p[3])
1507       {
1508       case 'e':
1509       case 'E':
1510         return bidi::kind::LTR;
1511       case 'f':
1512       case 'F':
1513         return bidi::kind::RTL;
1514       default:
1515         break;
1516       }
1517
1518   return bidi::kind::NONE;
1519 }
1520
1521 /* Parse a UCN where P points just past \u or \U and return its bidi code.
1522    If the kind is not NONE, write the location to *OUT.  */
1523
1524 static bidi::kind
1525 get_bidi_ucn (cpp_reader *pfile, const unsigned char *p, bool is_U,
1526               location_t *out)
1527 {
1528   const unsigned char *end;
1529   bidi::kind result = get_bidi_ucn_1 (p, is_U, &end);
1530   if (result != bidi::kind::NONE)
1531     {
1532       const unsigned char *start = p - 2;
1533       size_t num_bytes = end - start;
1534       *out = get_location_for_byte_range_in_cur_line (pfile, start, num_bytes);
1535     }
1536   return result;
1537 }
1538
1539 /* Parse a named universal character escape where P points just past \N and
1540    return its bidi code.  If the kind is not NONE, write the location to
1541    *OUT.  */
1542
1543 static bidi::kind
1544 get_bidi_named (cpp_reader *pfile, const unsigned char *p, location_t *out)
1545 {
1546   bidi::kind result = bidi::kind::NONE;
1547   if (*p != '{')
1548     return bidi::kind::NONE;
1549   if (strncmp ((const char *) (p + 1), "LEFT-TO-RIGHT ", 14) == 0)
1550     {
1551       if (strncmp ((const char *) (p + 15), "MARK}", 5) == 0)
1552         result = bidi::kind::LTR;
1553       else if (strncmp ((const char *) (p + 15), "EMBEDDING}", 10) == 0)
1554         result = bidi::kind::LRE;
1555       else if (strncmp ((const char *) (p + 15), "OVERRIDE}", 9) == 0)
1556         result = bidi::kind::LRO;
1557       else if (strncmp ((const char *) (p + 15), "ISOLATE}", 8) == 0)
1558         result = bidi::kind::LRI;
1559     }
1560   else if (strncmp ((const char *) (p + 1), "RIGHT-TO-LEFT ", 14) == 0)
1561     {
1562       if (strncmp ((const char *) (p + 15), "MARK}", 5) == 0)
1563         result = bidi::kind::RTL;
1564       else if (strncmp ((const char *) (p + 15), "EMBEDDING}", 10) == 0)
1565         result = bidi::kind::RLE;
1566       else if (strncmp ((const char *) (p + 15), "OVERRIDE}", 9) == 0)
1567         result = bidi::kind::RLO;
1568       else if (strncmp ((const char *) (p + 15), "ISOLATE}", 8) == 0)
1569         result = bidi::kind::RLI;
1570     }
1571   else if (strncmp ((const char *) (p + 1), "POP DIRECTIONAL ", 16) == 0)
1572     {
1573       if (strncmp ((const char *) (p + 16), "FORMATTING}", 11) == 0)
1574         result = bidi::kind::PDF;
1575       else if (strncmp ((const char *) (p + 16), "ISOLATE}", 8) == 0)
1576         result = bidi::kind::PDI;
1577     }
1578   else if (strncmp ((const char *) (p + 1), "FIRST STRONG ISOLATE}", 21) == 0)
1579     result = bidi::kind::FSI;
1580   if (result != bidi::kind::NONE)
1581     *out = get_location_for_byte_range_in_cur_line (pfile, p - 2,
1582                                                     (strchr ((const char *)
1583                                                              (p + 1), '}')
1584                                                      - (const char *) p)
1585                                                     + 3);
1586   return result;
1587 }
1588
1589 /* Subclass of rich_location for reporting on unpaired UTF-8
1590    bidirectional control character(s).
1591    Escape the source lines on output, and show all unclosed
1592    bidi context, labelling everything.  */
1593
1594 class unpaired_bidi_rich_location : public rich_location
1595 {
1596  public:
1597   class custom_range_label : public range_label
1598   {
1599    public:
1600      label_text get_text (unsigned range_idx) const final override
1601      {
1602        /* range 0 is the primary location; each subsequent range i + 1
1603           is for bidi::vec[i].  */
1604        if (range_idx > 0)
1605          {
1606            const bidi::context &ctxt (bidi::vec[range_idx - 1]);
1607            return label_text::borrow (bidi::to_str (ctxt.m_kind));
1608          }
1609        else
1610          return label_text::borrow (_("end of bidirectional context"));
1611      }
1612   };
1613
1614   unpaired_bidi_rich_location (cpp_reader *pfile, location_t loc)
1615   : rich_location (pfile->line_table, loc, &m_custom_label)
1616   {
1617     set_escape_on_output (true);
1618     for (unsigned i = 0; i < bidi::vec.count (); i++)
1619       add_range (bidi::vec[i].m_loc,
1620                  SHOW_RANGE_WITHOUT_CARET,
1621                  &m_custom_label);
1622   }
1623
1624  private:
1625    custom_range_label m_custom_label;
1626 };
1627
1628 /* We're closing a bidi context, that is, we've encountered a newline,
1629    are closing a C-style comment, or are at the end of a string literal,
1630    character constant, or identifier.  Warn if this context was not
1631    properly terminated by a PDI or PDF.  P points to the last character
1632    in this context.  */
1633
1634 static void
1635 maybe_warn_bidi_on_close (cpp_reader *pfile, const uchar *p)
1636 {
1637   const auto warn_bidi = CPP_OPTION (pfile, cpp_warn_bidirectional);
1638   if (bidi::vec.count () > 0
1639       && (warn_bidi & bidirectional_unpaired
1640           && (!bidi::current_ctx_ucn_p ()
1641               || (warn_bidi & bidirectional_ucn))))
1642     {
1643       const location_t loc
1644         = linemap_position_for_column (pfile->line_table,
1645                                        CPP_BUF_COLUMN (pfile->buffer, p));
1646       unpaired_bidi_rich_location rich_loc (pfile, loc);
1647       /* cpp_callbacks doesn't yet have a way to handle singular vs plural
1648          forms of a diagnostic, so fake it for now.  */
1649       if (bidi::vec.count () > 1)
1650         cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1651                         "unpaired UTF-8 bidirectional control characters "
1652                         "detected");
1653       else
1654         cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1655                         "unpaired UTF-8 bidirectional control character "
1656                         "detected");
1657     }
1658   /* We're done with this context.  */
1659   bidi::on_close ();
1660 }
1661
1662 /* We're at the beginning or in the middle of an identifier/comment/string
1663    literal/character constant.  Warn if we've encountered a bidi character.
1664    KIND says which bidi control character it was; UCN_P is true iff this bidi
1665    control character was written as a UCN.  LOC is the location of the
1666    character, but is only valid if KIND != bidi::kind::NONE.  */
1667
1668 static void
1669 maybe_warn_bidi_on_char (cpp_reader *pfile, bidi::kind kind,
1670                          bool ucn_p, location_t loc)
1671 {
1672   if (__builtin_expect (kind == bidi::kind::NONE, 1))
1673     return;
1674
1675   const auto warn_bidi = CPP_OPTION (pfile, cpp_warn_bidirectional);
1676
1677   if (warn_bidi & (bidirectional_unpaired|bidirectional_any))
1678     {
1679       rich_location rich_loc (pfile->line_table, loc);
1680       rich_loc.set_escape_on_output (true);
1681
1682       /* It seems excessive to warn about a PDI/PDF that is closing
1683          an opened context because we've already warned about the
1684          opening character.  Except warn when we have a UCN x UTF-8
1685          mismatch, if UCN checking is enabled.  */
1686       if (kind == bidi::current_ctx ())
1687         {
1688           if (warn_bidi == (bidirectional_unpaired|bidirectional_ucn)
1689               && bidi::current_ctx_ucn_p () != ucn_p)
1690             {
1691               rich_loc.add_range (bidi::current_ctx_loc ());
1692               cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1693                               "UTF-8 vs UCN mismatch when closing "
1694                               "a context by \"%s\"", bidi::to_str (kind));
1695             }
1696         }
1697       else if (warn_bidi & bidirectional_any
1698                && (!ucn_p || (warn_bidi & bidirectional_ucn)))
1699         {
1700           if (kind == bidi::kind::PDF || kind == bidi::kind::PDI)
1701             cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1702                             "\"%s\" is closing an unopened context",
1703                             bidi::to_str (kind));
1704           else
1705             cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1706                             "found problematic Unicode character \"%s\"",
1707                             bidi::to_str (kind));
1708         }
1709     }
1710   /* We're done with this context.  */
1711   bidi::on_char (kind, ucn_p, loc);
1712 }
1713
1714 static const cppchar_t utf8_continuation = 0x80;
1715 static const cppchar_t utf8_signifier = 0xC0;
1716
1717 /* Emit -Winvalid-utf8 warning on invalid UTF-8 character starting
1718    at PFILE->buffer->cur.  Return a pointer after the diagnosed
1719    invalid character.  */
1720
1721 static const uchar *
1722 _cpp_warn_invalid_utf8 (cpp_reader *pfile)
1723 {
1724   cpp_buffer *buffer = pfile->buffer;
1725   const uchar *cur = buffer->cur;
1726   bool pedantic = (CPP_PEDANTIC (pfile)
1727                    && CPP_OPTION (pfile, cpp_warn_invalid_utf8) == 2);
1728
1729   if (cur[0] < utf8_signifier
1730       || cur[1] < utf8_continuation || cur[1] >= utf8_signifier)
1731     {
1732       if (pedantic)
1733         cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1734                              pfile->line_table->highest_line,
1735                              CPP_BUF_COL (buffer),
1736                              "invalid UTF-8 character <%x>",
1737                              cur[0]);
1738       else
1739         cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1740                                pfile->line_table->highest_line,
1741                                CPP_BUF_COL (buffer),
1742                                "invalid UTF-8 character <%x>",
1743                                cur[0]);
1744       return cur + 1;
1745     }
1746   else if (cur[2] < utf8_continuation || cur[2] >= utf8_signifier)
1747     {
1748       if (pedantic)
1749         cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1750                              pfile->line_table->highest_line,
1751                              CPP_BUF_COL (buffer),
1752                              "invalid UTF-8 character <%x><%x>",
1753                              cur[0], cur[1]);
1754       else
1755         cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1756                                pfile->line_table->highest_line,
1757                                CPP_BUF_COL (buffer),
1758                                "invalid UTF-8 character <%x><%x>",
1759                                cur[0], cur[1]);
1760       return cur + 2;
1761     }
1762   else if (cur[3] < utf8_continuation || cur[3] >= utf8_signifier)
1763     {
1764       if (pedantic)
1765         cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1766                              pfile->line_table->highest_line,
1767                              CPP_BUF_COL (buffer),
1768                              "invalid UTF-8 character <%x><%x><%x>",
1769                              cur[0], cur[1], cur[2]);
1770       else
1771         cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1772                                pfile->line_table->highest_line,
1773                                CPP_BUF_COL (buffer),
1774                                "invalid UTF-8 character <%x><%x><%x>",
1775                                cur[0], cur[1], cur[2]);
1776       return cur + 3;
1777     }
1778   else
1779     {
1780       if (pedantic)
1781         cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1782                              pfile->line_table->highest_line,
1783                              CPP_BUF_COL (buffer),
1784                              "invalid UTF-8 character <%x><%x><%x><%x>",
1785                              cur[0], cur[1], cur[2], cur[3]);
1786       else
1787         cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1788                                pfile->line_table->highest_line,
1789                                CPP_BUF_COL (buffer),
1790                                "invalid UTF-8 character <%x><%x><%x><%x>",
1791                                cur[0], cur[1], cur[2], cur[3]);
1792       return cur + 4;
1793     }
1794 }
1795
1796 /* Helper function of *skip_*_comment and lex*_string.  For C,
1797    character at CUR[-1] with MSB set handle -Wbidi-chars* and
1798    -Winvalid-utf8 diagnostics and return pointer to first character
1799    that should be processed next.  */
1800
1801 static inline const uchar *
1802 _cpp_handle_multibyte_utf8 (cpp_reader *pfile, uchar c,
1803                             const uchar *cur, bool warn_bidi_p,
1804                             bool warn_invalid_utf8_p)
1805 {
1806   /* If this is a beginning of a UTF-8 encoding, it might be
1807      a bidirectional control character.  */
1808   if (c == bidi::utf8_start && warn_bidi_p)
1809     {
1810       location_t loc;
1811       bidi::kind kind = get_bidi_utf8 (pfile, cur - 1, &loc);
1812       maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
1813     }
1814   if (!warn_invalid_utf8_p)
1815     return cur;
1816   if (c >= utf8_signifier)
1817     {
1818       cppchar_t s;
1819       const uchar *pstr = cur - 1;
1820       if (_cpp_valid_utf8 (pfile, &pstr, pfile->buffer->rlimit, 0, NULL, &s)
1821           && s <= UCS_LIMIT)
1822         return pstr;
1823     }
1824   pfile->buffer->cur = cur - 1;
1825   return _cpp_warn_invalid_utf8 (pfile);
1826 }
1827
1828 /* Skip a C-style block comment.  We find the end of the comment by
1829    seeing if an asterisk is before every '/' we encounter.  Returns
1830    nonzero if comment terminated by EOF, zero otherwise.
1831
1832    Buffer->cur points to the initial asterisk of the comment.  */
1833 bool
1834 _cpp_skip_block_comment (cpp_reader *pfile)
1835 {
1836   cpp_buffer *buffer = pfile->buffer;
1837   const uchar *cur = buffer->cur;
1838   uchar c;
1839   const bool warn_bidi_p = pfile->warn_bidi_p ();
1840   const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
1841   const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
1842
1843   cur++;
1844   if (*cur == '/')
1845     cur++;
1846
1847   for (;;)
1848     {
1849       /* People like decorating comments with '*', so check for '/'
1850          instead for efficiency.  */
1851       c = *cur++;
1852
1853       if (c == '/')
1854         {
1855           if (cur[-2] == '*')
1856             {
1857               if (warn_bidi_p)
1858                 maybe_warn_bidi_on_close (pfile, cur);
1859               break;
1860             }
1861
1862           /* Warn about potential nested comments, but not if the '/'
1863              comes immediately before the true comment delimiter.
1864              Don't bother to get it right across escaped newlines.  */
1865           if (CPP_OPTION (pfile, warn_comments)
1866               && cur[0] == '*' && cur[1] != '/')
1867             {
1868               buffer->cur = cur;
1869               cpp_warning_with_line (pfile, CPP_W_COMMENTS,
1870                                      pfile->line_table->highest_line,
1871                                      CPP_BUF_COL (buffer),
1872                                      "\"/*\" within comment");
1873             }
1874         }
1875       else if (c == '\n')
1876         {
1877           unsigned int cols;
1878           buffer->cur = cur - 1;
1879           if (warn_bidi_p)
1880             maybe_warn_bidi_on_close (pfile, cur);
1881           _cpp_process_line_notes (pfile, true);
1882           if (buffer->next_line >= buffer->rlimit)
1883             return true;
1884           _cpp_clean_line (pfile);
1885
1886           cols = buffer->next_line - buffer->line_base;
1887           CPP_INCREMENT_LINE (pfile, cols);
1888
1889           cur = buffer->cur;
1890         }
1891       else if (__builtin_expect (c >= utf8_continuation, 0)
1892                && warn_bidi_or_invalid_utf8_p)
1893         cur = _cpp_handle_multibyte_utf8 (pfile, c, cur, warn_bidi_p,
1894                                           warn_invalid_utf8_p);
1895     }
1896
1897   buffer->cur = cur;
1898   _cpp_process_line_notes (pfile, true);
1899   return false;
1900 }
1901
1902 /* Skip a C++ line comment, leaving buffer->cur pointing to the
1903    terminating newline.  Handles escaped newlines.  Returns nonzero
1904    if a multiline comment.  */
1905 static int
1906 skip_line_comment (cpp_reader *pfile)
1907 {
1908   cpp_buffer *buffer = pfile->buffer;
1909   location_t orig_line = pfile->line_table->highest_line;
1910   const bool warn_bidi_p = pfile->warn_bidi_p ();
1911   const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
1912   const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
1913
1914   if (!warn_bidi_or_invalid_utf8_p)
1915     while (*buffer->cur != '\n')
1916       buffer->cur++;
1917   else if (!warn_invalid_utf8_p)
1918     {
1919       while (*buffer->cur != '\n'
1920              && *buffer->cur != bidi::utf8_start)
1921         buffer->cur++;
1922       if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
1923         {
1924           while (*buffer->cur != '\n')
1925             {
1926               if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
1927                 {
1928                   location_t loc;
1929                   bidi::kind kind = get_bidi_utf8 (pfile, buffer->cur, &loc);
1930                   maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
1931                 }
1932               buffer->cur++;
1933             }
1934           maybe_warn_bidi_on_close (pfile, buffer->cur);
1935         }
1936     }
1937   else
1938     {
1939       while (*buffer->cur != '\n')
1940         {
1941           if (*buffer->cur < utf8_continuation)
1942             {
1943               buffer->cur++;
1944               continue;
1945             }
1946           buffer->cur
1947             = _cpp_handle_multibyte_utf8 (pfile, *buffer->cur, buffer->cur + 1,
1948                                           warn_bidi_p, warn_invalid_utf8_p);
1949         }
1950       if (warn_bidi_p)
1951         maybe_warn_bidi_on_close (pfile, buffer->cur);
1952     }
1953
1954   _cpp_process_line_notes (pfile, true);
1955   return orig_line != pfile->line_table->highest_line;
1956 }
1957
1958 /* Skips whitespace, saving the next non-whitespace character.  */
1959 static void
1960 skip_whitespace (cpp_reader *pfile, cppchar_t c)
1961 {
1962   cpp_buffer *buffer = pfile->buffer;
1963   bool saw_NUL = false;
1964
1965   do
1966     {
1967       /* Horizontal space always OK.  */
1968       if (c == ' ' || c == '\t')
1969         ;
1970       /* Just \f \v or \0 left.  */
1971       else if (c == '\0')
1972         saw_NUL = true;
1973       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
1974         cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
1975                              CPP_BUF_COL (buffer),
1976                              "%s in preprocessing directive",
1977                              c == '\f' ? "form feed" : "vertical tab");
1978
1979       c = *buffer->cur++;
1980     }
1981   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
1982   while (is_nvspace (c));
1983
1984   if (saw_NUL)
1985     {
1986       encoding_rich_location rich_loc (pfile);
1987       cpp_error_at (pfile, CPP_DL_WARNING, &rich_loc,
1988                     "null character(s) ignored");
1989     }
1990
1991   buffer->cur--;
1992 }
1993
1994 /* See if the characters of a number token are valid in a name (no
1995    '.', '+' or '-').  */
1996 static int
1997 name_p (cpp_reader *pfile, const cpp_string *string)
1998 {
1999   unsigned int i;
2000
2001   for (i = 0; i < string->len; i++)
2002     if (!is_idchar (string->text[i]))
2003       return 0;
2004
2005   return 1;
2006 }
2007
2008 /* After parsing an identifier or other sequence, produce a warning about
2009    sequences not in NFC/NFKC.  */
2010 static void
2011 warn_about_normalization (cpp_reader *pfile,
2012                           const cpp_token *token,
2013                           const struct normalize_state *s,
2014                           bool identifier)
2015 {
2016   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
2017       && !pfile->state.skipping)
2018     {
2019       location_t loc = token->src_loc;
2020
2021       /* If possible, create a location range for the token.  */
2022       if (loc >= RESERVED_LOCATION_COUNT
2023           && token->type != CPP_EOF
2024           /* There must be no line notes to process.  */
2025           && (!(pfile->buffer->cur
2026                 >= pfile->buffer->notes[pfile->buffer->cur_note].pos
2027                 && !pfile->overlaid_buffer)))
2028         {
2029           source_range tok_range;
2030           tok_range.m_start = loc;
2031           tok_range.m_finish
2032             = linemap_position_for_column (pfile->line_table,
2033                                            CPP_BUF_COLUMN (pfile->buffer,
2034                                                            pfile->buffer->cur));
2035           loc = pfile->line_table->get_or_create_combined_loc (loc, tok_range,
2036                                                                nullptr, 0);
2037         }
2038
2039       encoding_rich_location rich_loc (pfile, loc);
2040
2041       /* Make sure that the token is printed using UCNs, even
2042          if we'd otherwise happily print UTF-8.  */
2043       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
2044       size_t sz;
2045
2046       sz = cpp_spell_token (pfile, token, buf, false) - buf;
2047       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
2048         cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
2049                         "`%.*s' is not in NFKC", (int) sz, buf);
2050       else if (identifier && CPP_OPTION (pfile, xid_identifiers))
2051         cpp_pedwarning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
2052                                   "`%.*s' is not in NFC", (int) sz, buf);
2053       else
2054         cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
2055                         "`%.*s' is not in NFC", (int) sz, buf);
2056       free (buf);
2057     }
2058 }
2059
2060 /* Returns TRUE if the byte sequence starting at buffer->cur is a valid
2061    extended character in an identifier.  If FIRST is TRUE, then the character
2062    must be valid at the beginning of an identifier as well.  If the return
2063    value is TRUE, then pfile->buffer->cur has been moved to point to the next
2064    byte after the extended character.  */
2065
2066 static bool
2067 forms_identifier_p (cpp_reader *pfile, int first,
2068                     struct normalize_state *state)
2069 {
2070   cpp_buffer *buffer = pfile->buffer;
2071   const bool warn_bidi_p = pfile->warn_bidi_p ();
2072
2073   if (*buffer->cur == '$')
2074     {
2075       if (!CPP_OPTION (pfile, dollars_in_ident))
2076         return false;
2077
2078       buffer->cur++;
2079       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
2080         {
2081           CPP_OPTION (pfile, warn_dollars) = 0;
2082           cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
2083         }
2084
2085       return true;
2086     }
2087
2088   /* Is this a syntactically valid UCN or a valid UTF-8 char?  */
2089   if (CPP_OPTION (pfile, extended_identifiers))
2090     {
2091       cppchar_t s;
2092       if (*buffer->cur >= utf8_signifier)
2093         {
2094           if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0)
2095               && warn_bidi_p)
2096             {
2097               location_t loc;
2098               bidi::kind kind = get_bidi_utf8 (pfile, buffer->cur, &loc);
2099               maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
2100             }
2101           if (_cpp_valid_utf8 (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
2102                                state, &s))
2103             return true;
2104         }
2105       else if (*buffer->cur == '\\'
2106                && (buffer->cur[1] == 'u'
2107                    || buffer->cur[1] == 'U'
2108                    || buffer->cur[1] == 'N'))
2109         {
2110           buffer->cur += 2;
2111           if (warn_bidi_p)
2112             {
2113               location_t loc;
2114               bidi::kind kind;
2115               if (buffer->cur[-1] == 'N')
2116                 kind = get_bidi_named (pfile, buffer->cur, &loc);
2117               else
2118                 kind = get_bidi_ucn (pfile, buffer->cur,
2119                                      buffer->cur[-1] == 'U', &loc);
2120               maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
2121             }
2122           if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
2123                               state, &s, NULL, NULL))
2124             return true;
2125           buffer->cur -= 2;
2126         }
2127     }
2128
2129   return false;
2130 }
2131
2132 /* Helper function to issue error about improper __VA_OPT__ use.  */
2133 static void
2134 maybe_va_opt_error (cpp_reader *pfile)
2135 {
2136   if (CPP_PEDANTIC (pfile) && !CPP_OPTION (pfile, va_opt))
2137     {
2138       /* __VA_OPT__ should not be accepted at all, but allow it in
2139          system headers.  */
2140       if (!_cpp_in_system_header (pfile))
2141         {
2142           if (CPP_OPTION (pfile, cplusplus))
2143             cpp_error (pfile, CPP_DL_PEDWARN,
2144                        "__VA_OPT__ is not available until C++20");
2145           else
2146             cpp_error (pfile, CPP_DL_PEDWARN,
2147                        "__VA_OPT__ is not available until C23");
2148         }
2149     }
2150   else if (!pfile->state.va_args_ok)
2151     {
2152       /* __VA_OPT__ should only appear in the replacement list of a
2153          variadic macro.  */
2154       cpp_error (pfile, CPP_DL_PEDWARN,
2155                  "__VA_OPT__ can only appear in the expansion"
2156                  " of a C++20 variadic macro");
2157     }
2158 }
2159
2160 /* Helper function to perform diagnostics that are needed (rarely)
2161    when an identifier is lexed.  */
2162 static void
2163 identifier_diagnostics_on_lex (cpp_reader *pfile, cpp_hashnode *node)
2164 {
2165   if (__builtin_expect (!(node->flags & NODE_DIAGNOSTIC)
2166                         || pfile->state.skipping, 1))
2167     return;
2168
2169   /* It is allowed to poison the same identifier twice.  */
2170   if ((node->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
2171     {
2172       cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
2173                  NODE_NAME (node));
2174       const auto data = (cpp_hashnode_extra *)
2175         ht_lookup (pfile->extra_hash_table, node->ident, HT_NO_INSERT);
2176       if (data && data->poisoned_loc)
2177         cpp_error_at (pfile, CPP_DL_NOTE, data->poisoned_loc, "poisoned here");
2178     }
2179
2180   /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
2181      replacement list of a variadic macro.  */
2182   if (node == pfile->spec_nodes.n__VA_ARGS__
2183       && !pfile->state.va_args_ok)
2184     {
2185       if (CPP_OPTION (pfile, cplusplus))
2186         cpp_error (pfile, CPP_DL_PEDWARN,
2187                    "__VA_ARGS__ can only appear in the expansion"
2188                    " of a C++11 variadic macro");
2189       else
2190         cpp_error (pfile, CPP_DL_PEDWARN,
2191                    "__VA_ARGS__ can only appear in the expansion"
2192                    " of a C99 variadic macro");
2193     }
2194
2195   /* __VA_OPT__ should only appear in the replacement list of a
2196      variadic macro.  */
2197   if (node == pfile->spec_nodes.n__VA_OPT__)
2198     maybe_va_opt_error (pfile);
2199
2200   /* For -Wc++-compat, warn about use of C++ named operators.  */
2201   if (node->flags & NODE_WARN_OPERATOR)
2202     cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
2203                  "identifier \"%s\" is a special operator name in C++",
2204                  NODE_NAME (node));
2205 }
2206
2207 /* Helper function to get the cpp_hashnode of the identifier BASE.  */
2208 static cpp_hashnode *
2209 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
2210 {
2211   cpp_hashnode *result;
2212   const uchar *cur;
2213   unsigned int len;
2214   unsigned int hash = HT_HASHSTEP (0, *base);
2215
2216   cur = base + 1;
2217   while (ISIDNUM (*cur))
2218     {
2219       hash = HT_HASHSTEP (hash, *cur);
2220       cur++;
2221     }
2222   len = cur - base;
2223   hash = HT_HASHFINISH (hash, len);
2224   result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
2225                                               base, len, hash, HT_ALLOC));
2226   identifier_diagnostics_on_lex (pfile, result);
2227   return result;
2228 }
2229
2230 /* Get the cpp_hashnode of an identifier specified by NAME in
2231    the current cpp_reader object.  If none is found, NULL is returned.  */
2232 cpp_hashnode *
2233 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
2234 {
2235   cpp_hashnode *result;
2236   result = lex_identifier_intern (pfile, (uchar *) name);
2237   return result;
2238 }
2239
2240 /* Lex an identifier starting at BASE.  BUFFER->CUR is expected to point
2241    one past the first character at BASE, which may be a (possibly multi-byte)
2242    character if STARTS_UCN is true.  */
2243 static cpp_hashnode *
2244 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
2245                 struct normalize_state *nst, cpp_hashnode **spelling)
2246 {
2247   cpp_hashnode *result;
2248   const uchar *cur;
2249   unsigned int len;
2250   unsigned int hash = HT_HASHSTEP (0, *base);
2251   const bool warn_bidi_p = pfile->warn_bidi_p ();
2252
2253   cur = pfile->buffer->cur;
2254   if (! starts_ucn)
2255     {
2256       while (ISIDNUM (*cur))
2257         {
2258           hash = HT_HASHSTEP (hash, *cur);
2259           cur++;
2260         }
2261       NORMALIZE_STATE_UPDATE_IDNUM (nst, *(cur - 1));
2262     }
2263   pfile->buffer->cur = cur;
2264   if (starts_ucn || forms_identifier_p (pfile, false, nst))
2265     {
2266       /* Slower version for identifiers containing UCNs
2267          or extended chars (including $).  */
2268       do {
2269         while (ISIDNUM (*pfile->buffer->cur))
2270           {
2271             NORMALIZE_STATE_UPDATE_IDNUM (nst, *pfile->buffer->cur);
2272             pfile->buffer->cur++;
2273           }
2274       } while (forms_identifier_p (pfile, false, nst));
2275       if (warn_bidi_p)
2276         maybe_warn_bidi_on_close (pfile, pfile->buffer->cur);
2277       result = _cpp_interpret_identifier (pfile, base,
2278                                           pfile->buffer->cur - base);
2279       *spelling = cpp_lookup (pfile, base, pfile->buffer->cur - base);
2280     }
2281   else
2282     {
2283       len = cur - base;
2284       hash = HT_HASHFINISH (hash, len);
2285
2286       result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
2287                                                   base, len, hash, HT_ALLOC));
2288       *spelling = result;
2289     }
2290
2291   return result;
2292 }
2293
2294 /* Struct to hold the return value of the scan_cur_identifier () helper
2295    function below.  */
2296
2297 struct scan_id_result
2298 {
2299   cpp_hashnode *node;
2300   normalize_state nst;
2301
2302   scan_id_result ()
2303     : node (nullptr)
2304   {
2305     nst = INITIAL_NORMALIZE_STATE;
2306   }
2307
2308   explicit operator bool () const { return node; }
2309 };
2310
2311 /* Helper function to scan an entire identifier beginning at
2312    pfile->buffer->cur, and possibly containing extended characters (UCNs
2313    and/or UTF-8).  Returns the cpp_hashnode for the identifier on success, or
2314    else nullptr, as well as a normalize_state so that normalization warnings
2315    may be issued once the token lexing is complete.  */
2316
2317 static scan_id_result
2318 scan_cur_identifier (cpp_reader *pfile)
2319 {
2320   const auto buffer = pfile->buffer;
2321   const auto begin = buffer->cur;
2322   scan_id_result result;
2323   if (ISIDST (*buffer->cur))
2324     {
2325       ++buffer->cur;
2326       cpp_hashnode *ignore;
2327       result.node = lex_identifier (pfile, begin, false, &result.nst, &ignore);
2328     }
2329   else if (forms_identifier_p (pfile, true, &result.nst))
2330     {
2331       /* buffer->cur has been moved already by the call
2332          to forms_identifier_p.  */
2333       cpp_hashnode *ignore;
2334       result.node = lex_identifier (pfile, begin, true, &result.nst, &ignore);
2335     }
2336   return result;
2337 }
2338
2339 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
2340 static void
2341 lex_number (cpp_reader *pfile, cpp_string *number,
2342             struct normalize_state *nst)
2343 {
2344   const uchar *cur;
2345   const uchar *base;
2346   uchar *dest;
2347
2348   base = pfile->buffer->cur - 1;
2349   do
2350     {
2351       const uchar *adj_digit_sep = NULL;
2352       cur = pfile->buffer->cur;
2353
2354       /* N.B. ISIDNUM does not include $.  */
2355       while (ISIDNUM (*cur)
2356              || (*cur == '.' && !DIGIT_SEP (cur[-1]))
2357              || DIGIT_SEP (*cur)
2358              || (VALID_SIGN (*cur, cur[-1]) && !DIGIT_SEP (cur[-2])))
2359         {
2360           NORMALIZE_STATE_UPDATE_IDNUM (nst, *cur);
2361           /* Adjacent digit separators do not form part of the pp-number syntax.
2362              However, they can safely be diagnosed here as an error, since '' is
2363              not a valid preprocessing token.  */
2364           if (DIGIT_SEP (*cur) && DIGIT_SEP (cur[-1]) && !adj_digit_sep)
2365             adj_digit_sep = cur;
2366           cur++;
2367         }
2368       /* A number can't end with a digit separator.  */
2369       while (cur > pfile->buffer->cur && DIGIT_SEP (cur[-1]))
2370         --cur;
2371       if (adj_digit_sep && adj_digit_sep < cur)
2372         cpp_error (pfile, CPP_DL_ERROR, "adjacent digit separators");
2373
2374       pfile->buffer->cur = cur;
2375     }
2376   while (forms_identifier_p (pfile, false, nst));
2377
2378   number->len = cur - base;
2379   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
2380   memcpy (dest, base, number->len);
2381   dest[number->len] = '\0';
2382   number->text = dest;
2383 }
2384
2385 /* Create a token of type TYPE with a literal spelling.  */
2386 static void
2387 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
2388                 unsigned int len, enum cpp_ttype type)
2389 {
2390   token->type = type;
2391   token->val.str.len = len;
2392   token->val.str.text = cpp_alloc_token_string (pfile, base, len);
2393 }
2394
2395 /* Like create_literal(), but construct it from two separate strings
2396    which are concatenated.  LEN2 may be 0 if no second string is
2397    required.  */
2398 static void
2399 create_literal2 (cpp_reader *pfile, cpp_token *token, const uchar *base1,
2400                  unsigned int len1, const uchar *base2, unsigned int len2,
2401                  enum cpp_ttype type)
2402 {
2403   token->type = type;
2404   token->val.str.len = len1 + len2;
2405   uchar *const dest = _cpp_unaligned_alloc (pfile, len1 + len2 + 1);
2406   memcpy (dest, base1, len1);
2407   if (len2)
2408     memcpy (dest+len1, base2, len2);
2409   dest[len1 + len2] = 0;
2410   token->val.str.text = dest;
2411 }
2412
2413 const uchar *
2414 cpp_alloc_token_string (cpp_reader *pfile,
2415                         const unsigned char *ptr, unsigned len)
2416 {
2417   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
2418
2419   dest[len] = 0;
2420   memcpy (dest, ptr, len);
2421   return dest;
2422 }
2423
2424 /* A pair of raw buffer pointers.  The currently open one is [1], the
2425    first one is [0].  Used for string literal lexing.  */
2426 struct lit_accum {
2427   _cpp_buff *first;
2428   _cpp_buff *last;
2429   const uchar *rpos;
2430   size_t accum;
2431
2432   lit_accum ()
2433     : first (NULL), last (NULL), rpos (0), accum (0)
2434   {
2435   }
2436
2437   void append (cpp_reader *, const uchar *, size_t);
2438
2439   void read_begin (cpp_reader *);
2440   bool reading_p () const
2441   {
2442     return rpos != NULL;
2443   }
2444   char read_char ()
2445   {
2446     char c = *rpos++;
2447     if (rpos == BUFF_FRONT (last))
2448       rpos = NULL;
2449     return c;
2450   }
2451
2452   void create_literal2 (cpp_reader *pfile, cpp_token *token,
2453                         const uchar *base1, unsigned int len1,
2454                         const uchar *base2, unsigned int len2,
2455                         enum cpp_ttype type);
2456 };
2457
2458 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
2459    sequence from *FIRST_BUFF_P to LAST_BUFF_P.  */
2460
2461 void
2462 lit_accum::append (cpp_reader *pfile, const uchar *base, size_t len)
2463 {
2464   if (!last)
2465     /* Starting.  */
2466     first = last = _cpp_get_buff (pfile, len);
2467   else if (len > BUFF_ROOM (last))
2468     {
2469       /* There is insufficient room in the buffer.  Copy what we can,
2470          and then either extend or create a new one.  */
2471       size_t room = BUFF_ROOM (last);
2472       memcpy (BUFF_FRONT (last), base, room);
2473       BUFF_FRONT (last) += room;
2474       base += room;
2475       len -= room;
2476       accum += room;
2477
2478       gcc_checking_assert (!rpos);
2479
2480       last = _cpp_append_extend_buff (pfile, last, len);
2481     }
2482
2483   memcpy (BUFF_FRONT (last), base, len);
2484   BUFF_FRONT (last) += len;
2485   accum += len;
2486 }
2487
2488 void
2489 lit_accum::read_begin (cpp_reader *pfile)
2490 {
2491   /* We never accumulate more than 4 chars to read.  */
2492   if (BUFF_ROOM (last) < 4)
2493
2494     last = _cpp_append_extend_buff (pfile, last, 4);
2495   rpos = BUFF_FRONT (last);
2496 }
2497
2498 /* Helper function to check if a string format macro, say from inttypes.h, is
2499    placed touching a string literal, in which case it could be parsed as a C++11
2500    user-defined string literal thus breaking the program.  Return TRUE if the
2501    UDL should be ignored for now and preserved for potential macro
2502    expansion.  */
2503
2504 static bool
2505 maybe_ignore_udl_macro_suffix (cpp_reader *pfile, location_t src_loc,
2506                                const uchar *suffix_begin, cpp_hashnode *node)
2507 {
2508   /* User-defined literals outside of namespace std must start with a single
2509      underscore, so assume anything of that form really is a UDL suffix.
2510      We don't need to worry about UDLs defined inside namespace std because
2511      their names are reserved, so cannot be used as macro names in valid
2512      programs.  */
2513   if ((suffix_begin[0] == '_' && suffix_begin[1] != '_')
2514       || !cpp_macro_p (node))
2515     return false;
2516
2517   /* Maybe raise a warning here; caller should arrange not to consume
2518      the tokens.  */
2519   if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
2520     cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX, src_loc, 0,
2521                            "invalid suffix on literal; C++11 requires a space "
2522                            "between literal and string macro");
2523   return true;
2524 }
2525
2526 /* Like create_literal2(), but also prepend all the accumulated data from
2527    the lit_accum struct.  */
2528 void
2529 lit_accum::create_literal2 (cpp_reader *pfile, cpp_token *token,
2530                             const uchar *base1, unsigned int len1,
2531                             const uchar *base2, unsigned int len2,
2532                             enum cpp_ttype type)
2533 {
2534   const unsigned int tot_len = accum + len1 + len2;
2535   uchar *dest = _cpp_unaligned_alloc (pfile, tot_len + 1);
2536   token->type = type;
2537   token->val.str.len = tot_len;
2538   token->val.str.text = dest;
2539   for (_cpp_buff *buf = first; buf; buf = buf->next)
2540     {
2541       size_t len = BUFF_FRONT (buf) - buf->base;
2542       memcpy (dest, buf->base, len);
2543       dest += len;
2544     }
2545   memcpy (dest, base1, len1);
2546   dest += len1;
2547   if (len2)
2548     memcpy (dest, base2, len2);
2549   dest += len2;
2550   *dest = '\0';
2551 }
2552
2553 /* Lexes a raw string.  The stored string contains the spelling,
2554    including double quotes, delimiter string, '(' and ')', any leading
2555    'L', 'u', 'U' or 'u8' and 'R' modifier.  The created token contains
2556    the type of the literal, or CPP_OTHER if it was not properly
2557    terminated.
2558
2559    BASE is the start of the token.  Updates pfile->buffer->cur to just
2560    after the lexed string.
2561
2562    The spelling is NUL-terminated, but it is not guaranteed that this
2563    is the first NUL since embedded NULs are preserved.  */
2564
2565 static void
2566 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
2567 {
2568   const uchar *pos = base;
2569   const bool warn_bidi_p = pfile->warn_bidi_p ();
2570   const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
2571   const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
2572
2573   /* 'tis a pity this information isn't passed down from the lexer's
2574      initial categorization of the token.  */
2575   enum cpp_ttype type = CPP_STRING;
2576
2577   if (*pos == 'L')
2578     {
2579       type = CPP_WSTRING;
2580       pos++;
2581     }
2582   else if (*pos == 'U')
2583     {
2584       type = CPP_STRING32;
2585       pos++;
2586     }
2587   else if (*pos == 'u')
2588     {
2589       if (pos[1] == '8')
2590         {
2591           type = CPP_UTF8STRING;
2592           pos++;
2593         }
2594       else
2595         type = CPP_STRING16;
2596       pos++;
2597     }
2598
2599   gcc_checking_assert (pos[0] == 'R' && pos[1] == '"');
2600   pos += 2;
2601
2602   _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
2603
2604   /* Skip notes before the ".  */
2605   while (note->pos < pos)
2606     ++note;
2607
2608   lit_accum accum;
2609
2610   uchar prefix[17];
2611   unsigned prefix_len = 0;
2612   enum Phase
2613   {
2614    PHASE_PREFIX = -2,
2615    PHASE_NONE = -1,
2616    PHASE_SUFFIX = 0
2617   } phase = PHASE_PREFIX;
2618
2619   for (;;)
2620     {
2621       gcc_checking_assert (note->pos >= pos);
2622
2623       /* Undo any escaped newlines and trigraphs.  */
2624       if (!accum.reading_p () && note->pos == pos)
2625         switch (note->type)
2626           {
2627           case '\\':
2628           case ' ':
2629             /* Restore backslash followed by newline.  */
2630             accum.append (pfile, base, pos - base);
2631             base = pos;
2632             accum.read_begin (pfile);
2633             accum.append (pfile, UC"\\", 1);
2634
2635           after_backslash:
2636             if (note->type == ' ')
2637               /* GNU backslash whitespace newline extension.  FIXME
2638                  could be any sequence of non-vertical space.  When we
2639                  can properly restore any such sequence, we should
2640                  mark this note as handled so _cpp_process_line_notes
2641                  doesn't warn.  */
2642               accum.append (pfile, UC" ", 1);
2643
2644             accum.append (pfile, UC"\n", 1);
2645             note++;
2646             break;
2647
2648           case '\n':
2649             /* This can happen for ??/<NEWLINE> when trigraphs are not
2650                being interpretted.  */
2651             gcc_checking_assert (!CPP_OPTION (pfile, trigraphs));
2652             note->type = 0;
2653             note++;
2654             break;
2655
2656           default:
2657             gcc_checking_assert (_cpp_trigraph_map[note->type]);
2658
2659             /* Don't warn about this trigraph in
2660                _cpp_process_line_notes, since trigraphs show up as
2661                trigraphs in raw strings.  */
2662             uchar type = note->type;
2663             note->type = 0;
2664
2665             if (CPP_OPTION (pfile, trigraphs))
2666               {
2667                 accum.append (pfile, base, pos - base);
2668                 base = pos;
2669                 accum.read_begin (pfile);
2670                 accum.append (pfile, UC"??", 2);
2671                 accum.append (pfile, &type, 1);
2672
2673                 /* ??/ followed by newline gets two line notes, one for
2674                    the trigraph and one for the backslash/newline.  */
2675                 if (type == '/' && note[1].pos == pos)
2676                   {
2677                     note++;
2678                     gcc_assert (note->type == '\\' || note->type == ' ');
2679                     goto after_backslash;
2680                   }
2681                 /* Skip the replacement character.  */
2682                 base = ++pos;
2683               }
2684
2685             note++;
2686             break;
2687           }
2688
2689       /* Now get a char to process.  Either from an expanded note, or
2690          from the line buffer.  */
2691       bool read_note = accum.reading_p ();
2692       char c = read_note ? accum.read_char () : *pos++;
2693
2694       if (phase == PHASE_PREFIX)
2695         {
2696           if (c == '(')
2697             {
2698               /* Done.  */
2699               phase = PHASE_NONE;
2700               prefix[prefix_len++] = '"';
2701             }
2702           else if (prefix_len < 16
2703                    /* Prefix chars are any of the basic character set,
2704                       [lex.charset] except for '
2705                       ()\\\t\v\f\n'. Optimized for a contiguous
2706                       alphabet.  */
2707                    /* Unlike a switch, this collapses down to one or
2708                       two shift and bitmask operations on an ASCII
2709                       system, with an outlier or two.   */
2710                    && (('Z' - 'A' == 25
2711                         ? ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
2712                         : ISIDST (c))
2713                        || (c >= '0' && c <= '9')
2714                        || c == '_' || c == '{' || c == '}'
2715                        || c == '[' || c == ']' || c == '#'
2716                        || c == '<' || c == '>' || c == '%'
2717                        || c == ':' || c == ';' || c == '.' || c == '?'
2718                        || c == '*' || c == '+' || c == '-' || c == '/'
2719                        || c == '^' || c == '&' || c == '|' || c == '~'
2720                        || c == '!' || c == '=' || c == ','
2721                        || c == '"' || c == '\''))
2722             prefix[prefix_len++] = c;
2723           else
2724             {
2725               /* Something is wrong.  */
2726               int col = CPP_BUF_COLUMN (pfile->buffer, pos) + read_note;
2727               if (prefix_len == 16)
2728                 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2729                                      col, "raw string delimiter longer "
2730                                      "than 16 characters");
2731               else if (c == '\n')
2732                 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2733                                      col, "invalid new-line in raw "
2734                                      "string delimiter");
2735               else
2736                 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2737                                      col, "invalid character '%c' in "
2738                                      "raw string delimiter", c);
2739               type = CPP_OTHER;
2740               phase = PHASE_NONE;
2741               /* Continue until we get a close quote, that's probably
2742                  the best failure mode.  */
2743               prefix_len = 0;
2744             }
2745           if (c != '\n')
2746             continue;
2747         }
2748
2749       if (phase != PHASE_NONE)
2750         {
2751           if (prefix[phase] != c)
2752             phase = PHASE_NONE;
2753           else if (unsigned (phase + 1) == prefix_len)
2754             break;
2755           else
2756             {
2757               phase = Phase (phase + 1);
2758               continue;
2759             }
2760         }
2761
2762       if (!prefix_len && c == '"')
2763         /* Failure mode lexing.  */
2764         goto out;
2765       else if (prefix_len && c == ')')
2766         phase = PHASE_SUFFIX;
2767       else if (!read_note && c == '\n')
2768         {
2769           pos--;
2770           pfile->buffer->cur = pos;
2771           if ((pfile->state.in_directive || pfile->state.parsing_args)
2772               && pfile->buffer->next_line >= pfile->buffer->rlimit)
2773             {
2774               cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
2775                                    "unterminated raw string");
2776               type = CPP_OTHER;
2777               goto out;
2778             }
2779
2780           accum.append (pfile, base, pos - base + 1);
2781           _cpp_process_line_notes (pfile, false);
2782
2783           if (pfile->buffer->next_line < pfile->buffer->rlimit)
2784             CPP_INCREMENT_LINE (pfile, 0);
2785           pfile->buffer->need_line = true;
2786
2787           if (!get_fresh_line_impl<true> (pfile))
2788             {
2789               /* We ran out of file and failed to get a line.  */
2790               location_t src_loc = token->src_loc;
2791               token->type = CPP_EOF;
2792               /* Tell the compiler the line number of the EOF token.  */
2793               token->src_loc = pfile->line_table->highest_line;
2794               token->flags = BOL;
2795               if (accum.first)
2796                 _cpp_release_buff (pfile, accum.first);
2797               cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
2798                                    "unterminated raw string");
2799
2800               /* Now pop the buffer that get_fresh_line_impl() did not.  Popping
2801                  is not safe if processing a directive, however this cannot
2802                  happen as we already checked above that a line would be
2803                  available, and get_fresh_line_impl() can't fail in this
2804                  case.  */
2805               gcc_assert (!pfile->state.in_directive);
2806               _cpp_pop_buffer (pfile);
2807
2808               return;
2809             }
2810
2811           pos = base = pfile->buffer->cur;
2812           note = &pfile->buffer->notes[pfile->buffer->cur_note];
2813         }
2814       else if (__builtin_expect ((unsigned char) c >= utf8_continuation, 0)
2815                && warn_bidi_or_invalid_utf8_p)
2816         pos = _cpp_handle_multibyte_utf8 (pfile, c, pos, warn_bidi_p,
2817                                           warn_invalid_utf8_p);
2818     }
2819
2820   if (warn_bidi_p)
2821     maybe_warn_bidi_on_close (pfile, pos);
2822
2823   if (CPP_OPTION (pfile, user_literals))
2824     {
2825       const uchar *const suffix_begin = pos;
2826       pfile->buffer->cur = pos;
2827
2828       if (const auto sr = scan_cur_identifier (pfile))
2829         {
2830           if (maybe_ignore_udl_macro_suffix (pfile, token->src_loc,
2831                                              suffix_begin, sr.node))
2832               pfile->buffer->cur = suffix_begin;
2833           else
2834             {
2835               type = cpp_userdef_string_add_type (type);
2836               accum.create_literal2 (pfile, token, base, suffix_begin - base,
2837                                      NODE_NAME (sr.node), NODE_LEN (sr.node),
2838                                      type);
2839               if (accum.first)
2840                 _cpp_release_buff (pfile, accum.first);
2841               warn_about_normalization (pfile, token, &sr.nst, true);
2842               return;
2843             }
2844         }
2845     }
2846
2847  out:
2848   pfile->buffer->cur = pos;
2849   if (!accum.accum)
2850     create_literal (pfile, token, base, pos - base, type);
2851   else
2852     {
2853       accum.create_literal2 (pfile, token, base, pos - base, nullptr, 0, type);
2854       _cpp_release_buff (pfile, accum.first);
2855     }
2856 }
2857
2858 /* Lexes a string, character constant, or angle-bracketed header file
2859    name.  The stored string contains the spelling, including opening
2860    quote and any leading 'L', 'u', 'U' or 'u8' and optional
2861    'R' modifier.  It returns the type of the literal, or CPP_OTHER
2862    if it was not properly terminated, or CPP_LESS for an unterminated
2863    header name which must be relexed as normal tokens.
2864
2865    The spelling is NUL-terminated, but it is not guaranteed that this
2866    is the first NUL since embedded NULs are preserved.  */
2867 static void
2868 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
2869 {
2870   bool saw_NUL = false;
2871   const uchar *cur;
2872   cppchar_t terminator;
2873   enum cpp_ttype type;
2874
2875   cur = base;
2876   terminator = *cur++;
2877   if (terminator == 'L' || terminator == 'U')
2878     terminator = *cur++;
2879   else if (terminator == 'u')
2880     {
2881       terminator = *cur++;
2882       if (terminator == '8')
2883         terminator = *cur++;
2884     }
2885   if (terminator == 'R')
2886     {
2887       lex_raw_string (pfile, token, base);
2888       return;
2889     }
2890   if (terminator == '"')
2891     type = (*base == 'L' ? CPP_WSTRING :
2892             *base == 'U' ? CPP_STRING32 :
2893             *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
2894                          : CPP_STRING);
2895   else if (terminator == '\'')
2896     type = (*base == 'L' ? CPP_WCHAR :
2897             *base == 'U' ? CPP_CHAR32 :
2898             *base == 'u' ? (base[1] == '8' ? CPP_UTF8CHAR : CPP_CHAR16)
2899                          : CPP_CHAR);
2900   else
2901     terminator = '>', type = CPP_HEADER_NAME;
2902
2903   const bool warn_bidi_p = pfile->warn_bidi_p ();
2904   const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
2905   const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
2906   for (;;)
2907     {
2908       cppchar_t c = *cur++;
2909
2910       /* In #include-style directives, terminators are not escapable.  */
2911       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
2912         {
2913           if ((cur[0] == 'u' || cur[0] == 'U' || cur[0] == 'N') && warn_bidi_p)
2914             {
2915               location_t loc;
2916               bidi::kind kind;
2917               if (cur[0] == 'N')
2918                 kind = get_bidi_named (pfile, cur + 1, &loc);
2919               else
2920                 kind = get_bidi_ucn (pfile, cur + 1, cur[0] == 'U', &loc);
2921               maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
2922             }
2923           cur++;
2924         }
2925       else if (c == terminator)
2926         {
2927           if (warn_bidi_p)
2928             maybe_warn_bidi_on_close (pfile, cur - 1);
2929           break;
2930         }
2931       else if (c == '\n')
2932         {
2933           cur--;
2934           /* Unmatched quotes always yield undefined behavior, but
2935              greedy lexing means that what appears to be an unterminated
2936              header name may actually be a legitimate sequence of tokens.  */
2937           if (terminator == '>')
2938             {
2939               token->type = CPP_LESS;
2940               return;
2941             }
2942           type = CPP_OTHER;
2943           break;
2944         }
2945       else if (c == '\0')
2946         saw_NUL = true;
2947       else if (__builtin_expect (c >= utf8_continuation, 0)
2948                && warn_bidi_or_invalid_utf8_p)
2949         cur = _cpp_handle_multibyte_utf8 (pfile, c, cur, warn_bidi_p,
2950                                           warn_invalid_utf8_p);
2951     }
2952
2953   if (saw_NUL && !pfile->state.skipping)
2954     cpp_error (pfile, CPP_DL_WARNING,
2955                "null character(s) preserved in literal");
2956
2957   if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
2958     cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
2959                (int) terminator);
2960
2961   pfile->buffer->cur = cur;
2962   const uchar *const suffix_begin = cur;
2963
2964   if (CPP_OPTION (pfile, user_literals))
2965     {
2966       if (const auto sr = scan_cur_identifier (pfile))
2967         {
2968           if (maybe_ignore_udl_macro_suffix (pfile, token->src_loc,
2969                                              suffix_begin, sr.node))
2970             pfile->buffer->cur = suffix_begin;
2971           else
2972             {
2973               /* Grab user defined literal suffix.  */
2974               type = cpp_userdef_char_add_type (type);
2975               type = cpp_userdef_string_add_type (type);
2976               create_literal2 (pfile, token, base, suffix_begin - base,
2977                                NODE_NAME (sr.node), NODE_LEN (sr.node), type);
2978               warn_about_normalization (pfile, token, &sr.nst, true);
2979               return;
2980             }
2981         }
2982     }
2983   else if (CPP_OPTION (pfile, cpp_warn_cxx11_compat)
2984            && !pfile->state.skipping)
2985     {
2986       const auto sr = scan_cur_identifier (pfile);
2987       /* Maybe raise a warning, but do not consume the tokens.  */
2988       pfile->buffer->cur = suffix_begin;
2989       if (sr && cpp_macro_p (sr.node))
2990         cpp_warning_with_line (pfile, CPP_W_CXX11_COMPAT,
2991                                token->src_loc, 0, "C++11 requires a space "
2992                                "between string literal and macro");
2993     }
2994
2995   create_literal (pfile, token, base, cur - base, type);
2996 }
2997
2998 /* Return the comment table. The client may not make any assumption
2999    about the ordering of the table.  */
3000 cpp_comment_table *
3001 cpp_get_comments (cpp_reader *pfile)
3002 {
3003   return &pfile->comments;
3004 }
3005
3006 /* Append a comment to the end of the comment table. */
3007 static void
3008 store_comment (cpp_reader *pfile, cpp_token *token)
3009 {
3010   int len;
3011
3012   if (pfile->comments.allocated == 0)
3013     {
3014       pfile->comments.allocated = 256;
3015       pfile->comments.entries = (cpp_comment *) xmalloc
3016         (pfile->comments.allocated * sizeof (cpp_comment));
3017     }
3018
3019   if (pfile->comments.count == pfile->comments.allocated)
3020     {
3021       pfile->comments.allocated *= 2;
3022       pfile->comments.entries = (cpp_comment *) xrealloc
3023         (pfile->comments.entries,
3024          pfile->comments.allocated * sizeof (cpp_comment));
3025     }
3026
3027   len = token->val.str.len;
3028
3029   /* Copy comment. Note, token may not be NULL terminated. */
3030   pfile->comments.entries[pfile->comments.count].comment =
3031     (char *) xmalloc (sizeof (char) * (len + 1));
3032   memcpy (pfile->comments.entries[pfile->comments.count].comment,
3033           token->val.str.text, len);
3034   pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
3035
3036   /* Set source location. */
3037   pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
3038
3039   /* Increment the count of entries in the comment table. */
3040   pfile->comments.count++;
3041 }
3042
3043 /* The stored comment includes the comment start and any terminator.  */
3044 static void
3045 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
3046               cppchar_t type)
3047 {
3048   unsigned char *buffer;
3049   unsigned int len, clen, i;
3050
3051   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
3052
3053   /* C++ comments probably (not definitely) have moved past a new
3054      line, which we don't want to save in the comment.  */
3055   if (is_vspace (pfile->buffer->cur[-1]))
3056     len--;
3057
3058   /* If we are currently in a directive or in argument parsing, then
3059      we need to store all C++ comments as C comments internally, and
3060      so we need to allocate a little extra space in that case.
3061
3062      Note that the only time we encounter a directive here is
3063      when we are saving comments in a "#define".  */
3064   clen = ((pfile->state.in_directive || pfile->state.parsing_args)
3065           && type == '/') ? len + 2 : len;
3066
3067   buffer = _cpp_unaligned_alloc (pfile, clen);
3068
3069   token->type = CPP_COMMENT;
3070   token->val.str.len = clen;
3071   token->val.str.text = buffer;
3072
3073   buffer[0] = '/';
3074   memcpy (buffer + 1, from, len - 1);
3075
3076   /* Finish conversion to a C comment, if necessary.  */
3077   if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
3078     {
3079       buffer[1] = '*';
3080       buffer[clen - 2] = '*';
3081       buffer[clen - 1] = '/';
3082       /* As there can be in a C++ comments illegal sequences for C comments
3083          we need to filter them out.  */
3084       for (i = 2; i < (clen - 2); i++)
3085         if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
3086           buffer[i] = '|';
3087     }
3088
3089   /* Finally store this comment for use by clients of libcpp. */
3090   store_comment (pfile, token);
3091 }
3092
3093 /* Returns true if comment at COMMENT_START is a recognized FALLTHROUGH
3094    comment.  */
3095
3096 static bool
3097 fallthrough_comment_p (cpp_reader *pfile, const unsigned char *comment_start)
3098 {
3099   const unsigned char *from = comment_start + 1;
3100
3101   switch (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough))
3102     {
3103       /* For both -Wimplicit-fallthrough=0 and -Wimplicit-fallthrough=5 we
3104          don't recognize any comments.  The latter only checks attributes,
3105          the former doesn't warn.  */
3106     case 0:
3107     default:
3108       return false;
3109       /* -Wimplicit-fallthrough=1 considers any comment, no matter what
3110          content it has.  */
3111     case 1:
3112       return true;
3113     case 2:
3114       /* -Wimplicit-fallthrough=2 looks for (case insensitive)
3115          .*falls?[ \t-]*thr(u|ough).* regex.  */
3116       for (; (size_t) (pfile->buffer->cur - from) >= sizeof "fallthru" - 1;
3117            from++)
3118         {
3119           /* Is there anything like strpbrk with upper boundary, or
3120              memchr looking for 2 characters rather than just one?  */
3121           if (from[0] != 'f' && from[0] != 'F')
3122             continue;
3123           if (from[1] != 'a' && from[1] != 'A')
3124             continue;
3125           if (from[2] != 'l' && from[2] != 'L')
3126             continue;
3127           if (from[3] != 'l' && from[3] != 'L')
3128             continue;
3129           from += sizeof "fall" - 1;
3130           if (from[0] == 's' || from[0] == 'S')
3131             from++;
3132           while (*from == ' ' || *from == '\t' || *from == '-')
3133             from++;
3134           if (from[0] != 't' && from[0] != 'T')
3135             continue;
3136           if (from[1] != 'h' && from[1] != 'H')
3137             continue;
3138           if (from[2] != 'r' && from[2] != 'R')
3139             continue;
3140           if (from[3] == 'u' || from[3] == 'U')
3141             return true;
3142           if (from[3] != 'o' && from[3] != 'O')
3143             continue;
3144           if (from[4] != 'u' && from[4] != 'U')
3145             continue;
3146           if (from[5] != 'g' && from[5] != 'G')
3147             continue;
3148           if (from[6] != 'h' && from[6] != 'H')
3149             continue;
3150           return true;
3151         }
3152       return false;
3153     case 3:
3154     case 4:
3155       break;
3156     }
3157
3158   /* Whole comment contents:
3159      -fallthrough
3160      @fallthrough@
3161    */
3162   if (*from == '-' || *from == '@')
3163     {
3164       size_t len = sizeof "fallthrough" - 1;
3165       if ((size_t) (pfile->buffer->cur - from - 1) < len)
3166         return false;
3167       if (memcmp (from + 1, "fallthrough", len))
3168         return false;
3169       if (*from == '@')
3170         {
3171           if (from[len + 1] != '@')
3172             return false;
3173           len++;
3174         }
3175       from += 1 + len;
3176     }
3177   /* Whole comment contents (regex):
3178      lint -fallthrough[ \t]*
3179    */
3180   else if (*from == 'l')
3181     {
3182       size_t len = sizeof "int -fallthrough" - 1;
3183       if ((size_t) (pfile->buffer->cur - from - 1) < len)
3184         return false;
3185       if (memcmp (from + 1, "int -fallthrough", len))
3186         return false;
3187       from += 1 + len;
3188       while (*from == ' ' || *from == '\t')
3189         from++;
3190     }
3191   /* Whole comment contents (regex):
3192      [ \t]*FALLTHR(U|OUGH)[ \t]*
3193    */
3194   else if (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough) == 4)
3195     {
3196       while (*from == ' ' || *from == '\t')
3197         from++;
3198       if ((size_t) (pfile->buffer->cur - from)  < sizeof "FALLTHRU" - 1)
3199         return false;
3200       if (memcmp (from, "FALLTHR", sizeof "FALLTHR" - 1))
3201         return false;
3202       from += sizeof "FALLTHR" - 1;
3203       if (*from == 'U')
3204         from++;
3205       else if ((size_t) (pfile->buffer->cur - from)  < sizeof "OUGH" - 1)
3206         return false;
3207       else if (memcmp (from, "OUGH", sizeof "OUGH" - 1))
3208         return false;
3209       else
3210         from += sizeof "OUGH" - 1;
3211       while (*from == ' ' || *from == '\t')
3212         from++;
3213     }
3214   /* Whole comment contents (regex):
3215      [ \t.!]*(ELSE,? |INTENTIONAL(LY)? )?FALL(S | |-)?THR(OUGH|U)[ \t.!]*(-[^\n\r]*)?
3216      [ \t.!]*(Else,? |Intentional(ly)? )?Fall((s | |-)[Tt]|t)hr(ough|u)[ \t.!]*(-[^\n\r]*)?
3217      [ \t.!]*([Ee]lse,? |[Ii]ntentional(ly)? )?fall(s | |-)?thr(ough|u)[ \t.!]*(-[^\n\r]*)?
3218    */
3219   else
3220     {
3221       while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
3222         from++;
3223       unsigned char f = *from;
3224       bool all_upper = false;
3225       if (f == 'E' || f == 'e')
3226         {
3227           if ((size_t) (pfile->buffer->cur - from)
3228               < sizeof "else fallthru" - 1)
3229             return false;
3230           if (f == 'E' && memcmp (from + 1, "LSE", sizeof "LSE" - 1) == 0)
3231             all_upper = true;
3232           else if (memcmp (from + 1, "lse", sizeof "lse" - 1))
3233             return false;
3234           from += sizeof "else" - 1;
3235           if (*from == ',')
3236             from++;
3237           if (*from != ' ')
3238             return false;
3239           from++;
3240           if (all_upper && *from == 'f')
3241             return false;
3242           if (f == 'e' && *from == 'F')
3243             return false;
3244           f = *from;
3245         }
3246       else if (f == 'I' || f == 'i')
3247         {
3248           if ((size_t) (pfile->buffer->cur - from)
3249               < sizeof "intentional fallthru" - 1)
3250             return false;
3251           if (f == 'I' && memcmp (from + 1, "NTENTIONAL",
3252                                   sizeof "NTENTIONAL" - 1) == 0)
3253             all_upper = true;
3254           else if (memcmp (from + 1, "ntentional",
3255                            sizeof "ntentional" - 1))
3256             return false;
3257           from += sizeof "intentional" - 1;
3258           if (*from == ' ')
3259             {
3260               from++;
3261               if (all_upper && *from == 'f')
3262                 return false;
3263             }
3264           else if (all_upper)
3265             {
3266               if (memcmp (from, "LY F", sizeof "LY F" - 1))
3267                 return false;
3268               from += sizeof "LY " - 1;
3269             }
3270           else
3271             {
3272               if (memcmp (from, "ly ", sizeof "ly " - 1))
3273                 return false;
3274               from += sizeof "ly " - 1;
3275             }
3276           if (f == 'i' && *from == 'F')
3277             return false;
3278           f = *from;
3279         }
3280       if (f != 'F' && f != 'f')
3281         return false;
3282       if ((size_t) (pfile->buffer->cur - from) < sizeof "fallthru" - 1)
3283         return false;
3284       if (f == 'F' && memcmp (from + 1, "ALL", sizeof "ALL" - 1) == 0)
3285         all_upper = true;
3286       else if (all_upper)
3287         return false;
3288       else if (memcmp (from + 1, "all", sizeof "all" - 1))
3289         return false;
3290       from += sizeof "fall" - 1;
3291       if (*from == (all_upper ? 'S' : 's') && from[1] == ' ')
3292         from += 2;
3293       else if (*from == ' ' || *from == '-')
3294         from++;
3295       else if (*from != (all_upper ? 'T' : 't'))
3296         return false;
3297       if ((f == 'f' || *from != 'T') && (all_upper || *from != 't'))
3298         return false;
3299       if ((size_t) (pfile->buffer->cur - from) < sizeof "thru" - 1)
3300         return false;
3301       if (memcmp (from + 1, all_upper ? "HRU" : "hru", sizeof "hru" - 1))
3302         {
3303           if ((size_t) (pfile->buffer->cur - from) < sizeof "through" - 1)
3304             return false;
3305           if (memcmp (from + 1, all_upper ? "HROUGH" : "hrough",
3306                       sizeof "hrough" - 1))
3307             return false;
3308           from += sizeof "through" - 1;
3309         }
3310       else
3311         from += sizeof "thru" - 1;
3312       while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
3313         from++;
3314       if (*from == '-')
3315         {
3316           from++;
3317           if (*comment_start == '*')
3318             {
3319               do
3320                 {
3321                   while (*from && *from != '*'
3322                          && *from != '\n' && *from != '\r')
3323                     from++;
3324                   if (*from != '*' || from[1] == '/')
3325                     break;
3326                   from++;
3327                 }
3328               while (1);
3329             }
3330           else
3331             while (*from && *from != '\n' && *from != '\r')
3332               from++;
3333         }
3334     }
3335   /* C block comment.  */
3336   if (*comment_start == '*')
3337     {
3338       if (*from != '*' || from[1] != '/')
3339         return false;
3340     }
3341   /* C++ line comment.  */
3342   else if (*from != '\n')
3343     return false;
3344
3345   return true;
3346 }
3347
3348 /* Allocate COUNT tokens for RUN.  */
3349 void
3350 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
3351 {
3352   run->base = XNEWVEC (cpp_token, count);
3353   run->limit = run->base + count;
3354   run->next = NULL;
3355 }
3356
3357 /* Returns the next tokenrun, or creates one if there is none.  */
3358 static tokenrun *
3359 next_tokenrun (tokenrun *run)
3360 {
3361   if (run->next == NULL)
3362     {
3363       run->next = XNEW (tokenrun);
3364       run->next->prev = run;
3365       _cpp_init_tokenrun (run->next, 250);
3366     }
3367
3368   return run->next;
3369 }
3370
3371 /* Return the number of not yet processed token in a given
3372    context.  */
3373 int
3374 _cpp_remaining_tokens_num_in_context (cpp_context *context)
3375 {
3376   if (context->tokens_kind == TOKENS_KIND_DIRECT)
3377     return (LAST (context).token - FIRST (context).token);
3378   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
3379            || context->tokens_kind == TOKENS_KIND_EXTENDED)
3380     return (LAST (context).ptoken - FIRST (context).ptoken);
3381   else
3382       abort ();
3383 }
3384
3385 /* Returns the token present at index INDEX in a given context.  If
3386    INDEX is zero, the next token to be processed is returned.  */
3387 static const cpp_token*
3388 _cpp_token_from_context_at (cpp_context *context, int index)
3389 {
3390   if (context->tokens_kind == TOKENS_KIND_DIRECT)
3391     return &(FIRST (context).token[index]);
3392   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
3393            || context->tokens_kind == TOKENS_KIND_EXTENDED)
3394     return FIRST (context).ptoken[index];
3395  else
3396    abort ();
3397 }
3398
3399 /* Look ahead in the input stream.  */
3400 const cpp_token *
3401 cpp_peek_token (cpp_reader *pfile, int index)
3402 {
3403   cpp_context *context = pfile->context;
3404   const cpp_token *peektok;
3405   int count;
3406
3407   /* First, scan through any pending cpp_context objects.  */
3408   while (context->prev)
3409     {
3410       ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
3411
3412       if (index < (int) sz)
3413         return _cpp_token_from_context_at (context, index);
3414       index -= (int) sz;
3415       context = context->prev;
3416     }
3417
3418   /* We will have to read some new tokens after all (and do so
3419      without invalidating preceding tokens).  */
3420   count = index;
3421   pfile->keep_tokens++;
3422
3423   /* For peeked tokens temporarily disable line_change reporting,
3424      until the tokens are parsed for real.  */
3425   void (*line_change) (cpp_reader *, const cpp_token *, int)
3426     = pfile->cb.line_change;
3427   pfile->cb.line_change = NULL;
3428
3429   do
3430     {
3431       peektok = _cpp_lex_token (pfile);
3432       if (peektok->type == CPP_EOF)
3433         {
3434           index--;
3435           break;
3436         }
3437       else if (peektok->type == CPP_PRAGMA)
3438         {
3439           /* Don't peek past a pragma.  */
3440           if (peektok == &pfile->directive_result)
3441             /* Save the pragma in the buffer.  */
3442             *pfile->cur_token++ = *peektok;
3443           index--;
3444           break;
3445         }
3446     }
3447   while (index--);
3448
3449   _cpp_backup_tokens_direct (pfile, count - index);
3450   pfile->keep_tokens--;
3451   pfile->cb.line_change = line_change;
3452
3453   return peektok;
3454 }
3455
3456 /* Allocate a single token that is invalidated at the same time as the
3457    rest of the tokens on the line.  Has its line and col set to the
3458    same as the last lexed token, so that diagnostics appear in the
3459    right place.  */
3460 cpp_token *
3461 _cpp_temp_token (cpp_reader *pfile)
3462 {
3463   cpp_token *old, *result;
3464   ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
3465   ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
3466
3467   old = pfile->cur_token - 1;
3468   /* Any pre-existing lookaheads must not be clobbered.  */
3469   if (la)
3470     {
3471       if (sz <= la)
3472         {
3473           tokenrun *next = next_tokenrun (pfile->cur_run);
3474
3475           if (sz < la)
3476             memmove (next->base + 1, next->base,
3477                      (la - sz) * sizeof (cpp_token));
3478
3479           next->base[0] = pfile->cur_run->limit[-1];
3480         }
3481
3482       if (sz > 1)
3483         memmove (pfile->cur_token + 1, pfile->cur_token,
3484                  MIN (la, sz - 1) * sizeof (cpp_token));
3485     }
3486
3487   if (!sz && pfile->cur_token == pfile->cur_run->limit)
3488     {
3489       pfile->cur_run = next_tokenrun (pfile->cur_run);
3490       pfile->cur_token = pfile->cur_run->base;
3491     }
3492
3493   result = pfile->cur_token++;
3494   result->src_loc = old->src_loc;
3495   return result;
3496 }
3497
3498 /* We're at the beginning of a logical line (so not in
3499   directives-mode) and RESULT is a CPP_NAME with NODE_MODULE set.  See
3500   if we should enter deferred_pragma mode to tokenize the rest of the
3501   line as a module control-line.  */
3502
3503 static void
3504 cpp_maybe_module_directive (cpp_reader *pfile, cpp_token *result)
3505 {
3506   unsigned backup = 0; /* Tokens we peeked.  */
3507   cpp_hashnode *node = result->val.node.node;
3508   cpp_token *peek = result;
3509   cpp_token *keyword = peek;
3510   cpp_hashnode *(&n_modules)[spec_nodes::M_HWM][2] = pfile->spec_nodes.n_modules;
3511   int header_count = 0;
3512
3513   /* Make sure the incoming state is as we expect it.  This way we
3514      can restore it using constants.  */
3515   gcc_checking_assert (!pfile->state.in_deferred_pragma
3516                        && !pfile->state.skipping
3517                        && !pfile->state.parsing_args
3518                        && !pfile->state.angled_headers
3519                        && (pfile->state.save_comments
3520                            == !CPP_OPTION (pfile, discard_comments)));
3521
3522   /* Enter directives mode sufficiently for peeking.  We don't have
3523      to actually set in_directive.  */
3524   pfile->state.in_deferred_pragma = true;
3525
3526   /* These two fields are needed to process tokenization in deferred
3527      pragma mode.  They are not used outside deferred pragma mode or
3528      directives mode.  */
3529   pfile->state.pragma_allow_expansion = true;
3530   pfile->directive_line = result->src_loc;
3531
3532   /* Saving comments is incompatible with directives mode.   */
3533   pfile->state.save_comments = 0;
3534
3535   if (node == n_modules[spec_nodes::M_EXPORT][0])
3536     {
3537       peek = _cpp_lex_direct (pfile);
3538       keyword = peek;
3539       backup++;
3540       if (keyword->type != CPP_NAME)
3541         goto not_module;
3542       node = keyword->val.node.node;
3543       if (!(node->flags & NODE_MODULE))
3544         goto not_module;
3545     }
3546
3547   if (node == n_modules[spec_nodes::M__IMPORT][0])
3548     /* __import  */
3549     header_count = backup + 2 + 16;
3550   else if (node == n_modules[spec_nodes::M_IMPORT][0])
3551     /* import  */
3552     header_count = backup + 2 + (CPP_OPTION (pfile, preprocessed) ? 16 : 0);
3553   else if (node == n_modules[spec_nodes::M_MODULE][0])
3554     ; /* module  */
3555   else
3556     goto not_module;
3557
3558   /* We've seen [export] {module|import|__import}.  Check the next token.  */
3559   if (header_count)
3560     /* After '{,__}import' a header name may appear.  */
3561     pfile->state.angled_headers = true;
3562   peek = _cpp_lex_direct (pfile);
3563   backup++;
3564
3565   /* ... import followed by identifier, ':', '<' or
3566      header-name preprocessing tokens, or module
3567      followed by cpp-identifier, ':' or ';' preprocessing
3568      tokens.  C++ keywords are not yet relevant.  */
3569   if (peek->type == CPP_NAME
3570       || peek->type == CPP_COLON
3571       ||  (header_count
3572            ? (peek->type == CPP_LESS
3573               || (peek->type == CPP_STRING && peek->val.str.text[0] != 'R')
3574               || peek->type == CPP_HEADER_NAME)
3575            : peek->type == CPP_SEMICOLON))
3576     {
3577       pfile->state.pragma_allow_expansion = !CPP_OPTION (pfile, preprocessed);
3578       if (!pfile->state.pragma_allow_expansion)
3579         pfile->state.prevent_expansion++;
3580
3581       if (!header_count && linemap_included_from
3582           (LINEMAPS_LAST_ORDINARY_MAP (pfile->line_table)))
3583         cpp_error_with_line (pfile, CPP_DL_ERROR, keyword->src_loc, 0,
3584                              "module control-line cannot be in included file");
3585
3586       /* The first one or two tokens cannot be macro names.  */
3587       for (int ix = backup; ix--;)
3588         {
3589           cpp_token *tok = ix ? keyword : result;
3590           cpp_hashnode *node = tok->val.node.node;
3591
3592           /* Don't attempt to expand the token.  */
3593           tok->flags |= NO_EXPAND;
3594           if (_cpp_defined_macro_p (node)
3595               && _cpp_maybe_notify_macro_use (pfile, node, tok->src_loc)
3596               && !cpp_fun_like_macro_p (node))
3597             cpp_error_with_line (pfile, CPP_DL_ERROR, tok->src_loc, 0,
3598                                  "module control-line \"%s\" cannot be"
3599                                  " an object-like macro",
3600                                  NODE_NAME (node));
3601         }
3602
3603       /* Map to underbar variants.  */
3604       keyword->val.node.node = n_modules[header_count
3605                                          ? spec_nodes::M_IMPORT
3606                                          : spec_nodes::M_MODULE][1];
3607       if (backup != 1)
3608         result->val.node.node = n_modules[spec_nodes::M_EXPORT][1];
3609
3610       /* Maybe tell the tokenizer we expect a header-name down the
3611          road.  */
3612       pfile->state.directive_file_token = header_count;
3613     }
3614   else
3615     {
3616     not_module:
3617       /* Drop out of directive mode.  */
3618       /* We aaserted save_comments had this value upon entry.  */
3619       pfile->state.save_comments
3620         = !CPP_OPTION (pfile, discard_comments);
3621       pfile->state.in_deferred_pragma = false;
3622       /* Do not let this remain on.  */
3623       pfile->state.angled_headers = false;
3624     }
3625
3626   /* In either case we want to backup the peeked tokens.  */
3627   if (backup)
3628     {
3629       /* If we saw EOL, we should drop it, because this isn't a module
3630          control-line after all.  */
3631       bool eol = peek->type == CPP_PRAGMA_EOL;
3632       if (!eol || backup > 1)
3633         {
3634           /* Put put the peeked tokens back  */
3635           _cpp_backup_tokens_direct (pfile, backup);
3636           /* But if the last one was an EOL, forget it.  */
3637           if (eol)
3638             pfile->lookaheads--;
3639         }
3640     }
3641 }
3642
3643 /* Lex a token into RESULT (external interface).  Takes care of issues
3644    like directive handling, token lookahead, multiple include
3645    optimization and skipping.  */
3646 const cpp_token *
3647 _cpp_lex_token (cpp_reader *pfile)
3648 {
3649   cpp_token *result;
3650
3651   for (;;)
3652     {
3653       if (pfile->cur_token == pfile->cur_run->limit)
3654         {
3655           pfile->cur_run = next_tokenrun (pfile->cur_run);
3656           pfile->cur_token = pfile->cur_run->base;
3657         }
3658       /* We assume that the current token is somewhere in the current
3659          run.  */
3660       if (pfile->cur_token < pfile->cur_run->base
3661           || pfile->cur_token >= pfile->cur_run->limit)
3662         abort ();
3663
3664       if (pfile->lookaheads)
3665         {
3666           pfile->lookaheads--;
3667           result = pfile->cur_token++;
3668         }
3669       else
3670         result = _cpp_lex_direct (pfile);
3671
3672       if (result->flags & BOL)
3673         {
3674           /* Is this a directive.  If _cpp_handle_directive returns
3675              false, it is an assembler #.  */
3676           if (result->type == CPP_HASH
3677               /* 6.10.3 p 11: Directives in a list of macro arguments
3678                  gives undefined behavior.  This implementation
3679                  handles the directive as normal.  */
3680               && pfile->state.parsing_args != 1)
3681             {
3682               if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
3683                 {
3684                   if (pfile->directive_result.type == CPP_PADDING)
3685                     continue;
3686                   result = &pfile->directive_result;
3687                 }
3688             }
3689           else if (pfile->state.in_deferred_pragma)
3690             result = &pfile->directive_result;
3691           else if (result->type == CPP_NAME
3692                    && (result->val.node.node->flags & NODE_MODULE)
3693                    && !pfile->state.skipping
3694                    /* Unlike regular directives, we do not deal with
3695                       tokenizing module directives as macro arguments.
3696                       That's not permitted.  */
3697                    && !pfile->state.parsing_args)
3698             {
3699               /* P1857.  Before macro expansion, At start of logical
3700                  line ... */
3701               /* We don't have to consider lookaheads at this point.  */
3702               gcc_checking_assert (!pfile->lookaheads);
3703
3704               cpp_maybe_module_directive (pfile, result);
3705             }
3706
3707           if (pfile->cb.line_change && !pfile->state.skipping)
3708             pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
3709         }
3710
3711       /* We don't skip tokens in directives.  */
3712       if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
3713         break;
3714
3715       /* Outside a directive, invalidate controlling macros.  At file
3716          EOF, _cpp_lex_direct takes care of popping the buffer, so we never
3717          get here and MI optimization works.  */
3718       pfile->mi_valid = false;
3719
3720       if (!pfile->state.skipping || result->type == CPP_EOF)
3721         break;
3722     }
3723
3724   return result;
3725 }
3726
3727 /* Returns true if a fresh line has been loaded.  */
3728 template <bool lexing_raw_string>
3729 static bool
3730 get_fresh_line_impl (cpp_reader *pfile)
3731 {
3732   /* We can't get a new line until we leave the current directive, unless we
3733      are lexing a raw string, in which case it will be OK as long as we don't
3734      pop the current buffer.  */
3735   if (!lexing_raw_string && pfile->state.in_directive)
3736     return false;
3737
3738   for (;;)
3739     {
3740       cpp_buffer *buffer = pfile->buffer;
3741
3742       if (!buffer->need_line)
3743         return true;
3744
3745       if (buffer->next_line < buffer->rlimit)
3746         {
3747           _cpp_clean_line (pfile);
3748           return true;
3749         }
3750
3751       /* We can't change buffers until we leave the current directive.  */
3752       if (lexing_raw_string && pfile->state.in_directive)
3753         return false;
3754
3755       /* First, get out of parsing arguments state.  */
3756       if (pfile->state.parsing_args)
3757         return false;
3758
3759       /* End of buffer.  Non-empty files should end in a newline.  */
3760       if (buffer->buf != buffer->rlimit
3761           && buffer->next_line > buffer->rlimit
3762           && !buffer->from_stage3)
3763         {
3764           /* Clip to buffer size.  */
3765           buffer->next_line = buffer->rlimit;
3766         }
3767
3768       if (buffer->prev && !buffer->return_at_eof)
3769         _cpp_pop_buffer (pfile);
3770       else
3771         {
3772           /* End of translation.  Do not pop the buffer yet. Increment
3773              line number so that the EOF token is on a line of its own
3774              (_cpp_lex_direct doesn't increment in that case, because
3775              it's hard for it to distinguish this special case). */
3776           CPP_INCREMENT_LINE (pfile, 0);
3777           return false;
3778         }
3779     }
3780 }
3781
3782 bool
3783 _cpp_get_fresh_line (cpp_reader *pfile)
3784 {
3785   return get_fresh_line_impl<false> (pfile);
3786 }
3787
3788
3789 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)          \
3790   do                                                    \
3791     {                                                   \
3792       result->type = ELSE_TYPE;                         \
3793       if (*buffer->cur == CHAR)                         \
3794         buffer->cur++, result->type = THEN_TYPE;        \
3795     }                                                   \
3796   while (0)
3797
3798 /* Lex a token into pfile->cur_token, which is also incremented, to
3799    get diagnostics pointing to the correct location.
3800
3801    Does not handle issues such as token lookahead, multiple-include
3802    optimization, directives, skipping etc.  This function is only
3803    suitable for use by _cpp_lex_token, and in special cases like
3804    lex_expansion_token which doesn't care for any of these issues.
3805
3806    When meeting a newline, returns CPP_EOF if parsing a directive,
3807    otherwise returns to the start of the token buffer if permissible.
3808    Returns the location of the lexed token.  */
3809 cpp_token *
3810 _cpp_lex_direct (cpp_reader *pfile)
3811 {
3812   cppchar_t c = 0;
3813   cpp_buffer *buffer;
3814   const unsigned char *comment_start;
3815   bool fallthrough_comment = false;
3816   cpp_token *result = pfile->cur_token++;
3817
3818  fresh_line:
3819   result->flags = 0;
3820   buffer = pfile->buffer;
3821   if (buffer->need_line)
3822     {
3823       if (pfile->state.in_deferred_pragma)
3824         {
3825           /* This can happen in cases like:
3826              #define loop(x) whatever
3827              #pragma omp loop
3828              where when trying to expand loop we need to peek
3829              next token after loop, but aren't still in_deferred_pragma
3830              mode but are in in_directive mode, so buffer->need_line
3831              is set, a CPP_EOF is peeked.  */
3832           result->type = CPP_PRAGMA_EOL;
3833           pfile->state.in_deferred_pragma = false;
3834           if (!pfile->state.pragma_allow_expansion)
3835             pfile->state.prevent_expansion--;
3836           result->src_loc = pfile->line_table->highest_line;
3837           return result;
3838         }
3839       if (!_cpp_get_fresh_line (pfile))
3840         {
3841           result->type = CPP_EOF;
3842           /* Not a real EOF in a directive or arg parsing -- we refuse
3843              to advance to the next file now, and will once we're out
3844              of those modes.  */
3845           if (!pfile->state.in_directive && !pfile->state.parsing_args)
3846             {
3847               /* Tell the compiler the line number of the EOF token.  */
3848               result->src_loc = pfile->line_table->highest_line;
3849               result->flags = BOL;
3850               /* Now pop the buffer that _cpp_get_fresh_line did not.  */
3851               _cpp_pop_buffer (pfile);
3852             }
3853           else if (c == 0)
3854             result->src_loc = pfile->line_table->highest_line;
3855           return result;
3856         }
3857       if (buffer != pfile->buffer)
3858         fallthrough_comment = false;
3859       if (!pfile->keep_tokens)
3860         {
3861           pfile->cur_run = &pfile->base_run;
3862           result = pfile->base_run.base;
3863           pfile->cur_token = result + 1;
3864         }
3865       result->flags = BOL;
3866       if (pfile->state.parsing_args == 2)
3867         result->flags |= PREV_WHITE;
3868     }
3869   buffer = pfile->buffer;
3870  update_tokens_line:
3871   result->src_loc = pfile->line_table->highest_line;
3872
3873  skipped_white:
3874   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
3875       && !pfile->overlaid_buffer)
3876     {
3877       _cpp_process_line_notes (pfile, false);
3878       result->src_loc = pfile->line_table->highest_line;
3879     }
3880   c = *buffer->cur++;
3881
3882   if (pfile->forced_token_location)
3883     result->src_loc = pfile->forced_token_location;
3884   else
3885     result->src_loc = linemap_position_for_column (pfile->line_table,
3886                                           CPP_BUF_COLUMN (buffer, buffer->cur));
3887
3888   switch (c)
3889     {
3890     case ' ': case '\t': case '\f': case '\v': case '\0':
3891       result->flags |= PREV_WHITE;
3892       skip_whitespace (pfile, c);
3893       goto skipped_white;
3894
3895     case '\n':
3896       /* Increment the line, unless this is the last line ...  */
3897       if (buffer->cur < buffer->rlimit
3898           /* ... or this is a #include, (where _cpp_stack_file needs to
3899              unwind by one line) ...  */
3900           || (pfile->state.in_directive > 1
3901               /* ... except traditional-cpp increments this elsewhere.  */
3902               && !CPP_OPTION (pfile, traditional)))
3903         CPP_INCREMENT_LINE (pfile, 0);
3904       buffer->need_line = true;
3905       if (pfile->state.in_deferred_pragma)
3906         {
3907           /* Produce the PRAGMA_EOL on this line.  File reading
3908              ensures there is always a \n at end of the buffer, thus
3909              in a deferred pragma we always see CPP_PRAGMA_EOL before
3910              any CPP_EOF.  */
3911           result->type = CPP_PRAGMA_EOL;
3912           result->flags &= ~PREV_WHITE;
3913           pfile->state.in_deferred_pragma = false;
3914           if (!pfile->state.pragma_allow_expansion)
3915             pfile->state.prevent_expansion--;
3916           return result;
3917         }
3918       goto fresh_line;
3919
3920     case '0': case '1': case '2': case '3': case '4':
3921     case '5': case '6': case '7': case '8': case '9':
3922       {
3923         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3924         result->type = CPP_NUMBER;
3925         lex_number (pfile, &result->val.str, &nst);
3926         warn_about_normalization (pfile, result, &nst, false);
3927         break;
3928       }
3929
3930     case 'L':
3931     case 'u':
3932     case 'U':
3933     case 'R':
3934       /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
3935          wide strings or raw strings.  */
3936       if (c == 'L' || CPP_OPTION (pfile, rliterals)
3937           || (c != 'R' && CPP_OPTION (pfile, uliterals)))
3938         {
3939           if ((*buffer->cur == '\'' && c != 'R')
3940               || *buffer->cur == '"'
3941               || (*buffer->cur == 'R'
3942                   && c != 'R'
3943                   && buffer->cur[1] == '"'
3944                   && CPP_OPTION (pfile, rliterals))
3945               || (*buffer->cur == '8'
3946                   && c == 'u'
3947                   && ((buffer->cur[1] == '"' || (buffer->cur[1] == '\''
3948                                 && CPP_OPTION (pfile, utf8_char_literals)))
3949                       || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
3950                           && CPP_OPTION (pfile, rliterals)))))
3951             {
3952               lex_string (pfile, result, buffer->cur - 1);
3953               break;
3954             }
3955         }
3956       /* Fall through.  */
3957
3958     case '_':
3959     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
3960     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
3961     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
3962     case 's': case 't':           case 'v': case 'w': case 'x':
3963     case 'y': case 'z':
3964     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
3965     case 'G': case 'H': case 'I': case 'J': case 'K':
3966     case 'M': case 'N': case 'O': case 'P': case 'Q':
3967     case 'S': case 'T':           case 'V': case 'W': case 'X':
3968     case 'Y': case 'Z':
3969       result->type = CPP_NAME;
3970       {
3971         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3972         const auto node = lex_identifier (pfile, buffer->cur - 1, false, &nst,
3973                                           &result->val.node.spelling);
3974         result->val.node.node = node;
3975         identifier_diagnostics_on_lex (pfile, node);
3976         warn_about_normalization (pfile, result, &nst, true);
3977       }
3978
3979       /* Convert named operators to their proper types.  */
3980       if (result->val.node.node->flags & NODE_OPERATOR)
3981         {
3982           result->flags |= NAMED_OP;
3983           result->type = (enum cpp_ttype) result->val.node.node->directive_index;
3984         }
3985
3986       /* Signal FALLTHROUGH comment followed by another token.  */
3987       if (fallthrough_comment)
3988         result->flags |= PREV_FALLTHROUGH;
3989       break;
3990
3991     case '\'':
3992     case '"':
3993       lex_string (pfile, result, buffer->cur - 1);
3994       break;
3995
3996     case '/':
3997       /* A potential block or line comment.  */
3998       comment_start = buffer->cur;
3999       c = *buffer->cur;
4000
4001       if (c == '*')
4002         {
4003           if (_cpp_skip_block_comment (pfile))
4004             cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
4005         }
4006       else if (c == '/' && ! CPP_OPTION (pfile, traditional))
4007         {
4008           /* Don't warn for system headers.  */
4009           if (_cpp_in_system_header (pfile))
4010             ;
4011           /* Warn about comments if pedantically GNUC89, and not
4012              in system headers.  */
4013           else if (CPP_OPTION (pfile, lang) == CLK_GNUC89
4014                    && CPP_PEDANTIC (pfile)
4015                    && ! buffer->warned_cplusplus_comments)
4016             {
4017               if (cpp_error (pfile, CPP_DL_PEDWARN,
4018                              "C++ style comments are not allowed in ISO C90"))
4019                 cpp_error (pfile, CPP_DL_NOTE,
4020                            "(this will be reported only once per input file)");
4021               buffer->warned_cplusplus_comments = 1;
4022             }
4023           /* Or if specifically desired via -Wc90-c99-compat.  */
4024           else if (CPP_OPTION (pfile, cpp_warn_c90_c99_compat) > 0
4025                    && ! CPP_OPTION (pfile, cplusplus)
4026                    && ! buffer->warned_cplusplus_comments)
4027             {
4028               if (cpp_error (pfile, CPP_DL_WARNING,
4029                              "C++ style comments are incompatible with C90"))
4030                 cpp_error (pfile, CPP_DL_NOTE,
4031                            "(this will be reported only once per input file)");
4032               buffer->warned_cplusplus_comments = 1;
4033             }
4034           /* In C89/C94, C++ style comments are forbidden.  */
4035           else if ((CPP_OPTION (pfile, lang) == CLK_STDC89
4036                     || CPP_OPTION (pfile, lang) == CLK_STDC94))
4037             {
4038               /* But don't be confused about valid code such as
4039                  - // immediately followed by *,
4040                  - // in a preprocessing directive,
4041                  - // in an #if 0 block.  */
4042               if (buffer->cur[1] == '*'
4043                   || pfile->state.in_directive
4044                   || pfile->state.skipping)
4045                 {
4046                   result->type = CPP_DIV;
4047                   break;
4048                 }
4049               else if (! buffer->warned_cplusplus_comments)
4050                 {
4051                   if (cpp_error (pfile, CPP_DL_ERROR,
4052                                  "C++ style comments are not allowed in "
4053                                  "ISO C90"))
4054                     cpp_error (pfile, CPP_DL_NOTE,
4055                                "(this will be reported only once per input "
4056                                "file)");
4057                   buffer->warned_cplusplus_comments = 1;
4058                 }
4059             }
4060           if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
4061             cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
4062         }
4063       else if (c == '=')
4064         {
4065           buffer->cur++;
4066           result->type = CPP_DIV_EQ;
4067           break;
4068         }
4069       else
4070         {
4071           result->type = CPP_DIV;
4072           break;
4073         }
4074
4075       if (fallthrough_comment_p (pfile, comment_start))
4076         fallthrough_comment = true;
4077
4078       if (pfile->cb.comment)
4079         {
4080           size_t len = pfile->buffer->cur - comment_start;
4081           pfile->cb.comment (pfile, result->src_loc, comment_start - 1,
4082                              len + 1);
4083         }
4084
4085       if (!pfile->state.save_comments)
4086         {
4087           result->flags |= PREV_WHITE;
4088           goto update_tokens_line;
4089         }
4090
4091       if (fallthrough_comment)
4092         result->flags |= PREV_FALLTHROUGH;
4093
4094       /* Save the comment as a token in its own right.  */
4095       save_comment (pfile, result, comment_start, c);
4096       break;
4097
4098     case '<':
4099       if (pfile->state.angled_headers)
4100         {
4101           lex_string (pfile, result, buffer->cur - 1);
4102           if (result->type != CPP_LESS)
4103             break;
4104         }
4105
4106       result->type = CPP_LESS;
4107       if (*buffer->cur == '=')
4108         {
4109           buffer->cur++, result->type = CPP_LESS_EQ;
4110           if (*buffer->cur == '>'
4111               && CPP_OPTION (pfile, cplusplus)
4112               && CPP_OPTION (pfile, lang) >= CLK_GNUCXX20)
4113             buffer->cur++, result->type = CPP_SPACESHIP;
4114         }
4115       else if (*buffer->cur == '<')
4116         {
4117           buffer->cur++;
4118           IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
4119         }
4120       else if (CPP_OPTION (pfile, digraphs))
4121         {
4122           if (*buffer->cur == ':')
4123             {
4124               /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
4125                  three characters are <:: and the subsequent character
4126                  is neither : nor >, the < is treated as a preprocessor
4127                  token by itself".  */
4128               if (CPP_OPTION (pfile, cplusplus)
4129                   && CPP_OPTION (pfile, lang) != CLK_CXX98
4130                   && CPP_OPTION (pfile, lang) != CLK_GNUCXX
4131                   && buffer->cur[1] == ':'
4132                   && buffer->cur[2] != ':' && buffer->cur[2] != '>')
4133                 break;
4134
4135               buffer->cur++;
4136               result->flags |= DIGRAPH;
4137               result->type = CPP_OPEN_SQUARE;
4138             }
4139           else if (*buffer->cur == '%')
4140             {
4141               buffer->cur++;
4142               result->flags |= DIGRAPH;
4143               result->type = CPP_OPEN_BRACE;
4144             }
4145         }
4146       break;
4147
4148     case '>':
4149       result->type = CPP_GREATER;
4150       if (*buffer->cur == '=')
4151         buffer->cur++, result->type = CPP_GREATER_EQ;
4152       else if (*buffer->cur == '>')
4153         {
4154           buffer->cur++;
4155           IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
4156         }
4157       break;
4158
4159     case '%':
4160       result->type = CPP_MOD;
4161       if (*buffer->cur == '=')
4162         buffer->cur++, result->type = CPP_MOD_EQ;
4163       else if (CPP_OPTION (pfile, digraphs))
4164         {
4165           if (*buffer->cur == ':')
4166             {
4167               buffer->cur++;
4168               result->flags |= DIGRAPH;
4169               result->type = CPP_HASH;
4170               if (*buffer->cur == '%' && buffer->cur[1] == ':')
4171                 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
4172             }
4173           else if (*buffer->cur == '>')
4174             {
4175               buffer->cur++;
4176               result->flags |= DIGRAPH;
4177               result->type = CPP_CLOSE_BRACE;
4178             }
4179         }
4180       break;
4181
4182     case '.':
4183       result->type = CPP_DOT;
4184       if (ISDIGIT (*buffer->cur))
4185         {
4186           struct normalize_state nst = INITIAL_NORMALIZE_STATE;
4187           result->type = CPP_NUMBER;
4188           lex_number (pfile, &result->val.str, &nst);
4189           warn_about_normalization (pfile, result, &nst, false);
4190         }
4191       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
4192         buffer->cur += 2, result->type = CPP_ELLIPSIS;
4193       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
4194         buffer->cur++, result->type = CPP_DOT_STAR;
4195       break;
4196
4197     case '+':
4198       result->type = CPP_PLUS;
4199       if (*buffer->cur == '+')
4200         buffer->cur++, result->type = CPP_PLUS_PLUS;
4201       else if (*buffer->cur == '=')
4202         buffer->cur++, result->type = CPP_PLUS_EQ;
4203       break;
4204
4205     case '-':
4206       result->type = CPP_MINUS;
4207       if (*buffer->cur == '>')
4208         {
4209           buffer->cur++;
4210           result->type = CPP_DEREF;
4211           if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
4212             buffer->cur++, result->type = CPP_DEREF_STAR;
4213         }
4214       else if (*buffer->cur == '-')
4215         buffer->cur++, result->type = CPP_MINUS_MINUS;
4216       else if (*buffer->cur == '=')
4217         buffer->cur++, result->type = CPP_MINUS_EQ;
4218       break;
4219
4220     case '&':
4221       result->type = CPP_AND;
4222       if (*buffer->cur == '&')
4223         buffer->cur++, result->type = CPP_AND_AND;
4224       else if (*buffer->cur == '=')
4225         buffer->cur++, result->type = CPP_AND_EQ;
4226       break;
4227
4228     case '|':
4229       result->type = CPP_OR;
4230       if (*buffer->cur == '|')
4231         buffer->cur++, result->type = CPP_OR_OR;
4232       else if (*buffer->cur == '=')
4233         buffer->cur++, result->type = CPP_OR_EQ;
4234       break;
4235
4236     case ':':
4237       result->type = CPP_COLON;
4238       if (*buffer->cur == ':')
4239         {
4240           if (CPP_OPTION (pfile, scope))
4241             buffer->cur++, result->type = CPP_SCOPE;
4242           else
4243             result->flags |= COLON_SCOPE;
4244         }
4245       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
4246         {
4247           buffer->cur++;
4248           result->flags |= DIGRAPH;
4249           result->type = CPP_CLOSE_SQUARE;
4250         }
4251       break;
4252
4253     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
4254     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
4255     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
4256     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
4257     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
4258
4259     case '?': result->type = CPP_QUERY; break;
4260     case '~': result->type = CPP_COMPL; break;
4261     case ',': result->type = CPP_COMMA; break;
4262     case '(': result->type = CPP_OPEN_PAREN; break;
4263     case ')': result->type = CPP_CLOSE_PAREN; break;
4264     case '[': result->type = CPP_OPEN_SQUARE; break;
4265     case ']': result->type = CPP_CLOSE_SQUARE; break;
4266     case '{': result->type = CPP_OPEN_BRACE; break;
4267     case '}': result->type = CPP_CLOSE_BRACE; break;
4268     case ';': result->type = CPP_SEMICOLON; break;
4269
4270       /* @ is a punctuator in Objective-C.  */
4271     case '@': result->type = CPP_ATSIGN; break;
4272
4273     default:
4274       {
4275         const uchar *base = --buffer->cur;
4276         static int no_warn_cnt;
4277
4278         /* Check for an extended identifier ($ or UCN or UTF-8).  */
4279         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
4280         if (forms_identifier_p (pfile, true, &nst))
4281           {
4282             result->type = CPP_NAME;
4283             const auto node = lex_identifier (pfile, base, true, &nst,
4284                                               &result->val.node.spelling);
4285             result->val.node.node = node;
4286             identifier_diagnostics_on_lex (pfile, node);
4287             warn_about_normalization (pfile, result, &nst, true);
4288             break;
4289           }
4290
4291         /* Otherwise this will form a CPP_OTHER token.  Parse valid UTF-8 as a
4292            single token.  */
4293         buffer->cur++;
4294         if (c >= utf8_signifier)
4295           {
4296             const uchar *pstr = base;
4297             cppchar_t s;
4298             if (_cpp_valid_utf8 (pfile, &pstr, buffer->rlimit, 0, NULL, &s))
4299               {
4300                 if (s > UCS_LIMIT && CPP_OPTION (pfile, cpp_warn_invalid_utf8))
4301                   {
4302                     buffer->cur = base;
4303                     _cpp_warn_invalid_utf8 (pfile);
4304                   }
4305                 buffer->cur = pstr;
4306               }
4307             else if (CPP_OPTION (pfile, cpp_warn_invalid_utf8))
4308               {
4309                 buffer->cur = base;
4310                 const uchar *end = _cpp_warn_invalid_utf8 (pfile);
4311                 buffer->cur = base + 1;
4312                 no_warn_cnt = end - buffer->cur;
4313               }
4314           }
4315         else if (c >= utf8_continuation
4316                  && CPP_OPTION (pfile, cpp_warn_invalid_utf8))
4317           {
4318             if (no_warn_cnt)
4319               --no_warn_cnt;
4320             else
4321               {
4322                 buffer->cur = base;
4323                 _cpp_warn_invalid_utf8 (pfile);
4324                 buffer->cur = base + 1;
4325               }
4326           }
4327         create_literal (pfile, result, base, buffer->cur - base, CPP_OTHER);
4328         break;
4329       }
4330
4331     }
4332
4333   /* Potentially convert the location of the token to a range.  */
4334   if (result->src_loc >= RESERVED_LOCATION_COUNT
4335       && result->type != CPP_EOF)
4336     {
4337       /* Ensure that any line notes are processed, so that we have the
4338          correct physical line/column for the end-point of the token even
4339          when a logical line is split via one or more backslashes.  */
4340       if (buffer->cur >= buffer->notes[buffer->cur_note].pos
4341           && !pfile->overlaid_buffer)
4342         _cpp_process_line_notes (pfile, false);
4343
4344       source_range tok_range;
4345       tok_range.m_start = result->src_loc;
4346       tok_range.m_finish
4347         = linemap_position_for_column (pfile->line_table,
4348                                        CPP_BUF_COLUMN (buffer, buffer->cur));
4349
4350       result->src_loc
4351         = pfile->line_table->get_or_create_combined_loc (result->src_loc,
4352                                                          tok_range, nullptr, 0);
4353     }
4354
4355   return result;
4356 }
4357
4358 /* An upper bound on the number of bytes needed to spell TOKEN.
4359    Does not include preceding whitespace.  */
4360 unsigned int
4361 cpp_token_len (const cpp_token *token)
4362 {
4363   unsigned int len;
4364
4365   switch (TOKEN_SPELL (token))
4366     {
4367     default:            len = 6;                                break;
4368     case SPELL_LITERAL: len = token->val.str.len;               break;
4369     case SPELL_IDENT:   len = NODE_LEN (token->val.node.node) * 10;     break;
4370     }
4371
4372   return len;
4373 }
4374
4375 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
4376    Return the number of bytes read out of NAME.  (There are always
4377    10 bytes written to BUFFER.)  */
4378
4379 static size_t
4380 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
4381 {
4382   int j;
4383   int ucn_len = 0;
4384   int ucn_len_c;
4385   unsigned t;
4386   unsigned long utf32;
4387
4388   /* Compute the length of the UTF-8 sequence.  */
4389   for (t = *name; t & 0x80; t <<= 1)
4390     ucn_len++;
4391
4392   utf32 = *name & (0x7F >> ucn_len);
4393   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
4394     {
4395       utf32 = (utf32 << 6) | (*++name & 0x3F);
4396
4397       /* Ill-formed UTF-8.  */
4398       if ((*name & ~0x3F) != 0x80)
4399         abort ();
4400     }
4401
4402   *buffer++ = '\\';
4403   *buffer++ = 'U';
4404   for (j = 7; j >= 0; j--)
4405     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
4406   return ucn_len;
4407 }
4408
4409 /* Given a token TYPE corresponding to a digraph, return a pointer to
4410    the spelling of the digraph.  */
4411 static const unsigned char *
4412 cpp_digraph2name (enum cpp_ttype type)
4413 {
4414   return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
4415 }
4416
4417 /* Write the spelling of an identifier IDENT, using UCNs, to BUFFER.
4418    The buffer must already contain enough space to hold the
4419    token's spelling.  Returns a pointer to the character after the
4420    last character written.  */
4421 unsigned char *
4422 _cpp_spell_ident_ucns (unsigned char *buffer, cpp_hashnode *ident)
4423 {
4424   size_t i;
4425   const unsigned char *name = NODE_NAME (ident);
4426
4427   for (i = 0; i < NODE_LEN (ident); i++)
4428     if (name[i] & ~0x7F)
4429       {
4430         i += utf8_to_ucn (buffer, name + i) - 1;
4431         buffer += 10;
4432       }
4433     else
4434       *buffer++ = name[i];
4435
4436   return buffer;
4437 }
4438
4439 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
4440    already contain enough space to hold the token's spelling.
4441    Returns a pointer to the character after the last character written.
4442    FORSTRING is true if this is to be the spelling after translation
4443    phase 1 (with the original spelling of extended identifiers), false
4444    if extended identifiers should always be written using UCNs (there is
4445    no option for always writing them in the internal UTF-8 form).
4446    FIXME: Would be nice if we didn't need the PFILE argument.  */
4447 unsigned char *
4448 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
4449                  unsigned char *buffer, bool forstring)
4450 {
4451   switch (TOKEN_SPELL (token))
4452     {
4453     case SPELL_OPERATOR:
4454       {
4455         const unsigned char *spelling;
4456         unsigned char c;
4457
4458         if (token->flags & DIGRAPH)
4459           spelling = cpp_digraph2name (token->type);
4460         else if (token->flags & NAMED_OP)
4461           goto spell_ident;
4462         else
4463           spelling = TOKEN_NAME (token);
4464
4465         while ((c = *spelling++) != '\0')
4466           *buffer++ = c;
4467       }
4468       break;
4469
4470     spell_ident:
4471     case SPELL_IDENT:
4472       if (forstring)
4473         {
4474           memcpy (buffer, NODE_NAME (token->val.node.spelling),
4475                   NODE_LEN (token->val.node.spelling));
4476           buffer += NODE_LEN (token->val.node.spelling);
4477         }
4478       else
4479         buffer = _cpp_spell_ident_ucns (buffer, token->val.node.node);
4480       break;
4481
4482     case SPELL_LITERAL:
4483       memcpy (buffer, token->val.str.text, token->val.str.len);
4484       buffer += token->val.str.len;
4485       break;
4486
4487     case SPELL_NONE:
4488       cpp_error (pfile, CPP_DL_ICE,
4489                  "unspellable token %s", TOKEN_NAME (token));
4490       break;
4491     }
4492
4493   return buffer;
4494 }
4495
4496 /* Returns TOKEN spelt as a null-terminated string.  The string is
4497    freed when the reader is destroyed.  Useful for diagnostics.  */
4498 unsigned char *
4499 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
4500 {
4501   unsigned int len = cpp_token_len (token) + 1;
4502   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
4503
4504   end = cpp_spell_token (pfile, token, start, false);
4505   end[0] = '\0';
4506
4507   return start;
4508 }
4509
4510 /* Returns a pointer to a string which spells the token defined by
4511    TYPE and FLAGS.  Used by C front ends, which really should move to
4512    using cpp_token_as_text.  */
4513 const char *
4514 cpp_type2name (enum cpp_ttype type, unsigned char flags)
4515 {
4516   if (flags & DIGRAPH)
4517     return (const char *) cpp_digraph2name (type);
4518   else if (flags & NAMED_OP)
4519     return cpp_named_operator2name (type);
4520
4521   return (const char *) token_spellings[type].name;
4522 }
4523
4524 /* Writes the spelling of token to FP, without any preceding space.
4525    Separated from cpp_spell_token for efficiency - to avoid stdio
4526    double-buffering.  */
4527 void
4528 cpp_output_token (const cpp_token *token, FILE *fp)
4529 {
4530   switch (TOKEN_SPELL (token))
4531     {
4532     case SPELL_OPERATOR:
4533       {
4534         const unsigned char *spelling;
4535         int c;
4536
4537         if (token->flags & DIGRAPH)
4538           spelling = cpp_digraph2name (token->type);
4539         else if (token->flags & NAMED_OP)
4540           goto spell_ident;
4541         else
4542           spelling = TOKEN_NAME (token);
4543
4544         c = *spelling;
4545         do
4546           putc (c, fp);
4547         while ((c = *++spelling) != '\0');
4548       }
4549       break;
4550
4551     spell_ident:
4552     case SPELL_IDENT:
4553       {
4554         size_t i;
4555         const unsigned char * name = NODE_NAME (token->val.node.node);
4556
4557         for (i = 0; i < NODE_LEN (token->val.node.node); i++)
4558           if (name[i] & ~0x7F)
4559             {
4560               unsigned char buffer[10];
4561               i += utf8_to_ucn (buffer, name + i) - 1;
4562               fwrite (buffer, 1, 10, fp);
4563             }
4564           else
4565             fputc (NODE_NAME (token->val.node.node)[i], fp);
4566       }
4567       break;
4568
4569     case SPELL_LITERAL:
4570       if (token->type == CPP_HEADER_NAME)
4571         fputc ('"', fp);
4572       fwrite (token->val.str.text, 1, token->val.str.len, fp);
4573       if (token->type == CPP_HEADER_NAME)
4574         fputc ('"', fp);
4575       break;
4576
4577     case SPELL_NONE:
4578       /* An error, most probably.  */
4579       break;
4580     }
4581 }
4582
4583 /* Compare two tokens.  */
4584 int
4585 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
4586 {
4587   if (a->type == b->type && a->flags == b->flags)
4588     switch (TOKEN_SPELL (a))
4589       {
4590       default:                  /* Keep compiler happy.  */
4591       case SPELL_OPERATOR:
4592         /* token_no is used to track where multiple consecutive ##
4593            tokens were originally located.  */
4594         return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
4595       case SPELL_NONE:
4596         return (a->type != CPP_MACRO_ARG
4597                 || (a->val.macro_arg.arg_no == b->val.macro_arg.arg_no
4598                     && a->val.macro_arg.spelling == b->val.macro_arg.spelling));
4599       case SPELL_IDENT:
4600         return (a->val.node.node == b->val.node.node
4601                 && a->val.node.spelling == b->val.node.spelling);
4602       case SPELL_LITERAL:
4603         return (a->val.str.len == b->val.str.len
4604                 && !memcmp (a->val.str.text, b->val.str.text,
4605                             a->val.str.len));
4606       }
4607
4608   return 0;
4609 }
4610
4611 /* Returns nonzero if a space should be inserted to avoid an
4612    accidental token paste for output.  For simplicity, it is
4613    conservative, and occasionally advises a space where one is not
4614    needed, e.g. "." and ".2".  */
4615 int
4616 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
4617                  const cpp_token *token2)
4618 {
4619   enum cpp_ttype a = token1->type, b = token2->type;
4620   cppchar_t c;
4621
4622   if (token1->flags & NAMED_OP)
4623     a = CPP_NAME;
4624   if (token2->flags & NAMED_OP)
4625     b = CPP_NAME;
4626
4627   c = EOF;
4628   if (token2->flags & DIGRAPH)
4629     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
4630   else if (token_spellings[b].category == SPELL_OPERATOR)
4631     c = token_spellings[b].name[0];
4632
4633   /* Quickly get everything that can paste with an '='.  */
4634   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
4635     return 1;
4636
4637   switch (a)
4638     {
4639     case CPP_GREATER:   return c == '>';
4640     case CPP_LESS:      return c == '<' || c == '%' || c == ':';
4641     case CPP_PLUS:      return c == '+';
4642     case CPP_MINUS:     return c == '-' || c == '>';
4643     case CPP_DIV:       return c == '/' || c == '*'; /* Comments.  */
4644     case CPP_MOD:       return c == ':' || c == '>';
4645     case CPP_AND:       return c == '&';
4646     case CPP_OR:        return c == '|';
4647     case CPP_COLON:     return c == ':' || c == '>';
4648     case CPP_DEREF:     return c == '*';
4649     case CPP_DOT:       return c == '.' || c == '%' || b == CPP_NUMBER;
4650     case CPP_HASH:      return c == '#' || c == '%'; /* Digraph form.  */
4651     case CPP_PRAGMA:
4652     case CPP_NAME:      return ((b == CPP_NUMBER
4653                                  && name_p (pfile, &token2->val.str))
4654                                 || b == CPP_NAME
4655                                 || b == CPP_CHAR || b == CPP_STRING); /* L */
4656     case CPP_NUMBER:    return (b == CPP_NUMBER || b == CPP_NAME
4657                                 || b == CPP_CHAR
4658                                 || c == '.' || c == '+' || c == '-');
4659                                       /* UCNs */
4660     case CPP_OTHER:     return ((token1->val.str.text[0] == '\\'
4661                                  && b == CPP_NAME)
4662                                 || (CPP_OPTION (pfile, objc)
4663                                     && token1->val.str.text[0] == '@'
4664                                     && (b == CPP_NAME || b == CPP_STRING)));
4665     case CPP_LESS_EQ:   return c == '>';
4666     case CPP_STRING:
4667     case CPP_WSTRING:
4668     case CPP_UTF8STRING:
4669     case CPP_STRING16:
4670     case CPP_STRING32:  return (CPP_OPTION (pfile, user_literals)
4671                                 && (b == CPP_NAME
4672                                     || (TOKEN_SPELL (token2) == SPELL_LITERAL
4673                                         && ISIDST (token2->val.str.text[0]))));
4674
4675     default:            break;
4676     }
4677
4678   return 0;
4679 }
4680
4681 /* Output all the remaining tokens on the current line, and a newline
4682    character, to FP.  Leading whitespace is removed.  If there are
4683    macros, special token padding is not performed.  */
4684 void
4685 cpp_output_line (cpp_reader *pfile, FILE *fp)
4686 {
4687   const cpp_token *token;
4688
4689   token = cpp_get_token (pfile);
4690   while (token->type != CPP_EOF)
4691     {
4692       cpp_output_token (token, fp);
4693       token = cpp_get_token (pfile);
4694       if (token->flags & PREV_WHITE)
4695         putc (' ', fp);
4696     }
4697
4698   putc ('\n', fp);
4699 }
4700
4701 /* Return a string representation of all the remaining tokens on the
4702    current line.  The result is allocated using xmalloc and must be
4703    freed by the caller.  */
4704 unsigned char *
4705 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
4706 {
4707   const cpp_token *token;
4708   unsigned int out = dir_name ? ustrlen (dir_name) : 0;
4709   unsigned int alloced = 120 + out;
4710   unsigned char *result = (unsigned char *) xmalloc (alloced);
4711
4712   /* If DIR_NAME is empty, there are no initial contents.  */
4713   if (dir_name)
4714     {
4715       sprintf ((char *) result, "#%s ", dir_name);
4716       out += 2;
4717     }
4718
4719   token = cpp_get_token (pfile);
4720   while (token->type != CPP_EOF)
4721     {
4722       unsigned char *last;
4723       /* Include room for a possible space and the terminating nul.  */
4724       unsigned int len = cpp_token_len (token) + 2;
4725
4726       if (out + len > alloced)
4727         {
4728           alloced *= 2;
4729           if (out + len > alloced)
4730             alloced = out + len;
4731           result = (unsigned char *) xrealloc (result, alloced);
4732         }
4733
4734       last = cpp_spell_token (pfile, token, &result[out], 0);
4735       out = last - result;
4736
4737       token = cpp_get_token (pfile);
4738       if (token->flags & PREV_WHITE)
4739         result[out++] = ' ';
4740     }
4741
4742   result[out] = '\0';
4743   return result;
4744 }
4745
4746 /* Memory buffers.  Changing these three constants can have a dramatic
4747    effect on performance.  The values here are reasonable defaults,
4748    but might be tuned.  If you adjust them, be sure to test across a
4749    range of uses of cpplib, including heavy nested function-like macro
4750    expansion.  Also check the change in peak memory usage (NJAMD is a
4751    good tool for this).  */
4752 #define MIN_BUFF_SIZE 8000
4753 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
4754 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
4755         (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
4756
4757 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
4758   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
4759 #endif
4760
4761 /* Create a new allocation buffer.  Place the control block at the end
4762    of the buffer, so that buffer overflows will cause immediate chaos.  */
4763 static _cpp_buff *
4764 new_buff (size_t len)
4765 {
4766   _cpp_buff *result;
4767   unsigned char *base;
4768
4769   if (len < MIN_BUFF_SIZE)
4770     len = MIN_BUFF_SIZE;
4771   len = CPP_ALIGN (len);
4772
4773 #ifdef ENABLE_VALGRIND_WORKAROUNDS
4774   /* Valgrind warns about uses of interior pointers, so put _cpp_buff
4775      struct first.  */
4776   size_t slen = CPP_ALIGN2 (sizeof (_cpp_buff), 2 * DEFAULT_ALIGNMENT);
4777   base = XNEWVEC (unsigned char, len + slen);
4778   result = (_cpp_buff *) base;
4779   base += slen;
4780 #else
4781   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
4782   result = (_cpp_buff *) (base + len);
4783 #endif
4784   result->base = base;
4785   result->cur = base;
4786   result->limit = base + len;
4787   result->next = NULL;
4788   return result;
4789 }
4790
4791 /* Place a chain of unwanted allocation buffers on the free list.  */
4792 void
4793 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
4794 {
4795   _cpp_buff *end = buff;
4796
4797   while (end->next)
4798     end = end->next;
4799   end->next = pfile->free_buffs;
4800   pfile->free_buffs = buff;
4801 }
4802
4803 /* Return a free buffer of size at least MIN_SIZE.  */
4804 _cpp_buff *
4805 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
4806 {
4807   _cpp_buff *result, **p;
4808
4809   for (p = &pfile->free_buffs;; p = &(*p)->next)
4810     {
4811       size_t size;
4812
4813       if (*p == NULL)
4814         return new_buff (min_size);
4815       result = *p;
4816       size = result->limit - result->base;
4817       /* Return a buffer that's big enough, but don't waste one that's
4818          way too big.  */
4819       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
4820         break;
4821     }
4822
4823   *p = result->next;
4824   result->next = NULL;
4825   result->cur = result->base;
4826   return result;
4827 }
4828
4829 /* Creates a new buffer with enough space to hold the uncommitted
4830    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
4831    the excess bytes to the new buffer.  Chains the new buffer after
4832    BUFF, and returns the new buffer.  */
4833 _cpp_buff *
4834 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
4835 {
4836   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
4837   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
4838
4839   buff->next = new_buff;
4840   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
4841   return new_buff;
4842 }
4843
4844 /* Creates a new buffer with enough space to hold the uncommitted
4845    remaining bytes of the buffer pointed to by BUFF, and at least
4846    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
4847    Chains the new buffer before the buffer pointed to by BUFF, and
4848    updates the pointer to point to the new buffer.  */
4849 void
4850 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
4851 {
4852   _cpp_buff *new_buff, *old_buff = *pbuff;
4853   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
4854
4855   new_buff = _cpp_get_buff (pfile, size);
4856   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
4857   new_buff->next = old_buff;
4858   *pbuff = new_buff;
4859 }
4860
4861 /* Free a chain of buffers starting at BUFF.  */
4862 void
4863 _cpp_free_buff (_cpp_buff *buff)
4864 {
4865   _cpp_buff *next;
4866
4867   for (; buff; buff = next)
4868     {
4869       next = buff->next;
4870 #ifdef ENABLE_VALGRIND_WORKAROUNDS
4871       free (buff);
4872 #else
4873       free (buff->base);
4874 #endif
4875     }
4876 }
4877
4878 /* Allocate permanent, unaligned storage of length LEN.  */
4879 unsigned char *
4880 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
4881 {
4882   _cpp_buff *buff = pfile->u_buff;
4883   unsigned char *result = buff->cur;
4884
4885   if (len > (size_t) (buff->limit - result))
4886     {
4887       buff = _cpp_get_buff (pfile, len);
4888       buff->next = pfile->u_buff;
4889       pfile->u_buff = buff;
4890       result = buff->cur;
4891     }
4892
4893   buff->cur = result + len;
4894   return result;
4895 }
4896
4897 /* Allocate permanent, unaligned storage of length LEN from a_buff.
4898    That buffer is used for growing allocations when saving macro
4899    replacement lists in a #define, and when parsing an answer to an
4900    assertion in #assert, #unassert or #if (and therefore possibly
4901    whilst expanding macros).  It therefore must not be used by any
4902    code that they might call: specifically the lexer and the guts of
4903    the macro expander.
4904
4905    All existing other uses clearly fit this restriction: storing
4906    registered pragmas during initialization.  */
4907 unsigned char *
4908 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
4909 {
4910   _cpp_buff *buff = pfile->a_buff;
4911   unsigned char *result = buff->cur;
4912
4913   if (len > (size_t) (buff->limit - result))
4914     {
4915       buff = _cpp_get_buff (pfile, len);
4916       buff->next = pfile->a_buff;
4917       pfile->a_buff = buff;
4918       result = buff->cur;
4919     }
4920
4921   buff->cur = result + len;
4922   return result;
4923 }
4924
4925 /* Commit or allocate storage from a buffer.  */
4926
4927 void *
4928 _cpp_commit_buff (cpp_reader *pfile, size_t size)
4929 {
4930   void *ptr = BUFF_FRONT (pfile->a_buff);
4931
4932   if (pfile->hash_table->alloc_subobject)
4933     {
4934       void *copy = pfile->hash_table->alloc_subobject (size);
4935       memcpy (copy, ptr, size);
4936       ptr = copy;
4937     }
4938   else
4939     BUFF_FRONT (pfile->a_buff) += size;
4940
4941   return ptr;
4942 }
4943
4944 /* Say which field of TOK is in use.  */
4945
4946 enum cpp_token_fld_kind
4947 cpp_token_val_index (const cpp_token *tok)
4948 {
4949   switch (TOKEN_SPELL (tok))
4950     {
4951     case SPELL_IDENT:
4952       return CPP_TOKEN_FLD_NODE;
4953     case SPELL_LITERAL:
4954       return CPP_TOKEN_FLD_STR;
4955     case SPELL_OPERATOR:
4956       /* Operands which were originally spelled as ident keep around
4957          the node for the exact spelling.  */
4958       if (tok->flags & NAMED_OP)
4959         return CPP_TOKEN_FLD_NODE;
4960       else if (tok->type == CPP_PASTE)
4961         return CPP_TOKEN_FLD_TOKEN_NO;
4962       else
4963         return CPP_TOKEN_FLD_NONE;
4964     case SPELL_NONE:
4965       if (tok->type == CPP_MACRO_ARG)
4966         return CPP_TOKEN_FLD_ARG_NO;
4967       else if (tok->type == CPP_PADDING)
4968         return CPP_TOKEN_FLD_SOURCE;
4969       else if (tok->type == CPP_PRAGMA)
4970         return CPP_TOKEN_FLD_PRAGMA;
4971       /* fall through */
4972     default:
4973       return CPP_TOKEN_FLD_NONE;
4974     }
4975 }
4976
4977 /* All tokens lexed in R after calling this function will be forced to
4978    have their location_t to be P, until
4979    cpp_stop_forcing_token_locations is called for R.  */
4980
4981 void
4982 cpp_force_token_locations (cpp_reader *r, location_t loc)
4983 {
4984   r->forced_token_location = loc;
4985 }
4986
4987 /* Go back to assigning locations naturally for lexed tokens.  */
4988
4989 void
4990 cpp_stop_forcing_token_locations (cpp_reader *r)
4991 {
4992   r->forced_token_location = 0;
4993 }
4994
4995 /* We're looking at \, if it's escaping EOL, look past it.  If at
4996    LIMIT, don't advance.  */
4997
4998 static const unsigned char *
4999 do_peek_backslash (const unsigned char *peek, const unsigned char *limit)
5000 {
5001   const unsigned char *probe = peek;
5002
5003   if (__builtin_expect (peek[1] == '\n', true))
5004     {
5005     eol:
5006       probe += 2;
5007       if (__builtin_expect (probe < limit, true))
5008         {
5009           peek = probe;
5010           if (*peek == '\\')
5011             /* The user might be perverse.  */
5012             return do_peek_backslash (peek, limit);
5013         }
5014     }
5015   else if (__builtin_expect (peek[1] == '\r', false))
5016     {
5017       if (probe[2] == '\n')
5018         probe++;
5019       goto eol;
5020     }
5021
5022   return peek;
5023 }
5024
5025 static const unsigned char *
5026 do_peek_next (const unsigned char *peek, const unsigned char *limit)
5027 {
5028   if (__builtin_expect (*peek == '\\', false))
5029     peek = do_peek_backslash (peek, limit);
5030   return peek;
5031 }
5032
5033 static const unsigned char *
5034 do_peek_prev (const unsigned char *peek, const unsigned char *bound)
5035 {
5036   if (peek == bound)
5037     return NULL;
5038
5039   unsigned char c = *--peek;
5040   if (__builtin_expect (c == '\n', false)
5041       || __builtin_expect (c == 'r', false))
5042     {
5043       if (peek == bound)
5044         return peek;
5045       int ix = -1;
5046       if (c == '\n' && peek[ix] == '\r')
5047         {
5048           if (peek + ix == bound)
5049             return peek;
5050           ix--;
5051         }
5052
5053       if (peek[ix] == '\\')
5054         return do_peek_prev (peek + ix, bound);
5055
5056       return peek;
5057     }
5058   else
5059     return peek;
5060 }
5061
5062 /* If PEEK[-1] is identifier MATCH, scan past it and trailing white
5063    space.  Otherwise return NULL.  */
5064
5065 static const unsigned char *
5066 do_peek_ident (const char *match, const unsigned char *peek,
5067                const unsigned char *limit)
5068 {
5069   for (; *++match; peek++)
5070     if (*peek != *match)
5071       {
5072         peek = do_peek_next (peek, limit);
5073         if (*peek != *match)
5074           return NULL;
5075       }
5076
5077   /* Must now not be looking at an identifier char.  */
5078   peek = do_peek_next (peek, limit);
5079   if (ISIDNUM (*peek))
5080     return NULL;
5081
5082   /* Skip control-line whitespace.  */
5083  ws:
5084   while (*peek == ' ' || *peek == '\t')
5085     peek++;
5086   if (__builtin_expect (*peek == '\\', false))
5087     {
5088       peek = do_peek_backslash (peek, limit);
5089       if (*peek != '\\')
5090         goto ws;
5091     }
5092
5093   return peek;
5094 }
5095
5096 /* Are we looking at a module control line starting as PEEK - 1?  */
5097
5098 static bool
5099 do_peek_module (cpp_reader *pfile, unsigned char c,
5100                 const unsigned char *peek, const unsigned char *limit)
5101 {
5102   bool import = false;
5103
5104   if (__builtin_expect (c == 'e', false))
5105     {
5106       if (!((peek[0] == 'x' || peek[0] == '\\')
5107             && (peek = do_peek_ident ("export", peek, limit))))
5108         return false;
5109
5110       /* export, peek for import or module.  No need to peek __import
5111          here.  */
5112       if (peek[0] == 'i')
5113         {
5114           if (!((peek[1] == 'm' || peek[1] == '\\')
5115                 && (peek = do_peek_ident ("import", peek + 1, limit))))
5116             return false;
5117           import = true;
5118         }
5119       else if (peek[0] == 'm')
5120         {
5121           if (!((peek[1] == 'o' || peek[1] == '\\')
5122                 && (peek = do_peek_ident ("module", peek + 1, limit))))
5123             return false;
5124         }
5125       else
5126         return false;
5127     }
5128   else if (__builtin_expect (c == 'i', false))
5129     {
5130       if (!((peek[0] == 'm' || peek[0] == '\\')
5131             && (peek = do_peek_ident ("import", peek, limit))))
5132         return false;
5133       import = true;
5134     }
5135   else if (__builtin_expect (c == '_', false))
5136     {
5137       /* Needed for translated includes.   */
5138       if (!((peek[0] == '_' || peek[0] == '\\')
5139             && (peek = do_peek_ident ("__import", peek, limit))))
5140         return false;
5141       import = true;
5142     }
5143   else if (__builtin_expect (c == 'm', false))
5144     {
5145       if (!((peek[0] == 'o' || peek[0] == '\\')
5146             && (peek = do_peek_ident ("module", peek, limit))))
5147         return false;
5148     }
5149   else
5150     return false;
5151
5152   /* Peek the next character to see if it's good enough.  We'll be at
5153      the first non-whitespace char, including skipping an escaped
5154      newline.  */
5155   /* ... import followed by identifier, ':', '<' or header-name
5156      preprocessing tokens, or module followed by identifier, ':' or
5157      ';' preprocessing tokens.  */
5158   unsigned char p = *peek++;
5159
5160   /* A character literal is ... single quotes, ... optionally preceded
5161      by u8, u, U, or L */
5162   /* A string-literal is a ... double quotes, optionally prefixed by
5163      R, u8, u8R, u, uR, U, UR, L, or LR */
5164   if (p == 'u')
5165     {
5166       peek = do_peek_next (peek, limit);
5167       if (*peek == '8')
5168         {
5169           peek++;
5170           goto peek_u8;
5171         }
5172       goto peek_u;
5173     }
5174   else if (p == 'U' || p == 'L')
5175     {
5176     peek_u8:
5177       peek = do_peek_next (peek, limit);
5178     peek_u:
5179       if (*peek == '\"' || *peek == '\'')
5180         return false;
5181
5182       if (*peek == 'R')
5183         goto peek_R;
5184       /* Identifier. Ok.  */
5185     }
5186   else if (p == 'R')
5187     {
5188     peek_R:
5189       if (CPP_OPTION (pfile, rliterals))
5190         {
5191           peek = do_peek_next (peek, limit);
5192           if (*peek == '\"')
5193             return false;
5194         }
5195       /* Identifier. Ok.  */
5196     }
5197   else if ('Z' - 'A' == 25
5198            ? ((p >= 'A' && p <= 'Z') || (p >= 'a' && p <= 'z') || p == '_')
5199            : ISIDST (p))
5200     {
5201       /* Identifier.  Ok. */
5202     }
5203   else if (p == '<')
5204     {
5205       /* Maybe angle header, ok for import.  Reject
5206          '<=', '<<' digraph:'<:'.  */
5207       if (!import)
5208         return false;
5209       peek = do_peek_next (peek, limit);
5210       if (*peek == '=' || *peek == '<'
5211           || (*peek == ':' && CPP_OPTION (pfile, digraphs)))
5212         return false;
5213     }
5214   else if (p == ';')
5215     {
5216       /* SEMICOLON, ok for module.  */
5217       if (import)
5218         return false;
5219     }
5220   else if (p == '"')
5221     {
5222       /* STRING, ok for import.  */
5223       if (!import)
5224         return false;
5225     }
5226   else if (p == ':')
5227     {
5228       /* Maybe COLON, ok.  Reject '::', digraph:':>'.  */
5229       peek = do_peek_next (peek, limit);
5230       if (*peek == ':' || (*peek == '>' && CPP_OPTION (pfile, digraphs)))
5231         return false;
5232     }
5233   else
5234     /* FIXME: Detect a unicode character, excluding those not
5235        permitted as the initial character. [lex.name]/1.  I presume
5236        we need to check the \[uU] spellings, and directly using
5237        Unicode in say UTF8 form?  Or perhaps we do the phase-1
5238        conversion of UTF8 to universal-character-names?  */
5239     return false;
5240
5241   return true;
5242 }
5243
5244 /* Directives-only scanning.  Somewhat more relaxed than correct
5245    parsing -- some ill-formed programs will not be rejected.  */
5246
5247 void
5248 cpp_directive_only_process (cpp_reader *pfile,
5249                             void *data,
5250                             void (*cb) (cpp_reader *, CPP_DO_task, void *, ...))
5251 {
5252   bool module_p = CPP_OPTION (pfile, module_directives);
5253
5254   do
5255     {
5256     restart:
5257       /* Buffer initialization, but no line cleaning. */
5258       cpp_buffer *buffer = pfile->buffer;
5259       buffer->cur_note = buffer->notes_used = 0;
5260       buffer->cur = buffer->line_base = buffer->next_line;
5261       buffer->need_line = false;
5262       /* Files always end in a newline or carriage return.  We rely on this for
5263          character peeking safety.  */
5264       gcc_assert (buffer->rlimit[0] == '\n' || buffer->rlimit[0] == '\r');
5265
5266       const unsigned char *base = buffer->cur;
5267       unsigned line_count = 0;
5268       const unsigned char *line_start = base;
5269
5270       bool bol = true;
5271       bool raw = false;
5272
5273       const unsigned char *lwm = base;
5274       for (const unsigned char *pos = base, *limit = buffer->rlimit;
5275            pos < limit;)
5276         {
5277           unsigned char c = *pos++;
5278           /* This matches the switch in _cpp_lex_direct.  */
5279           switch (c)
5280             {
5281             case ' ': case '\t': case '\f': case '\v':
5282               /* Whitespace, do nothing.  */
5283               break;
5284
5285             case '\r': /* MAC line ending, or Windows \r\n  */
5286               if (*pos == '\n')
5287                 pos++;
5288               /* FALLTHROUGH */
5289
5290             case '\n':
5291               bol = true;
5292
5293             next_line:
5294               CPP_INCREMENT_LINE (pfile, 0);
5295               line_count++;
5296               line_start = pos;
5297               break;
5298
5299             case '\\':
5300               /* <backslash><newline> is removed, and doesn't undo any
5301                  preceeding escape or whatnot.  */
5302               if (*pos == '\n')
5303                 {
5304                   pos++;
5305                   goto next_line;
5306                 }
5307               else if (*pos == '\r')
5308                 {
5309                   if (pos[1] == '\n')
5310                     pos++;
5311                   pos++;
5312                   goto next_line;
5313                 }
5314               goto dflt;
5315
5316             case '#':
5317               if (bol)
5318                 {
5319                   /* Line directive.  */
5320                   if (pos - 1 > base && !pfile->state.skipping)
5321                     cb (pfile, CPP_DO_print, data,
5322                         line_count, base, pos - 1 - base);
5323
5324                   /* Prep things for directive handling. */
5325                   buffer->next_line = pos;
5326                   buffer->need_line = true;
5327                   bool ok = _cpp_get_fresh_line (pfile);
5328                   gcc_checking_assert (ok);
5329
5330                   /* Ensure proper column numbering for generated
5331                      error messages. */
5332                   buffer->line_base -= pos - line_start;
5333
5334                   _cpp_handle_directive (pfile, line_start + 1 != pos);
5335
5336                   /* Sanitize the line settings.  Duplicate #include's can
5337                      mess things up. */
5338                   // FIXME: Necessary?
5339                   pfile->line_table->highest_location
5340                     = pfile->line_table->highest_line;
5341
5342                   if (!pfile->state.skipping
5343                       && pfile->buffer->next_line < pfile->buffer->rlimit)
5344                     cb (pfile, CPP_DO_location, data,
5345                         pfile->line_table->highest_line);
5346
5347                   goto restart;
5348                 }
5349               goto dflt;
5350
5351             case '/':
5352               {
5353                 const unsigned char *peek = do_peek_next (pos, limit);
5354                 if (!(*peek == '/' || *peek == '*'))
5355                   goto dflt;
5356
5357                 /* Line or block comment  */
5358                 bool is_block = *peek == '*';
5359                 bool star = false;
5360                 bool esc = false;
5361                 location_t sloc
5362                   = linemap_position_for_column (pfile->line_table,
5363                                                  pos - line_start);
5364
5365                 while (pos < limit)
5366                   {
5367                     char c = *pos++;
5368                     switch (c)
5369                       {
5370                       case '\\':
5371                         esc = true;
5372                         break;
5373
5374                       case '\r':
5375                         if (*pos == '\n')
5376                           pos++;
5377                         /* FALLTHROUGH  */
5378
5379                       case '\n':
5380                         {
5381                           CPP_INCREMENT_LINE (pfile, 0);
5382                           line_count++;
5383                           line_start = pos;
5384                           if (!esc && !is_block)
5385                             {
5386                               bol = true;
5387                               goto done_comment;
5388                             }
5389                         }
5390                         if (!esc)
5391                           star = false;
5392                         esc = false;
5393                         break;
5394
5395                       case '*':
5396                         if (pos > peek)
5397                           star = is_block;
5398                         esc = false;
5399                         break;
5400
5401                       case '/':
5402                         if (star)
5403                           goto done_comment;
5404                         /* FALLTHROUGH  */
5405
5406                       default:
5407                         star = false;
5408                         esc = false;
5409                         break;
5410                       }
5411                   }
5412                 if (pos < limit || is_block)
5413                   cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
5414                                        "unterminated comment");
5415               done_comment:
5416                 lwm = pos;
5417                 break;
5418               }
5419
5420             case '\'':
5421               if (!CPP_OPTION (pfile, digit_separators))
5422                 goto delimited_string;
5423
5424               /* Possibly a number punctuator.  */
5425               if (!ISIDNUM (*do_peek_next (pos, limit)))
5426                 goto delimited_string;
5427
5428               goto quote_peek;
5429
5430             case '\"':
5431               if (!CPP_OPTION (pfile, rliterals))
5432                 goto delimited_string;
5433
5434             quote_peek:
5435               {
5436                 /* For ' see if it's a number punctuator
5437                    \.?<digit>(<digit>|<identifier-nondigit>
5438                    |'<digit>|'<nondigit>|[eEpP]<sign>|\.)* */
5439                 /* For " see if it's a raw string
5440                    {U,L,u,u8}R.  This includes CPP_NUMBER detection,
5441                    because that could be 0e+R.  */
5442                 const unsigned char *peek = pos - 1;
5443                 bool quote_first = c == '"';
5444                 bool quote_eight = false;
5445                 bool maybe_number_start = false;
5446                 bool want_number = false;
5447
5448                 while ((peek = do_peek_prev (peek, lwm)))
5449                   {
5450                     unsigned char p = *peek;
5451                     if (quote_first)
5452                       {
5453                         if (!raw)
5454                           {
5455                             if (p != 'R')
5456                               break;
5457                             raw = true;
5458                             continue;
5459                           }
5460
5461                         quote_first = false;
5462                         if (p == 'L' || p == 'U' || p == 'u')
5463                           ;
5464                         else if (p == '8')
5465                           quote_eight = true;
5466                         else
5467                           goto second_raw;
5468                       }
5469                     else if (quote_eight)
5470                       {
5471                         if (p != 'u')
5472                           {
5473                             raw = false;
5474                             break;
5475                           }
5476                         quote_eight = false;
5477                       }
5478                     else if (c == '"')
5479                       {
5480                       second_raw:;
5481                         if (!want_number && ISIDNUM (p))
5482                           {
5483                             raw = false;
5484                             break;
5485                           }
5486                       }
5487
5488                     if (ISDIGIT (p))
5489                       maybe_number_start = true;
5490                     else if (p == '.')
5491                       want_number = true;
5492                     else if (ISIDNUM (p))
5493                       maybe_number_start = false;
5494                     else if (p == '+' || p == '-')
5495                       {
5496                         if (const unsigned char *peek_prev
5497                             = do_peek_prev (peek, lwm))
5498                           {
5499                             p = *peek_prev;
5500                             if (p == 'e' || p == 'E'
5501                                 || p == 'p' || p == 'P')
5502                               {
5503                                 want_number = true;
5504                                 maybe_number_start = false;
5505                               }
5506                             else
5507                               break;
5508                           }
5509                         else
5510                           break;
5511                       }
5512                     else if (p == '\'' || p == '\"')
5513                       {
5514                         /* If this is lwm, this must be the end of a
5515                            previous string.  So this is a trailing
5516                            literal type, (a) if those are allowed,
5517                              and (b) maybe_start is false.  Otherwise
5518                              this must be a CPP_NUMBER because we've
5519                              met another ', and we'd have checked that
5520                              in its own right.  */
5521                         if (peek == lwm && CPP_OPTION (pfile, uliterals))
5522                           {
5523                             if  (!maybe_number_start && !want_number)
5524                               /* Must be a literal type.  */
5525                               raw = false;
5526                           }
5527                         else if (p == '\''
5528                                  && CPP_OPTION (pfile, digit_separators))
5529                           maybe_number_start = true;
5530                         break;
5531                       }
5532                     else if (c == '\'')
5533                       break;
5534                     else if (!quote_first && !quote_eight)
5535                       break;
5536                   }
5537
5538                 if (maybe_number_start)
5539                   {
5540                     if (c == '\'')
5541                       /* A CPP NUMBER.  */
5542                       goto dflt;
5543                     raw = false;
5544                   }
5545
5546                 goto delimited_string;
5547               }
5548
5549             delimited_string:
5550               {
5551                 /* (Possibly raw) string or char literal.  */
5552                 unsigned char end = c;
5553                 int delim_len = -1;
5554                 const unsigned char *delim = NULL;
5555                 location_t sloc = linemap_position_for_column (pfile->line_table,
5556                                                                pos - line_start);
5557                 int esc = 0;
5558
5559                 if (raw)
5560                   {
5561                     /* There can be no line breaks in the delimiter.  */
5562                     delim = pos;
5563                     for (delim_len = 0; (c = *pos++) != '('; delim_len++)
5564                       {
5565                         if (delim_len == 16)
5566                           {
5567                             cpp_error_with_line (pfile, CPP_DL_ERROR,
5568                                                  sloc, 0,
5569                                                  "raw string delimiter"
5570                                                  " longer than %d"
5571                                                  " characters",
5572                                                  delim_len);
5573                             raw = false;
5574                             pos = delim;
5575                             break;
5576                           }
5577                         if (strchr (") \\\t\v\f\n", c))
5578                           {
5579                             cpp_error_with_line (pfile, CPP_DL_ERROR,
5580                                                  sloc, 0,
5581                                                  "invalid character '%c'"
5582                                                  " in raw string"
5583                                                  " delimiter", c);
5584                             raw = false;
5585                             pos = delim;
5586                             break;
5587                           }
5588                         if (pos >= limit)
5589                           goto bad_string;
5590                       }
5591                   }
5592
5593                 while (pos < limit)
5594                   {
5595                     char c = *pos++;
5596                     switch (c)
5597                       {
5598                       case '\\':
5599                         if (!raw)
5600                           esc++;
5601                         break;
5602
5603                       case '\r':
5604                         if (*pos == '\n')
5605                           pos++;
5606                         /* FALLTHROUGH  */
5607
5608                       case '\n':
5609                         {
5610                           CPP_INCREMENT_LINE (pfile, 0);
5611                           line_count++;
5612                           line_start = pos;
5613                         }
5614                         if (esc)
5615                           esc--;
5616                         break;
5617
5618                       case ')':
5619                         if (raw
5620                             && pos + delim_len + 1 < limit
5621                             && pos[delim_len] == end
5622                             && !memcmp (delim, pos, delim_len))
5623                           {
5624                             pos += delim_len + 1;
5625                             raw = false;
5626                             goto done_string;
5627                           }
5628                         break;
5629
5630                       default:
5631                         if (!raw && !(esc & 1) && c == end)
5632                           goto done_string;
5633                         esc = 0;
5634                         break;
5635                       }
5636                   }
5637               bad_string:
5638                 cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
5639                                      "unterminated literal");
5640
5641               done_string:
5642                 raw = false;
5643                 lwm = pos - 1;
5644               }
5645               goto dflt;
5646
5647             case '_':
5648             case 'e':
5649             case 'i':
5650             case 'm':
5651               if (bol && module_p && !pfile->state.skipping
5652                   && do_peek_module (pfile, c, pos, limit))
5653                 {
5654                   /* We've seen the start of a module control line.
5655                      Start up the tokenizer.  */
5656                   pos--; /* Backup over the first character.  */
5657
5658                   /* Backup over whitespace to start of line.  */
5659                   while (pos > line_start
5660                          && (pos[-1] == ' ' || pos[-1] == '\t'))
5661                     pos--;
5662
5663                   if (pos > base)
5664                     cb (pfile, CPP_DO_print, data, line_count, base, pos - base);
5665
5666                   /* Prep things for directive handling. */
5667                   buffer->next_line = pos;
5668                   buffer->need_line = true;
5669
5670                   /* Now get tokens until the PRAGMA_EOL.  */
5671                   do
5672                     {
5673                       location_t spelling;
5674                       const cpp_token *tok
5675                         = cpp_get_token_with_location (pfile, &spelling);
5676
5677                       gcc_assert (pfile->state.in_deferred_pragma
5678                                   || tok->type == CPP_PRAGMA_EOL);
5679                       cb (pfile, CPP_DO_token, data, tok, spelling);
5680                     }
5681                   while (pfile->state.in_deferred_pragma);
5682
5683                   if (pfile->buffer->next_line < pfile->buffer->rlimit)
5684                     cb (pfile, CPP_DO_location, data,
5685                         pfile->line_table->highest_line);
5686
5687                   pfile->mi_valid = false;
5688                   goto restart;
5689                 }
5690               goto dflt;
5691
5692             default:
5693             dflt:
5694               bol = false;
5695               pfile->mi_valid = false;
5696               break;
5697             }
5698         }
5699
5700       if (buffer->rlimit > base && !pfile->state.skipping)
5701         {
5702           const unsigned char *limit = buffer->rlimit;
5703           /* If the file was not newline terminated, add rlimit, which is
5704              guaranteed to point to a newline, to the end of our range.  */
5705           if (limit[-1] != '\n')
5706             {
5707               limit++;
5708               CPP_INCREMENT_LINE (pfile, 0);
5709               line_count++;
5710             }
5711           cb (pfile, CPP_DO_print, data, line_count, base, limit - base);
5712         }
5713
5714       _cpp_pop_buffer (pfile);
5715     }
5716   while (pfile->buffer);
5717 }