libcpp/lex.cc

   1 /* CPP Library - lexical analysis.
   2    Copyright (C) 2000-2024 Free Software Foundation, Inc.
   3    Contributed by Per Bothner, 1994-95.
   4    Based on CCCP program by Paul Rubin, June 1986
   5    Adapted to ANSI C, Richard Stallman, Jan 1987
   6    Broken out to separate file, Zack Weinberg, Mar 2000
   7
   8 This program is free software; you can redistribute it and/or modify it
   9 under the terms of the GNU General Public License as published by the
  10 Free Software Foundation; either version 3, or (at your option) any
  11 later version.
  12
  13 This program is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with this program; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "cpplib.h"
  25 #include "internal.h"
  26
  27 enum spell_type
  28 {
  29   SPELL_OPERATOR = 0,
  30   SPELL_IDENT,
  31   SPELL_LITERAL,
  32   SPELL_NONE
  33 };
  34
  35 struct token_spelling
  36 {
  37   enum spell_type category;
  38   const unsigned char *name;
  39 };
  40
  41 static const unsigned char *const digraph_spellings[] =
  42 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
  43
  44 #define OP(e, s) { SPELL_OPERATOR, UC s  },
  45 #define TK(e, s) { SPELL_ ## s,    UC #e },
  46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
  47 #undef OP
  48 #undef TK
  49
  50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
  51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
  52
  53 /* ISO 10646 defines the UCS codespace as the range 0-0x10FFFF inclusive.  */
  54 #define UCS_LIMIT 0x10FFFF
  55
  56 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
  57 static int skip_line_comment (cpp_reader *);
  58 static void skip_whitespace (cpp_reader *, cppchar_t);
  59 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
  60 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
  61 static void store_comment (cpp_reader *, cpp_token *);
  62 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
  63                             unsigned int, enum cpp_ttype);
  64 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
  65 static int name_p (cpp_reader *, const cpp_string *);
  66 static tokenrun *next_tokenrun (tokenrun *);
  67
  68 static _cpp_buff *new_buff (size_t);
  69
  70
  71 /* Utility routine:
  72
  73    Compares, the token TOKEN to the NUL-terminated string STRING.
  74    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
  75 int
  76 cpp_ideq (const cpp_token *token, const char *string)
  77 {
  78   if (token->type != CPP_NAME)
  79     return 0;
  80
  81   return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
  82 }
  83
  84 /* Record a note TYPE at byte POS into the current cleaned logical
  85    line.  */
  86 static void
  87 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
  88 {
  89   if (buffer->notes_used == buffer->notes_cap)
  90     {
  91       buffer->notes_cap = buffer->notes_cap * 2 + 200;
  92       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
  93                                   buffer->notes_cap);
  94     }
  95
  96   buffer->notes[buffer->notes_used].pos = pos;
  97   buffer->notes[buffer->notes_used].type = type;
  98   buffer->notes_used++;
  99 }
 100
 101 \f
 102 /* Fast path to find line special characters using optimized character
 103    scanning algorithms.  Anything complicated falls back to the slow
 104    path below.  Since this loop is very hot it's worth doing these kinds
 105    of optimizations.
 106
 107    One of the paths through the ifdefs should provide
 108
 109      const uchar *search_line_fast (const uchar *s, const uchar *end);
 110
 111    Between S and END, search for \n, \r, \\, ?.  Return a pointer to
 112    the found character.
 113
 114    Note that the last character of the buffer is *always* a newline,
 115    as forced by _cpp_convert_input.  This fact can be used to avoid
 116    explicitly looking for the end of the buffer.  */
 117
 118 /* Configure gives us an ifdef test.  */
 119 #ifndef WORDS_BIGENDIAN
 120 #define WORDS_BIGENDIAN 0
 121 #endif
 122
 123 /* We'd like the largest integer that fits into a register.  There's nothing
 124    in <stdint.h> that gives us that.  For most hosts this is unsigned long,
 125    but MS decided on an LLP64 model.  Thankfully when building with GCC we
 126    can get the "real" word size.  */
 127 #ifdef __GNUC__
 128 typedef unsigned int word_type __attribute__((__mode__(__word__)));
 129 #else
 130 typedef unsigned long word_type;
 131 #endif
 132
 133 /* The code below is only expecting sizes 4 or 8.
 134    Die at compile-time if this expectation is violated.  */
 135 typedef char check_word_type_size
 136   [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
 137
 138 /* Return X with the first N bytes forced to values that won't match one
 139    of the interesting characters.  Note that NUL is not interesting.  */
 140
 141 static inline word_type
 142 acc_char_mask_misalign (word_type val, unsigned int n)
 143 {
 144   word_type mask = -1;
 145   if (WORDS_BIGENDIAN)
 146     mask >>= n * 8;
 147   else
 148     mask <<= n * 8;
 149   return val & mask;
 150 }
 151
 152 /* Return X replicated to all byte positions within WORD_TYPE.  */
 153
 154 static inline word_type
 155 acc_char_replicate (uchar x)
 156 {
 157   word_type ret;
 158
 159   ret = (x << 24) | (x << 16) | (x << 8) | x;
 160   if (sizeof(word_type) == 8)
 161     ret = (ret << 16 << 16) | ret;
 162   return ret;
 163 }
 164
 165 /* Return non-zero if some byte of VAL is (probably) C.  */
 166
 167 static inline word_type
 168 acc_char_cmp (word_type val, word_type c)
 169 {
 170 #if defined(__GNUC__) && defined(__alpha__)
 171   /* We can get exact results using a compare-bytes instruction.
 172      Get (val == c) via (0 >= (val ^ c)).  */
 173   return __builtin_alpha_cmpbge (0, val ^ c);
 174 #else
 175   word_type magic = 0x7efefefeU;
 176   if (sizeof(word_type) == 8)
 177     magic = (magic << 16 << 16) | 0xfefefefeU;
 178   magic |= 1;
 179
 180   val ^= c;
 181   return ((val + magic) ^ ~val) & ~magic;
 182 #endif
 183 }
 184
 185 /* Given the result of acc_char_cmp is non-zero, return the index of
 186    the found character.  If this was a false positive, return -1.  */
 187
 188 static inline int
 189 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
 190                 word_type val ATTRIBUTE_UNUSED)
 191 {
 192 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
 193   /* The cmpbge instruction sets *bits* of the result corresponding to
 194      matches in the bytes with no false positives.  */
 195   return __builtin_ctzl (cmp);
 196 #else
 197   unsigned int i;
 198
 199   /* ??? It would be nice to force unrolling here,
 200      and have all of these constants folded.  */
 201   for (i = 0; i < sizeof(word_type); ++i)
 202     {
 203       uchar c;
 204       if (WORDS_BIGENDIAN)
 205         c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
 206       else
 207         c = (val >> i * 8) & 0xff;
 208
 209       if (c == '\n' || c == '\r' || c == '\\' || c == '?')
 210         return i;
 211     }
 212
 213   return -1;
 214 #endif
 215 }
 216
 217 /* A version of the fast scanner using bit fiddling techniques.
 218
 219    For 32-bit words, one would normally perform 16 comparisons and
 220    16 branches.  With this algorithm one performs 24 arithmetic
 221    operations and one branch.  Whether this is faster with a 32-bit
 222    word size is going to be somewhat system dependent.
 223
 224    For 64-bit words, we eliminate twice the number of comparisons
 225    and branches without increasing the number of arithmetic operations.
 226    It's almost certainly going to be a win with 64-bit word size.  */
 227
 228 static const uchar * search_line_acc_char (const uchar *, const uchar *)
 229   ATTRIBUTE_UNUSED;
 230
 231 static const uchar *
 232 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 233 {
 234   const word_type repl_nl = acc_char_replicate ('\n');
 235   const word_type repl_cr = acc_char_replicate ('\r');
 236   const word_type repl_bs = acc_char_replicate ('\\');
 237   const word_type repl_qm = acc_char_replicate ('?');
 238
 239   unsigned int misalign;
 240   const word_type *p;
 241   word_type val, t;
 242
 243   /* Align the buffer.  Mask out any bytes from before the beginning.  */
 244   p = (word_type *)((uintptr_t)s & -sizeof(word_type));
 245   val = *p;
 246   misalign = (uintptr_t)s & (sizeof(word_type) - 1);
 247   if (misalign)
 248     val = acc_char_mask_misalign (val, misalign);
 249
 250   /* Main loop.  */
 251   while (1)
 252     {
 253       t  = acc_char_cmp (val, repl_nl);
 254       t |= acc_char_cmp (val, repl_cr);
 255       t |= acc_char_cmp (val, repl_bs);
 256       t |= acc_char_cmp (val, repl_qm);
 257
 258       if (__builtin_expect (t != 0, 0))
 259         {
 260           int i = acc_char_index (t, val);
 261           if (i >= 0)
 262             return (const uchar *)p + i;
 263         }
 264
 265       val = *++p;
 266     }
 267 }
 268
 269 /* Disable on Solaris 2/x86 until the following problem can be properly
 270    autoconfed:
 271
 272    The Solaris 10+ assembler tags objects with the instruction set
 273    extensions used, so SSE4.2 executables cannot run on machines that
 274    don't support that extension.  */
 275
 276 #if (GCC_VERSION >= 4005) && (__GNUC__ >= 5 || !defined(__PIC__)) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
 277
 278 /* Replicated character data to be shared between implementations.
 279    Recall that outside of a context with vector support we can't
 280    define compatible vector types, therefore these are all defined
 281    in terms of raw characters.  */
 282 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
 283   { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 284     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
 285   { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 286     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
 287   { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 288     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
 289   { '?', '?', '?', '?', '?', '?', '?', '?',
 290     '?', '?', '?', '?', '?', '?', '?', '?' },
 291 };
 292
 293 /* A version of the fast scanner using MMX vectorized byte compare insns.
 294
 295    This uses the PMOVMSKB instruction which was introduced with "MMX2",
 296    which was packaged into SSE1; it is also present in the AMD MMX
 297    extension.  Mark the function as using "sse" so that we emit a real
 298    "emms" instruction, rather than the 3dNOW "femms" instruction.  */
 299
 300 static const uchar *
 301 #ifndef __SSE__
 302 __attribute__((__target__("sse")))
 303 #endif
 304 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 305 {
 306   typedef char v8qi __attribute__ ((__vector_size__ (8)));
 307   typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
 308
 309   const v8qi repl_nl = *(const v8qi *)repl_chars[0];
 310   const v8qi repl_cr = *(const v8qi *)repl_chars[1];
 311   const v8qi repl_bs = *(const v8qi *)repl_chars[2];
 312   const v8qi repl_qm = *(const v8qi *)repl_chars[3];
 313
 314   unsigned int misalign, found, mask;
 315   const v8qi *p;
 316   v8qi data, t, c;
 317
 318   /* Align the source pointer.  While MMX doesn't generate unaligned data
 319      faults, this allows us to safely scan to the end of the buffer without
 320      reading beyond the end of the last page.  */
 321   misalign = (uintptr_t)s & 7;
 322   p = (const v8qi *)((uintptr_t)s & -8);
 323   data = *p;
 324
 325   /* Create a mask for the bytes that are valid within the first
 326      16-byte block.  The Idea here is that the AND with the mask
 327      within the loop is "free", since we need some AND or TEST
 328      insn in order to set the flags for the branch anyway.  */
 329   mask = -1u << misalign;
 330
 331   /* Main loop processing 8 bytes at a time.  */
 332   goto start;
 333   do
 334     {
 335       data = *++p;
 336       mask = -1;
 337
 338     start:
 339       t = __builtin_ia32_pcmpeqb(data, repl_nl);
 340       c = __builtin_ia32_pcmpeqb(data, repl_cr);
 341       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 342       c = __builtin_ia32_pcmpeqb(data, repl_bs);
 343       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 344       c = __builtin_ia32_pcmpeqb(data, repl_qm);
 345       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 346       found = __builtin_ia32_pmovmskb (t);
 347       found &= mask;
 348     }
 349   while (!found);
 350
 351   __builtin_ia32_emms ();
 352
 353   /* FOUND contains 1 in bits for which we matched a relevant
 354      character.  Conversion to the byte index is trivial.  */
 355   found = __builtin_ctz(found);
 356   return (const uchar *)p + found;
 357 }
 358
 359 /* A version of the fast scanner using SSE2 vectorized byte compare insns.  */
 360
 361 static const uchar *
 362 #ifndef __SSE2__
 363 __attribute__((__target__("sse2")))
 364 #endif
 365 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 366 {
 367   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 368
 369   const v16qi repl_nl = *(const v16qi *)repl_chars[0];
 370   const v16qi repl_cr = *(const v16qi *)repl_chars[1];
 371   const v16qi repl_bs = *(const v16qi *)repl_chars[2];
 372   const v16qi repl_qm = *(const v16qi *)repl_chars[3];
 373
 374   unsigned int misalign, found, mask;
 375   const v16qi *p;
 376   v16qi data, t;
 377
 378   /* Align the source pointer.  */
 379   misalign = (uintptr_t)s & 15;
 380   p = (const v16qi *)((uintptr_t)s & -16);
 381   data = *p;
 382
 383   /* Create a mask for the bytes that are valid within the first
 384      16-byte block.  The Idea here is that the AND with the mask
 385      within the loop is "free", since we need some AND or TEST
 386      insn in order to set the flags for the branch anyway.  */
 387   mask = -1u << misalign;
 388
 389   /* Main loop processing 16 bytes at a time.  */
 390   goto start;
 391   do
 392     {
 393       data = *++p;
 394       mask = -1;
 395
 396     start:
 397       t  = data == repl_nl;
 398       t |= data == repl_cr;
 399       t |= data == repl_bs;
 400       t |= data == repl_qm;
 401       found = __builtin_ia32_pmovmskb128 (t);
 402       found &= mask;
 403     }
 404   while (!found);
 405
 406   /* FOUND contains 1 in bits for which we matched a relevant
 407      character.  Conversion to the byte index is trivial.  */
 408   found = __builtin_ctz(found);
 409   return (const uchar *)p + found;
 410 }
 411
 412 #ifdef HAVE_SSE4
 413 /* A version of the fast scanner using SSE 4.2 vectorized string insns.  */
 414
 415 static const uchar *
 416 #ifndef __SSE4_2__
 417 __attribute__((__target__("sse4.2")))
 418 #endif
 419 search_line_sse42 (const uchar *s, const uchar *end)
 420 {
 421   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 422   static const v16qi search = { '\n', '\r', '?', '\\' };
 423
 424   uintptr_t si = (uintptr_t)s;
 425   uintptr_t index;
 426
 427   /* Check for unaligned input.  */
 428   if (si & 15)
 429     {
 430       v16qi sv;
 431
 432       if (__builtin_expect (end - s < 16, 0)
 433           && __builtin_expect ((si & 0xfff) > 0xff0, 0))
 434         {
 435           /* There are less than 16 bytes left in the buffer, and less
 436              than 16 bytes left on the page.  Reading 16 bytes at this
 437              point might generate a spurious page fault.  Defer to the
 438              SSE2 implementation, which already handles alignment.  */
 439           return search_line_sse2 (s, end);
 440         }
 441
 442       /* ??? The builtin doesn't understand that the PCMPESTRI read from
 443          memory need not be aligned.  */
 444       sv = __builtin_ia32_loaddqu ((const char *) s);
 445       index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
 446
 447       if (__builtin_expect (index < 16, 0))
 448         goto found;
 449
 450       /* Advance the pointer to an aligned address.  We will re-scan a
 451          few bytes, but we no longer need care for reading past the
 452          end of a page, since we're guaranteed a match.  */
 453       s = (const uchar *)((si + 15) & -16);
 454     }
 455
 456   /* Main loop, processing 16 bytes at a time.  */
 457 #ifdef __GCC_ASM_FLAG_OUTPUTS__
 458   while (1)
 459     {
 460       char f;
 461
 462       /* By using inline assembly instead of the builtin,
 463          we can use the result, as well as the flags set.  */
 464       __asm ("%vpcmpestri\t$0, %2, %3"
 465              : "=c"(index), "=@ccc"(f)
 466              : "m"(*s), "x"(search), "a"(4), "d"(16));
 467       if (f)
 468         break;
 469
 470       s += 16;
 471     }
 472 #else
 473   s -= 16;
 474   /* By doing the whole loop in inline assembly,
 475      we can make proper use of the flags set.  */
 476   __asm (      ".balign 16\n"
 477         "0:     add $16, %1\n"
 478         "       %vpcmpestri\t$0, (%1), %2\n"
 479         "       jnc 0b"
 480         : "=&c"(index), "+r"(s)
 481         : "x"(search), "a"(4), "d"(16));
 482 #endif
 483
 484  found:
 485   return s + index;
 486 }
 487
 488 #else
 489 /* Work around out-dated assemblers without sse4 support.  */
 490 #define search_line_sse42 search_line_sse2
 491 #endif
 492
 493 /* Check the CPU capabilities.  */
 494
 495 #include "../gcc/config/i386/cpuid.h"
 496
 497 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
 498 static search_line_fast_type search_line_fast;
 499
 500 #define HAVE_init_vectorized_lexer 1
 501 static inline void
 502 init_vectorized_lexer (void)
 503 {
 504   unsigned dummy, ecx = 0, edx = 0;
 505   search_line_fast_type impl = search_line_acc_char;
 506   int minimum = 0;
 507
 508 #if defined(__SSE4_2__)
 509   minimum = 3;
 510 #elif defined(__SSE2__)
 511   minimum = 2;
 512 #elif defined(__SSE__)
 513   minimum = 1;
 514 #endif
 515
 516   if (minimum == 3)
 517     impl = search_line_sse42;
 518   else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
 519     {
 520       if (minimum == 3 || (ecx & bit_SSE4_2))
 521         impl = search_line_sse42;
 522       else if (minimum == 2 || (edx & bit_SSE2))
 523         impl = search_line_sse2;
 524       else if (minimum == 1 || (edx & bit_SSE))
 525         impl = search_line_mmx;
 526     }
 527   else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
 528     {
 529       if (minimum == 1
 530           || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
 531         impl = search_line_mmx;
 532     }
 533
 534   search_line_fast = impl;
 535 }
 536
 537 #elif (GCC_VERSION >= 4005) && defined(_ARCH_PWR8) && defined(__ALTIVEC__)
 538
 539 /* A vection of the fast scanner using AltiVec vectorized byte compares
 540    and VSX unaligned loads (when VSX is available).  This is otherwise
 541    the same as the AltiVec version.  */
 542
 543 ATTRIBUTE_NO_SANITIZE_UNDEFINED
 544 static const uchar *
 545 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 546 {
 547   typedef __attribute__((altivec(vector))) unsigned char vc;
 548
 549   const vc repl_nl = {
 550     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 551     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 552   };
 553   const vc repl_cr = {
 554     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 555     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 556   };
 557   const vc repl_bs = {
 558     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 559     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 560   };
 561   const vc repl_qm = {
 562     '?', '?', '?', '?', '?', '?', '?', '?',
 563     '?', '?', '?', '?', '?', '?', '?', '?',
 564   };
 565   const vc zero = { 0 };
 566
 567   vc data, t;
 568
 569   /* Main loop processing 16 bytes at a time.  */
 570   do
 571     {
 572       vc m_nl, m_cr, m_bs, m_qm;
 573
 574       data = __builtin_vec_vsx_ld (0, s);
 575       s += 16;
 576
 577       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 578       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 579       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 580       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 581       t = (m_nl | m_cr) | (m_bs | m_qm);
 582
 583       /* T now contains 0xff in bytes for which we matched one of the relevant
 584          characters.  We want to exit the loop if any byte in T is non-zero.
 585          Below is the expansion of vec_any_ne(t, zero).  */
 586     }
 587   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 588
 589   /* Restore s to to point to the 16 bytes we just processed.  */
 590   s -= 16;
 591
 592   {
 593 #define N  (sizeof(vc) / sizeof(long))
 594
 595     union {
 596       vc v;
 597       /* Statically assert that N is 2 or 4.  */
 598       unsigned long l[(N == 2 || N == 4) ? N : -1];
 599     } u;
 600     unsigned long l, i = 0;
 601
 602     u.v = t;
 603
 604     /* Find the first word of T that is non-zero.  */
 605     switch (N)
 606       {
 607       case 4:
 608         l = u.l[i++];
 609         if (l != 0)
 610           break;
 611         s += sizeof(unsigned long);
 612         l = u.l[i++];
 613         if (l != 0)
 614           break;
 615         s += sizeof(unsigned long);
 616         /* FALLTHRU */
 617       case 2:
 618         l = u.l[i++];
 619         if (l != 0)
 620           break;
 621         s += sizeof(unsigned long);
 622         l = u.l[i];
 623       }
 624
 625     /* L now contains 0xff in bytes for which we matched one of the
 626        relevant characters.  We can find the byte index by finding
 627        its bit index and dividing by 8.  */
 628 #ifdef __BIG_ENDIAN__
 629     l = __builtin_clzl(l) >> 3;
 630 #else
 631     l = __builtin_ctzl(l) >> 3;
 632 #endif
 633     return s + l;
 634
 635 #undef N
 636   }
 637 }
 638
 639 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__) && defined (__BIG_ENDIAN__)
 640
 641 /* A vection of the fast scanner using AltiVec vectorized byte compares.
 642    This cannot be used for little endian because vec_lvsl/lvsr are
 643    deprecated for little endian and the code won't work properly.  */
 644 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
 645    so we can't compile this function without -maltivec on the command line
 646    (or implied by some other switch).  */
 647
 648 static const uchar *
 649 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 650 {
 651   typedef __attribute__((altivec(vector))) unsigned char vc;
 652
 653   const vc repl_nl = {
 654     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 655     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 656   };
 657   const vc repl_cr = {
 658     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 659     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 660   };
 661   const vc repl_bs = {
 662     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 663     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 664   };
 665   const vc repl_qm = {
 666     '?', '?', '?', '?', '?', '?', '?', '?',
 667     '?', '?', '?', '?', '?', '?', '?', '?',
 668   };
 669   const vc ones = {
 670     -1, -1, -1, -1, -1, -1, -1, -1,
 671     -1, -1, -1, -1, -1, -1, -1, -1,
 672   };
 673   const vc zero = { 0 };
 674
 675   vc data, mask, t;
 676
 677   /* Altivec loads automatically mask addresses with -16.  This lets us
 678      issue the first load as early as possible.  */
 679   data = __builtin_vec_ld(0, (const vc *)s);
 680
 681   /* Discard bytes before the beginning of the buffer.  Do this by
 682      beginning with all ones and shifting in zeros according to the
 683      mis-alignment.  The LVSR instruction pulls the exact shift we
 684      want from the address.  */
 685   mask = __builtin_vec_lvsr(0, s);
 686   mask = __builtin_vec_perm(zero, ones, mask);
 687   data &= mask;
 688
 689   /* While altivec loads mask addresses, we still need to align S so
 690      that the offset we compute at the end is correct.  */
 691   s = (const uchar *)((uintptr_t)s & -16);
 692
 693   /* Main loop processing 16 bytes at a time.  */
 694   goto start;
 695   do
 696     {
 697       vc m_nl, m_cr, m_bs, m_qm;
 698
 699       s += 16;
 700       data = __builtin_vec_ld(0, (const vc *)s);
 701
 702     start:
 703       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 704       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 705       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 706       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 707       t = (m_nl | m_cr) | (m_bs | m_qm);
 708
 709       /* T now contains 0xff in bytes for which we matched one of the relevant
 710          characters.  We want to exit the loop if any byte in T is non-zero.
 711          Below is the expansion of vec_any_ne(t, zero).  */
 712     }
 713   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 714
 715   {
 716 #define N  (sizeof(vc) / sizeof(long))
 717
 718     union {
 719       vc v;
 720       /* Statically assert that N is 2 or 4.  */
 721       unsigned long l[(N == 2 || N == 4) ? N : -1];
 722     } u;
 723     unsigned long l, i = 0;
 724
 725     u.v = t;
 726
 727     /* Find the first word of T that is non-zero.  */
 728     switch (N)
 729       {
 730       case 4:
 731         l = u.l[i++];
 732         if (l != 0)
 733           break;
 734         s += sizeof(unsigned long);
 735         l = u.l[i++];
 736         if (l != 0)
 737           break;
 738         s += sizeof(unsigned long);
 739         /* FALLTHROUGH */
 740       case 2:
 741         l = u.l[i++];
 742         if (l != 0)
 743           break;
 744         s += sizeof(unsigned long);
 745         l = u.l[i];
 746       }
 747
 748     /* L now contains 0xff in bytes for which we matched one of the
 749        relevant characters.  We can find the byte index by finding
 750        its bit index and dividing by 8.  */
 751     l = __builtin_clzl(l) >> 3;
 752     return s + l;
 753
 754 #undef N
 755   }
 756 }
 757
 758 #elif defined (__ARM_NEON) && defined (__ARM_64BIT_STATE)
 759 #include "arm_neon.h"
 760
 761 /* This doesn't have to be the exact page size, but no system may use
 762    a size smaller than this.  ARMv8 requires a minimum page size of
 763    4k.  The impact of being conservative here is a small number of
 764    cases will take the slightly slower entry path into the main
 765    loop.  */
 766
 767 #define AARCH64_MIN_PAGE_SIZE 4096
 768
 769 static const uchar *
 770 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 771 {
 772   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
 773   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
 774   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
 775   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
 776   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
 777
 778 #ifdef __ARM_BIG_ENDIAN
 779   const int16x8_t shift = {8, 8, 8, 8, 0, 0, 0, 0};
 780 #else
 781   const int16x8_t shift = {0, 0, 0, 0, 8, 8, 8, 8};
 782 #endif
 783
 784   unsigned int found;
 785   const uint8_t *p;
 786   uint8x16_t data;
 787   uint8x16_t t;
 788   uint16x8_t m;
 789   uint8x16_t u, v, w;
 790
 791   /* Align the source pointer.  */
 792   p = (const uint8_t *)((uintptr_t)s & -16);
 793
 794   /* Assuming random string start positions, with a 4k page size we'll take
 795      the slow path about 0.37% of the time.  */
 796   if (__builtin_expect ((AARCH64_MIN_PAGE_SIZE
 797                          - (((uintptr_t) s) & (AARCH64_MIN_PAGE_SIZE - 1)))
 798                         < 16, 0))
 799     {
 800       /* Slow path: the string starts near a possible page boundary.  */
 801       uint32_t misalign, mask;
 802
 803       misalign = (uintptr_t)s & 15;
 804       mask = (-1u << misalign) & 0xffff;
 805       data = vld1q_u8 (p);
 806       t = vceqq_u8 (data, repl_nl);
 807       u = vceqq_u8 (data, repl_cr);
 808       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 809       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 810       t = vorrq_u8 (v, w);
 811       t = vandq_u8 (t, xmask);
 812       m = vpaddlq_u8 (t);
 813       m = vshlq_u16 (m, shift);
 814       found = vaddvq_u16 (m);
 815       found &= mask;
 816       if (found)
 817         return (const uchar*)p + __builtin_ctz (found);
 818     }
 819   else
 820     {
 821       data = vld1q_u8 ((const uint8_t *) s);
 822       t = vceqq_u8 (data, repl_nl);
 823       u = vceqq_u8 (data, repl_cr);
 824       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 825       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 826       t = vorrq_u8 (v, w);
 827       if (__builtin_expect (vpaddd_u64 ((uint64x2_t)t) != 0, 0))
 828         goto done;
 829     }
 830
 831   do
 832     {
 833       p += 16;
 834       data = vld1q_u8 (p);
 835       t = vceqq_u8 (data, repl_nl);
 836       u = vceqq_u8 (data, repl_cr);
 837       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 838       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 839       t = vorrq_u8 (v, w);
 840     } while (!vpaddd_u64 ((uint64x2_t)t));
 841
 842 done:
 843   /* Now that we've found the terminating substring, work out precisely where
 844      we need to stop.  */
 845   t = vandq_u8 (t, xmask);
 846   m = vpaddlq_u8 (t);
 847   m = vshlq_u16 (m, shift);
 848   found = vaddvq_u16 (m);
 849   return (((((uintptr_t) p) < (uintptr_t) s) ? s : (const uchar *)p)
 850           + __builtin_ctz (found));
 851 }
 852
 853 #elif defined (__ARM_NEON)
 854 #include "arm_neon.h"
 855
 856 static const uchar *
 857 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 858 {
 859   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
 860   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
 861   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
 862   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
 863   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
 864
 865   unsigned int misalign, found, mask;
 866   const uint8_t *p;
 867   uint8x16_t data;
 868
 869   /* Align the source pointer.  */
 870   misalign = (uintptr_t)s & 15;
 871   p = (const uint8_t *)((uintptr_t)s & -16);
 872   data = vld1q_u8 (p);
 873
 874   /* Create a mask for the bytes that are valid within the first
 875      16-byte block.  The Idea here is that the AND with the mask
 876      within the loop is "free", since we need some AND or TEST
 877      insn in order to set the flags for the branch anyway.  */
 878   mask = (-1u << misalign) & 0xffff;
 879
 880   /* Main loop, processing 16 bytes at a time.  */
 881   goto start;
 882
 883   do
 884     {
 885       uint8x8_t l;
 886       uint16x4_t m;
 887       uint32x2_t n;
 888       uint8x16_t t, u, v, w;
 889
 890       p += 16;
 891       data = vld1q_u8 (p);
 892       mask = 0xffff;
 893
 894     start:
 895       t = vceqq_u8 (data, repl_nl);
 896       u = vceqq_u8 (data, repl_cr);
 897       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 898       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 899       t = vandq_u8 (vorrq_u8 (v, w), xmask);
 900       l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
 901       m = vpaddl_u8 (l);
 902       n = vpaddl_u16 (m);
 903
 904       found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
 905               vshr_n_u64 ((uint64x1_t) n, 24)), 0);
 906       found &= mask;
 907     }
 908   while (!found);
 909
 910   /* FOUND contains 1 in bits for which we matched a relevant
 911      character.  Conversion to the byte index is trivial.  */
 912   found = __builtin_ctz (found);
 913   return (const uchar *)p + found;
 914 }
 915
 916 #else
 917
 918 /* We only have one accelerated alternative.  Use a direct call so that
 919    we encourage inlining.  */
 920
 921 #define search_line_fast  search_line_acc_char
 922
 923 #endif
 924
 925 /* Initialize the lexer if needed.  */
 926
 927 void
 928 _cpp_init_lexer (void)
 929 {
 930 #ifdef HAVE_init_vectorized_lexer
 931   init_vectorized_lexer ();
 932 #endif
 933 }
 934
 935 /* Returns with a logical line that contains no escaped newlines or
 936    trigraphs.  This is a time-critical inner loop.  */
 937 void
 938 _cpp_clean_line (cpp_reader *pfile)
 939 {
 940   cpp_buffer *buffer;
 941   const uchar *s;
 942   uchar c, *d, *p;
 943
 944   buffer = pfile->buffer;
 945   buffer->cur_note = buffer->notes_used = 0;
 946   buffer->cur = buffer->line_base = buffer->next_line;
 947   buffer->need_line = false;
 948   s = buffer->next_line;
 949
 950   if (!buffer->from_stage3)
 951     {
 952       const uchar *pbackslash = NULL;
 953
 954       /* Fast path.  This is the common case of an un-escaped line with
 955          no trigraphs.  The primary win here is by not writing any
 956          data back to memory until we have to.  */
 957       while (1)
 958         {
 959           /* Perform an optimized search for \n, \r, \\, ?.  */
 960           s = search_line_fast (s, buffer->rlimit);
 961
 962           c = *s;
 963           if (c == '\\')
 964             {
 965               /* Record the location of the backslash and continue.  */
 966               pbackslash = s++;
 967             }
 968           else if (__builtin_expect (c == '?', 0))
 969             {
 970               if (__builtin_expect (s[1] == '?', false)
 971                    && _cpp_trigraph_map[s[2]])
 972                 {
 973                   /* Have a trigraph.  We may or may not have to convert
 974                      it.  Add a line note regardless, for -Wtrigraphs.  */
 975                   add_line_note (buffer, s, s[2]);
 976                   if (CPP_OPTION (pfile, trigraphs))
 977                     {
 978                       /* We do, and that means we have to switch to the
 979                          slow path.  */
 980                       d = (uchar *) s;
 981                       *d = _cpp_trigraph_map[s[2]];
 982                       s += 2;
 983                       goto slow_path;
 984                     }
 985                 }
 986               /* Not a trigraph.  Continue on fast-path.  */
 987               s++;
 988             }
 989           else
 990             break;
 991         }
 992
 993       /* This must be \r or \n.  We're either done, or we'll be forced
 994          to write back to the buffer and continue on the slow path.  */
 995       d = (uchar *) s;
 996
 997       if (__builtin_expect (s == buffer->rlimit, false))
 998         goto done;
 999
1000       /* DOS line ending? */
1001       if (__builtin_expect (c == '\r', false) && s[1] == '\n')
1002         {
1003           s++;
1004           if (s == buffer->rlimit)
1005             goto done;
1006         }
1007
1008       if (__builtin_expect (pbackslash == NULL, true))
1009         goto done;
1010
1011       /* Check for escaped newline.  */
1012       p = d;
1013       while (is_nvspace (p[-1]))
1014         p--;
1015       if (p - 1 != pbackslash)
1016         goto done;
1017
1018       /* Have an escaped newline; process it and proceed to
1019          the slow path.  */
1020       add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
1021       d = p - 2;
1022       buffer->next_line = p - 1;
1023
1024     slow_path:
1025       while (1)
1026         {
1027           c = *++s;
1028           *++d = c;
1029
1030           if (c == '\n' || c == '\r')
1031             {
1032               /* Handle DOS line endings.  */
1033               if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
1034                 s++;
1035               if (s == buffer->rlimit)
1036                 break;
1037
1038               /* Escaped?  */
1039               p = d;
1040               while (p != buffer->next_line && is_nvspace (p[-1]))
1041                 p--;
1042               if (p == buffer->next_line || p[-1] != '\\')
1043                 break;
1044
1045               add_line_note (buffer, p - 1, p != d ? ' ': '\\');
1046               d = p - 2;
1047               buffer->next_line = p - 1;
1048             }
1049           else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
1050             {
1051               /* Add a note regardless, for the benefit of -Wtrigraphs.  */
1052               add_line_note (buffer, d, s[2]);
1053               if (CPP_OPTION (pfile, trigraphs))
1054                 {
1055                   *d = _cpp_trigraph_map[s[2]];
1056                   s += 2;
1057                 }
1058             }
1059         }
1060     }
1061   else
1062     {
1063       while (*s != '\n' && *s != '\r')
1064         s++;
1065       d = (uchar *) s;
1066
1067       /* Handle DOS line endings.  */
1068       if (*s == '\r' && s + 1 != buffer->rlimit && s[1] == '\n')
1069         s++;
1070     }
1071
1072  done:
1073   *d = '\n';
1074   /* A sentinel note that should never be processed.  */
1075   add_line_note (buffer, d + 1, '\n');
1076   buffer->next_line = s + 1;
1077 }
1078
1079 template <bool lexing_raw_string>
1080 static bool get_fresh_line_impl (cpp_reader *pfile);
1081
1082 /* Return true if the trigraph indicated by NOTE should be warned
1083    about in a comment.  */
1084 static bool
1085 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
1086 {
1087   const uchar *p;
1088
1089   /* Within comments we don't warn about trigraphs, unless the
1090      trigraph forms an escaped newline, as that may change
1091      behavior.  */
1092   if (note->type != '/')
1093     return false;
1094
1095   /* If -trigraphs, then this was an escaped newline iff the next note
1096      is coincident.  */
1097   if (CPP_OPTION (pfile, trigraphs))
1098     return note[1].pos == note->pos;
1099
1100   /* Otherwise, see if this forms an escaped newline.  */
1101   p = note->pos + 3;
1102   while (is_nvspace (*p))
1103     p++;
1104
1105   /* There might have been escaped newlines between the trigraph and the
1106      newline we found.  Hence the position test.  */
1107   return (*p == '\n' && p < note[1].pos);
1108 }
1109
1110 /* Process the notes created by add_line_note as far as the current
1111    location.  */
1112 void
1113 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
1114 {
1115   cpp_buffer *buffer = pfile->buffer;
1116
1117   for (;;)
1118     {
1119       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
1120       unsigned int col;
1121
1122       if (note->pos > buffer->cur)
1123         break;
1124
1125       buffer->cur_note++;
1126       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
1127
1128       if (note->type == '\\' || note->type == ' ')
1129         {
1130           if (note->type == ' ' && !in_comment)
1131             cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
1132                                  "backslash and newline separated by space");
1133
1134           if (buffer->next_line > buffer->rlimit)
1135             {
1136               cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
1137                                    "backslash-newline at end of file");
1138               /* Prevent "no newline at end of file" warning.  */
1139               buffer->next_line = buffer->rlimit;
1140             }
1141
1142           buffer->line_base = note->pos;
1143           CPP_INCREMENT_LINE (pfile, 0);
1144         }
1145       else if (_cpp_trigraph_map[note->type])
1146         {
1147           if (CPP_OPTION (pfile, warn_trigraphs)
1148               && (!in_comment || warn_in_comment (pfile, note)))
1149             {
1150               if (CPP_OPTION (pfile, trigraphs))
1151                 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
1152                                        pfile->line_table->highest_line, col,
1153                                        "trigraph ??%c converted to %c",
1154                                        note->type,
1155                                        (int) _cpp_trigraph_map[note->type]);
1156               else
1157                 {
1158                   cpp_warning_with_line
1159                     (pfile, CPP_W_TRIGRAPHS,
1160                      pfile->line_table->highest_line, col,
1161                      "trigraph ??%c ignored, use -trigraphs to enable",
1162                      note->type);
1163                 }
1164             }
1165         }
1166       else if (note->type == 0)
1167         /* Already processed in lex_raw_string.  */;
1168       else
1169         abort ();
1170     }
1171 }
1172
1173 namespace bidi {
1174   enum class kind {
1175     NONE, LRE, RLE, LRO, RLO, LRI, RLI, FSI, PDF, PDI, LTR, RTL
1176   };
1177
1178   /* All the UTF-8 encodings of bidi characters start with E2.  */
1179   constexpr uchar utf8_start = 0xe2;
1180
1181   struct context
1182   {
1183     context () {}
1184     context (location_t loc, kind k, bool pdf, bool ucn)
1185     : m_loc (loc), m_kind (k), m_pdf (pdf), m_ucn (ucn)
1186     {
1187     }
1188
1189     kind get_pop_kind () const
1190     {
1191       return m_pdf ? kind::PDF : kind::PDI;
1192     }
1193     bool ucn_p () const
1194     {
1195       return m_ucn;
1196     }
1197
1198     location_t m_loc;
1199     kind m_kind;
1200     unsigned m_pdf : 1;
1201     unsigned m_ucn : 1;
1202   };
1203
1204   /* A vector holding currently open bidi contexts.  We use a char for
1205      each context, its LSB is 1 if it represents a PDF context, 0 if it
1206      represents a PDI context.  The next bit is 1 if this context was open
1207      by a bidi character written as a UCN, and 0 when it was UTF-8.  */
1208   semi_embedded_vec <context, 16> vec;
1209
1210   /* Close the whole comment/identifier/string literal/character constant
1211      context.  */
1212   void on_close ()
1213   {
1214     vec.truncate (0);
1215   }
1216
1217   /* Pop the last element in the vector.  */
1218   void pop ()
1219   {
1220     unsigned int len = vec.count ();
1221     gcc_checking_assert (len > 0);
1222     vec.truncate (len - 1);
1223   }
1224
1225   /* Return the pop kind of the context of the Ith element.  */
1226   kind pop_kind_at (unsigned int i)
1227   {
1228     return vec[i].get_pop_kind ();
1229   }
1230
1231   /* Return the pop kind of the context that is currently opened.  */
1232   kind current_ctx ()
1233   {
1234     unsigned int len = vec.count ();
1235     if (len == 0)
1236       return kind::NONE;
1237     return vec[len - 1].get_pop_kind ();
1238   }
1239
1240   /* Return true if the current context comes from a UCN origin, that is,
1241      the bidi char which started this bidi context was written as a UCN.  */
1242   bool current_ctx_ucn_p ()
1243   {
1244     unsigned int len = vec.count ();
1245     gcc_checking_assert (len > 0);
1246     return vec[len - 1].m_ucn;
1247   }
1248
1249   location_t current_ctx_loc ()
1250   {
1251     unsigned int len = vec.count ();
1252     gcc_checking_assert (len > 0);
1253     return vec[len - 1].m_loc;
1254   }
1255
1256   /* We've read a bidi char, update the current vector as necessary.
1257      LOC is only valid when K is not kind::NONE.  */
1258   void on_char (kind k, bool ucn_p, location_t loc)
1259   {
1260     switch (k)
1261       {
1262       case kind::LRE:
1263       case kind::RLE:
1264       case kind::LRO:
1265       case kind::RLO:
1266         vec.push (context (loc, k, true, ucn_p));
1267         break;
1268       case kind::LRI:
1269       case kind::RLI:
1270       case kind::FSI:
1271         vec.push (context (loc, k, false, ucn_p));
1272         break;
1273       /* PDF terminates the scope of the last LRE, RLE, LRO, or RLO
1274          whose scope has not yet been terminated.  */
1275       case kind::PDF:
1276         if (current_ctx () == kind::PDF)
1277           pop ();
1278         break;
1279       /* PDI terminates the scope of the last LRI, RLI, or FSI whose
1280          scope has not yet been terminated, as well as the scopes of
1281          any subsequent LREs, RLEs, LROs, or RLOs whose scopes have not
1282          yet been terminated.  */
1283       case kind::PDI:
1284         for (int i = vec.count () - 1; i >= 0; --i)
1285           if (pop_kind_at (i) == kind::PDI)
1286             {
1287               vec.truncate (i);
1288               break;
1289             }
1290         break;
1291       case kind::LTR:
1292       case kind::RTL:
1293         /* These aren't popped by a PDF/PDI.  */
1294         break;
1295       ATTR_LIKELY case kind::NONE:
1296         break;
1297       default:
1298         abort ();
1299       }
1300   }
1301
1302   /* Return a descriptive string for K.  */
1303   const char *to_str (kind k)
1304   {
1305     switch (k)
1306       {
1307       case kind::LRE:
1308         return "U+202A (LEFT-TO-RIGHT EMBEDDING)";
1309       case kind::RLE:
1310         return "U+202B (RIGHT-TO-LEFT EMBEDDING)";
1311       case kind::LRO:
1312         return "U+202D (LEFT-TO-RIGHT OVERRIDE)";
1313       case kind::RLO:
1314         return "U+202E (RIGHT-TO-LEFT OVERRIDE)";
1315       case kind::LRI:
1316         return "U+2066 (LEFT-TO-RIGHT ISOLATE)";
1317       case kind::RLI:
1318         return "U+2067 (RIGHT-TO-LEFT ISOLATE)";
1319       case kind::FSI:
1320         return "U+2068 (FIRST STRONG ISOLATE)";
1321       case kind::PDF:
1322         return "U+202C (POP DIRECTIONAL FORMATTING)";
1323       case kind::PDI:
1324         return "U+2069 (POP DIRECTIONAL ISOLATE)";
1325       case kind::LTR:
1326         return "U+200E (LEFT-TO-RIGHT MARK)";
1327       case kind::RTL:
1328         return "U+200F (RIGHT-TO-LEFT MARK)";
1329       default:
1330         abort ();
1331       }
1332   }
1333 }
1334
1335 /* Get location_t for the range of bytes [START, START + NUM_BYTES)
1336    within the current line in FILE, with the caret at START.  */
1337
1338 static location_t
1339 get_location_for_byte_range_in_cur_line (cpp_reader *pfile,
1340                                          const unsigned char *const start,
1341                                          size_t num_bytes)
1342 {
1343   gcc_checking_assert (num_bytes > 0);
1344
1345   /* CPP_BUF_COLUMN and linemap_position_for_column both refer
1346      to offsets in bytes, but CPP_BUF_COLUMN is 0-based,
1347      whereas linemap_position_for_column is 1-based.  */
1348
1349   /* Get 0-based offsets within the line.  */
1350   size_t start_offset = CPP_BUF_COLUMN (pfile->buffer, start);
1351   size_t end_offset = start_offset + num_bytes - 1;
1352
1353   /* Now convert to location_t, where "columns" are 1-based byte offsets.  */
1354   location_t start_loc = linemap_position_for_column (pfile->line_table,
1355                                                       start_offset + 1);
1356   location_t end_loc = linemap_position_for_column (pfile->line_table,
1357                                                      end_offset + 1);
1358
1359   if (start_loc == end_loc)
1360     return start_loc;
1361
1362   source_range src_range;
1363   src_range.m_start = start_loc;
1364   src_range.m_finish = end_loc;
1365   location_t combined_loc
1366     = pfile->line_table->get_or_create_combined_loc (start_loc,
1367                                                      src_range,
1368                                                      nullptr,
1369                                                      0);
1370   return combined_loc;
1371 }
1372
1373 /* Parse a sequence of 3 bytes starting with P and return its bidi code.  */
1374
1375 static bidi::kind
1376 get_bidi_utf8_1 (const unsigned char *const p)
1377 {
1378   gcc_checking_assert (p[0] == bidi::utf8_start);
1379
1380   if (p[1] == 0x80)
1381     switch (p[2])
1382       {
1383       case 0xaa:
1384         return bidi::kind::LRE;
1385       case 0xab:
1386         return bidi::kind::RLE;
1387       case 0xac:
1388         return bidi::kind::PDF;
1389       case 0xad:
1390         return bidi::kind::LRO;
1391       case 0xae:
1392         return bidi::kind::RLO;
1393       case 0x8e:
1394         return bidi::kind::LTR;
1395       case 0x8f:
1396         return bidi::kind::RTL;
1397       default:
1398         break;
1399       }
1400   else if (p[1] == 0x81)
1401     switch (p[2])
1402       {
1403       case 0xa6:
1404         return bidi::kind::LRI;
1405       case 0xa7:
1406         return bidi::kind::RLI;
1407       case 0xa8:
1408         return bidi::kind::FSI;
1409       case 0xa9:
1410         return bidi::kind::PDI;
1411       default:
1412         break;
1413       }
1414
1415   return bidi::kind::NONE;
1416 }
1417
1418 /* Parse a sequence of 3 bytes starting with P and return its bidi code.
1419    If the kind is not NONE, write the location to *OUT.*/
1420
1421 static bidi::kind
1422 get_bidi_utf8 (cpp_reader *pfile, const unsigned char *const p, location_t *out)
1423 {
1424   bidi::kind result = get_bidi_utf8_1 (p);
1425   if (result != bidi::kind::NONE)
1426     {
1427       /* We have a sequence of 3 bytes starting at P.  */
1428       *out = get_location_for_byte_range_in_cur_line (pfile, p, 3);
1429     }
1430   return result;
1431 }
1432
1433 /* Parse a UCN where P points just past \u or \U and return its bidi code.  */
1434
1435 static bidi::kind
1436 get_bidi_ucn_1 (const unsigned char *p, bool is_U, const unsigned char **end)
1437 {
1438   /* 6.4.3 Universal Character Names
1439       \u hex-quad
1440       \U hex-quad hex-quad
1441       \u { simple-hexadecimal-digit-sequence }
1442      where \unnnn means \U0000nnnn.  */
1443
1444   *end = p + 4;
1445   if (is_U)
1446     {
1447       if (p[0] != '0' || p[1] != '0' || p[2] != '0' || p[3] != '0')
1448         return bidi::kind::NONE;
1449       /* Skip 4B so we can treat \u and \U the same below.  */
1450       p += 4;
1451       *end += 4;
1452     }
1453   else if (p[0] == '{')
1454     {
1455       p++;
1456       while (*p == '0')
1457         p++;
1458       if (p[0] != '2'
1459           || p[1] != '0'
1460           || !ISXDIGIT (p[2])
1461           || !ISXDIGIT (p[3])
1462           || p[4] != '}')
1463         return bidi::kind::NONE;
1464       *end = p + 5;
1465     }
1466
1467   /* All code points we are looking for start with 20xx.  */
1468   if (p[0] != '2' || p[1] != '0')
1469     return bidi::kind::NONE;
1470   else if (p[2] == '2')
1471     switch (p[3])
1472       {
1473       case 'a':
1474       case 'A':
1475         return bidi::kind::LRE;
1476       case 'b':
1477       case 'B':
1478         return bidi::kind::RLE;
1479       case 'c':
1480       case 'C':
1481         return bidi::kind::PDF;
1482       case 'd':
1483       case 'D':
1484         return bidi::kind::LRO;
1485       case 'e':
1486       case 'E':
1487         return bidi::kind::RLO;
1488       default:
1489         break;
1490       }
1491   else if (p[2] == '6')
1492     switch (p[3])
1493       {
1494       case '6':
1495         return bidi::kind::LRI;
1496       case '7':
1497         return bidi::kind::RLI;
1498       case '8':
1499         return bidi::kind::FSI;
1500       case '9':
1501         return bidi::kind::PDI;
1502       default:
1503         break;
1504       }
1505   else if (p[2] == '0')
1506     switch (p[3])
1507       {
1508       case 'e':
1509       case 'E':
1510         return bidi::kind::LTR;
1511       case 'f':
1512       case 'F':
1513         return bidi::kind::RTL;
1514       default:
1515         break;
1516       }
1517
1518   return bidi::kind::NONE;
1519 }
1520
1521 /* Parse a UCN where P points just past \u or \U and return its bidi code.
1522    If the kind is not NONE, write the location to *OUT.  */
1523
1524 static bidi::kind
1525 get_bidi_ucn (cpp_reader *pfile, const unsigned char *p, bool is_U,
1526               location_t *out)
1527 {
1528   const unsigned char *end;
1529   bidi::kind result = get_bidi_ucn_1 (p, is_U, &end);
1530   if (result != bidi::kind::NONE)
1531     {
1532       const unsigned char *start = p - 2;
1533       size_t num_bytes = end - start;
1534       *out = get_location_for_byte_range_in_cur_line (pfile, start, num_bytes);
1535     }
1536   return result;
1537 }
1538
1539 /* Parse a named universal character escape where P points just past \N and
1540    return its bidi code.  If the kind is not NONE, write the location to
1541    *OUT.  */
1542
1543 static bidi::kind
1544 get_bidi_named (cpp_reader *pfile, const unsigned char *p, location_t *out)
1545 {
1546   bidi::kind result = bidi::kind::NONE;
1547   if (*p != '{')
1548     return bidi::kind::NONE;
1549   if (strncmp ((const char *) (p + 1), "LEFT-TO-RIGHT ", 14) == 0)
1550     {
1551       if (strncmp ((const char *) (p + 15), "MARK}", 5) == 0)
1552         result = bidi::kind::LTR;
1553       else if (strncmp ((const char *) (p + 15), "EMBEDDING}", 10) == 0)
1554         result = bidi::kind::LRE;
1555       else if (strncmp ((const char *) (p + 15), "OVERRIDE}", 9) == 0)
1556         result = bidi::kind::LRO;
1557       else if (strncmp ((const char *) (p + 15), "ISOLATE}", 8) == 0)
1558         result = bidi::kind::LRI;
1559     }
1560   else if (strncmp ((const char *) (p + 1), "RIGHT-TO-LEFT ", 14) == 0)
1561     {
1562       if (strncmp ((const char *) (p + 15), "MARK}", 5) == 0)
1563         result = bidi::kind::RTL;
1564       else if (strncmp ((const char *) (p + 15), "EMBEDDING}", 10) == 0)
1565         result = bidi::kind::RLE;
1566       else if (strncmp ((const char *) (p + 15), "OVERRIDE}", 9) == 0)
1567         result = bidi::kind::RLO;
1568       else if (strncmp ((const char *) (p + 15), "ISOLATE}", 8) == 0)
1569         result = bidi::kind::RLI;
1570     }
1571   else if (strncmp ((const char *) (p + 1), "POP DIRECTIONAL ", 16) == 0)
1572     {
1573       if (strncmp ((const char *) (p + 16), "FORMATTING}", 11) == 0)
1574         result = bidi::kind::PDF;
1575       else if (strncmp ((const char *) (p + 16), "ISOLATE}", 8) == 0)
1576         result = bidi::kind::PDI;
1577     }
1578   else if (strncmp ((const char *) (p + 1), "FIRST STRONG ISOLATE}", 21) == 0)
1579     result = bidi::kind::FSI;
1580   if (result != bidi::kind::NONE)
1581     *out = get_location_for_byte_range_in_cur_line (pfile, p - 2,
1582                                                     (strchr ((const char *)
1583                                                              (p + 1), '}')
1584                                                      - (const char *) p)
1585                                                     + 3);
1586   return result;
1587 }
1588
1589 /* Subclass of rich_location for reporting on unpaired UTF-8
1590    bidirectional control character(s).
1591    Escape the source lines on output, and show all unclosed
1592    bidi context, labelling everything.  */
1593
1594 class unpaired_bidi_rich_location : public rich_location
1595 {
1596  public:
1597   class custom_range_label : public range_label
1598   {
1599    public:
1600      label_text get_text (unsigned range_idx) const final override
1601      {
1602        /* range 0 is the primary location; each subsequent range i + 1
1603           is for bidi::vec[i].  */
1604        if (range_idx > 0)
1605          {
1606            const bidi::context &ctxt (bidi::vec[range_idx - 1]);
1607            return label_text::borrow (bidi::to_str (ctxt.m_kind));
1608          }
1609        else
1610          return label_text::borrow (_("end of bidirectional context"));
1611      }
1612   };
1613
1614   unpaired_bidi_rich_location (cpp_reader *pfile, location_t loc)
1615   : rich_location (pfile->line_table, loc, &m_custom_label)
1616   {
1617     set_escape_on_output (true);
1618     for (unsigned i = 0; i < bidi::vec.count (); i++)
1619       add_range (bidi::vec[i].m_loc,
1620                  SHOW_RANGE_WITHOUT_CARET,
1621                  &m_custom_label);
1622   }
1623
1624  private:
1625    custom_range_label m_custom_label;
1626 };
1627
1628 /* We're closing a bidi context, that is, we've encountered a newline,
1629    are closing a C-style comment, or are at the end of a string literal,
1630    character constant, or identifier.  Warn if this context was not
1631    properly terminated by a PDI or PDF.  P points to the last character
1632    in this context.  */
1633
1634 static void
1635 maybe_warn_bidi_on_close (cpp_reader *pfile, const uchar *p)
1636 {
1637   const auto warn_bidi = CPP_OPTION (pfile, cpp_warn_bidirectional);
1638   if (bidi::vec.count () > 0
1639       && (warn_bidi & bidirectional_unpaired
1640           && (!bidi::current_ctx_ucn_p ()
1641               || (warn_bidi & bidirectional_ucn))))
1642     {
1643       const location_t loc
1644         = linemap_position_for_column (pfile->line_table,
1645                                        CPP_BUF_COLUMN (pfile->buffer, p));
1646       unpaired_bidi_rich_location rich_loc (pfile, loc);
1647       /* cpp_callbacks doesn't yet have a way to handle singular vs plural
1648          forms of a diagnostic, so fake it for now.  */
1649       if (bidi::vec.count () > 1)
1650         cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1651                         "unpaired UTF-8 bidirectional control characters "
1652                         "detected");
1653       else
1654         cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1655                         "unpaired UTF-8 bidirectional control character "
1656                         "detected");
1657     }
1658   /* We're done with this context.  */
1659   bidi::on_close ();
1660 }
1661
1662 /* We're at the beginning or in the middle of an identifier/comment/string
1663    literal/character constant.  Warn if we've encountered a bidi character.
1664    KIND says which bidi control character it was; UCN_P is true iff this bidi
1665    control character was written as a UCN.  LOC is the location of the
1666    character, but is only valid if KIND != bidi::kind::NONE.  */
1667
1668 static void
1669 maybe_warn_bidi_on_char (cpp_reader *pfile, bidi::kind kind,
1670                          bool ucn_p, location_t loc)
1671 {
1672   if (__builtin_expect (kind == bidi::kind::NONE, 1))
1673     return;
1674
1675   const auto warn_bidi = CPP_OPTION (pfile, cpp_warn_bidirectional);
1676
1677   if (warn_bidi & (bidirectional_unpaired|bidirectional_any))
1678     {
1679       rich_location rich_loc (pfile->line_table, loc);
1680       rich_loc.set_escape_on_output (true);
1681
1682       /* It seems excessive to warn about a PDI/PDF that is closing
1683          an opened context because we've already warned about the
1684          opening character.  Except warn when we have a UCN x UTF-8
1685          mismatch, if UCN checking is enabled.  */
1686       if (kind == bidi::current_ctx ())
1687         {
1688           if (warn_bidi == (bidirectional_unpaired|bidirectional_ucn)
1689               && bidi::current_ctx_ucn_p () != ucn_p)
1690             {
1691               rich_loc.add_range (bidi::current_ctx_loc ());
1692               cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1693                               "UTF-8 vs UCN mismatch when closing "
1694                               "a context by \"%s\"", bidi::to_str (kind));
1695             }
1696         }
1697       else if (warn_bidi & bidirectional_any
1698                && (!ucn_p || (warn_bidi & bidirectional_ucn)))
1699         {
1700           if (kind == bidi::kind::PDF || kind == bidi::kind::PDI)
1701             cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1702                             "\"%s\" is closing an unopened context",
1703                             bidi::to_str (kind));
1704           else
1705             cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1706                             "found problematic Unicode character \"%s\"",
1707                             bidi::to_str (kind));
1708         }
1709     }
1710   /* We're done with this context.  */
1711   bidi::on_char (kind, ucn_p, loc);
1712 }
1713
1714 static const cppchar_t utf8_continuation = 0x80;
1715 static const cppchar_t utf8_signifier = 0xC0;
1716
1717 /* Emit -Winvalid-utf8 warning on invalid UTF-8 character starting
1718    at PFILE->buffer->cur.  Return a pointer after the diagnosed
1719    invalid character.  */
1720
1721 static const uchar *
1722 _cpp_warn_invalid_utf8 (cpp_reader *pfile)
1723 {
1724   cpp_buffer *buffer = pfile->buffer;
1725   const uchar *cur = buffer->cur;
1726   bool pedantic = (CPP_PEDANTIC (pfile)
1727                    && CPP_OPTION (pfile, cpp_warn_invalid_utf8) == 2);
1728
1729   if (cur[0] < utf8_signifier
1730       || cur[1] < utf8_continuation || cur[1] >= utf8_signifier)
1731     {
1732       if (pedantic)
1733         cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1734                              pfile->line_table->highest_line,
1735                              CPP_BUF_COL (buffer),
1736                              "invalid UTF-8 character <%x>",
1737                              cur[0]);
1738       else
1739         cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1740                                pfile->line_table->highest_line,
1741                                CPP_BUF_COL (buffer),
1742                                "invalid UTF-8 character <%x>",
1743                                cur[0]);
1744       return cur + 1;
1745     }
1746   else if (cur[2] < utf8_continuation || cur[2] >= utf8_signifier)
1747     {
1748       if (pedantic)
1749         cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1750                              pfile->line_table->highest_line,
1751                              CPP_BUF_COL (buffer),
1752                              "invalid UTF-8 character <%x><%x>",
1753                              cur[0], cur[1]);
1754       else
1755         cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1756                                pfile->line_table->highest_line,
1757                                CPP_BUF_COL (buffer),
1758                                "invalid UTF-8 character <%x><%x>",
1759                                cur[0], cur[1]);
1760       return cur + 2;
1761     }
1762   else if (cur[3] < utf8_continuation || cur[3] >= utf8_signifier)
1763     {
1764       if (pedantic)
1765         cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1766                              pfile->line_table->highest_line,
1767                              CPP_BUF_COL (buffer),
1768                              "invalid UTF-8 character <%x><%x><%x>",
1769                              cur[0], cur[1], cur[2]);
1770       else
1771         cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1772                                pfile->line_table->highest_line,
1773                                CPP_BUF_COL (buffer),
1774                                "invalid UTF-8 character <%x><%x><%x>",
1775                                cur[0], cur[1], cur[2]);
1776       return cur + 3;
1777     }
1778   else
1779     {
1780       if (pedantic)
1781         cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1782                              pfile->line_table->highest_line,
1783                              CPP_BUF_COL (buffer),
1784                              "invalid UTF-8 character <%x><%x><%x><%x>",
1785                              cur[0], cur[1], cur[2], cur[3]);
1786       else
1787         cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1788                                pfile->line_table->highest_line,
1789                                CPP_BUF_COL (buffer),
1790                                "invalid UTF-8 character <%x><%x><%x><%x>",
1791                                cur[0], cur[1], cur[2], cur[3]);
1792       return cur + 4;
1793     }
1794 }
1795
1796 /* Helper function of *skip_*_comment and lex*_string.  For C,
1797    character at CUR[-1] with MSB set handle -Wbidi-chars* and
1798    -Winvalid-utf8 diagnostics and return pointer to first character
1799    that should be processed next.  */
1800
1801 static inline const uchar *
1802 _cpp_handle_multibyte_utf8 (cpp_reader *pfile, uchar c,
1803                             const uchar *cur, bool warn_bidi_p,
1804                             bool warn_invalid_utf8_p)
1805 {
1806   /* If this is a beginning of a UTF-8 encoding, it might be
1807      a bidirectional control character.  */
1808   if (c == bidi::utf8_start && warn_bidi_p)
1809     {
1810       location_t loc;
1811       bidi::kind kind = get_bidi_utf8 (pfile, cur - 1, &loc);
1812       maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
1813     }
1814   if (!warn_invalid_utf8_p)
1815     return cur;
1816   if (c >= utf8_signifier)
1817     {
1818       cppchar_t s;
1819       const uchar *pstr = cur - 1;
1820       if (_cpp_valid_utf8 (pfile, &pstr, pfile->buffer->rlimit, 0, NULL, &s)
1821           && s <= UCS_LIMIT)
1822         return pstr;
1823     }
1824   pfile->buffer->cur = cur - 1;
1825   return _cpp_warn_invalid_utf8 (pfile);
1826 }
1827
1828 /* Skip a C-style block comment.  We find the end of the comment by
1829    seeing if an asterisk is before every '/' we encounter.  Returns
1830    nonzero if comment terminated by EOF, zero otherwise.
1831
1832    Buffer->cur points to the initial asterisk of the comment.  */
1833 bool
1834 _cpp_skip_block_comment (cpp_reader *pfile)
1835 {
1836   cpp_buffer *buffer = pfile->buffer;
1837   const uchar *cur = buffer->cur;
1838   uchar c;
1839   const bool warn_bidi_p = pfile->warn_bidi_p ();
1840   const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
1841   const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
1842
1843   cur++;
1844   if (*cur == '/')
1845     cur++;
1846
1847   for (;;)
1848     {
1849       /* People like decorating comments with '*', so check for '/'
1850          instead for efficiency.  */
1851       c = *cur++;
1852
1853       if (c == '/')
1854         {
1855           if (cur[-2] == '*')
1856             {
1857               if (warn_bidi_p)
1858                 maybe_warn_bidi_on_close (pfile, cur);
1859               break;
1860             }
1861
1862           /* Warn about potential nested comments, but not if the '/'
1863              comes immediately before the true comment delimiter.
1864              Don't bother to get it right across escaped newlines.  */
1865           if (CPP_OPTION (pfile, warn_comments)
1866               && cur[0] == '*' && cur[1] != '/')
1867             {
1868               buffer->cur = cur;
1869               cpp_warning_with_line (pfile, CPP_W_COMMENTS,
1870                                      pfile->line_table->highest_line,
1871                                      CPP_BUF_COL (buffer),
1872                                      "\"/*\" within comment");
1873             }
1874         }
1875       else if (c == '\n')
1876         {
1877           unsigned int cols;
1878           buffer->cur = cur - 1;
1879           if (warn_bidi_p)
1880             maybe_warn_bidi_on_close (pfile, cur);
1881           _cpp_process_line_notes (pfile, true);
1882           if (buffer->next_line >= buffer->rlimit)
1883             return true;
1884           _cpp_clean_line (pfile);
1885
1886           cols = buffer->next_line - buffer->line_base;
1887           CPP_INCREMENT_LINE (pfile, cols);
1888
1889           cur = buffer->cur;
1890         }
1891       else if (__builtin_expect (c >= utf8_continuation, 0)
1892                && warn_bidi_or_invalid_utf8_p)
1893         cur = _cpp_handle_multibyte_utf8 (pfile, c, cur, warn_bidi_p,
1894                                           warn_invalid_utf8_p);
1895     }
1896
1897   buffer->cur = cur;
1898   _cpp_process_line_notes (pfile, true);
1899   return false;
1900 }
1901
1902 /* Skip a C++ line comment, leaving buffer->cur pointing to the
1903    terminating newline.  Handles escaped newlines.  Returns nonzero
1904    if a multiline comment.  */
1905 static int
1906 skip_line_comment (cpp_reader *pfile)
1907 {
1908   cpp_buffer *buffer = pfile->buffer;
1909   location_t orig_line = pfile->line_table->highest_line;
1910   const bool warn_bidi_p = pfile->warn_bidi_p ();
1911   const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
1912   const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
1913
1914   if (!warn_bidi_or_invalid_utf8_p)
1915     while (*buffer->cur != '\n')
1916       buffer->cur++;
1917   else if (!warn_invalid_utf8_p)
1918     {
1919       while (*buffer->cur != '\n'
1920              && *buffer->cur != bidi::utf8_start)
1921         buffer->cur++;
1922       if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
1923         {
1924           while (*buffer->cur != '\n')
1925             {
1926               if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
1927                 {
1928                   location_t loc;
1929                   bidi::kind kind = get_bidi_utf8 (pfile, buffer->cur, &loc);
1930                   maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
1931                 }
1932               buffer->cur++;
1933             }
1934           maybe_warn_bidi_on_close (pfile, buffer->cur);
1935         }
1936     }
1937   else
1938     {
1939       while (*buffer->cur != '\n')
1940         {
1941           if (*buffer->cur < utf8_continuation)
1942             {
1943               buffer->cur++;
1944               continue;
1945             }
1946           buffer->cur
1947             = _cpp_handle_multibyte_utf8 (pfile, *buffer->cur, buffer->cur + 1,
1948                                           warn_bidi_p, warn_invalid_utf8_p);
1949         }
1950       if (warn_bidi_p)
1951         maybe_warn_bidi_on_close (pfile, buffer->cur);
1952     }
1953
1954   _cpp_process_line_notes (pfile, true);
1955   return orig_line != pfile->line_table->highest_line;
1956 }
1957
1958 /* Skips whitespace, saving the next non-whitespace character.  */
1959 static void
1960 skip_whitespace (cpp_reader *pfile, cppchar_t c)
1961 {
1962   cpp_buffer *buffer = pfile->buffer;
1963   bool saw_NUL = false;
1964
1965   do
1966     {
1967       /* Horizontal space always OK.  */
1968       if (c == ' ' || c == '\t')
1969         ;
1970       /* Just \f \v or \0 left.  */
1971       else if (c == '\0')
1972         saw_NUL = true;
1973       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
1974         cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
1975                              CPP_BUF_COL (buffer),
1976                              "%s in preprocessing directive",
1977                              c == '\f' ? "form feed" : "vertical tab");
1978
1979       c = *buffer->cur++;
1980     }
1981   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
1982   while (is_nvspace (c));
1983
1984   if (saw_NUL)
1985     {
1986       encoding_rich_location rich_loc (pfile);
1987       cpp_error_at (pfile, CPP_DL_WARNING, &rich_loc,
1988                     "null character(s) ignored");
1989     }
1990
1991   buffer->cur--;
1992 }
1993
1994 /* See if the characters of a number token are valid in a name (no
1995    '.', '+' or '-').  */
1996 static int
1997 name_p (cpp_reader *pfile, const cpp_string *string)
1998 {
1999   unsigned int i;
2000
2001   for (i = 0; i < string->len; i++)
2002     if (!is_idchar (string->text[i]))
2003       return 0;
2004
2005   return 1;
2006 }
2007
2008 /* After parsing an identifier or other sequence, produce a warning about
2009    sequences not in NFC/NFKC.  */
2010 static void
2011 warn_about_normalization (cpp_reader *pfile,
2012                           const cpp_token *token,
2013                           const struct normalize_state *s,
2014                           bool identifier)
2015 {
2016   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
2017       && !pfile->state.skipping)
2018     {
2019       location_t loc = token->src_loc;
2020
2021       /* If possible, create a location range for the token.  */
2022       if (loc >= RESERVED_LOCATION_COUNT
2023           && token->type != CPP_EOF
2024           /* There must be no line notes to process.  */
2025           && (!(pfile->buffer->cur
2026                 >= pfile->buffer->notes[pfile->buffer->cur_note].pos
2027                 && !pfile->overlaid_buffer)))
2028         {
2029           source_range tok_range;
2030           tok_range.m_start = loc;
2031           tok_range.m_finish
2032             = linemap_position_for_column (pfile->line_table,
2033                                            CPP_BUF_COLUMN (pfile->buffer,
2034                                                            pfile->buffer->cur));
2035           loc = pfile->line_table->get_or_create_combined_loc (loc, tok_range,
2036                                                                nullptr, 0);
2037         }
2038
2039       encoding_rich_location rich_loc (pfile, loc);
2040
2041       /* Make sure that the token is printed using UCNs, even
2042          if we'd otherwise happily print UTF-8.  */
2043       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
2044       size_t sz;
2045
2046       sz = cpp_spell_token (pfile, token, buf, false) - buf;
2047       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
2048         cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
2049                         "`%.*s' is not in NFKC", (int) sz, buf);
2050       else if (identifier && CPP_OPTION (pfile, xid_identifiers))
2051         cpp_pedwarning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
2052                                   "`%.*s' is not in NFC", (int) sz, buf);
2053       else
2054         cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
2055                         "`%.*s' is not in NFC", (int) sz, buf);
2056       free (buf);
2057     }
2058 }
2059
2060 /* Returns TRUE if the byte sequence starting at buffer->cur is a valid
2061    extended character in an identifier.  If FIRST is TRUE, then the character
2062    must be valid at the beginning of an identifier as well.  If the return
2063    value is TRUE, then pfile->buffer->cur has been moved to point to the next
2064    byte after the extended character.  */
2065
2066 static bool
2067 forms_identifier_p (cpp_reader *pfile, int first,
2068                     struct normalize_state *state)
2069 {
2070   cpp_buffer *buffer = pfile->buffer;
2071   const bool warn_bidi_p = pfile->warn_bidi_p ();
2072
2073   if (*buffer->cur == '$')
2074     {
2075       if (!CPP_OPTION (pfile, dollars_in_ident))
2076         return false;
2077
2078       buffer->cur++;
2079       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
2080         {
2081           CPP_OPTION (pfile, warn_dollars) = 0;
2082           cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
2083         }
2084
2085       return true;
2086     }
2087
2088   /* Is this a syntactically valid UCN or a valid UTF-8 char?  */
2089   if (CPP_OPTION (pfile, extended_identifiers))
2090     {
2091       cppchar_t s;
2092       if (*buffer->cur >= utf8_signifier)
2093         {
2094           if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0)
2095               && warn_bidi_p)
2096             {
2097               location_t loc;
2098               bidi::kind kind = get_bidi_utf8 (pfile, buffer->cur, &loc);
2099               maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
2100             }
2101           if (_cpp_valid_utf8 (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
2102                                state, &s))
2103             return true;
2104         }
2105       else if (*buffer->cur == '\\'
2106                && (buffer->cur[1] == 'u'
2107                    || buffer->cur[1] == 'U'
2108                    || buffer->cur[1] == 'N'))
2109         {
2110           buffer->cur += 2;
2111           if (warn_bidi_p)
2112             {
2113               location_t loc;
2114               bidi::kind kind;
2115               if (buffer->cur[-1] == 'N')
2116                 kind = get_bidi_named (pfile, buffer->cur, &loc);
2117               else
2118                 kind = get_bidi_ucn (pfile, buffer->cur,
2119                                      buffer->cur[-1] == 'U', &loc);
2120               maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
2121             }
2122           if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
2123                               state, &s, NULL, NULL))
2124             return true;
2125           buffer->cur -= 2;
2126         }
2127     }
2128
2129   return false;
2130 }
2131
2132 /* Helper function to issue error about improper __VA_OPT__ use.  */
2133 static void
2134 maybe_va_opt_error (cpp_reader *pfile)
2135 {
2136   if (CPP_PEDANTIC (pfile) && !CPP_OPTION (pfile, va_opt))
2137     {
2138       /* __VA_OPT__ should not be accepted at all, but allow it in
2139          system headers.  */
2140       if (!_cpp_in_system_header (pfile))
2141         {
2142           if (CPP_OPTION (pfile, cplusplus))
2143             cpp_error (pfile, CPP_DL_PEDWARN,
2144                        "__VA_OPT__ is not available until C++20");
2145           else
2146             cpp_error (pfile, CPP_DL_PEDWARN,
2147                        "__VA_OPT__ is not available until C23");
2148         }
2149     }
2150   else if (!pfile->state.va_args_ok)
2151     {
2152       /* __VA_OPT__ should only appear in the replacement list of a
2153          variadic macro.  */
2154       cpp_error (pfile, CPP_DL_PEDWARN,
2155                  "__VA_OPT__ can only appear in the expansion"
2156                  " of a C++20 variadic macro");
2157     }
2158 }
2159
2160 /* Helper function to perform diagnostics that are needed (rarely)
2161    when an identifier is lexed.  */
2162 static void
2163 identifier_diagnostics_on_lex (cpp_reader *pfile, cpp_hashnode *node)
2164 {
2165   if (__builtin_expect (!(node->flags & NODE_DIAGNOSTIC)
2166                         || pfile->state.skipping, 1))
2167     return;
2168
2169   /* It is allowed to poison the same identifier twice.  */
2170   if ((node->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
2171     {
2172       cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
2173                  NODE_NAME (node));
2174       const auto data = (cpp_hashnode_extra *)
2175         ht_lookup (pfile->extra_hash_table, node->ident, HT_NO_INSERT);
2176       if (data && data->poisoned_loc)
2177         cpp_error_at (pfile, CPP_DL_NOTE, data->poisoned_loc, "poisoned here");
2178     }
2179
2180   /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
2181      replacement list of a variadic macro.  */
2182   if (node == pfile->spec_nodes.n__VA_ARGS__
2183       && !pfile->state.va_args_ok)
2184     {
2185       if (CPP_OPTION (pfile, cplusplus))
2186         cpp_error (pfile, CPP_DL_PEDWARN,
2187                    "__VA_ARGS__ can only appear in the expansion"
2188                    " of a C++11 variadic macro");
2189       else
2190         cpp_error (pfile, CPP_DL_PEDWARN,
2191                    "__VA_ARGS__ can only appear in the expansion"
2192                    " of a C99 variadic macro");
2193     }
2194
2195   /* __VA_OPT__ should only appear in the replacement list of a
2196      variadic macro.  */
2197   if (node == pfile->spec_nodes.n__VA_OPT__)
2198     maybe_va_opt_error (pfile);
2199
2200   /* For -Wc++-compat, warn about use of C++ named operators.  */
2201   if (node->flags & NODE_WARN_OPERATOR)
2202     cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
2203                  "identifier \"%s\" is a special operator name in C++",
2204                  NODE_NAME (node));
2205 }
2206
2207 /* Helper function to get the cpp_hashnode of the identifier BASE.  */
2208 static cpp_hashnode *
2209 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
2210 {
2211   cpp_hashnode *result;
2212   const uchar *cur;
2213   unsigned int len;
2214   unsigned int hash = HT_HASHSTEP (0, *base);
2215
2216   cur = base + 1;
2217   while (ISIDNUM (*cur))
2218     {
2219       hash = HT_HASHSTEP (hash, *cur);
2220       cur++;
2221     }
2222   len = cur - base;
2223   hash = HT_HASHFINISH (hash, len);
2224   result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
2225                                               base, len, hash, HT_ALLOC));
2226   identifier_diagnostics_on_lex (pfile, result);
2227   return result;
2228 }
2229
2230 /* Get the cpp_hashnode of an identifier specified by NAME in
2231    the current cpp_reader object.  If none is found, NULL is returned.  */
2232 cpp_hashnode *
2233 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
2234 {
2235   cpp_hashnode *result;
2236   result = lex_identifier_intern (pfile, (uchar *) name);
2237   return result;
2238 }
2239
2240 /* Lex an identifier starting at BASE.  BUFFER->CUR is expected to point
2241    one past the first character at BASE, which may be a (possibly multi-byte)
2242    character if STARTS_UCN is true.  */
2243 static cpp_hashnode *
2244 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
2245                 struct normalize_state *nst, cpp_hashnode **spelling)
2246 {
2247   cpp_hashnode *result;
2248   const uchar *cur;
2249   unsigned int len;
2250   unsigned int hash = HT_HASHSTEP (0, *base);
2251   const bool warn_bidi_p = pfile->warn_bidi_p ();
2252
2253   cur = pfile->buffer->cur;
2254   if (! starts_ucn)
2255     {
2256       while (ISIDNUM (*cur))
2257         {
2258           hash = HT_HASHSTEP (hash, *cur);
2259           cur++;
2260         }
2261       NORMALIZE_STATE_UPDATE_IDNUM (nst, *(cur - 1));
2262     }
2263   pfile->buffer->cur = cur;
2264   if (starts_ucn || forms_identifier_p (pfile, false, nst))
2265     {
2266       /* Slower version for identifiers containing UCNs
2267          or extended chars (including $).  */
2268       do {
2269         while (ISIDNUM (*pfile->buffer->cur))
2270           {
2271             NORMALIZE_STATE_UPDATE_IDNUM (nst, *pfile->buffer->cur);
2272             pfile->buffer->cur++;
2273           }
2274       } while (forms_identifier_p (pfile, false, nst));
2275       if (warn_bidi_p)
2276         maybe_warn_bidi_on_close (pfile, pfile->buffer->cur);
2277       result = _cpp_interpret_identifier (pfile, base,
2278                                           pfile->buffer->cur - base);
2279       *spelling = cpp_lookup (pfile, base, pfile->buffer->cur - base);
2280     }
2281   else
2282     {
2283       len = cur - base;
2284       hash = HT_HASHFINISH (hash, len);
2285
2286       result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
2287                                                   base, len, hash, HT_ALLOC));
2288       *spelling = result;
2289     }
2290
2291   return result;
2292 }
2293
2294 /* Struct to hold the return value of the scan_cur_identifier () helper
2295    function below.  */
2296
2297 struct scan_id_result
2298 {
2299   cpp_hashnode *node;
2300   normalize_state nst;
2301
2302   scan_id_result ()
2303     : node (nullptr)
2304   {
2305     nst = INITIAL_NORMALIZE_STATE;
2306   }
2307
2308   explicit operator bool () const { return node; }
2309 };
2310
2311 /* Helper function to scan an entire identifier beginning at
2312    pfile->buffer->cur, and possibly containing extended characters (UCNs
2313    and/or UTF-8).  Returns the cpp_hashnode for the identifier on success, or
2314    else nullptr, as well as a normalize_state so that normalization warnings
2315    may be issued once the token lexing is complete.  */
2316
2317 static scan_id_result
2318 scan_cur_identifier (cpp_reader *pfile)
2319 {
2320   const auto buffer = pfile->buffer;
2321   const auto begin = buffer->cur;
2322   scan_id_result result;
2323   if (ISIDST (*buffer->cur))
2324     {
2325       ++buffer->cur;
2326       cpp_hashnode *ignore;
2327       result.node = lex_identifier (pfile, begin, false, &result.nst, &ignore);
2328     }
2329   else if (forms_identifier_p (pfile, true, &result.nst))
2330     {
2331       /* buffer->cur has been moved already by the call
2332          to forms_identifier_p.  */
2333       cpp_hashnode *ignore;
2334       result.node = lex_identifier (pfile, begin, true, &result.nst, &ignore);
2335     }
2336   return result;
2337 }
2338
2339 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
2340 static void
2341 lex_number (cpp_reader *pfile, cpp_string *number,
2342             struct normalize_state *nst)
2343 {
2344   const uchar *cur;
2345   const uchar *base;
2346   uchar *dest;
2347
2348   base = pfile->buffer->cur - 1;
2349   do
2350     {
2351       const uchar *adj_digit_sep = NULL;
2352       cur = pfile->buffer->cur;
2353
2354       /* N.B. ISIDNUM does not include $.  */
2355       while (ISIDNUM (*cur)
2356              || (*cur == '.' && !DIGIT_SEP (cur[-1]))
2357              || DIGIT_SEP (*cur)
2358              || (VALID_SIGN (*cur, cur[-1]) && !DIGIT_SEP (cur[-2])))
2359         {
2360           NORMALIZE_STATE_UPDATE_IDNUM (nst, *cur);
2361           /* Adjacent digit separators do not form part of the pp-number syntax.
2362              However, they can safely be diagnosed here as an error, since '' is
2363              not a valid preprocessing token.  */
2364           if (DIGIT_SEP (*cur) && DIGIT_SEP (cur[-1]) && !adj_digit_sep)
2365             adj_digit_sep = cur;
2366           cur++;
2367         }
2368       /* A number can't end with a digit separator.  */
2369       while (cur > pfile->buffer->cur && DIGIT_SEP (cur[-1]))
2370         --cur;
2371       if (adj_digit_sep && adj_digit_sep < cur)
2372         cpp_error (pfile, CPP_DL_ERROR, "adjacent digit separators");
2373
2374       pfile->buffer->cur = cur;
2375     }
2376   while (forms_identifier_p (pfile, false, nst));
2377
2378   number->len = cur - base;
2379   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
2380   memcpy (dest, base, number->len);
2381   dest[number->len] = '\0';
2382   number->text = dest;
2383 }
2384
2385 /* Create a token of type TYPE with a literal spelling.  */
2386 static void
2387 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
2388                 unsigned int len, enum cpp_ttype type)
2389 {
2390   token->type = type;
2391   token->val.str.len = len;
2392   token->val.str.text = cpp_alloc_token_string (pfile, base, len);
2393 }
2394
2395 /* Like create_literal(), but construct it from two separate strings
2396    which are concatenated.  LEN2 may be 0 if no second string is
2397    required.  */
2398 static void
2399 create_literal2 (cpp_reader *pfile, cpp_token *token, const uchar *base1,
2400                  unsigned int len1, const uchar *base2, unsigned int len2,
2401                  enum cpp_ttype type)
2402 {
2403   token->type = type;
2404   token->val.str.len = len1 + len2;
2405   uchar *const dest = _cpp_unaligned_alloc (pfile, len1 + len2 + 1);
2406   memcpy (dest, base1, len1);
2407   if (len2)
2408     memcpy (dest+len1, base2, len2);
2409   dest[len1 + len2] = 0;
2410   token->val.str.text = dest;
2411 }
2412
2413 const uchar *
2414 cpp_alloc_token_string (cpp_reader *pfile,
2415                         const unsigned char *ptr, unsigned len)
2416 {
2417   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
2418
2419   dest[len] = 0;
2420   memcpy (dest, ptr, len);
2421   return dest;
2422 }
2423
2424 /* A pair of raw buffer pointers.  The currently open one is [1], the
2425    first one is [0].  Used for string literal lexing.  */
2426 struct lit_accum {
2427   _cpp_buff *first;
2428   _cpp_buff *last;
2429   const uchar *rpos;
2430   size_t accum;
2431
2432   lit_accum ()
2433     : first (NULL), last (NULL), rpos (0), accum (0)
2434   {
2435   }
2436
2437   void append (cpp_reader *, const uchar *, size_t);
2438
2439   void read_begin (cpp_reader *);
2440   bool reading_p () const
2441   {
2442     return rpos != NULL;
2443   }
2444   char read_char ()
2445   {
2446     char c = *rpos++;
2447     if (rpos == BUFF_FRONT (last))
2448       rpos = NULL;
2449     return c;
2450   }
2451
2452   void create_literal2 (cpp_reader *pfile, cpp_token *token,
2453                         const uchar *base1, unsigned int len1,
2454                         const uchar *base2, unsigned int len2,
2455                         enum cpp_ttype type);
2456 };
2457
2458 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
2459    sequence from *FIRST_BUFF_P to LAST_BUFF_P.  */
2460
2461 void
2462 lit_accum::append (cpp_reader *pfile, const uchar *base, size_t len)
2463 {
2464   if (!last)
2465     /* Starting.  */
2466     first = last = _cpp_get_buff (pfile, len);
2467   else if (len > BUFF_ROOM (last))
2468     {
2469       /* There is insufficient room in the buffer.  Copy what we can,
2470          and then either extend or create a new one.  */
2471       size_t room = BUFF_ROOM (last);
2472       memcpy (BUFF_FRONT (last), base, room);
2473       BUFF_FRONT (last) += room;
2474       base += room;
2475       len -= room;
2476       accum += room;
2477
2478       gcc_checking_assert (!rpos);
2479
2480       last = _cpp_append_extend_buff (pfile, last, len);
2481     }
2482
2483   memcpy (BUFF_FRONT (last), base, len);
2484   BUFF_FRONT (last) += len;
2485   accum += len;
2486 }
2487
2488 void
2489 lit_accum::read_begin (cpp_reader *pfile)
2490 {
2491   /* We never accumulate more than 4 chars to read.  */
2492   if (BUFF_ROOM (last) < 4)
2493
2494     last = _cpp_append_extend_buff (pfile, last, 4);
2495   rpos = BUFF_FRONT (last);
2496 }
2497
2498 /* Helper function to check if a string format macro, say from inttypes.h, is
2499    placed touching a string literal, in which case it could be parsed as a C++11
2500    user-defined string literal thus breaking the program.  Return TRUE if the
2501    UDL should be ignored for now and preserved for potential macro
2502    expansion.  */
2503
2504 static bool
2505 maybe_ignore_udl_macro_suffix (cpp_reader *pfile, location_t src_loc,
2506                                const uchar *suffix_begin, cpp_hashnode *node)
2507 {
2508   /* User-defined literals outside of namespace std must start with a single
2509      underscore, so assume anything of that form really is a UDL suffix.
2510      We don't need to worry about UDLs defined inside namespace std because
2511      their names are reserved, so cannot be used as macro names in valid
2512      programs.  */
2513   if ((suffix_begin[0] == '_' && suffix_begin[1] != '_')
2514       || !cpp_macro_p (node))
2515     return false;
2516
2517   /* Maybe raise a warning here; caller should arrange not to consume
2518      the tokens.  */
2519   if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
2520     cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX, src_loc, 0,
2521                            "invalid suffix on literal; C++11 requires a space "
2522                            "between literal and string macro");
2523   return true;
2524 }
2525
2526 /* Like create_literal2(), but also prepend all the accumulated data from
2527    the lit_accum struct.  */
2528 void
2529 lit_accum::create_literal2 (cpp_reader *pfile, cpp_token *token,
2530                             const uchar *base1, unsigned int len1,
2531                             const uchar *base2, unsigned int len2,
2532                             enum cpp_ttype type)
2533 {
2534   const unsigned int tot_len = accum + len1 + len2;
2535   uchar *dest = _cpp_unaligned_alloc (pfile, tot_len + 1);
2536   token->type = type;
2537   token->val.str.len = tot_len;
2538   token->val.str.text = dest;
2539   for (_cpp_buff *buf = first; buf; buf = buf->next)
2540     {
2541       size_t len = BUFF_FRONT (buf) - buf->base;
2542       memcpy (dest, buf->base, len);
2543       dest += len;
2544     }
2545   memcpy (dest, base1, len1);
2546   dest += len1;
2547   if (len2)
2548     memcpy (dest, base2, len2);
2549   dest += len2;
2550   *dest = '\0';
2551 }
2552
2553 /* Lexes a raw string.  The stored string contains the spelling,
2554    including double quotes, delimiter string, '(' and ')', any leading
2555    'L', 'u', 'U' or 'u8' and 'R' modifier.  The created token contains
2556    the type of the literal, or CPP_OTHER if it was not properly
2557    terminated.
2558
2559    BASE is the start of the token.  Updates pfile->buffer->cur to just
2560    after the lexed string.
2561
2562    The spelling is NUL-terminated, but it is not guaranteed that this
2563    is the first NUL since embedded NULs are preserved.  */
2564
2565 static void
2566 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
2567 {
2568   const uchar *pos = base;
2569   const bool warn_bidi_p = pfile->warn_bidi_p ();
2570   const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
2571   const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
2572
2573   /* 'tis a pity this information isn't passed down from the lexer's
2574      initial categorization of the token.  */
2575   enum cpp_ttype type = CPP_STRING;
2576
2577   if (*pos == 'L')
2578     {
2579       type = CPP_WSTRING;
2580       pos++;
2581     }
2582   else if (*pos == 'U')
2583     {
2584       type = CPP_STRING32;
2585       pos++;
2586     }
2587   else if (*pos == 'u')
2588     {
2589       if (pos[1] == '8')
2590         {
2591           type = CPP_UTF8STRING;
2592           pos++;
2593         }
2594       else
2595         type = CPP_STRING16;
2596       pos++;
2597     }
2598
2599   gcc_checking_assert (pos[0] == 'R' && pos[1] == '"');
2600   pos += 2;
2601
2602   _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
2603
2604   /* Skip notes before the ".  */
2605   while (note->pos < pos)
2606     ++note;
2607
2608   lit_accum accum;
2609
2610   uchar prefix[17];
2611   unsigned prefix_len = 0;
2612   enum Phase
2613   {
2614    PHASE_PREFIX = -2,
2615    PHASE_NONE = -1,
2616    PHASE_SUFFIX = 0
2617   } phase = PHASE_PREFIX;
2618
2619   for (;;)
2620     {
2621       gcc_checking_assert (note->pos >= pos);
2622
2623       /* Undo any escaped newlines and trigraphs.  */
2624       if (!accum.reading_p () && note->pos == pos)
2625         switch (note->type)
2626           {
2627           case '\\':
2628           case ' ':
2629             /* Restore backslash followed by newline.  */
2630             accum.append (pfile, base, pos - base);
2631             base = pos;
2632             accum.read_begin (pfile);
2633             accum.append (pfile, UC"\\", 1);
2634
2635           after_backslash:
2636             if (note->type == ' ')
2637               /* GNU backslash whitespace newline extension.  FIXME
2638                  could be any sequence of non-vertical space.  When we
2639                  can properly restore any such sequence, we should
2640                  mark this note as handled so _cpp_process_line_notes
2641                  doesn't warn.  */
2642               accum.append (pfile, UC" ", 1);
2643
2644             accum.append (pfile, UC"\n", 1);
2645             note++;
2646             break;
2647
2648           case '\n':
2649             /* This can happen for ??/<NEWLINE> when trigraphs are not
2650                being interpretted.  */
2651             gcc_checking_assert (!CPP_OPTION (pfile, trigraphs));
2652             note->type = 0;
2653             note++;
2654             break;
2655
2656           default:
2657             gcc_checking_assert (_cpp_trigraph_map[note->type]);
2658
2659             /* Don't warn about this trigraph in
2660                _cpp_process_line_notes, since trigraphs show up as
2661                trigraphs in raw strings.  */
2662             uchar type = note->type;
2663             note->type = 0;
2664
2665             if (CPP_OPTION (pfile, trigraphs))
2666               {
2667                 accum.append (pfile, base, pos - base);
2668                 base = pos;
2669                 accum.read_begin (pfile);
2670                 accum.append (pfile, UC"??", 2);
2671                 accum.append (pfile, &type, 1);
2672
2673                 /* ??/ followed by newline gets two line notes, one for
2674                    the trigraph and one for the backslash/newline.  */
2675                 if (type == '/' && note[1].pos == pos)
2676                   {
2677                     note++;
2678                     gcc_assert (note->type == '\\' || note->type == ' ');
2679                     goto after_backslash;
2680                   }
2681                 /* Skip the replacement character.  */
2682                 base = ++pos;
2683               }
2684
2685             note++;
2686             break;
2687           }
2688
2689       /* Now get a char to process.  Either from an expanded note, or
2690          from the line buffer.  */
2691       bool read_note = accum.reading_p ();
2692       char c = read_note ? accum.read_char () : *pos++;
2693
2694       if (phase == PHASE_PREFIX)
2695         {
2696           if (c == '(')
2697             {
2698               /* Done.  */
2699               phase = PHASE_NONE;
2700               prefix[prefix_len++] = '"';
2701             }
2702           else if (prefix_len < 16
2703                    /* Prefix chars are any of the basic character set,
2704                       [lex.charset] except for '
2705                       ()\\\t\v\f\n'. Optimized for a contiguous
2706                       alphabet.  */
2707                    /* Unlike a switch, this collapses down to one or
2708                       two shift and bitmask operations on an ASCII
2709                       system, with an outlier or two.   */
2710                    && (('Z' - 'A' == 25
2711                         ? ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
2712                         : ISIDST (c))
2713                        || (c >= '0' && c <= '9')
2714                        || c == '_' || c == '{' || c == '}'
2715                        || c == '[' || c == ']' || c == '#'
2716                        || c == '<' || c == '>' || c == '%'
2717                        || c == ':' || c == ';' || c == '.' || c == '?'
2718                        || c == '*' || c == '+' || c == '-' || c == '/'
2719                        || c == '^' || c == '&' || c == '|' || c == '~'
2720                        || c == '!' || c == '=' || c == ','
2721                        || c == '"' || c == '\''))
2722             prefix[prefix_len++] = c;
2723           else
2724             {
2725               /* Something is wrong.  */
2726               int col = CPP_BUF_COLUMN (pfile->buffer, pos) + read_note;
2727               if (prefix_len == 16)
2728                 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2729                                      col, "raw string delimiter longer "
2730                                      "than 16 characters");
2731               else if (c == '\n')
2732                 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2733                                      col, "invalid new-line in raw "
2734                                      "string delimiter");
2735               else
2736                 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2737                                      col, "invalid character '%c' in "
2738                                      "raw string delimiter", c);
2739               type = CPP_OTHER;
2740               phase = PHASE_NONE;
2741               /* Continue until we get a close quote, that's probably
2742                  the best failure mode.  */
2743               prefix_len = 0;
2744             }
2745           if (c != '\n')
2746             continue;
2747         }
2748
2749       if (phase != PHASE_NONE)
2750         {
2751           if (prefix[phase] != c)
2752             phase = PHASE_NONE;
2753           else if (unsigned (phase + 1) == prefix_len)
2754             break;
2755           else
2756             {
2757               phase = Phase (phase + 1);
2758               continue;
2759             }
2760         }
2761
2762       if (!prefix_len && c == '"')
2763         /* Failure mode lexing.  */
2764         goto out;
2765       else if (prefix_len && c == ')')
2766         phase = PHASE_SUFFIX;
2767       else if (!read_note && c == '\n')
2768         {
2769           pos--;
2770           pfile->buffer->cur = pos;
2771           if ((pfile->state.in_directive || pfile->state.parsing_args)
2772               && pfile->buffer->next_line >= pfile->buffer->rlimit)
2773             {
2774               cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
2775                                    "unterminated raw string");
2776               type = CPP_OTHER;
2777               goto out;
2778             }
2779
2780           accum.append (pfile, base, pos - base + 1);
2781           _cpp_process_line_notes (pfile, false);
2782
2783           if (pfile->buffer->next_line < pfile->buffer->rlimit)
2784             CPP_INCREMENT_LINE (pfile, 0);
2785           pfile->buffer->need_line = true;
2786
2787           if (!get_fresh_line_impl<true> (pfile))
2788             {
2789               /* We ran out of file and failed to get a line.  */
2790               location_t src_loc = token->src_loc;
2791               token->type = CPP_EOF;
2792               /* Tell the compiler the line number of the EOF token.  */
2793               token->src_loc = pfile->line_table->highest_line;
2794               token->flags = BOL;
2795               if (accum.first)
2796                 _cpp_release_buff (pfile, accum.first);
2797               cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
2798                                    "unterminated raw string");
2799
2800               /* Now pop the buffer that get_fresh_line_impl() did not.  Popping
2801                  is not safe if processing a directive, however this cannot
2802                  happen as we already checked above that a line would be
2803                  available, and get_fresh_line_impl() can't fail in this
2804                  case.  */
2805               gcc_assert (!pfile->state.in_directive);
2806               _cpp_pop_buffer (pfile);
2807
2808               return;
2809             }
2810
2811           pos = base = pfile->buffer->cur;
2812           note = &pfile->buffer->notes[pfile->buffer->cur_note];
2813         }
2814       else if (__builtin_expect ((unsigned char) c >= utf8_continuation, 0)
2815                && warn_bidi_or_invalid_utf8_p)
2816         pos = _cpp_handle_multibyte_utf8 (pfile, c, pos, warn_bidi_p,
2817                                           warn_invalid_utf8_p);
2818     }
2819
2820   if (warn_bidi_p)
2821     maybe_warn_bidi_on_close (pfile, pos);
2822
2823   if (CPP_OPTION (pfile, user_literals))
2824     {
2825       const uchar *const suffix_begin = pos;
2826       pfile->buffer->cur = pos;
2827
2828       if (const auto sr = scan_cur_identifier (pfile))
2829         {
2830           if (maybe_ignore_udl_macro_suffix (pfile, token->src_loc,
2831                                              suffix_begin, sr.node))
2832               pfile->buffer->cur = suffix_begin;
2833           else
2834             {
2835               type = cpp_userdef_string_add_type (type);
2836               accum.create_literal2 (pfile, token, base, suffix_begin - base,
2837                                      NODE_NAME (sr.node), NODE_LEN (sr.node),
2838                                      type);
2839               if (accum.first)
2840                 _cpp_release_buff (pfile, accum.first);
2841               warn_about_normalization (pfile, token, &sr.nst, true);
2842               return;
2843             }
2844         }
2845     }
2846
2847  out:
2848   pfile->buffer->cur = pos;
2849   if (!accum.accum)
2850     create_literal (pfile, token, base, pos - base, type);
2851   else
2852     {
2853       accum.create_literal2 (pfile, token, base, pos - base, nullptr, 0, type);
2854       _cpp_release_buff (pfile, accum.first);
2855     }
2856 }
2857
2858 /* Lexes a string, character constant, or angle-bracketed header file
2859    name.  The stored string contains the spelling, including opening
2860    quote and any leading 'L', 'u', 'U' or 'u8' and optional
2861    'R' modifier.  It returns the type of the literal, or CPP_OTHER
2862    if it was not properly terminated, or CPP_LESS for an unterminated
2863    header name which must be relexed as normal tokens.
2864
2865    The spelling is NUL-terminated, but it is not guaranteed that this
2866    is the first NUL since embedded NULs are preserved.  */
2867 static void
2868 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
2869 {
2870   bool saw_NUL = false;
2871   const uchar *cur;
2872   cppchar_t terminator;
2873   enum cpp_ttype type;
2874
2875   cur = base;
2876   terminator = *cur++;
2877   if (terminator == 'L' || terminator == 'U')
2878     terminator = *cur++;
2879   else if (terminator == 'u')
2880     {
2881       terminator = *cur++;
2882       if (terminator == '8')
2883         terminator = *cur++;
2884     }
2885   if (terminator == 'R')
2886     {
2887       lex_raw_string (pfile, token, base);
2888       return;
2889     }
2890   if (terminator == '"')
2891     type = (*base == 'L' ? CPP_WSTRING :
2892             *base == 'U' ? CPP_STRING32 :
2893             *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
2894                          : CPP_STRING);
2895   else if (terminator == '\'')
2896     type = (*base == 'L' ? CPP_WCHAR :
2897             *base == 'U' ? CPP_CHAR32 :
2898             *base == 'u' ? (base[1] == '8' ? CPP_UTF8CHAR : CPP_CHAR16)
2899                          : CPP_CHAR);
2900   else
2901     terminator = '>', type = CPP_HEADER_NAME;
2902
2903   const bool warn_bidi_p = pfile->warn_bidi_p ();
2904   const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
2905   const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
2906   for (;;)
2907     {
2908       cppchar_t c = *cur++;
2909
2910       /* In #include-style directives, terminators are not escapable.  */
2911       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
2912         {
2913           if ((cur[0] == 'u' || cur[0] == 'U' || cur[0] == 'N') && warn_bidi_p)
2914             {
2915               location_t loc;
2916               bidi::kind kind;
2917               if (cur[0] == 'N')
2918                 kind = get_bidi_named (pfile, cur + 1, &loc);
2919               else
2920                 kind = get_bidi_ucn (pfile, cur + 1, cur[0] == 'U', &loc);
2921               maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
2922             }
2923           cur++;
2924         }
2925       else if (c == terminator)
2926         {
2927           if (warn_bidi_p)
2928             maybe_warn_bidi_on_close (pfile, cur - 1);
2929           break;
2930         }
2931       else if (c == '\n')
2932         {
2933           cur--;
2934           /* Unmatched quotes always yield undefined behavior, but
2935              greedy lexing means that what appears to be an unterminated
2936              header name may actually be a legitimate sequence of tokens.  */
2937           if (terminator == '>')
2938             {
2939               token->type = CPP_LESS;
2940               return;
2941             }
2942           type = CPP_OTHER;
2943           break;
2944         }
2945       else if (c == '\0')
2946         saw_NUL = true;
2947       else if (__builtin_expect (c >= utf8_continuation, 0)
2948                && warn_bidi_or_invalid_utf8_p)
2949         cur = _cpp_handle_multibyte_utf8 (pfile, c, cur, warn_bidi_p,
2950                                           warn_invalid_utf8_p);
2951     }
2952
2953   if (saw_NUL && !pfile->state.skipping)
2954     cpp_error (pfile, CPP_DL_WARNING,
2955                "null character(s) preserved in literal");
2956
2957   if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
2958     cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
2959                (int) terminator);
2960
2961   pfile->buffer->cur = cur;
2962   const uchar *const suffix_begin = cur;
2963
2964   if (CPP_OPTION (pfile, user_literals))
2965     {
2966       if (const auto sr = scan_cur_identifier (pfile))
2967         {
2968           if (maybe_ignore_udl_macro_suffix (pfile, token->src_loc,
2969                                              suffix_begin, sr.node))
2970             pfile->buffer->cur = suffix_begin;
2971           else
2972             {
2973               /* Grab user defined literal suffix.  */
2974               type = cpp_userdef_char_add_type (type);
2975               type = cpp_userdef_string_add_type (type);
2976               create_literal2 (pfile, token, base, suffix_begin - base,
2977                                NODE_NAME (sr.node), NODE_LEN (sr.node), type);
2978               warn_about_normalization (pfile, token, &sr.nst, true);
2979               return;
2980             }
2981         }
2982     }
2983   else if (CPP_OPTION (pfile, cpp_warn_cxx11_compat)
2984            && !pfile->state.skipping)
2985     {
2986       const auto sr = scan_cur_identifier (pfile);
2987       /* Maybe raise a warning, but do not consume the tokens.  */
2988       pfile->buffer->cur = suffix_begin;
2989       if (sr && cpp_macro_p (sr.node))
2990         cpp_warning_with_line (pfile, CPP_W_CXX11_COMPAT,
2991                                token->src_loc, 0, "C++11 requires a space "
2992                                "between string literal and macro");
2993     }
2994
2995   create_literal (pfile, token, base, cur - base, type);
2996 }
2997
2998 /* Return the comment table. The client may not make any assumption
2999    about the ordering of the table.  */
3000 cpp_comment_table *
3001 cpp_get_comments (cpp_reader *pfile)
3002 {
3003   return &pfile->comments;
3004 }
3005
3006 /* Append a comment to the end of the comment table. */
3007 static void
3008 store_comment (cpp_reader *pfile, cpp_token *token)
3009 {
3010   int len;
3011
3012   if (pfile->comments.allocated == 0)
3013     {
3014       pfile->comments.allocated = 256;
3015       pfile->comments.entries = (cpp_comment *) xmalloc
3016         (pfile->comments.allocated * sizeof (cpp_comment));
3017     }
3018
3019   if (pfile->comments.count == pfile->comments.allocated)
3020     {
3021       pfile->comments.allocated *= 2;
3022       pfile->comments.entries = (cpp_comment *) xrealloc
3023         (pfile->comments.entries,
3024          pfile->comments.allocated * sizeof (cpp_comment));
3025     }
3026
3027   len = token->val.str.len;
3028
3029   /* Copy comment. Note, token may not be NULL terminated. */
3030   pfile->comments.entries[pfile->comments.count].comment =
3031     (char *) xmalloc (sizeof (char) * (len + 1));
3032   memcpy (pfile->comments.entries[pfile->comments.count].comment,
3033           token->val.str.text, len);
3034   pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
3035
3036   /* Set source location. */
3037   pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
3038
3039   /* Increment the count of entries in the comment table. */
3040   pfile->comments.count++;
3041 }
3042
3043 /* The stored comment includes the comment start and any terminator.  */
3044 static void
3045 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
3046               cppchar_t type)
3047 {
3048   unsigned char *buffer;
3049   unsigned int len, clen, i;
3050
3051   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
3052
3053   /* C++ comments probably (not definitely) have moved past a new
3054      line, which we don't want to save in the comment.  */
3055   if (is_vspace (pfile->buffer->cur[-1]))
3056     len--;
3057
3058   /* If we are currently in a directive or in argument parsing, then
3059      we need to store all C++ comments as C comments internally, and
3060      so we need to allocate a little extra space in that case.
3061
3062      Note that the only time we encounter a directive here is
3063      when we are saving comments in a "#define".  */
3064   clen = ((pfile->state.in_directive || pfile->state.parsing_args)
3065           && type == '/') ? len + 2 : len;
3066
3067   buffer = _cpp_unaligned_alloc (pfile, clen);
3068
3069   token->type = CPP_COMMENT;
3070   token->val.str.len = clen;
3071   token->val.str.text = buffer;
3072
3073   buffer[0] = '/';
3074   memcpy (buffer + 1, from, len - 1);
3075
3076   /* Finish conversion to a C comment, if necessary.  */
3077   if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
3078     {
3079       buffer[1] = '*';
3080       buffer[clen - 2] = '*';
3081       buffer[clen - 1] = '/';
3082       /* As there can be in a C++ comments illegal sequences for C comments
3083          we need to filter them out.  */
3084       for (i = 2; i < (clen - 2); i++)
3085         if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
3086           buffer[i] = '|';
3087     }
3088
3089   /* Finally store this comment for use by clients of libcpp. */
3090   store_comment (pfile, token);
3091 }
3092
3093 /* Returns true if comment at COMMENT_START is a recognized FALLTHROUGH
3094    comment.  */
3095
3096 static bool
3097 fallthrough_comment_p (cpp_reader *pfile, const unsigned char *comment_start)
3098 {
3099   const unsigned char *from = comment_start + 1;
3100
3101   switch (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough))
3102     {
3103       /* For both -Wimplicit-fallthrough=0 and -Wimplicit-fallthrough=5 we
3104          don't recognize any comments.  The latter only checks attributes,
3105          the former doesn't warn.  */
3106     case 0:
3107     default:
3108       return false;
3109       /* -Wimplicit-fallthrough=1 considers any comment, no matter what
3110          content it has.  */
3111     case 1:
3112       return true;
3113     case 2:
3114       /* -Wimplicit-fallthrough=2 looks for (case insensitive)
3115          .*falls?[ \t-]*thr(u|ough).* regex.  */
3116       for (; (size_t) (pfile->buffer->cur - from) >= sizeof "fallthru" - 1;
3117            from++)
3118         {
3119           /* Is there anything like strpbrk with upper boundary, or
3120              memchr looking for 2 characters rather than just one?  */
3121           if (from[0] != 'f' && from[0] != 'F')
3122             continue;
3123           if (from[1] != 'a' && from[1] != 'A')
3124             continue;
3125           if (from[2] != 'l' && from[2] != 'L')
3126             continue;
3127           if (from[3] != 'l' && from[3] != 'L')
3128             continue;
3129           from += sizeof "fall" - 1;
3130           if (from[0] == 's' || from[0] == 'S')
3131             from++;
3132           while (*from == ' ' || *from == '\t' || *from == '-')
3133             from++;
3134           if (from[0] != 't' && from[0] != 'T')
3135             continue;
3136           if (from[1] != 'h' && from[1] != 'H')
3137             continue;
3138           if (from[2] != 'r' && from[2] != 'R')
3139             continue;
3140           if (from[3] == 'u' || from[3] == 'U')
3141             return true;
3142           if (from[3] != 'o' && from[3] != 'O')
3143             continue;
3144           if (from[4] != 'u' && from[4] != 'U')
3145             continue;
3146           if (from[5] != 'g' && from[5] != 'G')
3147             continue;
3148           if (from[6] != 'h' && from[6] != 'H')
3149             continue;
3150           return true;
3151         }
3152       return false;
3153     case 3:
3154     case 4:
3155       break;
3156     }
3157
3158   /* Whole comment contents:
3159      -fallthrough
3160      @fallthrough@
3161    */
3162   if (*from == '-' || *from == '@')
3163     {
3164       size_t len = sizeof "fallthrough" - 1;
3165       if ((size_t) (pfile->buffer->cur - from - 1) < len)
3166         return false;
3167       if (memcmp (from + 1, "fallthrough", len))
3168         return false;
3169       if (*from == '@')
3170         {
3171           if (from[len + 1] != '@')
3172             return false;
3173           len++;
3174         }
3175       from += 1 + len;
3176     }
3177   /* Whole comment contents (regex):
3178      lint -fallthrough[ \t]*
3179    */
3180   else if (*from == 'l')
3181     {
3182       size_t len = sizeof "int -fallthrough" - 1;
3183       if ((size_t) (pfile->buffer->cur - from - 1) < len)
3184         return false;
3185       if (memcmp (from + 1, "int -fallthrough", len))
3186         return false;
3187       from += 1 + len;
3188       while (*from == ' ' || *from == '\t')
3189         from++;
3190     }
3191   /* Whole comment contents (regex):
3192      [ \t]*FALLTHR(U|OUGH)[ \t]*
3193    */
3194   else if (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough) == 4)
3195     {
3196       while (*from == ' ' || *from == '\t')
3197         from++;
3198       if ((size_t) (pfile->buffer->cur - from)  < sizeof "FALLTHRU" - 1)
3199         return false;
3200       if (memcmp (from, "FALLTHR", sizeof "FALLTHR" - 1))
3201         return false;
3202       from += sizeof "FALLTHR" - 1;
3203       if (*from == 'U')
3204         from++;
3205       else if ((size_t) (pfile->buffer->cur - from)  < sizeof "OUGH" - 1)
3206         return false;
3207       else if (memcmp (from, "OUGH", sizeof "OUGH" - 1))
3208         return false;
3209       else
3210         from += sizeof "OUGH" - 1;
3211       while (*from == ' ' || *from == '\t')
3212         from++;
3213     }
3214   /* Whole comment contents (regex):
3215      [ \t.!]*(ELSE,? |INTENTIONAL(LY)? )?FALL(S | |-)?THR(OUGH|U)[ \t.!]*(-[^\n\r]*)?
3216      [ \t.!]*(Else,? |Intentional(ly)? )?Fall((s | |-)[Tt]|t)hr(ough|u)[ \t.!]*(-[^\n\r]*)?
3217      [ \t.!]*([Ee]lse,? |[Ii]ntentional(ly)? )?fall(s | |-)?thr(ough|u)[ \t.!]*(-[^\n\r]*)?
3218    */
3219   else
3220     {
3221       while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
3222         from++;
3223       unsigned char f = *from;
3224       bool all_upper = false;
3225       if (f == 'E' || f == 'e')
3226         {
3227           if ((size_t) (pfile->buffer->cur - from)
3228               < sizeof "else fallthru" - 1)
3229             return false;
3230           if (f == 'E' && memcmp (from + 1, "LSE", sizeof "LSE" - 1) == 0)
3231             all_upper = true;
3232           else if (memcmp (from + 1, "lse", sizeof "lse" - 1))
3233             return false;
3234           from += sizeof "else" - 1;
3235           if (*from == ',')
3236             from++;
3237           if (*from != ' ')
3238             return false;
3239           from++;
3240           if (all_upper && *from == 'f')
3241             return false;
3242           if (f == 'e' && *from == 'F')
3243             return false;
3244           f = *from;
3245         }
3246       else if (f == 'I' || f == 'i')
3247         {
3248           if ((size_t) (pfile->buffer->cur - from)
3249               < sizeof "intentional fallthru" - 1)
3250             return false;
3251           if (f == 'I' && memcmp (from + 1, "NTENTIONAL",
3252                                   sizeof "NTENTIONAL" - 1) == 0)
3253             all_upper = true;
3254           else if (memcmp (from + 1, "ntentional",
3255                            sizeof "ntentional" - 1))
3256             return false;
3257           from += sizeof "intentional" - 1;
3258           if (*from == ' ')
3259             {
3260               from++;
3261               if (all_upper && *from == 'f')
3262                 return false;
3263             }
3264           else if (all_upper)
3265             {
3266               if (memcmp (from, "LY F", sizeof "LY F" - 1))
3267                 return false;
3268               from += sizeof "LY " - 1;
3269             }
3270           else
3271             {
3272               if (memcmp (from, "ly ", sizeof "ly " - 1))
3273                 return false;
3274               from += sizeof "ly " - 1;
3275             }
3276           if (f == 'i' && *from == 'F')
3277             return false;
3278           f = *from;
3279         }
3280       if (f != 'F' && f != 'f')
3281         return false;
3282       if ((size_t) (pfile->buffer->cur - from) < sizeof "fallthru" - 1)
3283         return false;
3284       if (f == 'F' && memcmp (from + 1, "ALL", sizeof "ALL" - 1) == 0)
3285         all_upper = true;
3286       else if (all_upper)
3287         return false;
3288       else if (memcmp (from + 1, "all", sizeof "all" - 1))
3289         return false;
3290       from += sizeof "fall" - 1;
3291       if (*from == (all_upper ? 'S' : 's') && from[1] == ' ')
3292         from += 2;
3293       else if (*from == ' ' || *from == '-')
3294         from++;
3295       else if (*from != (all_upper ? 'T' : 't'))
3296         return false;
3297       if ((f == 'f' || *from != 'T') && (all_upper || *from != 't'))
3298         return false;
3299       if ((size_t) (pfile->buffer->cur - from) < sizeof "thru" - 1)
3300         return false;
3301       if (memcmp (from + 1, all_upper ? "HRU" : "hru", sizeof "hru" - 1))
3302         {
3303           if ((size_t) (pfile->buffer->cur - from) < sizeof "through" - 1)
3304             return false;
3305           if (memcmp (from + 1, all_upper ? "HROUGH" : "hrough",
3306                       sizeof "hrough" - 1))
3307             return false;
3308           from += sizeof "through" - 1;
3309         }
3310       else
3311         from += sizeof "thru" - 1;
3312       while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
3313         from++;
3314       if (*from == '-')
3315         {
3316           from++;
3317           if (*comment_start == '*')
3318             {
3319               do
3320                 {
3321                   while (*from && *from != '*'
3322                          && *from != '\n' && *from != '\r')
3323                     from++;
3324                   if (*from != '*' || from[1] == '/')
3325                     break;
3326                   from++;
3327                 }
3328               while (1);
3329             }
3330           else
3331             while (*from && *from != '\n' && *from != '\r')
3332               from++;
3333         }
3334     }
3335   /* C block comment.  */
3336   if (*comment_start == '*')
3337     {
3338       if (*from != '*' || from[1] != '/')
3339         return false;
3340     }
3341   /* C++ line comment.  */
3342   else if (*from != '\n')
3343     return false;
3344
3345   return true;
3346 }
3347
3348 /* Allocate COUNT tokens for RUN.  */
3349 void
3350 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
3351 {
3352   run->base = XNEWVEC (cpp_token, count);
3353   run->limit = run->base + count;
3354   run->next = NULL;
3355 }
3356
3357 /* Returns the next tokenrun, or creates one if there is none.  */
3358 static tokenrun *
3359 next_tokenrun (tokenrun *run)
3360 {
3361   if (run->next == NULL)
3362     {
3363       run->next = XNEW (tokenrun);
3364       run->next->prev = run;
3365       _cpp_init_tokenrun (run->next, 250);
3366     }
3367
3368   return run->next;
3369 }
3370
3371 /* Return the number of not yet processed token in a given
3372    context.  */
3373 int
3374 _cpp_remaining_tokens_num_in_context (cpp_context *context)
3375 {
3376   if (context->tokens_kind == TOKENS_KIND_DIRECT)
3377     return (LAST (context).token - FIRST (context).token);
3378   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
3379            || context->tokens_kind == TOKENS_KIND_EXTENDED)
3380     return (LAST (context).ptoken - FIRST (context).ptoken);
3381   else
3382       abort ();
3383 }
3384
3385 /* Returns the token present at index INDEX in a given context.  If
3386    INDEX is zero, the next token to be processed is returned.  */
3387 static const cpp_token*
3388 _cpp_token_from_context_at (cpp_context *context, int index)
3389 {
3390   if (context->tokens_kind == TOKENS_KIND_DIRECT)
3391     return &(FIRST (context).token[index]);
3392   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
3393            || context->tokens_kind == TOKENS_KIND_EXTENDED)
3394     return FIRST (context).ptoken[index];
3395  else
3396    abort ();
3397 }
3398
3399 /* Look ahead in the input stream.  */
3400 const cpp_token *
3401 cpp_peek_token (cpp_reader *pfile, int index)
3402 {
3403   cpp_context *context = pfile->context;
3404   const cpp_token *peektok;
3405   int count;
3406
3407   /* First, scan through any pending cpp_context objects.  */
3408   while (context->prev)
3409     {
3410       ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
3411
3412       if (index < (int) sz)
3413         return _cpp_token_from_context_at (context, index);
3414       index -= (int) sz;
3415       context = context->prev;
3416     }
3417
3418   /* We will have to read some new tokens after all (and do so
3419      without invalidating preceding tokens).  */
3420   count = index;
3421   pfile->keep_tokens++;
3422
3423   /* For peeked tokens temporarily disable line_change reporting,
3424      until the tokens are parsed for real.  */
3425   void (*line_change) (cpp_reader *, const cpp_token *, int)
3426     = pfile->cb.line_change;
3427   pfile->cb.line_change = NULL;
3428
3429   do
3430     {
3431       peektok = _cpp_lex_token (pfile);
3432       if (peektok->type == CPP_EOF)
3433         {
3434           index--;
3435           break;
3436         }
3437       else if (peektok->type == CPP_PRAGMA)
3438         {
3439           /* Don't peek past a pragma.  */
3440           if (peektok == &pfile->directive_result)
3441             /* Save the pragma in the buffer.  */
3442             *pfile->cur_token++ = *peektok;
3443           index--;
3444           break;
3445         }
3446     }
3447   while (index--);
3448
3449   _cpp_backup_tokens_direct (pfile, count - index);
3450   pfile->keep_tokens--;
3451   pfile->cb.line_change = line_change;
3452
3453   return peektok;
3454 }
3455
3456 /* Allocate a single token that is invalidated at the same time as the
3457    rest of the tokens on the line.  Has its line and col set to the
3458    same as the last lexed token, so that diagnostics appear in the
3459    right place.  */
3460 cpp_token *
3461 _cpp_temp_token (cpp_reader *pfile)
3462 {
3463   cpp_token *old, *result;
3464   ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
3465   ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
3466
3467   old = pfile->cur_token - 1;
3468   /* Any pre-existing lookaheads must not be clobbered.  */
3469   if (la)
3470     {
3471       if (sz <= la)
3472         {
3473           tokenrun *next = next_tokenrun (pfile->cur_run);
3474
3475           if (sz < la)
3476             memmove (next->base + 1, next->base,
3477                      (la - sz) * sizeof (cpp_token));
3478
3479           next->base[0] = pfile->cur_run->limit[-1];
3480         }
3481
3482       if (sz > 1)
3483         memmove (pfile->cur_token + 1, pfile->cur_token,
3484                  MIN (la, sz - 1) * sizeof (cpp_token));
3485     }
3486
3487   if (!sz && pfile->cur_token == pfile->cur_run->limit)
3488     {
3489       pfile->cur_run = next_tokenrun (pfile->cur_run);
3490       pfile->cur_token = pfile->cur_run->base;
3491     }
3492
3493   result = pfile->cur_token++;
3494   result->src_loc = old->src_loc;
3495   return result;
3496 }
3497
3498 /* We're at the beginning of a logical line (so not in
3499   directives-mode) and RESULT is a CPP_NAME with NODE_MODULE set.  See
3500   if we should enter deferred_pragma mode to tokenize the rest of the
3501   line as a module control-line.  */
3502
3503 static void
3504 cpp_maybe_module_directive (cpp_reader *pfile, cpp_token *result)
3505 {
3506   unsigned backup = 0; /* Tokens we peeked.  */
3507   cpp_hashnode *node = result->val.node.node;
3508   cpp_token *peek = result;
3509   cpp_token *keyword = peek;
3510   cpp_hashnode *(&n_modules)[spec_nodes::M_HWM][2] = pfile->spec_nodes.n_modules;
3511   int header_count = 0;
3512
3513   /* Make sure the incoming state is as we expect it.  This way we
3514      can restore it using constants.  */
3515   gcc_checking_assert (!pfile->state.in_deferred_pragma
3516                        && !pfile->state.skipping
3517                        && !pfile->state.parsing_args
3518                        && !pfile->state.angled_headers
3519                        && (pfile->state.save_comments
3520                            == !CPP_OPTION (pfile, discard_comments)));
3521
3522   /* Enter directives mode sufficiently for peeking.  We don't have
3523      to actually set in_directive.  */
3524   pfile->state.in_deferred_pragma = true;
3525
3526   /* These two fields are needed to process tokenization in deferred
3527      pragma mode.  They are not used outside deferred pragma mode or
3528      directives mode.  */
3529   pfile->state.pragma_allow_expansion = true;
3530   pfile->directive_line = result->src_loc;
3531
3532   /* Saving comments is incompatible with directives mode.   */
3533   pfile->state.save_comments = 0;
3534
3535   if (node == n_modules[spec_nodes::M_EXPORT][0])
3536     {
3537       peek = _cpp_lex_direct (pfile);
3538       keyword = peek;
3539       backup++;
3540       if (keyword->type != CPP_NAME)
3541         goto not_module;
3542       node = keyword->val.node.node;
3543       if (!(node->flags & NODE_MODULE))
3544         goto not_module;
3545     }
3546
3547   if (node == n_modules[spec_nodes::M__IMPORT][0])
3548     /* __import  */
3549     header_count = backup + 2 + 16;
3550   else if (node == n_modules[spec_nodes::M_IMPORT][0])
3551     /* import  */
3552     header_count = backup + 2 + (CPP_OPTION (pfile, preprocessed) ? 16 : 0);
3553   else if (node == n_modules[spec_nodes::M_MODULE][0])
3554     ; /* module  */
3555   else
3556     goto not_module;
3557
3558   /* We've seen [export] {module|import|__import}.  Check the next token.  */
3559   if (header_count)
3560     /* After '{,__}import' a header name may appear.  */
3561     pfile->state.angled_headers = true;
3562   peek = _cpp_lex_direct (pfile);
3563   backup++;
3564
3565   /* ... import followed by identifier, ':', '<' or
3566      header-name preprocessing tokens, or module
3567      followed by cpp-identifier, ':' or ';' preprocessing
3568      tokens.  C++ keywords are not yet relevant.  */
3569   if (peek->type == CPP_NAME
3570       || peek->type == CPP_COLON
3571       ||  (header_count
3572            ? (peek->type == CPP_LESS
3573               || (peek->type == CPP_STRING && peek->val.str.text[0] != 'R')
3574               || peek->type == CPP_HEADER_NAME)
3575            : peek->type == CPP_SEMICOLON))
3576     {
3577       pfile->state.pragma_allow_expansion = !CPP_OPTION (pfile, preprocessed);
3578       if (!pfile->state.pragma_allow_expansion)
3579         pfile->state.prevent_expansion++;
3580
3581       if (!header_count && linemap_included_from
3582           (LINEMAPS_LAST_ORDINARY_MAP (pfile->line_table)))
3583         cpp_error_with_line (pfile, CPP_DL_ERROR, keyword->src_loc, 0,
3584                              "module control-line cannot be in included file");
3585
3586       /* The first one or two tokens cannot be macro names.  */
3587       for (int ix = backup; ix--;)
3588         {
3589           cpp_token *tok = ix ? keyword : result;
3590           cpp_hashnode *node = tok->val.node.node;
3591
3592           /* Don't attempt to expand the token.  */
3593           tok->flags |= NO_EXPAND;
3594           if (_cpp_defined_macro_p (node)
3595               && _cpp_maybe_notify_macro_use (pfile, node, tok->src_loc)
3596               && !cpp_fun_like_macro_p (node))
3597             cpp_error_with_line (pfile, CPP_DL_ERROR, tok->src_loc, 0,
3598                                  "module control-line \"%s\" cannot be"
3599                                  " an object-like macro",
3600                                  NODE_NAME (node));
3601         }
3602
3603       /* Map to underbar variants.  */
3604       keyword->val.node.node = n_modules[header_count
3605                                          ? spec_nodes::M_IMPORT
3606                                          : spec_nodes::M_MODULE][1];
3607       if (backup != 1)
3608         result->val.node.node = n_modules[spec_nodes::M_EXPORT][1];
3609
3610       /* Maybe tell the tokenizer we expect a header-name down the
3611          road.  */
3612       pfile->state.directive_file_token = header_count;
3613     }
3614   else
3615     {
3616     not_module:
3617       /* Drop out of directive mode.  */
3618       /* We aaserted save_comments had this value upon entry.  */
3619       pfile->state.save_comments
3620         = !CPP_OPTION (pfile, discard_comments);
3621       pfile->state.in_deferred_pragma = false;
3622       /* Do not let this remain on.  */
3623       pfile->state.angled_headers = false;
3624     }
3625
3626   /* In either case we want to backup the peeked tokens.  */
3627   if (backup)
3628     {
3629       /* If we saw EOL, we should drop it, because this isn't a module
3630          control-line after all.  */
3631       bool eol = peek->type == CPP_PRAGMA_EOL;
3632       if (!eol || backup > 1)
3633         {
3634           /* Put put the peeked tokens back  */
3635           _cpp_backup_tokens_direct (pfile, backup);
3636           /* But if the last one was an EOL, forget it.  */
3637           if (eol)
3638             pfile->lookaheads--;
3639         }
3640     }
3641 }
3642
3643 /* Lex a token into RESULT (external interface).  Takes care of issues
3644    like directive handling, token lookahead, multiple include
3645    optimization and skipping.  */
3646 const cpp_token *
3647 _cpp_lex_token (cpp_reader *pfile)
3648 {
3649   cpp_token *result;
3650
3651   for (;;)
3652     {
3653       if (pfile->cur_token == pfile->cur_run->limit)
3654         {
3655           pfile->cur_run = next_tokenrun (pfile->cur_run);
3656           pfile->cur_token = pfile->cur_run->base;
3657         }
3658       /* We assume that the current token is somewhere in the current
3659          run.  */
3660       if (pfile->cur_token < pfile->cur_run->base
3661           || pfile->cur_token >= pfile->cur_run->limit)
3662         abort ();
3663
3664       if (pfile->lookaheads)
3665         {
3666           pfile->lookaheads--;
3667           result = pfile->cur_token++;
3668         }
3669       else
3670         result = _cpp_lex_direct (pfile);
3671
3672       if (result->flags & BOL)
3673         {
3674           /* Is this a directive.  If _cpp_handle_directive returns
3675              false, it is an assembler #.  */
3676           if (result->type == CPP_HASH
3677               /* 6.10.3 p 11: Directives in a list of macro arguments
3678                  gives undefined behavior.  This implementation
3679                  handles the directive as normal.  */
3680               && pfile->state.parsing_args != 1)
3681             {
3682               if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
3683                 {
3684                   if (pfile->directive_result.type == CPP_PADDING)
3685                     continue;
3686                   result = &pfile->directive_result;
3687                 }
3688             }
3689           else if (pfile->state.in_deferred_pragma)
3690             result = &pfile->directive_result;
3691           else if (result->type == CPP_NAME
3692                    && (result->val.node.node->flags & NODE_MODULE)
3693                    && !pfile->state.skipping
3694                    /* Unlike regular directives, we do not deal with
3695                       tokenizing module directives as macro arguments.
3696                       That's not permitted.  */
3697                    && !pfile->state.parsing_args)
3698             {
3699               /* P1857.  Before macro expansion, At start of logical
3700                  line ... */
3701               /* We don't have to consider lookaheads at this point.  */
3702               gcc_checking_assert (!pfile->lookaheads);
3703
3704               cpp_maybe_module_directive (pfile, result);
3705             }
3706
3707           if (pfile->cb.line_change && !pfile->state.skipping)
3708             pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
3709         }
3710
3711       /* We don't skip tokens in directives.  */
3712       if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
3713         break;
3714
3715       /* Outside a directive, invalidate controlling macros.  At file
3716          EOF, _cpp_lex_direct takes care of popping the buffer, so we never
3717          get here and MI optimization works.  */
3718       pfile->mi_valid = false;
3719
3720       if (!pfile->state.skipping || result->type == CPP_EOF)
3721         break;
3722     }
3723
3724   return result;
3725 }
3726
3727 /* Returns true if a fresh line has been loaded.  */
3728 template <bool lexing_raw_string>
3729 static bool
3730 get_fresh_line_impl (cpp_reader *pfile)
3731 {
3732   /* We can't get a new line until we leave the current directive, unless we
3733      are lexing a raw string, in which case it will be OK as long as we don't
3734      pop the current buffer.  */
3735   if (!lexing_raw_string && pfile->state.in_directive)
3736     return false;
3737
3738   for (;;)
3739     {
3740       cpp_buffer *buffer = pfile->buffer;
3741
3742       if (!buffer->need_line)
3743         return true;
3744
3745       if (buffer->next_line < buffer->rlimit)
3746         {
3747           _cpp_clean_line (pfile);
3748           return true;
3749         }
3750
3751       /* We can't change buffers until we leave the current directive.  */
3752       if (lexing_raw_string && pfile->state.in_directive)
3753         return false;
3754
3755       /* First, get out of parsing arguments state.  */
3756       if (pfile->state.parsing_args)
3757         return false;
3758
3759       /* End of buffer.  Non-empty files should end in a newline.  */
3760       if (buffer->buf != buffer->rlimit
3761           && buffer->next_line > buffer->rlimit
3762           && !buffer->from_stage3)
3763         {
3764           /* Clip to buffer size.  */
3765           buffer->next_line = buffer->rlimit;
3766         }
3767
3768       if (buffer->prev && !buffer->return_at_eof)
3769         _cpp_pop_buffer (pfile);
3770       else
3771         {
3772           /* End of translation.  Do not pop the buffer yet. Increment
3773              line number so that the EOF token is on a line of its own
3774              (_cpp_lex_direct doesn't increment in that case, because
3775              it's hard for it to distinguish this special case). */
3776           CPP_INCREMENT_LINE (pfile, 0);
3777           return false;
3778         }
3779     }
3780 }
3781
3782 bool
3783 _cpp_get_fresh_line (cpp_reader *pfile)
3784 {
3785   return get_fresh_line_impl<false> (pfile);
3786 }
3787
3788
3789 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)          \
3790   do                                                    \
3791     {                                                   \
3792       result->type = ELSE_TYPE;                         \
3793       if (*buffer->cur == CHAR)                         \
3794         buffer->cur++, result->type = THEN_TYPE;        \
3795     }                                                   \
3796   while (0)
3797
3798 /* Lex a token into pfile->cur_token, which is also incremented, to
3799    get diagnostics pointing to the correct location.
3800
3801    Does not handle issues such as token lookahead, multiple-include
3802    optimization, directives, skipping etc.  This function is only
3803    suitable for use by _cpp_lex_token, and in special cases like
3804    lex_expansion_token which doesn't care for any of these issues.
3805
3806    When meeting a newline, returns CPP_EOF if parsing a directive,
3807    otherwise returns to the start of the token buffer if permissible.
3808    Returns the location of the lexed token.  */
3809 cpp_token *
3810 _cpp_lex_direct (cpp_reader *pfile)
3811 {
3812   cppchar_t c = 0;
3813   cpp_buffer *buffer;
3814   const unsigned char *comment_start;
3815   bool fallthrough_comment = false;
3816   cpp_token *result = pfile->cur_token++;
3817
3818  fresh_line:
3819   result->flags = 0;
3820   buffer = pfile->buffer;
3821   if (buffer->need_line)
3822     {
3823       if (pfile->state.in_deferred_pragma)
3824         {
3825           /* This can happen in cases like:
3826              #define loop(x) whatever
3827              #pragma omp loop
3828              where when trying to expand loop we need to peek
3829              next token after loop, but aren't still in_deferred_pragma
3830              mode but are in in_directive mode, so buffer->need_line
3831              is set, a CPP_EOF is peeked.  */
3832           result->type = CPP_PRAGMA_EOL;
3833           pfile->state.in_deferred_pragma = false;
3834           if (!pfile->state.pragma_allow_expansion)
3835             pfile->state.prevent_expansion--;
3836           result->src_loc = pfile->line_table->highest_line;
3837           return result;
3838         }
3839       if (!_cpp_get_fresh_line (pfile))
3840         {
3841           result->type = CPP_EOF;
3842           /* Not a real EOF in a directive or arg parsing -- we refuse
3843              to advance to the next file now, and will once we're out
3844              of those modes.  */
3845           if (!pfile->state.in_directive && !pfile->state.parsing_args)
3846             {
3847               /* Tell the compiler the line number of the EOF token.  */
3848               result->src_loc = pfile->line_table->highest_line;
3849               result->flags = BOL;
3850               /* Now pop the buffer that _cpp_get_fresh_line did not.  */
3851               _cpp_pop_buffer (pfile);
3852             }
3853           else if (c == 0)
3854             result->src_loc = pfile->line_table->highest_line;
3855           return result;
3856         }
3857       if (buffer != pfile->buffer)
3858         fallthrough_comment = false;
3859       if (!pfile->keep_tokens)
3860         {
3861           pfile->cur_run = &pfile->base_run;
3862           result = pfile->base_run.base;
3863           pfile->cur_token = result + 1;
3864         }
3865       result->flags = BOL;
3866       if (pfile->state.parsing_args == 2)
3867         result->flags |= PREV_WHITE;
3868     }
3869   buffer = pfile->buffer;
3870  update_tokens_line:
3871   result->src_loc = pfile->line_table->highest_line;
3872
3873  skipped_white:
3874   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
3875       && !pfile->overlaid_buffer)
3876     {
3877       _cpp_process_line_notes (pfile, false);
3878       result->src_loc = pfile->line_table->highest_line;
3879     }
3880   c = *buffer->cur++;
3881
3882   if (pfile->forced_token_location)
3883     result->src_loc = pfile->forced_token_location;
3884   else
3885     result->src_loc = linemap_position_for_column (pfile->line_table,
3886                                           CPP_BUF_COLUMN (buffer, buffer->cur));
3887
3888   switch (c)
3889     {
3890     case ' ': case '\t': case '\f': case '\v': case '\0':
3891       result->flags |= PREV_WHITE;
3892       skip_whitespace (pfile, c);
3893       goto skipped_white;
3894
3895     case '\n':
3896       /* Increment the line, unless this is the last line ...  */
3897       if (buffer->cur < buffer->rlimit
3898           /* ... or this is a #include, (where _cpp_stack_file needs to
3899              unwind by one line) ...  */
3900           || (pfile->state.in_directive > 1
3901               /* ... except traditional-cpp increments this elsewhere.  */
3902               && !CPP_OPTION (pfile, traditional)))
3903         CPP_INCREMENT_LINE (pfile, 0);
3904       buffer->need_line = true;
3905       if (pfile->state.in_deferred_pragma)
3906         {
3907           /* Produce the PRAGMA_EOL on this line.  File reading
3908              ensures there is always a \n at end of the buffer, thus
3909              in a deferred pragma we always see CPP_PRAGMA_EOL before
3910              any CPP_EOF.  */
3911           result->type = CPP_PRAGMA_EOL;
3912           result->flags &= ~PREV_WHITE;
3913           pfile->state.in_deferred_pragma = false;
3914           if (!pfile->state.pragma_allow_expansion)
3915             pfile->state.prevent_expansion--;
3916           return result;
3917         }
3918       goto fresh_line;
3919
3920     case '0': case '1': case '2': case '3': case '4':
3921     case '5': case '6': case '7': case '8': case '9':
3922       {
3923         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3924         result->type = CPP_NUMBER;
3925         lex_number (pfile, &result->val.str, &nst);
3926         warn_about_normalization (pfile, result, &nst, false);
3927         break;
3928       }
3929
3930     case 'L':
3931     case 'u':
3932     case 'U':
3933     case 'R':
3934       /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
3935          wide strings or raw strings.  */
3936       if (c == 'L' || CPP_OPTION (pfile, rliterals)
3937           || (c != 'R' && CPP_OPTION (pfile, uliterals)))
3938         {
3939           if ((*buffer->cur == '\'' && c != 'R')
3940               || *buffer->cur == '"'
3941               || (*buffer->cur == 'R'
3942                   && c != 'R'
3943                   && buffer->cur[1] == '"'
3944                   && CPP_OPTION (pfile, rliterals))
3945               || (*buffer->cur == '8'
3946                   && c == 'u'
3947                   && ((buffer->cur[1] == '"' || (buffer->cur[1] == '\''
3948                                 && CPP_OPTION (pfile, utf8_char_literals)))
3949                       || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
3950                           && CPP_OPTION (pfile, rliterals)))))
3951             {
3952               lex_string (pfile, result, buffer->cur - 1);
3953               break;
3954             }
3955         }
3956       /* Fall through.  */
3957
3958     case '_':
3959     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
3960     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
3961     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
3962     case 's': case 't':           case 'v': case 'w': case 'x':
3963     case 'y': case 'z':
3964     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
3965     case 'G': case 'H': case 'I': case 'J': case 'K':
3966     case 'M': case 'N': case 'O': case 'P': case 'Q':
3967     case 'S': case 'T':           case 'V': case 'W': case 'X':
3968     case 'Y': case 'Z':
3969       result->type = CPP_NAME;
3970       {
3971         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3972         const auto node = lex_identifier (pfile, buffer->cur - 1, false, &nst,
3973                                           &result->val.node.spelling);
3974         result->val.node.node = node;
3975         identifier_diagnostics_on_lex (pfile, node);
3976         warn_about_normalization (pfile, result, &nst, true);
3977       }
3978
3979       /* Convert named operators to their proper types.  */
3980       if (result->val.node.node->flags & NODE_OPERATOR)
3981         {
3982           result->flags |= NAMED_OP;
3983           result->type = (enum cpp_ttype) result->val.node.node->directive_index;
3984         }
3985
3986       /* Signal FALLTHROUGH comment followed by another token.  */
3987       if (fallthrough_comment)
3988         result->flags |= PREV_FALLTHROUGH;
3989       break;
3990
3991     case '\'':
3992     case '"':
3993       lex_string (pfile, result, buffer->cur - 1);
3994       break;
3995
3996     case '/':
3997       /* A potential block or line comment.  */
3998       comment_start = buffer->cur;
3999       c = *buffer->cur;
4000
4001       if (c == '*')
4002         {
4003           if (_cpp_skip_block_comment (pfile))
4004             cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
4005         }
4006       else if (c == '/' && ! CPP_OPTION (pfile, traditional))
4007         {
4008           /* Don't warn for system headers.  */
4009           if (_cpp_in_system_header (pfile))
4010             ;
4011           /* Warn about comments if pedantically GNUC89, and not
4012              in system headers.  */
4013           else if (CPP_OPTION (pfile, lang) == CLK_GNUC89
4014                    && CPP_PEDANTIC (pfile)
4015                    && ! buffer->warned_cplusplus_comments)
4016             {
4017               if (cpp_error (pfile, CPP_DL_PEDWARN,
4018                              "C++ style comments are not allowed in ISO C90"))
4019                 cpp_error (pfile, CPP_DL_NOTE,
4020                            "(this will be reported only once per input file)");
4021               buffer->warned_cplusplus_comments = 1;
4022             }
4023           /* Or if specifically desired via -Wc90-c99-compat.  */
4024           else if (CPP_OPTION (pfile, cpp_warn_c90_c99_compat) > 0
4025                    && ! CPP_OPTION (pfile, cplusplus)
4026                    && ! buffer->warned_cplusplus_comments)
4027             {
4028               if (cpp_error (pfile, CPP_DL_WARNING,
4029                              "C++ style comments are incompatible with C90"))
4030                 cpp_error (pfile, CPP_DL_NOTE,
4031                            "(this will be reported only once per input file)");
4032               buffer->warned_cplusplus_comments = 1;
4033             }
4034           /* In C89/C94, C++ style comments are forbidden.  */
4035           else if ((CPP_OPTION (pfile, lang) == CLK_STDC89
4036                     || CPP_OPTION (pfile, lang) == CLK_STDC94))
4037             {
4038               /* But don't be confused about valid code such as
4039                  - // immediately followed by *,
4040                  - // in a preprocessing directive,
4041                  - // in an #if 0 block.  */
4042               if (buffer->cur[1] == '*'
4043                   || pfile->state.in_directive
4044                   || pfile->state.skipping)
4045                 {
4046                   result->type = CPP_DIV;
4047                   break;
4048                 }
4049               else if (! buffer->warned_cplusplus_comments)
4050                 {
4051                   if (cpp_error (pfile, CPP_DL_ERROR,
4052                                  "C++ style comments are not allowed in "
4053                                  "ISO C90"))
4054                     cpp_error (pfile, CPP_DL_NOTE,
4055                                "(this will be reported only once per input "
4056                                "file)");
4057                   buffer->warned_cplusplus_comments = 1;
4058                 }
4059             }
4060           if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
4061             cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
4062         }
4063       else if (c == '=')
4064         {
4065           buffer->cur++;
4066           result->type = CPP_DIV_EQ;
4067           break;
4068         }
4069       else
4070         {
4071           result->type = CPP_DIV;
4072           break;
4073         }
4074
4075       if (fallthrough_comment_p (pfile, comment_start))
4076         fallthrough_comment = true;
4077
4078       if (pfile->cb.comment)
4079         {
4080           size_t len = pfile->buffer->cur - comment_start;
4081           pfile->cb.comment (pfile, result->src_loc, comment_start - 1,
4082                              len + 1);
4083         }
4084
4085       if (!pfile->state.save_comments)
4086         {
4087           result->flags |= PREV_WHITE;
4088           goto update_tokens_line;
4089         }
4090
4091       if (fallthrough_comment)
4092         result->flags |= PREV_FALLTHROUGH;
4093
4094       /* Save the comment as a token in its own right.  */
4095       save_comment (pfile, result, comment_start, c);
4096       break;
4097
4098     case '<':
4099       if (pfile->state.angled_headers)
4100         {
4101           lex_string (pfile, result, buffer->cur - 1);
4102           if (result->type != CPP_LESS)
4103             break;
4104         }
4105
4106       result->type = CPP_LESS;
4107       if (*buffer->cur == '=')
4108         {
4109           buffer->cur++, result->type = CPP_LESS_EQ;
4110           if (*buffer->cur == '>'
4111               && CPP_OPTION (pfile, cplusplus)
4112               && CPP_OPTION (pfile, lang) >= CLK_GNUCXX20)
4113             buffer->cur++, result->type = CPP_SPACESHIP;
4114         }
4115       else if (*buffer->cur == '<')
4116         {
4117           buffer->cur++;
4118           IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
4119         }
4120       else if (CPP_OPTION (pfile, digraphs))
4121         {
4122           if (*buffer->cur == ':')
4123             {
4124               /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
4125                  three characters are <:: and the subsequent character
4126                  is neither : nor >, the < is treated as a preprocessor
4127                  token by itself".  */
4128               if (CPP_OPTION (pfile, cplusplus)
4129                   && CPP_OPTION (pfile, lang) != CLK_CXX98
4130                   && CPP_OPTION (pfile, lang) != CLK_GNUCXX
4131                   && buffer->cur[1] == ':'
4132                   && buffer->cur[2] != ':' && buffer->cur[2] != '>')
4133                 break;
4134
4135               buffer->cur++;
4136               result->flags |= DIGRAPH;
4137               result->type = CPP_OPEN_SQUARE;
4138             }
4139           else if (*buffer->cur == '%')
4140             {
4141               buffer->cur++;
4142               result->flags |= DIGRAPH;
4143               result->type = CPP_OPEN_BRACE;
4144             }
4145         }
4146       break;
4147
4148     case '>':
4149       result->type = CPP_GREATER;
4150       if (*buffer->cur == '=')
4151         buffer->cur++, result->type = CPP_GREATER_EQ;
4152       else if (*buffer->cur == '>')
4153         {
4154           buffer->cur++;
4155           IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
4156         }
4157       break;
4158
4159     case '%':
4160       result->type = CPP_MOD;
4161       if (*buffer->cur == '=')
4162         buffer->cur++, result->type = CPP_MOD_EQ;
4163       else if (CPP_OPTION (pfile, digraphs))
4164         {
4165           if (*buffer->cur == ':')
4166             {
4167               buffer->cur++;
4168               result->flags |= DIGRAPH;
4169               result->type = CPP_HASH;
4170               if (*buffer->cur == '%' && buffer->cur[1] == ':')
4171                 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
4172             }
4173           else if (*buffer->cur == '>')
4174             {
4175               buffer->cur++;
4176               result->flags |= DIGRAPH;
4177               result->type = CPP_CLOSE_BRACE;
4178             }
4179         }
4180       break;
4181
4182     case '.':
4183       result->type = CPP_DOT;
4184       if (ISDIGIT (*buffer->cur))
4185         {
4186           struct normalize_state nst = INITIAL_NORMALIZE_STATE;
4187           result->type = CPP_NUMBER;
4188           lex_number (pfile, &result->val.str, &nst);
4189           warn_about_normalization (pfile, result, &nst, false);
4190         }
4191       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
4192         buffer->cur += 2, result->type = CPP_ELLIPSIS;
4193       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
4194         buffer->cur++, result->type = CPP_DOT_STAR;
4195       break;
4196
4197     case '+':
4198       result->type = CPP_PLUS;
4199       if (*buffer->cur == '+')
4200         buffer->cur++, result->type = CPP_PLUS_PLUS;
4201       else if (*buffer->cur == '=')
4202         buffer->cur++, result->type = CPP_PLUS_EQ;
4203       break;
4204
4205     case '-':
4206       result->type = CPP_MINUS;
4207       if (*buffer->cur == '>')
4208         {
4209           buffer->cur++;
4210           result->type = CPP_DEREF;
4211           if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
4212             buffer->cur++, result->type = CPP_DEREF_STAR;
4213         }
4214       else if (*buffer->cur == '-')
4215         buffer->cur++, result->type = CPP_MINUS_MINUS;
4216       else if (*buffer->cur == '=')
4217         buffer->cur++, result->type = CPP_MINUS_EQ;
4218       break;
4219
4220     case '&':
4221       result->type = CPP_AND;
4222       if (*buffer->cur == '&')
4223         buffer->cur++, result->type = CPP_AND_AND;
4224       else if (*buffer->cur == '=')
4225         buffer->cur++, result->type = CPP_AND_EQ;
4226       break;
4227
4228     case '|':
4229       result->type = CPP_OR;
4230       if (*buffer->cur == '|')
4231         buffer->cur++, result->type = CPP_OR_OR;
4232       else if (*buffer->cur == '=')
4233         buffer->cur++, result->type = CPP_OR_EQ;
4234       break;
4235
4236     case ':':
4237       result->type = CPP_COLON;
4238       if (*buffer->cur == ':' && CPP_OPTION (pfile, scope))
4239         buffer->cur++, result->type = CPP_SCOPE;
4240       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
4241         {
4242           buffer->cur++;
4243           result->flags |= DIGRAPH;
4244           result->type = CPP_CLOSE_SQUARE;
4245         }
4246       break;
4247
4248     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
4249     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
4250     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
4251     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
4252     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
4253
4254     case '?': result->type = CPP_QUERY; break;
4255     case '~': result->type = CPP_COMPL; break;
4256     case ',': result->type = CPP_COMMA; break;
4257     case '(': result->type = CPP_OPEN_PAREN; break;
4258     case ')': result->type = CPP_CLOSE_PAREN; break;
4259     case '[': result->type = CPP_OPEN_SQUARE; break;
4260     case ']': result->type = CPP_CLOSE_SQUARE; break;
4261     case '{': result->type = CPP_OPEN_BRACE; break;
4262     case '}': result->type = CPP_CLOSE_BRACE; break;
4263     case ';': result->type = CPP_SEMICOLON; break;
4264
4265       /* @ is a punctuator in Objective-C.  */
4266     case '@': result->type = CPP_ATSIGN; break;
4267
4268     default:
4269       {
4270         const uchar *base = --buffer->cur;
4271         static int no_warn_cnt;
4272
4273         /* Check for an extended identifier ($ or UCN or UTF-8).  */
4274         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
4275         if (forms_identifier_p (pfile, true, &nst))
4276           {
4277             result->type = CPP_NAME;
4278             const auto node = lex_identifier (pfile, base, true, &nst,
4279                                               &result->val.node.spelling);
4280             result->val.node.node = node;
4281             identifier_diagnostics_on_lex (pfile, node);
4282             warn_about_normalization (pfile, result, &nst, true);
4283             break;
4284           }
4285
4286         /* Otherwise this will form a CPP_OTHER token.  Parse valid UTF-8 as a
4287            single token.  */
4288         buffer->cur++;
4289         if (c >= utf8_signifier)
4290           {
4291             const uchar *pstr = base;
4292             cppchar_t s;
4293             if (_cpp_valid_utf8 (pfile, &pstr, buffer->rlimit, 0, NULL, &s))
4294               {
4295                 if (s > UCS_LIMIT && CPP_OPTION (pfile, cpp_warn_invalid_utf8))
4296                   {
4297                     buffer->cur = base;
4298                     _cpp_warn_invalid_utf8 (pfile);
4299                   }
4300                 buffer->cur = pstr;
4301               }
4302             else if (CPP_OPTION (pfile, cpp_warn_invalid_utf8))
4303               {
4304                 buffer->cur = base;
4305                 const uchar *end = _cpp_warn_invalid_utf8 (pfile);
4306                 buffer->cur = base + 1;
4307                 no_warn_cnt = end - buffer->cur;
4308               }
4309           }
4310         else if (c >= utf8_continuation
4311                  && CPP_OPTION (pfile, cpp_warn_invalid_utf8))
4312           {
4313             if (no_warn_cnt)
4314               --no_warn_cnt;
4315             else
4316               {
4317                 buffer->cur = base;
4318                 _cpp_warn_invalid_utf8 (pfile);
4319                 buffer->cur = base + 1;
4320               }
4321           }
4322         create_literal (pfile, result, base, buffer->cur - base, CPP_OTHER);
4323         break;
4324       }
4325
4326     }
4327
4328   /* Potentially convert the location of the token to a range.  */
4329   if (result->src_loc >= RESERVED_LOCATION_COUNT
4330       && result->type != CPP_EOF)
4331     {
4332       /* Ensure that any line notes are processed, so that we have the
4333          correct physical line/column for the end-point of the token even
4334          when a logical line is split via one or more backslashes.  */
4335       if (buffer->cur >= buffer->notes[buffer->cur_note].pos
4336           && !pfile->overlaid_buffer)
4337         _cpp_process_line_notes (pfile, false);
4338
4339       source_range tok_range;
4340       tok_range.m_start = result->src_loc;
4341       tok_range.m_finish
4342         = linemap_position_for_column (pfile->line_table,
4343                                        CPP_BUF_COLUMN (buffer, buffer->cur));
4344
4345       result->src_loc
4346         = pfile->line_table->get_or_create_combined_loc (result->src_loc,
4347                                                          tok_range, nullptr, 0);
4348     }
4349
4350   return result;
4351 }
4352
4353 /* An upper bound on the number of bytes needed to spell TOKEN.
4354    Does not include preceding whitespace.  */
4355 unsigned int
4356 cpp_token_len (const cpp_token *token)
4357 {
4358   unsigned int len;
4359
4360   switch (TOKEN_SPELL (token))
4361     {
4362     default:            len = 6;                                break;
4363     case SPELL_LITERAL: len = token->val.str.len;               break;
4364     case SPELL_IDENT:   len = NODE_LEN (token->val.node.node) * 10;     break;
4365     }
4366
4367   return len;
4368 }
4369
4370 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
4371    Return the number of bytes read out of NAME.  (There are always
4372    10 bytes written to BUFFER.)  */
4373
4374 static size_t
4375 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
4376 {
4377   int j;
4378   int ucn_len = 0;
4379   int ucn_len_c;
4380   unsigned t;
4381   unsigned long utf32;
4382
4383   /* Compute the length of the UTF-8 sequence.  */
4384   for (t = *name; t & 0x80; t <<= 1)
4385     ucn_len++;
4386
4387   utf32 = *name & (0x7F >> ucn_len);
4388   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
4389     {
4390       utf32 = (utf32 << 6) | (*++name & 0x3F);
4391
4392       /* Ill-formed UTF-8.  */
4393       if ((*name & ~0x3F) != 0x80)
4394         abort ();
4395     }
4396
4397   *buffer++ = '\\';
4398   *buffer++ = 'U';
4399   for (j = 7; j >= 0; j--)
4400     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
4401   return ucn_len;
4402 }
4403
4404 /* Given a token TYPE corresponding to a digraph, return a pointer to
4405    the spelling of the digraph.  */
4406 static const unsigned char *
4407 cpp_digraph2name (enum cpp_ttype type)
4408 {
4409   return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
4410 }
4411
4412 /* Write the spelling of an identifier IDENT, using UCNs, to BUFFER.
4413    The buffer must already contain enough space to hold the
4414    token's spelling.  Returns a pointer to the character after the
4415    last character written.  */
4416 unsigned char *
4417 _cpp_spell_ident_ucns (unsigned char *buffer, cpp_hashnode *ident)
4418 {
4419   size_t i;
4420   const unsigned char *name = NODE_NAME (ident);
4421
4422   for (i = 0; i < NODE_LEN (ident); i++)
4423     if (name[i] & ~0x7F)
4424       {
4425         i += utf8_to_ucn (buffer, name + i) - 1;
4426         buffer += 10;
4427       }
4428     else
4429       *buffer++ = name[i];
4430
4431   return buffer;
4432 }
4433
4434 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
4435    already contain enough space to hold the token's spelling.
4436    Returns a pointer to the character after the last character written.
4437    FORSTRING is true if this is to be the spelling after translation
4438    phase 1 (with the original spelling of extended identifiers), false
4439    if extended identifiers should always be written using UCNs (there is
4440    no option for always writing them in the internal UTF-8 form).
4441    FIXME: Would be nice if we didn't need the PFILE argument.  */
4442 unsigned char *
4443 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
4444                  unsigned char *buffer, bool forstring)
4445 {
4446   switch (TOKEN_SPELL (token))
4447     {
4448     case SPELL_OPERATOR:
4449       {
4450         const unsigned char *spelling;
4451         unsigned char c;
4452
4453         if (token->flags & DIGRAPH)
4454           spelling = cpp_digraph2name (token->type);
4455         else if (token->flags & NAMED_OP)
4456           goto spell_ident;
4457         else
4458           spelling = TOKEN_NAME (token);
4459
4460         while ((c = *spelling++) != '\0')
4461           *buffer++ = c;
4462       }
4463       break;
4464
4465     spell_ident:
4466     case SPELL_IDENT:
4467       if (forstring)
4468         {
4469           memcpy (buffer, NODE_NAME (token->val.node.spelling),
4470                   NODE_LEN (token->val.node.spelling));
4471           buffer += NODE_LEN (token->val.node.spelling);
4472         }
4473       else
4474         buffer = _cpp_spell_ident_ucns (buffer, token->val.node.node);
4475       break;
4476
4477     case SPELL_LITERAL:
4478       memcpy (buffer, token->val.str.text, token->val.str.len);
4479       buffer += token->val.str.len;
4480       break;
4481
4482     case SPELL_NONE:
4483       cpp_error (pfile, CPP_DL_ICE,
4484                  "unspellable token %s", TOKEN_NAME (token));
4485       break;
4486     }
4487
4488   return buffer;
4489 }
4490
4491 /* Returns TOKEN spelt as a null-terminated string.  The string is
4492    freed when the reader is destroyed.  Useful for diagnostics.  */
4493 unsigned char *
4494 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
4495 {
4496   unsigned int len = cpp_token_len (token) + 1;
4497   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
4498
4499   end = cpp_spell_token (pfile, token, start, false);
4500   end[0] = '\0';
4501
4502   return start;
4503 }
4504
4505 /* Returns a pointer to a string which spells the token defined by
4506    TYPE and FLAGS.  Used by C front ends, which really should move to
4507    using cpp_token_as_text.  */
4508 const char *
4509 cpp_type2name (enum cpp_ttype type, unsigned char flags)
4510 {
4511   if (flags & DIGRAPH)
4512     return (const char *) cpp_digraph2name (type);
4513   else if (flags & NAMED_OP)
4514     return cpp_named_operator2name (type);
4515
4516   return (const char *) token_spellings[type].name;
4517 }
4518
4519 /* Writes the spelling of token to FP, without any preceding space.
4520    Separated from cpp_spell_token for efficiency - to avoid stdio
4521    double-buffering.  */
4522 void
4523 cpp_output_token (const cpp_token *token, FILE *fp)
4524 {
4525   switch (TOKEN_SPELL (token))
4526     {
4527     case SPELL_OPERATOR:
4528       {
4529         const unsigned char *spelling;
4530         int c;
4531
4532         if (token->flags & DIGRAPH)
4533           spelling = cpp_digraph2name (token->type);
4534         else if (token->flags & NAMED_OP)
4535           goto spell_ident;
4536         else
4537           spelling = TOKEN_NAME (token);
4538
4539         c = *spelling;
4540         do
4541           putc (c, fp);
4542         while ((c = *++spelling) != '\0');
4543       }
4544       break;
4545
4546     spell_ident:
4547     case SPELL_IDENT:
4548       {
4549         size_t i;
4550         const unsigned char * name = NODE_NAME (token->val.node.node);
4551
4552         for (i = 0; i < NODE_LEN (token->val.node.node); i++)
4553           if (name[i] & ~0x7F)
4554             {
4555               unsigned char buffer[10];
4556               i += utf8_to_ucn (buffer, name + i) - 1;
4557               fwrite (buffer, 1, 10, fp);
4558             }
4559           else
4560             fputc (NODE_NAME (token->val.node.node)[i], fp);
4561       }
4562       break;
4563
4564     case SPELL_LITERAL:
4565       if (token->type == CPP_HEADER_NAME)
4566         fputc ('"', fp);
4567       fwrite (token->val.str.text, 1, token->val.str.len, fp);
4568       if (token->type == CPP_HEADER_NAME)
4569         fputc ('"', fp);
4570       break;
4571
4572     case SPELL_NONE:
4573       /* An error, most probably.  */
4574       break;
4575     }
4576 }
4577
4578 /* Compare two tokens.  */
4579 int
4580 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
4581 {
4582   if (a->type == b->type && a->flags == b->flags)
4583     switch (TOKEN_SPELL (a))
4584       {
4585       default:                  /* Keep compiler happy.  */
4586       case SPELL_OPERATOR:
4587         /* token_no is used to track where multiple consecutive ##
4588            tokens were originally located.  */
4589         return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
4590       case SPELL_NONE:
4591         return (a->type != CPP_MACRO_ARG
4592                 || (a->val.macro_arg.arg_no == b->val.macro_arg.arg_no
4593                     && a->val.macro_arg.spelling == b->val.macro_arg.spelling));
4594       case SPELL_IDENT:
4595         return (a->val.node.node == b->val.node.node
4596                 && a->val.node.spelling == b->val.node.spelling);
4597       case SPELL_LITERAL:
4598         return (a->val.str.len == b->val.str.len
4599                 && !memcmp (a->val.str.text, b->val.str.text,
4600                             a->val.str.len));
4601       }
4602
4603   return 0;
4604 }
4605
4606 /* Returns nonzero if a space should be inserted to avoid an
4607    accidental token paste for output.  For simplicity, it is
4608    conservative, and occasionally advises a space where one is not
4609    needed, e.g. "." and ".2".  */
4610 int
4611 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
4612                  const cpp_token *token2)
4613 {
4614   enum cpp_ttype a = token1->type, b = token2->type;
4615   cppchar_t c;
4616
4617   if (token1->flags & NAMED_OP)
4618     a = CPP_NAME;
4619   if (token2->flags & NAMED_OP)
4620     b = CPP_NAME;
4621
4622   c = EOF;
4623   if (token2->flags & DIGRAPH)
4624     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
4625   else if (token_spellings[b].category == SPELL_OPERATOR)
4626     c = token_spellings[b].name[0];
4627
4628   /* Quickly get everything that can paste with an '='.  */
4629   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
4630     return 1;
4631
4632   switch (a)
4633     {
4634     case CPP_GREATER:   return c == '>';
4635     case CPP_LESS:      return c == '<' || c == '%' || c == ':';
4636     case CPP_PLUS:      return c == '+';
4637     case CPP_MINUS:     return c == '-' || c == '>';
4638     case CPP_DIV:       return c == '/' || c == '*'; /* Comments.  */
4639     case CPP_MOD:       return c == ':' || c == '>';
4640     case CPP_AND:       return c == '&';
4641     case CPP_OR:        return c == '|';
4642     case CPP_COLON:     return c == ':' || c == '>';
4643     case CPP_DEREF:     return c == '*';
4644     case CPP_DOT:       return c == '.' || c == '%' || b == CPP_NUMBER;
4645     case CPP_HASH:      return c == '#' || c == '%'; /* Digraph form.  */
4646     case CPP_PRAGMA:
4647     case CPP_NAME:      return ((b == CPP_NUMBER
4648                                  && name_p (pfile, &token2->val.str))
4649                                 || b == CPP_NAME
4650                                 || b == CPP_CHAR || b == CPP_STRING); /* L */
4651     case CPP_NUMBER:    return (b == CPP_NUMBER || b == CPP_NAME
4652                                 || b == CPP_CHAR
4653                                 || c == '.' || c == '+' || c == '-');
4654                                       /* UCNs */
4655     case CPP_OTHER:     return ((token1->val.str.text[0] == '\\'
4656                                  && b == CPP_NAME)
4657                                 || (CPP_OPTION (pfile, objc)
4658                                     && token1->val.str.text[0] == '@'
4659                                     && (b == CPP_NAME || b == CPP_STRING)));
4660     case CPP_LESS_EQ:   return c == '>';
4661     case CPP_STRING:
4662     case CPP_WSTRING:
4663     case CPP_UTF8STRING:
4664     case CPP_STRING16:
4665     case CPP_STRING32:  return (CPP_OPTION (pfile, user_literals)
4666                                 && (b == CPP_NAME
4667                                     || (TOKEN_SPELL (token2) == SPELL_LITERAL
4668                                         && ISIDST (token2->val.str.text[0]))));
4669
4670     default:            break;
4671     }
4672
4673   return 0;
4674 }
4675
4676 /* Output all the remaining tokens on the current line, and a newline
4677    character, to FP.  Leading whitespace is removed.  If there are
4678    macros, special token padding is not performed.  */
4679 void
4680 cpp_output_line (cpp_reader *pfile, FILE *fp)
4681 {
4682   const cpp_token *token;
4683
4684   token = cpp_get_token (pfile);
4685   while (token->type != CPP_EOF)
4686     {
4687       cpp_output_token (token, fp);
4688       token = cpp_get_token (pfile);
4689       if (token->flags & PREV_WHITE)
4690         putc (' ', fp);
4691     }
4692
4693   putc ('\n', fp);
4694 }
4695
4696 /* Return a string representation of all the remaining tokens on the
4697    current line.  The result is allocated using xmalloc and must be
4698    freed by the caller.  */
4699 unsigned char *
4700 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
4701 {
4702   const cpp_token *token;
4703   unsigned int out = dir_name ? ustrlen (dir_name) : 0;
4704   unsigned int alloced = 120 + out;
4705   unsigned char *result = (unsigned char *) xmalloc (alloced);
4706
4707   /* If DIR_NAME is empty, there are no initial contents.  */
4708   if (dir_name)
4709     {
4710       sprintf ((char *) result, "#%s ", dir_name);
4711       out += 2;
4712     }
4713
4714   token = cpp_get_token (pfile);
4715   while (token->type != CPP_EOF)
4716     {
4717       unsigned char *last;
4718       /* Include room for a possible space and the terminating nul.  */
4719       unsigned int len = cpp_token_len (token) + 2;
4720
4721       if (out + len > alloced)
4722         {
4723           alloced *= 2;
4724           if (out + len > alloced)
4725             alloced = out + len;
4726           result = (unsigned char *) xrealloc (result, alloced);
4727         }
4728
4729       last = cpp_spell_token (pfile, token, &result[out], 0);
4730       out = last - result;
4731
4732       token = cpp_get_token (pfile);
4733       if (token->flags & PREV_WHITE)
4734         result[out++] = ' ';
4735     }
4736
4737   result[out] = '\0';
4738   return result;
4739 }
4740
4741 /* Memory buffers.  Changing these three constants can have a dramatic
4742    effect on performance.  The values here are reasonable defaults,
4743    but might be tuned.  If you adjust them, be sure to test across a
4744    range of uses of cpplib, including heavy nested function-like macro
4745    expansion.  Also check the change in peak memory usage (NJAMD is a
4746    good tool for this).  */
4747 #define MIN_BUFF_SIZE 8000
4748 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
4749 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
4750         (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
4751
4752 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
4753   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
4754 #endif
4755
4756 /* Create a new allocation buffer.  Place the control block at the end
4757    of the buffer, so that buffer overflows will cause immediate chaos.  */
4758 static _cpp_buff *
4759 new_buff (size_t len)
4760 {
4761   _cpp_buff *result;
4762   unsigned char *base;
4763
4764   if (len < MIN_BUFF_SIZE)
4765     len = MIN_BUFF_SIZE;
4766   len = CPP_ALIGN (len);
4767
4768 #ifdef ENABLE_VALGRIND_WORKAROUNDS
4769   /* Valgrind warns about uses of interior pointers, so put _cpp_buff
4770      struct first.  */
4771   size_t slen = CPP_ALIGN2 (sizeof (_cpp_buff), 2 * DEFAULT_ALIGNMENT);
4772   base = XNEWVEC (unsigned char, len + slen);
4773   result = (_cpp_buff *) base;
4774   base += slen;
4775 #else
4776   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
4777   result = (_cpp_buff *) (base + len);
4778 #endif
4779   result->base = base;
4780   result->cur = base;
4781   result->limit = base + len;
4782   result->next = NULL;
4783   return result;
4784 }
4785
4786 /* Place a chain of unwanted allocation buffers on the free list.  */
4787 void
4788 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
4789 {
4790   _cpp_buff *end = buff;
4791
4792   while (end->next)
4793     end = end->next;
4794   end->next = pfile->free_buffs;
4795   pfile->free_buffs = buff;
4796 }
4797
4798 /* Return a free buffer of size at least MIN_SIZE.  */
4799 _cpp_buff *
4800 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
4801 {
4802   _cpp_buff *result, **p;
4803
4804   for (p = &pfile->free_buffs;; p = &(*p)->next)
4805     {
4806       size_t size;
4807
4808       if (*p == NULL)
4809         return new_buff (min_size);
4810       result = *p;
4811       size = result->limit - result->base;
4812       /* Return a buffer that's big enough, but don't waste one that's
4813          way too big.  */
4814       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
4815         break;
4816     }
4817
4818   *p = result->next;
4819   result->next = NULL;
4820   result->cur = result->base;
4821   return result;
4822 }
4823
4824 /* Creates a new buffer with enough space to hold the uncommitted
4825    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
4826    the excess bytes to the new buffer.  Chains the new buffer after
4827    BUFF, and returns the new buffer.  */
4828 _cpp_buff *
4829 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
4830 {
4831   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
4832   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
4833
4834   buff->next = new_buff;
4835   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
4836   return new_buff;
4837 }
4838
4839 /* Creates a new buffer with enough space to hold the uncommitted
4840    remaining bytes of the buffer pointed to by BUFF, and at least
4841    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
4842    Chains the new buffer before the buffer pointed to by BUFF, and
4843    updates the pointer to point to the new buffer.  */
4844 void
4845 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
4846 {
4847   _cpp_buff *new_buff, *old_buff = *pbuff;
4848   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
4849
4850   new_buff = _cpp_get_buff (pfile, size);
4851   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
4852   new_buff->next = old_buff;
4853   *pbuff = new_buff;
4854 }
4855
4856 /* Free a chain of buffers starting at BUFF.  */
4857 void
4858 _cpp_free_buff (_cpp_buff *buff)
4859 {
4860   _cpp_buff *next;
4861
4862   for (; buff; buff = next)
4863     {
4864       next = buff->next;
4865 #ifdef ENABLE_VALGRIND_WORKAROUNDS
4866       free (buff);
4867 #else
4868       free (buff->base);
4869 #endif
4870     }
4871 }
4872
4873 /* Allocate permanent, unaligned storage of length LEN.  */
4874 unsigned char *
4875 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
4876 {
4877   _cpp_buff *buff = pfile->u_buff;
4878   unsigned char *result = buff->cur;
4879
4880   if (len > (size_t) (buff->limit - result))
4881     {
4882       buff = _cpp_get_buff (pfile, len);
4883       buff->next = pfile->u_buff;
4884       pfile->u_buff = buff;
4885       result = buff->cur;
4886     }
4887
4888   buff->cur = result + len;
4889   return result;
4890 }
4891
4892 /* Allocate permanent, unaligned storage of length LEN from a_buff.
4893    That buffer is used for growing allocations when saving macro
4894    replacement lists in a #define, and when parsing an answer to an
4895    assertion in #assert, #unassert or #if (and therefore possibly
4896    whilst expanding macros).  It therefore must not be used by any
4897    code that they might call: specifically the lexer and the guts of
4898    the macro expander.
4899
4900    All existing other uses clearly fit this restriction: storing
4901    registered pragmas during initialization.  */
4902 unsigned char *
4903 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
4904 {
4905   _cpp_buff *buff = pfile->a_buff;
4906   unsigned char *result = buff->cur;
4907
4908   if (len > (size_t) (buff->limit - result))
4909     {
4910       buff = _cpp_get_buff (pfile, len);
4911       buff->next = pfile->a_buff;
4912       pfile->a_buff = buff;
4913       result = buff->cur;
4914     }
4915
4916   buff->cur = result + len;
4917   return result;
4918 }
4919
4920 /* Commit or allocate storage from a buffer.  */
4921
4922 void *
4923 _cpp_commit_buff (cpp_reader *pfile, size_t size)
4924 {
4925   void *ptr = BUFF_FRONT (pfile->a_buff);
4926
4927   if (pfile->hash_table->alloc_subobject)
4928     {
4929       void *copy = pfile->hash_table->alloc_subobject (size);
4930       memcpy (copy, ptr, size);
4931       ptr = copy;
4932     }
4933   else
4934     BUFF_FRONT (pfile->a_buff) += size;
4935
4936   return ptr;
4937 }
4938
4939 /* Say which field of TOK is in use.  */
4940
4941 enum cpp_token_fld_kind
4942 cpp_token_val_index (const cpp_token *tok)
4943 {
4944   switch (TOKEN_SPELL (tok))
4945     {
4946     case SPELL_IDENT:
4947       return CPP_TOKEN_FLD_NODE;
4948     case SPELL_LITERAL:
4949       return CPP_TOKEN_FLD_STR;
4950     case SPELL_OPERATOR:
4951       /* Operands which were originally spelled as ident keep around
4952          the node for the exact spelling.  */
4953       if (tok->flags & NAMED_OP)
4954         return CPP_TOKEN_FLD_NODE;
4955       else if (tok->type == CPP_PASTE)
4956         return CPP_TOKEN_FLD_TOKEN_NO;
4957       else
4958         return CPP_TOKEN_FLD_NONE;
4959     case SPELL_NONE:
4960       if (tok->type == CPP_MACRO_ARG)
4961         return CPP_TOKEN_FLD_ARG_NO;
4962       else if (tok->type == CPP_PADDING)
4963         return CPP_TOKEN_FLD_SOURCE;
4964       else if (tok->type == CPP_PRAGMA)
4965         return CPP_TOKEN_FLD_PRAGMA;
4966       /* fall through */
4967     default:
4968       return CPP_TOKEN_FLD_NONE;
4969     }
4970 }
4971
4972 /* All tokens lexed in R after calling this function will be forced to
4973    have their location_t to be P, until
4974    cpp_stop_forcing_token_locations is called for R.  */
4975
4976 void
4977 cpp_force_token_locations (cpp_reader *r, location_t loc)
4978 {
4979   r->forced_token_location = loc;
4980 }
4981
4982 /* Go back to assigning locations naturally for lexed tokens.  */
4983
4984 void
4985 cpp_stop_forcing_token_locations (cpp_reader *r)
4986 {
4987   r->forced_token_location = 0;
4988 }
4989
4990 /* We're looking at \, if it's escaping EOL, look past it.  If at
4991    LIMIT, don't advance.  */
4992
4993 static const unsigned char *
4994 do_peek_backslash (const unsigned char *peek, const unsigned char *limit)
4995 {
4996   const unsigned char *probe = peek;
4997
4998   if (__builtin_expect (peek[1] == '\n', true))
4999     {
5000     eol:
5001       probe += 2;
5002       if (__builtin_expect (probe < limit, true))
5003         {
5004           peek = probe;
5005           if (*peek == '\\')
5006             /* The user might be perverse.  */
5007             return do_peek_backslash (peek, limit);
5008         }
5009     }
5010   else if (__builtin_expect (peek[1] == '\r', false))
5011     {
5012       if (probe[2] == '\n')
5013         probe++;
5014       goto eol;
5015     }
5016
5017   return peek;
5018 }
5019
5020 static const unsigned char *
5021 do_peek_next (const unsigned char *peek, const unsigned char *limit)
5022 {
5023   if (__builtin_expect (*peek == '\\', false))
5024     peek = do_peek_backslash (peek, limit);
5025   return peek;
5026 }
5027
5028 static const unsigned char *
5029 do_peek_prev (const unsigned char *peek, const unsigned char *bound)
5030 {
5031   if (peek == bound)
5032     return NULL;
5033
5034   unsigned char c = *--peek;
5035   if (__builtin_expect (c == '\n', false)
5036       || __builtin_expect (c == 'r', false))
5037     {
5038       if (peek == bound)
5039         return peek;
5040       int ix = -1;
5041       if (c == '\n' && peek[ix] == '\r')
5042         {
5043           if (peek + ix == bound)
5044             return peek;
5045           ix--;
5046         }
5047
5048       if (peek[ix] == '\\')
5049         return do_peek_prev (peek + ix, bound);
5050
5051       return peek;
5052     }
5053   else
5054     return peek;
5055 }
5056
5057 /* If PEEK[-1] is identifier MATCH, scan past it and trailing white
5058    space.  Otherwise return NULL.  */
5059
5060 static const unsigned char *
5061 do_peek_ident (const char *match, const unsigned char *peek,
5062                const unsigned char *limit)
5063 {
5064   for (; *++match; peek++)
5065     if (*peek != *match)
5066       {
5067         peek = do_peek_next (peek, limit);
5068         if (*peek != *match)
5069           return NULL;
5070       }
5071
5072   /* Must now not be looking at an identifier char.  */
5073   peek = do_peek_next (peek, limit);
5074   if (ISIDNUM (*peek))
5075     return NULL;
5076
5077   /* Skip control-line whitespace.  */
5078  ws:
5079   while (*peek == ' ' || *peek == '\t')
5080     peek++;
5081   if (__builtin_expect (*peek == '\\', false))
5082     {
5083       peek = do_peek_backslash (peek, limit);
5084       if (*peek != '\\')
5085         goto ws;
5086     }
5087
5088   return peek;
5089 }
5090
5091 /* Are we looking at a module control line starting as PEEK - 1?  */
5092
5093 static bool
5094 do_peek_module (cpp_reader *pfile, unsigned char c,
5095                 const unsigned char *peek, const unsigned char *limit)
5096 {
5097   bool import = false;
5098
5099   if (__builtin_expect (c == 'e', false))
5100     {
5101       if (!((peek[0] == 'x' || peek[0] == '\\')
5102             && (peek = do_peek_ident ("export", peek, limit))))
5103         return false;
5104
5105       /* export, peek for import or module.  No need to peek __import
5106          here.  */
5107       if (peek[0] == 'i')
5108         {
5109           if (!((peek[1] == 'm' || peek[1] == '\\')
5110                 && (peek = do_peek_ident ("import", peek + 1, limit))))
5111             return false;
5112           import = true;
5113         }
5114       else if (peek[0] == 'm')
5115         {
5116           if (!((peek[1] == 'o' || peek[1] == '\\')
5117                 && (peek = do_peek_ident ("module", peek + 1, limit))))
5118             return false;
5119         }
5120       else
5121         return false;
5122     }
5123   else if (__builtin_expect (c == 'i', false))
5124     {
5125       if (!((peek[0] == 'm' || peek[0] == '\\')
5126             && (peek = do_peek_ident ("import", peek, limit))))
5127         return false;
5128       import = true;
5129     }
5130   else if (__builtin_expect (c == '_', false))
5131     {
5132       /* Needed for translated includes.   */
5133       if (!((peek[0] == '_' || peek[0] == '\\')
5134             && (peek = do_peek_ident ("__import", peek, limit))))
5135         return false;
5136       import = true;
5137     }
5138   else if (__builtin_expect (c == 'm', false))
5139     {
5140       if (!((peek[0] == 'o' || peek[0] == '\\')
5141             && (peek = do_peek_ident ("module", peek, limit))))
5142         return false;
5143     }
5144   else
5145     return false;
5146
5147   /* Peek the next character to see if it's good enough.  We'll be at
5148      the first non-whitespace char, including skipping an escaped
5149      newline.  */
5150   /* ... import followed by identifier, ':', '<' or header-name
5151      preprocessing tokens, or module followed by identifier, ':' or
5152      ';' preprocessing tokens.  */
5153   unsigned char p = *peek++;
5154
5155   /* A character literal is ... single quotes, ... optionally preceded
5156      by u8, u, U, or L */
5157   /* A string-literal is a ... double quotes, optionally prefixed by
5158      R, u8, u8R, u, uR, U, UR, L, or LR */
5159   if (p == 'u')
5160     {
5161       peek = do_peek_next (peek, limit);
5162       if (*peek == '8')
5163         {
5164           peek++;
5165           goto peek_u8;
5166         }
5167       goto peek_u;
5168     }
5169   else if (p == 'U' || p == 'L')
5170     {
5171     peek_u8:
5172       peek = do_peek_next (peek, limit);
5173     peek_u:
5174       if (*peek == '\"' || *peek == '\'')
5175         return false;
5176
5177       if (*peek == 'R')
5178         goto peek_R;
5179       /* Identifier. Ok.  */
5180     }
5181   else if (p == 'R')
5182     {
5183     peek_R:
5184       if (CPP_OPTION (pfile, rliterals))
5185         {
5186           peek = do_peek_next (peek, limit);
5187           if (*peek == '\"')
5188             return false;
5189         }
5190       /* Identifier. Ok.  */
5191     }
5192   else if ('Z' - 'A' == 25
5193            ? ((p >= 'A' && p <= 'Z') || (p >= 'a' && p <= 'z') || p == '_')
5194            : ISIDST (p))
5195     {
5196       /* Identifier.  Ok. */
5197     }
5198   else if (p == '<')
5199     {
5200       /* Maybe angle header, ok for import.  Reject
5201          '<=', '<<' digraph:'<:'.  */
5202       if (!import)
5203         return false;
5204       peek = do_peek_next (peek, limit);
5205       if (*peek == '=' || *peek == '<'
5206           || (*peek == ':' && CPP_OPTION (pfile, digraphs)))
5207         return false;
5208     }
5209   else if (p == ';')
5210     {
5211       /* SEMICOLON, ok for module.  */
5212       if (import)
5213         return false;
5214     }
5215   else if (p == '"')
5216     {
5217       /* STRING, ok for import.  */
5218       if (!import)
5219         return false;
5220     }
5221   else if (p == ':')
5222     {
5223       /* Maybe COLON, ok.  Reject '::', digraph:':>'.  */
5224       peek = do_peek_next (peek, limit);
5225       if (*peek == ':' || (*peek == '>' && CPP_OPTION (pfile, digraphs)))
5226         return false;
5227     }
5228   else
5229     /* FIXME: Detect a unicode character, excluding those not
5230        permitted as the initial character. [lex.name]/1.  I presume
5231        we need to check the \[uU] spellings, and directly using
5232        Unicode in say UTF8 form?  Or perhaps we do the phase-1
5233        conversion of UTF8 to universal-character-names?  */
5234     return false;
5235
5236   return true;
5237 }
5238
5239 /* Directives-only scanning.  Somewhat more relaxed than correct
5240    parsing -- some ill-formed programs will not be rejected.  */
5241
5242 void
5243 cpp_directive_only_process (cpp_reader *pfile,
5244                             void *data,
5245                             void (*cb) (cpp_reader *, CPP_DO_task, void *, ...))
5246 {
5247   bool module_p = CPP_OPTION (pfile, module_directives);
5248
5249   do
5250     {
5251     restart:
5252       /* Buffer initialization, but no line cleaning. */
5253       cpp_buffer *buffer = pfile->buffer;
5254       buffer->cur_note = buffer->notes_used = 0;
5255       buffer->cur = buffer->line_base = buffer->next_line;
5256       buffer->need_line = false;
5257       /* Files always end in a newline or carriage return.  We rely on this for
5258          character peeking safety.  */
5259       gcc_assert (buffer->rlimit[0] == '\n' || buffer->rlimit[0] == '\r');
5260
5261       const unsigned char *base = buffer->cur;
5262       unsigned line_count = 0;
5263       const unsigned char *line_start = base;
5264
5265       bool bol = true;
5266       bool raw = false;
5267
5268       const unsigned char *lwm = base;
5269       for (const unsigned char *pos = base, *limit = buffer->rlimit;
5270            pos < limit;)
5271         {
5272           unsigned char c = *pos++;
5273           /* This matches the switch in _cpp_lex_direct.  */
5274           switch (c)
5275             {
5276             case ' ': case '\t': case '\f': case '\v':
5277               /* Whitespace, do nothing.  */
5278               break;
5279
5280             case '\r': /* MAC line ending, or Windows \r\n  */
5281               if (*pos == '\n')
5282                 pos++;
5283               /* FALLTHROUGH */
5284
5285             case '\n':
5286               bol = true;
5287
5288             next_line:
5289               CPP_INCREMENT_LINE (pfile, 0);
5290               line_count++;
5291               line_start = pos;
5292               break;
5293
5294             case '\\':
5295               /* <backslash><newline> is removed, and doesn't undo any
5296                  preceeding escape or whatnot.  */
5297               if (*pos == '\n')
5298                 {
5299                   pos++;
5300                   goto next_line;
5301                 }
5302               else if (*pos == '\r')
5303                 {
5304                   if (pos[1] == '\n')
5305                     pos++;
5306                   pos++;
5307                   goto next_line;
5308                 }
5309               goto dflt;
5310
5311             case '#':
5312               if (bol)
5313                 {
5314                   /* Line directive.  */
5315                   if (pos - 1 > base && !pfile->state.skipping)
5316                     cb (pfile, CPP_DO_print, data,
5317                         line_count, base, pos - 1 - base);
5318
5319                   /* Prep things for directive handling. */
5320                   buffer->next_line = pos;
5321                   buffer->need_line = true;
5322                   bool ok = _cpp_get_fresh_line (pfile);
5323                   gcc_checking_assert (ok);
5324
5325                   /* Ensure proper column numbering for generated
5326                      error messages. */
5327                   buffer->line_base -= pos - line_start;
5328
5329                   _cpp_handle_directive (pfile, line_start + 1 != pos);
5330
5331                   /* Sanitize the line settings.  Duplicate #include's can
5332                      mess things up. */
5333                   // FIXME: Necessary?
5334                   pfile->line_table->highest_location
5335                     = pfile->line_table->highest_line;
5336
5337                   if (!pfile->state.skipping
5338                       && pfile->buffer->next_line < pfile->buffer->rlimit)
5339                     cb (pfile, CPP_DO_location, data,
5340                         pfile->line_table->highest_line);
5341
5342                   goto restart;
5343                 }
5344               goto dflt;
5345
5346             case '/':
5347               {
5348                 const unsigned char *peek = do_peek_next (pos, limit);
5349                 if (!(*peek == '/' || *peek == '*'))
5350                   goto dflt;
5351
5352                 /* Line or block comment  */
5353                 bool is_block = *peek == '*';
5354                 bool star = false;
5355                 bool esc = false;
5356                 location_t sloc
5357                   = linemap_position_for_column (pfile->line_table,
5358                                                  pos - line_start);
5359
5360                 while (pos < limit)
5361                   {
5362                     char c = *pos++;
5363                     switch (c)
5364                       {
5365                       case '\\':
5366                         esc = true;
5367                         break;
5368
5369                       case '\r':
5370                         if (*pos == '\n')
5371                           pos++;
5372                         /* FALLTHROUGH  */
5373
5374                       case '\n':
5375                         {
5376                           CPP_INCREMENT_LINE (pfile, 0);
5377                           line_count++;
5378                           line_start = pos;
5379                           if (!esc && !is_block)
5380                             {
5381                               bol = true;
5382                               goto done_comment;
5383                             }
5384                         }
5385                         if (!esc)
5386                           star = false;
5387                         esc = false;
5388                         break;
5389
5390                       case '*':
5391                         if (pos > peek)
5392                           star = is_block;
5393                         esc = false;
5394                         break;
5395
5396                       case '/':
5397                         if (star)
5398                           goto done_comment;
5399                         /* FALLTHROUGH  */
5400
5401                       default:
5402                         star = false;
5403                         esc = false;
5404                         break;
5405                       }
5406                   }
5407                 if (pos < limit || is_block)
5408                   cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
5409                                        "unterminated comment");
5410               done_comment:
5411                 lwm = pos;
5412                 break;
5413               }
5414
5415             case '\'':
5416               if (!CPP_OPTION (pfile, digit_separators))
5417                 goto delimited_string;
5418
5419               /* Possibly a number punctuator.  */
5420               if (!ISIDNUM (*do_peek_next (pos, limit)))
5421                 goto delimited_string;
5422
5423               goto quote_peek;
5424
5425             case '\"':
5426               if (!CPP_OPTION (pfile, rliterals))
5427                 goto delimited_string;
5428
5429             quote_peek:
5430               {
5431                 /* For ' see if it's a number punctuator
5432                    \.?<digit>(<digit>|<identifier-nondigit>
5433                    |'<digit>|'<nondigit>|[eEpP]<sign>|\.)* */
5434                 /* For " see if it's a raw string
5435                    {U,L,u,u8}R.  This includes CPP_NUMBER detection,
5436                    because that could be 0e+R.  */
5437                 const unsigned char *peek = pos - 1;
5438                 bool quote_first = c == '"';
5439                 bool quote_eight = false;
5440                 bool maybe_number_start = false;
5441                 bool want_number = false;
5442
5443                 while ((peek = do_peek_prev (peek, lwm)))
5444                   {
5445                     unsigned char p = *peek;
5446                     if (quote_first)
5447                       {
5448                         if (!raw)
5449                           {
5450                             if (p != 'R')
5451                               break;
5452                             raw = true;
5453                             continue;
5454                           }
5455
5456                         quote_first = false;
5457                         if (p == 'L' || p == 'U' || p == 'u')
5458                           ;
5459                         else if (p == '8')
5460                           quote_eight = true;
5461                         else
5462                           goto second_raw;
5463                       }
5464                     else if (quote_eight)
5465                       {
5466                         if (p != 'u')
5467                           {
5468                             raw = false;
5469                             break;
5470                           }
5471                         quote_eight = false;
5472                       }
5473                     else if (c == '"')
5474                       {
5475                       second_raw:;
5476                         if (!want_number && ISIDNUM (p))
5477                           {
5478                             raw = false;
5479                             break;
5480                           }
5481                       }
5482
5483                     if (ISDIGIT (p))
5484                       maybe_number_start = true;
5485                     else if (p == '.')
5486                       want_number = true;
5487                     else if (ISIDNUM (p))
5488                       maybe_number_start = false;
5489                     else if (p == '+' || p == '-')
5490                       {
5491                         if (const unsigned char *peek_prev
5492                             = do_peek_prev (peek, lwm))
5493                           {
5494                             p = *peek_prev;
5495                             if (p == 'e' || p == 'E'
5496                                 || p == 'p' || p == 'P')
5497                               {
5498                                 want_number = true;
5499                                 maybe_number_start = false;
5500                               }
5501                             else
5502                               break;
5503                           }
5504                         else
5505                           break;
5506                       }
5507                     else if (p == '\'' || p == '\"')
5508                       {
5509                         /* If this is lwm, this must be the end of a
5510                            previous string.  So this is a trailing
5511                            literal type, (a) if those are allowed,
5512                              and (b) maybe_start is false.  Otherwise
5513                              this must be a CPP_NUMBER because we've
5514                              met another ', and we'd have checked that
5515                              in its own right.  */
5516                         if (peek == lwm && CPP_OPTION (pfile, uliterals))
5517                           {
5518                             if  (!maybe_number_start && !want_number)
5519                               /* Must be a literal type.  */
5520                               raw = false;
5521                           }
5522                         else if (p == '\''
5523                                  && CPP_OPTION (pfile, digit_separators))
5524                           maybe_number_start = true;
5525                         break;
5526                       }
5527                     else if (c == '\'')
5528                       break;
5529                     else if (!quote_first && !quote_eight)
5530                       break;
5531                   }
5532
5533                 if (maybe_number_start)
5534                   {
5535                     if (c == '\'')
5536                       /* A CPP NUMBER.  */
5537                       goto dflt;
5538                     raw = false;
5539                   }
5540
5541                 goto delimited_string;
5542               }
5543
5544             delimited_string:
5545               {
5546                 /* (Possibly raw) string or char literal.  */
5547                 unsigned char end = c;
5548                 int delim_len = -1;
5549                 const unsigned char *delim = NULL;
5550                 location_t sloc = linemap_position_for_column (pfile->line_table,
5551                                                                pos - line_start);
5552                 int esc = 0;
5553
5554                 if (raw)
5555                   {
5556                     /* There can be no line breaks in the delimiter.  */
5557                     delim = pos;
5558                     for (delim_len = 0; (c = *pos++) != '('; delim_len++)
5559                       {
5560                         if (delim_len == 16)
5561                           {
5562                             cpp_error_with_line (pfile, CPP_DL_ERROR,
5563                                                  sloc, 0,
5564                                                  "raw string delimiter"
5565                                                  " longer than %d"
5566                                                  " characters",
5567                                                  delim_len);
5568                             raw = false;
5569                             pos = delim;
5570                             break;
5571                           }
5572                         if (strchr (") \\\t\v\f\n", c))
5573                           {
5574                             cpp_error_with_line (pfile, CPP_DL_ERROR,
5575                                                  sloc, 0,
5576                                                  "invalid character '%c'"
5577                                                  " in raw string"
5578                                                  " delimiter", c);
5579                             raw = false;
5580                             pos = delim;
5581                             break;
5582                           }
5583                         if (pos >= limit)
5584                           goto bad_string;
5585                       }
5586                   }
5587
5588                 while (pos < limit)
5589                   {
5590                     char c = *pos++;
5591                     switch (c)
5592                       {
5593                       case '\\':
5594                         if (!raw)
5595                           esc++;
5596                         break;
5597
5598                       case '\r':
5599                         if (*pos == '\n')
5600                           pos++;
5601                         /* FALLTHROUGH  */
5602
5603                       case '\n':
5604                         {
5605                           CPP_INCREMENT_LINE (pfile, 0);
5606                           line_count++;
5607                           line_start = pos;
5608                         }
5609                         if (esc)
5610                           esc--;
5611                         break;
5612
5613                       case ')':
5614                         if (raw
5615                             && pos + delim_len + 1 < limit
5616                             && pos[delim_len] == end
5617                             && !memcmp (delim, pos, delim_len))
5618                           {
5619                             pos += delim_len + 1;
5620                             raw = false;
5621                             goto done_string;
5622                           }
5623                         break;
5624
5625                       default:
5626                         if (!raw && !(esc & 1) && c == end)
5627                           goto done_string;
5628                         esc = 0;
5629                         break;
5630                       }
5631                   }
5632               bad_string:
5633                 cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
5634                                      "unterminated literal");
5635
5636               done_string:
5637                 raw = false;
5638                 lwm = pos - 1;
5639               }
5640               goto dflt;
5641
5642             case '_':
5643             case 'e':
5644             case 'i':
5645             case 'm':
5646               if (bol && module_p && !pfile->state.skipping
5647                   && do_peek_module (pfile, c, pos, limit))
5648                 {
5649                   /* We've seen the start of a module control line.
5650                      Start up the tokenizer.  */
5651                   pos--; /* Backup over the first character.  */
5652
5653                   /* Backup over whitespace to start of line.  */
5654                   while (pos > line_start
5655                          && (pos[-1] == ' ' || pos[-1] == '\t'))
5656                     pos--;
5657
5658                   if (pos > base)
5659                     cb (pfile, CPP_DO_print, data, line_count, base, pos - base);
5660
5661                   /* Prep things for directive handling. */
5662                   buffer->next_line = pos;
5663                   buffer->need_line = true;
5664
5665                   /* Now get tokens until the PRAGMA_EOL.  */
5666                   do
5667                     {
5668                       location_t spelling;
5669                       const cpp_token *tok
5670                         = cpp_get_token_with_location (pfile, &spelling);
5671
5672                       gcc_assert (pfile->state.in_deferred_pragma
5673                                   || tok->type == CPP_PRAGMA_EOL);
5674                       cb (pfile, CPP_DO_token, data, tok, spelling);
5675                     }
5676                   while (pfile->state.in_deferred_pragma);
5677
5678                   if (pfile->buffer->next_line < pfile->buffer->rlimit)
5679                     cb (pfile, CPP_DO_location, data,
5680                         pfile->line_table->highest_line);
5681
5682                   pfile->mi_valid = false;
5683                   goto restart;
5684                 }
5685               goto dflt;
5686
5687             default:
5688             dflt:
5689               bol = false;
5690               pfile->mi_valid = false;
5691               break;
5692             }
5693         }
5694
5695       if (buffer->rlimit > base && !pfile->state.skipping)
5696         {
5697           const unsigned char *limit = buffer->rlimit;
5698           /* If the file was not newline terminated, add rlimit, which is
5699              guaranteed to point to a newline, to the end of our range.  */
5700           if (limit[-1] != '\n')
5701             {
5702               limit++;
5703               CPP_INCREMENT_LINE (pfile, 0);
5704               line_count++;
5705             }
5706           cb (pfile, CPP_DO_print, data, line_count, base, limit - base);
5707         }
5708
5709       _cpp_pop_buffer (pfile);
5710     }
5711   while (pfile->buffer);
5712 }