libcpp/lex.cc

   1 /* CPP Library - lexical analysis.
   2    Copyright (C) 2000-2023 Free Software Foundation, Inc.
   3    Contributed by Per Bothner, 1994-95.
   4    Based on CCCP program by Paul Rubin, June 1986
   5    Adapted to ANSI C, Richard Stallman, Jan 1987
   6    Broken out to separate file, Zack Weinberg, Mar 2000
   7
   8 This program is free software; you can redistribute it and/or modify it
   9 under the terms of the GNU General Public License as published by the
  10 Free Software Foundation; either version 3, or (at your option) any
  11 later version.
  12
  13 This program is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with this program; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "cpplib.h"
  25 #include "internal.h"
  26
  27 enum spell_type
  28 {
  29   SPELL_OPERATOR = 0,
  30   SPELL_IDENT,
  31   SPELL_LITERAL,
  32   SPELL_NONE
  33 };
  34
  35 struct token_spelling
  36 {
  37   enum spell_type category;
  38   const unsigned char *name;
  39 };
  40
  41 static const unsigned char *const digraph_spellings[] =
  42 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
  43
  44 #define OP(e, s) { SPELL_OPERATOR, UC s  },
  45 #define TK(e, s) { SPELL_ ## s,    UC #e },
  46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
  47 #undef OP
  48 #undef TK
  49
  50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
  51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
  52
  53 /* ISO 10646 defines the UCS codespace as the range 0-0x10FFFF inclusive.  */
  54 #define UCS_LIMIT 0x10FFFF
  55
  56 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
  57 static int skip_line_comment (cpp_reader *);
  58 static void skip_whitespace (cpp_reader *, cppchar_t);
  59 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
  60 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
  61 static void store_comment (cpp_reader *, cpp_token *);
  62 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
  63                             unsigned int, enum cpp_ttype);
  64 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
  65 static int name_p (cpp_reader *, const cpp_string *);
  66 static tokenrun *next_tokenrun (tokenrun *);
  67
  68 static _cpp_buff *new_buff (size_t);
  69
  70
  71 /* Utility routine:
  72
  73    Compares, the token TOKEN to the NUL-terminated string STRING.
  74    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
  75 int
  76 cpp_ideq (const cpp_token *token, const char *string)
  77 {
  78   if (token->type != CPP_NAME)
  79     return 0;
  80
  81   return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
  82 }
  83
  84 /* Record a note TYPE at byte POS into the current cleaned logical
  85    line.  */
  86 static void
  87 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
  88 {
  89   if (buffer->notes_used == buffer->notes_cap)
  90     {
  91       buffer->notes_cap = buffer->notes_cap * 2 + 200;
  92       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
  93                                   buffer->notes_cap);
  94     }
  95
  96   buffer->notes[buffer->notes_used].pos = pos;
  97   buffer->notes[buffer->notes_used].type = type;
  98   buffer->notes_used++;
  99 }
 100
 101 \f
 102 /* Fast path to find line special characters using optimized character
 103    scanning algorithms.  Anything complicated falls back to the slow
 104    path below.  Since this loop is very hot it's worth doing these kinds
 105    of optimizations.
 106
 107    One of the paths through the ifdefs should provide
 108
 109      const uchar *search_line_fast (const uchar *s, const uchar *end);
 110
 111    Between S and END, search for \n, \r, \\, ?.  Return a pointer to
 112    the found character.
 113
 114    Note that the last character of the buffer is *always* a newline,
 115    as forced by _cpp_convert_input.  This fact can be used to avoid
 116    explicitly looking for the end of the buffer.  */
 117
 118 /* Configure gives us an ifdef test.  */
 119 #ifndef WORDS_BIGENDIAN
 120 #define WORDS_BIGENDIAN 0
 121 #endif
 122
 123 /* We'd like the largest integer that fits into a register.  There's nothing
 124    in <stdint.h> that gives us that.  For most hosts this is unsigned long,
 125    but MS decided on an LLP64 model.  Thankfully when building with GCC we
 126    can get the "real" word size.  */
 127 #ifdef __GNUC__
 128 typedef unsigned int word_type __attribute__((__mode__(__word__)));
 129 #else
 130 typedef unsigned long word_type;
 131 #endif
 132
 133 /* The code below is only expecting sizes 4 or 8.
 134    Die at compile-time if this expectation is violated.  */
 135 typedef char check_word_type_size
 136   [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
 137
 138 /* Return X with the first N bytes forced to values that won't match one
 139    of the interesting characters.  Note that NUL is not interesting.  */
 140
 141 static inline word_type
 142 acc_char_mask_misalign (word_type val, unsigned int n)
 143 {
 144   word_type mask = -1;
 145   if (WORDS_BIGENDIAN)
 146     mask >>= n * 8;
 147   else
 148     mask <<= n * 8;
 149   return val & mask;
 150 }
 151
 152 /* Return X replicated to all byte positions within WORD_TYPE.  */
 153
 154 static inline word_type
 155 acc_char_replicate (uchar x)
 156 {
 157   word_type ret;
 158
 159   ret = (x << 24) | (x << 16) | (x << 8) | x;
 160   if (sizeof(word_type) == 8)
 161     ret = (ret << 16 << 16) | ret;
 162   return ret;
 163 }
 164
 165 /* Return non-zero if some byte of VAL is (probably) C.  */
 166
 167 static inline word_type
 168 acc_char_cmp (word_type val, word_type c)
 169 {
 170 #if defined(__GNUC__) && defined(__alpha__)
 171   /* We can get exact results using a compare-bytes instruction.
 172      Get (val == c) via (0 >= (val ^ c)).  */
 173   return __builtin_alpha_cmpbge (0, val ^ c);
 174 #else
 175   word_type magic = 0x7efefefeU;
 176   if (sizeof(word_type) == 8)
 177     magic = (magic << 16 << 16) | 0xfefefefeU;
 178   magic |= 1;
 179
 180   val ^= c;
 181   return ((val + magic) ^ ~val) & ~magic;
 182 #endif
 183 }
 184
 185 /* Given the result of acc_char_cmp is non-zero, return the index of
 186    the found character.  If this was a false positive, return -1.  */
 187
 188 static inline int
 189 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
 190                 word_type val ATTRIBUTE_UNUSED)
 191 {
 192 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
 193   /* The cmpbge instruction sets *bits* of the result corresponding to
 194      matches in the bytes with no false positives.  */
 195   return __builtin_ctzl (cmp);
 196 #else
 197   unsigned int i;
 198
 199   /* ??? It would be nice to force unrolling here,
 200      and have all of these constants folded.  */
 201   for (i = 0; i < sizeof(word_type); ++i)
 202     {
 203       uchar c;
 204       if (WORDS_BIGENDIAN)
 205         c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
 206       else
 207         c = (val >> i * 8) & 0xff;
 208
 209       if (c == '\n' || c == '\r' || c == '\\' || c == '?')
 210         return i;
 211     }
 212
 213   return -1;
 214 #endif
 215 }
 216
 217 /* A version of the fast scanner using bit fiddling techniques.
 218
 219    For 32-bit words, one would normally perform 16 comparisons and
 220    16 branches.  With this algorithm one performs 24 arithmetic
 221    operations and one branch.  Whether this is faster with a 32-bit
 222    word size is going to be somewhat system dependent.
 223
 224    For 64-bit words, we eliminate twice the number of comparisons
 225    and branches without increasing the number of arithmetic operations.
 226    It's almost certainly going to be a win with 64-bit word size.  */
 227
 228 static const uchar * search_line_acc_char (const uchar *, const uchar *)
 229   ATTRIBUTE_UNUSED;
 230
 231 static const uchar *
 232 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 233 {
 234   const word_type repl_nl = acc_char_replicate ('\n');
 235   const word_type repl_cr = acc_char_replicate ('\r');
 236   const word_type repl_bs = acc_char_replicate ('\\');
 237   const word_type repl_qm = acc_char_replicate ('?');
 238
 239   unsigned int misalign;
 240   const word_type *p;
 241   word_type val, t;
 242
 243   /* Align the buffer.  Mask out any bytes from before the beginning.  */
 244   p = (word_type *)((uintptr_t)s & -sizeof(word_type));
 245   val = *p;
 246   misalign = (uintptr_t)s & (sizeof(word_type) - 1);
 247   if (misalign)
 248     val = acc_char_mask_misalign (val, misalign);
 249
 250   /* Main loop.  */
 251   while (1)
 252     {
 253       t  = acc_char_cmp (val, repl_nl);
 254       t |= acc_char_cmp (val, repl_cr);
 255       t |= acc_char_cmp (val, repl_bs);
 256       t |= acc_char_cmp (val, repl_qm);
 257
 258       if (__builtin_expect (t != 0, 0))
 259         {
 260           int i = acc_char_index (t, val);
 261           if (i >= 0)
 262             return (const uchar *)p + i;
 263         }
 264
 265       val = *++p;
 266     }
 267 }
 268
 269 /* Disable on Solaris 2/x86 until the following problem can be properly
 270    autoconfed:
 271
 272    The Solaris 10+ assembler tags objects with the instruction set
 273    extensions used, so SSE4.2 executables cannot run on machines that
 274    don't support that extension.  */
 275
 276 #if (GCC_VERSION >= 4005) && (__GNUC__ >= 5 || !defined(__PIC__)) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
 277
 278 /* Replicated character data to be shared between implementations.
 279    Recall that outside of a context with vector support we can't
 280    define compatible vector types, therefore these are all defined
 281    in terms of raw characters.  */
 282 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
 283   { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 284     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
 285   { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 286     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
 287   { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 288     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
 289   { '?', '?', '?', '?', '?', '?', '?', '?',
 290     '?', '?', '?', '?', '?', '?', '?', '?' },
 291 };
 292
 293 /* A version of the fast scanner using MMX vectorized byte compare insns.
 294
 295    This uses the PMOVMSKB instruction which was introduced with "MMX2",
 296    which was packaged into SSE1; it is also present in the AMD MMX
 297    extension.  Mark the function as using "sse" so that we emit a real
 298    "emms" instruction, rather than the 3dNOW "femms" instruction.  */
 299
 300 static const uchar *
 301 #ifndef __SSE__
 302 __attribute__((__target__("sse")))
 303 #endif
 304 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 305 {
 306   typedef char v8qi __attribute__ ((__vector_size__ (8)));
 307   typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
 308
 309   const v8qi repl_nl = *(const v8qi *)repl_chars[0];
 310   const v8qi repl_cr = *(const v8qi *)repl_chars[1];
 311   const v8qi repl_bs = *(const v8qi *)repl_chars[2];
 312   const v8qi repl_qm = *(const v8qi *)repl_chars[3];
 313
 314   unsigned int misalign, found, mask;
 315   const v8qi *p;
 316   v8qi data, t, c;
 317
 318   /* Align the source pointer.  While MMX doesn't generate unaligned data
 319      faults, this allows us to safely scan to the end of the buffer without
 320      reading beyond the end of the last page.  */
 321   misalign = (uintptr_t)s & 7;
 322   p = (const v8qi *)((uintptr_t)s & -8);
 323   data = *p;
 324
 325   /* Create a mask for the bytes that are valid within the first
 326      16-byte block.  The Idea here is that the AND with the mask
 327      within the loop is "free", since we need some AND or TEST
 328      insn in order to set the flags for the branch anyway.  */
 329   mask = -1u << misalign;
 330
 331   /* Main loop processing 8 bytes at a time.  */
 332   goto start;
 333   do
 334     {
 335       data = *++p;
 336       mask = -1;
 337
 338     start:
 339       t = __builtin_ia32_pcmpeqb(data, repl_nl);
 340       c = __builtin_ia32_pcmpeqb(data, repl_cr);
 341       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 342       c = __builtin_ia32_pcmpeqb(data, repl_bs);
 343       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 344       c = __builtin_ia32_pcmpeqb(data, repl_qm);
 345       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 346       found = __builtin_ia32_pmovmskb (t);
 347       found &= mask;
 348     }
 349   while (!found);
 350
 351   __builtin_ia32_emms ();
 352
 353   /* FOUND contains 1 in bits for which we matched a relevant
 354      character.  Conversion to the byte index is trivial.  */
 355   found = __builtin_ctz(found);
 356   return (const uchar *)p + found;
 357 }
 358
 359 /* A version of the fast scanner using SSE2 vectorized byte compare insns.  */
 360
 361 static const uchar *
 362 #ifndef __SSE2__
 363 __attribute__((__target__("sse2")))
 364 #endif
 365 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 366 {
 367   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 368
 369   const v16qi repl_nl = *(const v16qi *)repl_chars[0];
 370   const v16qi repl_cr = *(const v16qi *)repl_chars[1];
 371   const v16qi repl_bs = *(const v16qi *)repl_chars[2];
 372   const v16qi repl_qm = *(const v16qi *)repl_chars[3];
 373
 374   unsigned int misalign, found, mask;
 375   const v16qi *p;
 376   v16qi data, t;
 377
 378   /* Align the source pointer.  */
 379   misalign = (uintptr_t)s & 15;
 380   p = (const v16qi *)((uintptr_t)s & -16);
 381   data = *p;
 382
 383   /* Create a mask for the bytes that are valid within the first
 384      16-byte block.  The Idea here is that the AND with the mask
 385      within the loop is "free", since we need some AND or TEST
 386      insn in order to set the flags for the branch anyway.  */
 387   mask = -1u << misalign;
 388
 389   /* Main loop processing 16 bytes at a time.  */
 390   goto start;
 391   do
 392     {
 393       data = *++p;
 394       mask = -1;
 395
 396     start:
 397       t  = data == repl_nl;
 398       t |= data == repl_cr;
 399       t |= data == repl_bs;
 400       t |= data == repl_qm;
 401       found = __builtin_ia32_pmovmskb128 (t);
 402       found &= mask;
 403     }
 404   while (!found);
 405
 406   /* FOUND contains 1 in bits for which we matched a relevant
 407      character.  Conversion to the byte index is trivial.  */
 408   found = __builtin_ctz(found);
 409   return (const uchar *)p + found;
 410 }
 411
 412 #ifdef HAVE_SSE4
 413 /* A version of the fast scanner using SSE 4.2 vectorized string insns.  */
 414
 415 static const uchar *
 416 #ifndef __SSE4_2__
 417 __attribute__((__target__("sse4.2")))
 418 #endif
 419 search_line_sse42 (const uchar *s, const uchar *end)
 420 {
 421   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 422   static const v16qi search = { '\n', '\r', '?', '\\' };
 423
 424   uintptr_t si = (uintptr_t)s;
 425   uintptr_t index;
 426
 427   /* Check for unaligned input.  */
 428   if (si & 15)
 429     {
 430       v16qi sv;
 431
 432       if (__builtin_expect (end - s < 16, 0)
 433           && __builtin_expect ((si & 0xfff) > 0xff0, 0))
 434         {
 435           /* There are less than 16 bytes left in the buffer, and less
 436              than 16 bytes left on the page.  Reading 16 bytes at this
 437              point might generate a spurious page fault.  Defer to the
 438              SSE2 implementation, which already handles alignment.  */
 439           return search_line_sse2 (s, end);
 440         }
 441
 442       /* ??? The builtin doesn't understand that the PCMPESTRI read from
 443          memory need not be aligned.  */
 444       sv = __builtin_ia32_loaddqu ((const char *) s);
 445       index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
 446
 447       if (__builtin_expect (index < 16, 0))
 448         goto found;
 449
 450       /* Advance the pointer to an aligned address.  We will re-scan a
 451          few bytes, but we no longer need care for reading past the
 452          end of a page, since we're guaranteed a match.  */
 453       s = (const uchar *)((si + 15) & -16);
 454     }
 455
 456   /* Main loop, processing 16 bytes at a time.  */
 457 #ifdef __GCC_ASM_FLAG_OUTPUTS__
 458   while (1)
 459     {
 460       char f;
 461
 462       /* By using inline assembly instead of the builtin,
 463          we can use the result, as well as the flags set.  */
 464       __asm ("%vpcmpestri\t$0, %2, %3"
 465              : "=c"(index), "=@ccc"(f)
 466              : "m"(*s), "x"(search), "a"(4), "d"(16));
 467       if (f)
 468         break;
 469
 470       s += 16;
 471     }
 472 #else
 473   s -= 16;
 474   /* By doing the whole loop in inline assembly,
 475      we can make proper use of the flags set.  */
 476   __asm (      ".balign 16\n"
 477         "0:     add $16, %1\n"
 478         "       %vpcmpestri\t$0, (%1), %2\n"
 479         "       jnc 0b"
 480         : "=&c"(index), "+r"(s)
 481         : "x"(search), "a"(4), "d"(16));
 482 #endif
 483
 484  found:
 485   return s + index;
 486 }
 487
 488 #else
 489 /* Work around out-dated assemblers without sse4 support.  */
 490 #define search_line_sse42 search_line_sse2
 491 #endif
 492
 493 /* Check the CPU capabilities.  */
 494
 495 #include "../gcc/config/i386/cpuid.h"
 496
 497 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
 498 static search_line_fast_type search_line_fast;
 499
 500 #define HAVE_init_vectorized_lexer 1
 501 static inline void
 502 init_vectorized_lexer (void)
 503 {
 504   unsigned dummy, ecx = 0, edx = 0;
 505   search_line_fast_type impl = search_line_acc_char;
 506   int minimum = 0;
 507
 508 #if defined(__SSE4_2__)
 509   minimum = 3;
 510 #elif defined(__SSE2__)
 511   minimum = 2;
 512 #elif defined(__SSE__)
 513   minimum = 1;
 514 #endif
 515
 516   if (minimum == 3)
 517     impl = search_line_sse42;
 518   else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
 519     {
 520       if (minimum == 3 || (ecx & bit_SSE4_2))
 521         impl = search_line_sse42;
 522       else if (minimum == 2 || (edx & bit_SSE2))
 523         impl = search_line_sse2;
 524       else if (minimum == 1 || (edx & bit_SSE))
 525         impl = search_line_mmx;
 526     }
 527   else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
 528     {
 529       if (minimum == 1
 530           || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
 531         impl = search_line_mmx;
 532     }
 533
 534   search_line_fast = impl;
 535 }
 536
 537 #elif (GCC_VERSION >= 4005) && defined(_ARCH_PWR8) && defined(__ALTIVEC__)
 538
 539 /* A vection of the fast scanner using AltiVec vectorized byte compares
 540    and VSX unaligned loads (when VSX is available).  This is otherwise
 541    the same as the AltiVec version.  */
 542
 543 ATTRIBUTE_NO_SANITIZE_UNDEFINED
 544 static const uchar *
 545 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 546 {
 547   typedef __attribute__((altivec(vector))) unsigned char vc;
 548
 549   const vc repl_nl = {
 550     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 551     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 552   };
 553   const vc repl_cr = {
 554     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 555     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 556   };
 557   const vc repl_bs = {
 558     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 559     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 560   };
 561   const vc repl_qm = {
 562     '?', '?', '?', '?', '?', '?', '?', '?',
 563     '?', '?', '?', '?', '?', '?', '?', '?',
 564   };
 565   const vc zero = { 0 };
 566
 567   vc data, t;
 568
 569   /* Main loop processing 16 bytes at a time.  */
 570   do
 571     {
 572       vc m_nl, m_cr, m_bs, m_qm;
 573
 574       data = __builtin_vec_vsx_ld (0, s);
 575       s += 16;
 576
 577       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 578       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 579       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 580       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 581       t = (m_nl | m_cr) | (m_bs | m_qm);
 582
 583       /* T now contains 0xff in bytes for which we matched one of the relevant
 584          characters.  We want to exit the loop if any byte in T is non-zero.
 585          Below is the expansion of vec_any_ne(t, zero).  */
 586     }
 587   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 588
 589   /* Restore s to to point to the 16 bytes we just processed.  */
 590   s -= 16;
 591
 592   {
 593 #define N  (sizeof(vc) / sizeof(long))
 594
 595     union {
 596       vc v;
 597       /* Statically assert that N is 2 or 4.  */
 598       unsigned long l[(N == 2 || N == 4) ? N : -1];
 599     } u;
 600     unsigned long l, i = 0;
 601
 602     u.v = t;
 603
 604     /* Find the first word of T that is non-zero.  */
 605     switch (N)
 606       {
 607       case 4:
 608         l = u.l[i++];
 609         if (l != 0)
 610           break;
 611         s += sizeof(unsigned long);
 612         l = u.l[i++];
 613         if (l != 0)
 614           break;
 615         s += sizeof(unsigned long);
 616         /* FALLTHRU */
 617       case 2:
 618         l = u.l[i++];
 619         if (l != 0)
 620           break;
 621         s += sizeof(unsigned long);
 622         l = u.l[i];
 623       }
 624
 625     /* L now contains 0xff in bytes for which we matched one of the
 626        relevant characters.  We can find the byte index by finding
 627        its bit index and dividing by 8.  */
 628 #ifdef __BIG_ENDIAN__
 629     l = __builtin_clzl(l) >> 3;
 630 #else
 631     l = __builtin_ctzl(l) >> 3;
 632 #endif
 633     return s + l;
 634
 635 #undef N
 636   }
 637 }
 638
 639 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__) && defined (__BIG_ENDIAN__)
 640
 641 /* A vection of the fast scanner using AltiVec vectorized byte compares.
 642    This cannot be used for little endian because vec_lvsl/lvsr are
 643    deprecated for little endian and the code won't work properly.  */
 644 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
 645    so we can't compile this function without -maltivec on the command line
 646    (or implied by some other switch).  */
 647
 648 static const uchar *
 649 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 650 {
 651   typedef __attribute__((altivec(vector))) unsigned char vc;
 652
 653   const vc repl_nl = {
 654     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 655     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 656   };
 657   const vc repl_cr = {
 658     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 659     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 660   };
 661   const vc repl_bs = {
 662     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 663     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 664   };
 665   const vc repl_qm = {
 666     '?', '?', '?', '?', '?', '?', '?', '?',
 667     '?', '?', '?', '?', '?', '?', '?', '?',
 668   };
 669   const vc ones = {
 670     -1, -1, -1, -1, -1, -1, -1, -1,
 671     -1, -1, -1, -1, -1, -1, -1, -1,
 672   };
 673   const vc zero = { 0 };
 674
 675   vc data, mask, t;
 676
 677   /* Altivec loads automatically mask addresses with -16.  This lets us
 678      issue the first load as early as possible.  */
 679   data = __builtin_vec_ld(0, (const vc *)s);
 680
 681   /* Discard bytes before the beginning of the buffer.  Do this by
 682      beginning with all ones and shifting in zeros according to the
 683      mis-alignment.  The LVSR instruction pulls the exact shift we
 684      want from the address.  */
 685   mask = __builtin_vec_lvsr(0, s);
 686   mask = __builtin_vec_perm(zero, ones, mask);
 687   data &= mask;
 688
 689   /* While altivec loads mask addresses, we still need to align S so
 690      that the offset we compute at the end is correct.  */
 691   s = (const uchar *)((uintptr_t)s & -16);
 692
 693   /* Main loop processing 16 bytes at a time.  */
 694   goto start;
 695   do
 696     {
 697       vc m_nl, m_cr, m_bs, m_qm;
 698
 699       s += 16;
 700       data = __builtin_vec_ld(0, (const vc *)s);
 701
 702     start:
 703       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 704       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 705       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 706       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 707       t = (m_nl | m_cr) | (m_bs | m_qm);
 708
 709       /* T now contains 0xff in bytes for which we matched one of the relevant
 710          characters.  We want to exit the loop if any byte in T is non-zero.
 711          Below is the expansion of vec_any_ne(t, zero).  */
 712     }
 713   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 714
 715   {
 716 #define N  (sizeof(vc) / sizeof(long))
 717
 718     union {
 719       vc v;
 720       /* Statically assert that N is 2 or 4.  */
 721       unsigned long l[(N == 2 || N == 4) ? N : -1];
 722     } u;
 723     unsigned long l, i = 0;
 724
 725     u.v = t;
 726
 727     /* Find the first word of T that is non-zero.  */
 728     switch (N)
 729       {
 730       case 4:
 731         l = u.l[i++];
 732         if (l != 0)
 733           break;
 734         s += sizeof(unsigned long);
 735         l = u.l[i++];
 736         if (l != 0)
 737           break;
 738         s += sizeof(unsigned long);
 739         /* FALLTHROUGH */
 740       case 2:
 741         l = u.l[i++];
 742         if (l != 0)
 743           break;
 744         s += sizeof(unsigned long);
 745         l = u.l[i];
 746       }
 747
 748     /* L now contains 0xff in bytes for which we matched one of the
 749        relevant characters.  We can find the byte index by finding
 750        its bit index and dividing by 8.  */
 751     l = __builtin_clzl(l) >> 3;
 752     return s + l;
 753
 754 #undef N
 755   }
 756 }
 757
 758 #elif defined (__ARM_NEON) && defined (__ARM_64BIT_STATE)
 759 #include "arm_neon.h"
 760
 761 /* This doesn't have to be the exact page size, but no system may use
 762    a size smaller than this.  ARMv8 requires a minimum page size of
 763    4k.  The impact of being conservative here is a small number of
 764    cases will take the slightly slower entry path into the main
 765    loop.  */
 766
 767 #define AARCH64_MIN_PAGE_SIZE 4096
 768
 769 static const uchar *
 770 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 771 {
 772   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
 773   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
 774   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
 775   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
 776   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
 777
 778 #ifdef __ARM_BIG_ENDIAN
 779   const int16x8_t shift = {8, 8, 8, 8, 0, 0, 0, 0};
 780 #else
 781   const int16x8_t shift = {0, 0, 0, 0, 8, 8, 8, 8};
 782 #endif
 783
 784   unsigned int found;
 785   const uint8_t *p;
 786   uint8x16_t data;
 787   uint8x16_t t;
 788   uint16x8_t m;
 789   uint8x16_t u, v, w;
 790
 791   /* Align the source pointer.  */
 792   p = (const uint8_t *)((uintptr_t)s & -16);
 793
 794   /* Assuming random string start positions, with a 4k page size we'll take
 795      the slow path about 0.37% of the time.  */
 796   if (__builtin_expect ((AARCH64_MIN_PAGE_SIZE
 797                          - (((uintptr_t) s) & (AARCH64_MIN_PAGE_SIZE - 1)))
 798                         < 16, 0))
 799     {
 800       /* Slow path: the string starts near a possible page boundary.  */
 801       uint32_t misalign, mask;
 802
 803       misalign = (uintptr_t)s & 15;
 804       mask = (-1u << misalign) & 0xffff;
 805       data = vld1q_u8 (p);
 806       t = vceqq_u8 (data, repl_nl);
 807       u = vceqq_u8 (data, repl_cr);
 808       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 809       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 810       t = vorrq_u8 (v, w);
 811       t = vandq_u8 (t, xmask);
 812       m = vpaddlq_u8 (t);
 813       m = vshlq_u16 (m, shift);
 814       found = vaddvq_u16 (m);
 815       found &= mask;
 816       if (found)
 817         return (const uchar*)p + __builtin_ctz (found);
 818     }
 819   else
 820     {
 821       data = vld1q_u8 ((const uint8_t *) s);
 822       t = vceqq_u8 (data, repl_nl);
 823       u = vceqq_u8 (data, repl_cr);
 824       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 825       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 826       t = vorrq_u8 (v, w);
 827       if (__builtin_expect (vpaddd_u64 ((uint64x2_t)t) != 0, 0))
 828         goto done;
 829     }
 830
 831   do
 832     {
 833       p += 16;
 834       data = vld1q_u8 (p);
 835       t = vceqq_u8 (data, repl_nl);
 836       u = vceqq_u8 (data, repl_cr);
 837       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 838       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 839       t = vorrq_u8 (v, w);
 840     } while (!vpaddd_u64 ((uint64x2_t)t));
 841
 842 done:
 843   /* Now that we've found the terminating substring, work out precisely where
 844      we need to stop.  */
 845   t = vandq_u8 (t, xmask);
 846   m = vpaddlq_u8 (t);
 847   m = vshlq_u16 (m, shift);
 848   found = vaddvq_u16 (m);
 849   return (((((uintptr_t) p) < (uintptr_t) s) ? s : (const uchar *)p)
 850           + __builtin_ctz (found));
 851 }
 852
 853 #elif defined (__ARM_NEON)
 854 #include "arm_neon.h"
 855
 856 static const uchar *
 857 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 858 {
 859   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
 860   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
 861   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
 862   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
 863   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
 864
 865   unsigned int misalign, found, mask;
 866   const uint8_t *p;
 867   uint8x16_t data;
 868
 869   /* Align the source pointer.  */
 870   misalign = (uintptr_t)s & 15;
 871   p = (const uint8_t *)((uintptr_t)s & -16);
 872   data = vld1q_u8 (p);
 873
 874   /* Create a mask for the bytes that are valid within the first
 875      16-byte block.  The Idea here is that the AND with the mask
 876      within the loop is "free", since we need some AND or TEST
 877      insn in order to set the flags for the branch anyway.  */
 878   mask = (-1u << misalign) & 0xffff;
 879
 880   /* Main loop, processing 16 bytes at a time.  */
 881   goto start;
 882
 883   do
 884     {
 885       uint8x8_t l;
 886       uint16x4_t m;
 887       uint32x2_t n;
 888       uint8x16_t t, u, v, w;
 889
 890       p += 16;
 891       data = vld1q_u8 (p);
 892       mask = 0xffff;
 893
 894     start:
 895       t = vceqq_u8 (data, repl_nl);
 896       u = vceqq_u8 (data, repl_cr);
 897       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 898       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 899       t = vandq_u8 (vorrq_u8 (v, w), xmask);
 900       l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
 901       m = vpaddl_u8 (l);
 902       n = vpaddl_u16 (m);
 903
 904       found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
 905               vshr_n_u64 ((uint64x1_t) n, 24)), 0);
 906       found &= mask;
 907     }
 908   while (!found);
 909
 910   /* FOUND contains 1 in bits for which we matched a relevant
 911      character.  Conversion to the byte index is trivial.  */
 912   found = __builtin_ctz (found);
 913   return (const uchar *)p + found;
 914 }
 915
 916 #else
 917
 918 /* We only have one accelerated alternative.  Use a direct call so that
 919    we encourage inlining.  */
 920
 921 #define search_line_fast  search_line_acc_char
 922
 923 #endif
 924
 925 /* Initialize the lexer if needed.  */
 926
 927 void
 928 _cpp_init_lexer (void)
 929 {
 930 #ifdef HAVE_init_vectorized_lexer
 931   init_vectorized_lexer ();
 932 #endif
 933 }
 934
 935 /* Returns with a logical line that contains no escaped newlines or
 936    trigraphs.  This is a time-critical inner loop.  */
 937 void
 938 _cpp_clean_line (cpp_reader *pfile)
 939 {
 940   cpp_buffer *buffer;
 941   const uchar *s;
 942   uchar c, *d, *p;
 943
 944   buffer = pfile->buffer;
 945   buffer->cur_note = buffer->notes_used = 0;
 946   buffer->cur = buffer->line_base = buffer->next_line;
 947   buffer->need_line = false;
 948   s = buffer->next_line;
 949
 950   if (!buffer->from_stage3)
 951     {
 952       const uchar *pbackslash = NULL;
 953
 954       /* Fast path.  This is the common case of an un-escaped line with
 955          no trigraphs.  The primary win here is by not writing any
 956          data back to memory until we have to.  */
 957       while (1)
 958         {
 959           /* Perform an optimized search for \n, \r, \\, ?.  */
 960           s = search_line_fast (s, buffer->rlimit);
 961
 962           c = *s;
 963           if (c == '\\')
 964             {
 965               /* Record the location of the backslash and continue.  */
 966               pbackslash = s++;
 967             }
 968           else if (__builtin_expect (c == '?', 0))
 969             {
 970               if (__builtin_expect (s[1] == '?', false)
 971                    && _cpp_trigraph_map[s[2]])
 972                 {
 973                   /* Have a trigraph.  We may or may not have to convert
 974                      it.  Add a line note regardless, for -Wtrigraphs.  */
 975                   add_line_note (buffer, s, s[2]);
 976                   if (CPP_OPTION (pfile, trigraphs))
 977                     {
 978                       /* We do, and that means we have to switch to the
 979                          slow path.  */
 980                       d = (uchar *) s;
 981                       *d = _cpp_trigraph_map[s[2]];
 982                       s += 2;
 983                       goto slow_path;
 984                     }
 985                 }
 986               /* Not a trigraph.  Continue on fast-path.  */
 987               s++;
 988             }
 989           else
 990             break;
 991         }
 992
 993       /* This must be \r or \n.  We're either done, or we'll be forced
 994          to write back to the buffer and continue on the slow path.  */
 995       d = (uchar *) s;
 996
 997       if (__builtin_expect (s == buffer->rlimit, false))
 998         goto done;
 999
1000       /* DOS line ending? */
1001       if (__builtin_expect (c == '\r', false) && s[1] == '\n')
1002         {
1003           s++;
1004           if (s == buffer->rlimit)
1005             goto done;
1006         }
1007
1008       if (__builtin_expect (pbackslash == NULL, true))
1009         goto done;
1010
1011       /* Check for escaped newline.  */
1012       p = d;
1013       while (is_nvspace (p[-1]))
1014         p--;
1015       if (p - 1 != pbackslash)
1016         goto done;
1017
1018       /* Have an escaped newline; process it and proceed to
1019          the slow path.  */
1020       add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
1021       d = p - 2;
1022       buffer->next_line = p - 1;
1023
1024     slow_path:
1025       while (1)
1026         {
1027           c = *++s;
1028           *++d = c;
1029
1030           if (c == '\n' || c == '\r')
1031             {
1032               /* Handle DOS line endings.  */
1033               if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
1034                 s++;
1035               if (s == buffer->rlimit)
1036                 break;
1037
1038               /* Escaped?  */
1039               p = d;
1040               while (p != buffer->next_line && is_nvspace (p[-1]))
1041                 p--;
1042               if (p == buffer->next_line || p[-1] != '\\')
1043                 break;
1044
1045               add_line_note (buffer, p - 1, p != d ? ' ': '\\');
1046               d = p - 2;
1047               buffer->next_line = p - 1;
1048             }
1049           else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
1050             {
1051               /* Add a note regardless, for the benefit of -Wtrigraphs.  */
1052               add_line_note (buffer, d, s[2]);
1053               if (CPP_OPTION (pfile, trigraphs))
1054                 {
1055                   *d = _cpp_trigraph_map[s[2]];
1056                   s += 2;
1057                 }
1058             }
1059         }
1060     }
1061   else
1062     {
1063       while (*s != '\n' && *s != '\r')
1064         s++;
1065       d = (uchar *) s;
1066
1067       /* Handle DOS line endings.  */
1068       if (*s == '\r' && s + 1 != buffer->rlimit && s[1] == '\n')
1069         s++;
1070     }
1071
1072  done:
1073   *d = '\n';
1074   /* A sentinel note that should never be processed.  */
1075   add_line_note (buffer, d + 1, '\n');
1076   buffer->next_line = s + 1;
1077 }
1078
1079 template <bool lexing_raw_string>
1080 static bool get_fresh_line_impl (cpp_reader *pfile);
1081
1082 /* Return true if the trigraph indicated by NOTE should be warned
1083    about in a comment.  */
1084 static bool
1085 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
1086 {
1087   const uchar *p;
1088
1089   /* Within comments we don't warn about trigraphs, unless the
1090      trigraph forms an escaped newline, as that may change
1091      behavior.  */
1092   if (note->type != '/')
1093     return false;
1094
1095   /* If -trigraphs, then this was an escaped newline iff the next note
1096      is coincident.  */
1097   if (CPP_OPTION (pfile, trigraphs))
1098     return note[1].pos == note->pos;
1099
1100   /* Otherwise, see if this forms an escaped newline.  */
1101   p = note->pos + 3;
1102   while (is_nvspace (*p))
1103     p++;
1104
1105   /* There might have been escaped newlines between the trigraph and the
1106      newline we found.  Hence the position test.  */
1107   return (*p == '\n' && p < note[1].pos);
1108 }
1109
1110 /* Process the notes created by add_line_note as far as the current
1111    location.  */
1112 void
1113 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
1114 {
1115   cpp_buffer *buffer = pfile->buffer;
1116
1117   for (;;)
1118     {
1119       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
1120       unsigned int col;
1121
1122       if (note->pos > buffer->cur)
1123         break;
1124
1125       buffer->cur_note++;
1126       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
1127
1128       if (note->type == '\\' || note->type == ' ')
1129         {
1130           if (note->type == ' ' && !in_comment)
1131             cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
1132                                  "backslash and newline separated by space");
1133
1134           if (buffer->next_line > buffer->rlimit)
1135             {
1136               cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
1137                                    "backslash-newline at end of file");
1138               /* Prevent "no newline at end of file" warning.  */
1139               buffer->next_line = buffer->rlimit;
1140             }
1141
1142           buffer->line_base = note->pos;
1143           CPP_INCREMENT_LINE (pfile, 0);
1144         }
1145       else if (_cpp_trigraph_map[note->type])
1146         {
1147           if (CPP_OPTION (pfile, warn_trigraphs)
1148               && (!in_comment || warn_in_comment (pfile, note)))
1149             {
1150               if (CPP_OPTION (pfile, trigraphs))
1151                 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
1152                                        pfile->line_table->highest_line, col,
1153                                        "trigraph ??%c converted to %c",
1154                                        note->type,
1155                                        (int) _cpp_trigraph_map[note->type]);
1156               else
1157                 {
1158                   cpp_warning_with_line
1159                     (pfile, CPP_W_TRIGRAPHS,
1160                      pfile->line_table->highest_line, col,
1161                      "trigraph ??%c ignored, use -trigraphs to enable",
1162                      note->type);
1163                 }
1164             }
1165         }
1166       else if (note->type == 0)
1167         /* Already processed in lex_raw_string.  */;
1168       else
1169         abort ();
1170     }
1171 }
1172
1173 namespace bidi {
1174   enum class kind {
1175     NONE, LRE, RLE, LRO, RLO, LRI, RLI, FSI, PDF, PDI, LTR, RTL
1176   };
1177
1178   /* All the UTF-8 encodings of bidi characters start with E2.  */
1179   constexpr uchar utf8_start = 0xe2;
1180
1181   struct context
1182   {
1183     context () {}
1184     context (location_t loc, kind k, bool pdf, bool ucn)
1185     : m_loc (loc), m_kind (k), m_pdf (pdf), m_ucn (ucn)
1186     {
1187     }
1188
1189     kind get_pop_kind () const
1190     {
1191       return m_pdf ? kind::PDF : kind::PDI;
1192     }
1193     bool ucn_p () const
1194     {
1195       return m_ucn;
1196     }
1197
1198     location_t m_loc;
1199     kind m_kind;
1200     unsigned m_pdf : 1;
1201     unsigned m_ucn : 1;
1202   };
1203
1204   /* A vector holding currently open bidi contexts.  We use a char for
1205      each context, its LSB is 1 if it represents a PDF context, 0 if it
1206      represents a PDI context.  The next bit is 1 if this context was open
1207      by a bidi character written as a UCN, and 0 when it was UTF-8.  */
1208   semi_embedded_vec <context, 16> vec;
1209
1210   /* Close the whole comment/identifier/string literal/character constant
1211      context.  */
1212   void on_close ()
1213   {
1214     vec.truncate (0);
1215   }
1216
1217   /* Pop the last element in the vector.  */
1218   void pop ()
1219   {
1220     unsigned int len = vec.count ();
1221     gcc_checking_assert (len > 0);
1222     vec.truncate (len - 1);
1223   }
1224
1225   /* Return the pop kind of the context of the Ith element.  */
1226   kind pop_kind_at (unsigned int i)
1227   {
1228     return vec[i].get_pop_kind ();
1229   }
1230
1231   /* Return the pop kind of the context that is currently opened.  */
1232   kind current_ctx ()
1233   {
1234     unsigned int len = vec.count ();
1235     if (len == 0)
1236       return kind::NONE;
1237     return vec[len - 1].get_pop_kind ();
1238   }
1239
1240   /* Return true if the current context comes from a UCN origin, that is,
1241      the bidi char which started this bidi context was written as a UCN.  */
1242   bool current_ctx_ucn_p ()
1243   {
1244     unsigned int len = vec.count ();
1245     gcc_checking_assert (len > 0);
1246     return vec[len - 1].m_ucn;
1247   }
1248
1249   location_t current_ctx_loc ()
1250   {
1251     unsigned int len = vec.count ();
1252     gcc_checking_assert (len > 0);
1253     return vec[len - 1].m_loc;
1254   }
1255
1256   /* We've read a bidi char, update the current vector as necessary.
1257      LOC is only valid when K is not kind::NONE.  */
1258   void on_char (kind k, bool ucn_p, location_t loc)
1259   {
1260     switch (k)
1261       {
1262       case kind::LRE:
1263       case kind::RLE:
1264       case kind::LRO:
1265       case kind::RLO:
1266         vec.push (context (loc, k, true, ucn_p));
1267         break;
1268       case kind::LRI:
1269       case kind::RLI:
1270       case kind::FSI:
1271         vec.push (context (loc, k, false, ucn_p));
1272         break;
1273       /* PDF terminates the scope of the last LRE, RLE, LRO, or RLO
1274          whose scope has not yet been terminated.  */
1275       case kind::PDF:
1276         if (current_ctx () == kind::PDF)
1277           pop ();
1278         break;
1279       /* PDI terminates the scope of the last LRI, RLI, or FSI whose
1280          scope has not yet been terminated, as well as the scopes of
1281          any subsequent LREs, RLEs, LROs, or RLOs whose scopes have not
1282          yet been terminated.  */
1283       case kind::PDI:
1284         for (int i = vec.count () - 1; i >= 0; --i)
1285           if (pop_kind_at (i) == kind::PDI)
1286             {
1287               vec.truncate (i);
1288               break;
1289             }
1290         break;
1291       case kind::LTR:
1292       case kind::RTL:
1293         /* These aren't popped by a PDF/PDI.  */
1294         break;
1295       ATTR_LIKELY case kind::NONE:
1296         break;
1297       default:
1298         abort ();
1299       }
1300   }
1301
1302   /* Return a descriptive string for K.  */
1303   const char *to_str (kind k)
1304   {
1305     switch (k)
1306       {
1307       case kind::LRE:
1308         return "U+202A (LEFT-TO-RIGHT EMBEDDING)";
1309       case kind::RLE:
1310         return "U+202B (RIGHT-TO-LEFT EMBEDDING)";
1311       case kind::LRO:
1312         return "U+202D (LEFT-TO-RIGHT OVERRIDE)";
1313       case kind::RLO:
1314         return "U+202E (RIGHT-TO-LEFT OVERRIDE)";
1315       case kind::LRI:
1316         return "U+2066 (LEFT-TO-RIGHT ISOLATE)";
1317       case kind::RLI:
1318         return "U+2067 (RIGHT-TO-LEFT ISOLATE)";
1319       case kind::FSI:
1320         return "U+2068 (FIRST STRONG ISOLATE)";
1321       case kind::PDF:
1322         return "U+202C (POP DIRECTIONAL FORMATTING)";
1323       case kind::PDI:
1324         return "U+2069 (POP DIRECTIONAL ISOLATE)";
1325       case kind::LTR:
1326         return "U+200E (LEFT-TO-RIGHT MARK)";
1327       case kind::RTL:
1328         return "U+200F (RIGHT-TO-LEFT MARK)";
1329       default:
1330         abort ();
1331       }
1332   }
1333 }
1334
1335 /* Get location_t for the range of bytes [START, START + NUM_BYTES)
1336    within the current line in FILE, with the caret at START.  */
1337
1338 static location_t
1339 get_location_for_byte_range_in_cur_line (cpp_reader *pfile,
1340                                          const unsigned char *const start,
1341                                          size_t num_bytes)
1342 {
1343   gcc_checking_assert (num_bytes > 0);
1344
1345   /* CPP_BUF_COLUMN and linemap_position_for_column both refer
1346      to offsets in bytes, but CPP_BUF_COLUMN is 0-based,
1347      whereas linemap_position_for_column is 1-based.  */
1348
1349   /* Get 0-based offsets within the line.  */
1350   size_t start_offset = CPP_BUF_COLUMN (pfile->buffer, start);
1351   size_t end_offset = start_offset + num_bytes - 1;
1352
1353   /* Now convert to location_t, where "columns" are 1-based byte offsets.  */
1354   location_t start_loc = linemap_position_for_column (pfile->line_table,
1355                                                       start_offset + 1);
1356   location_t end_loc = linemap_position_for_column (pfile->line_table,
1357                                                      end_offset + 1);
1358
1359   if (start_loc == end_loc)
1360     return start_loc;
1361
1362   source_range src_range;
1363   src_range.m_start = start_loc;
1364   src_range.m_finish = end_loc;
1365   location_t combined_loc
1366     = pfile->line_table->get_or_create_combined_loc (start_loc,
1367                                                      src_range,
1368                                                      nullptr,
1369                                                      0);
1370   return combined_loc;
1371 }
1372
1373 /* Parse a sequence of 3 bytes starting with P and return its bidi code.  */
1374
1375 static bidi::kind
1376 get_bidi_utf8_1 (const unsigned char *const p)
1377 {
1378   gcc_checking_assert (p[0] == bidi::utf8_start);
1379
1380   if (p[1] == 0x80)
1381     switch (p[2])
1382       {
1383       case 0xaa:
1384         return bidi::kind::LRE;
1385       case 0xab:
1386         return bidi::kind::RLE;
1387       case 0xac:
1388         return bidi::kind::PDF;
1389       case 0xad:
1390         return bidi::kind::LRO;
1391       case 0xae:
1392         return bidi::kind::RLO;
1393       case 0x8e:
1394         return bidi::kind::LTR;
1395       case 0x8f:
1396         return bidi::kind::RTL;
1397       default:
1398         break;
1399       }
1400   else if (p[1] == 0x81)
1401     switch (p[2])
1402       {
1403       case 0xa6:
1404         return bidi::kind::LRI;
1405       case 0xa7:
1406         return bidi::kind::RLI;
1407       case 0xa8:
1408         return bidi::kind::FSI;
1409       case 0xa9:
1410         return bidi::kind::PDI;
1411       default:
1412         break;
1413       }
1414
1415   return bidi::kind::NONE;
1416 }
1417
1418 /* Parse a sequence of 3 bytes starting with P and return its bidi code.
1419    If the kind is not NONE, write the location to *OUT.*/
1420
1421 static bidi::kind
1422 get_bidi_utf8 (cpp_reader *pfile, const unsigned char *const p, location_t *out)
1423 {
1424   bidi::kind result = get_bidi_utf8_1 (p);
1425   if (result != bidi::kind::NONE)
1426     {
1427       /* We have a sequence of 3 bytes starting at P.  */
1428       *out = get_location_for_byte_range_in_cur_line (pfile, p, 3);
1429     }
1430   return result;
1431 }
1432
1433 /* Parse a UCN where P points just past \u or \U and return its bidi code.  */
1434
1435 static bidi::kind
1436 get_bidi_ucn_1 (const unsigned char *p, bool is_U, const unsigned char **end)
1437 {
1438   /* 6.4.3 Universal Character Names
1439       \u hex-quad
1440       \U hex-quad hex-quad
1441       \u { simple-hexadecimal-digit-sequence }
1442      where \unnnn means \U0000nnnn.  */
1443
1444   *end = p + 4;
1445   if (is_U)
1446     {
1447       if (p[0] != '0' || p[1] != '0' || p[2] != '0' || p[3] != '0')
1448         return bidi::kind::NONE;
1449       /* Skip 4B so we can treat \u and \U the same below.  */
1450       p += 4;
1451       *end += 4;
1452     }
1453   else if (p[0] == '{')
1454     {
1455       p++;
1456       while (*p == '0')
1457         p++;
1458       if (p[0] != '2'
1459           || p[1] != '0'
1460           || !ISXDIGIT (p[2])
1461           || !ISXDIGIT (p[3])
1462           || p[4] != '}')
1463         return bidi::kind::NONE;
1464       *end = p + 5;
1465     }
1466
1467   /* All code points we are looking for start with 20xx.  */
1468   if (p[0] != '2' || p[1] != '0')
1469     return bidi::kind::NONE;
1470   else if (p[2] == '2')
1471     switch (p[3])
1472       {
1473       case 'a':
1474       case 'A':
1475         return bidi::kind::LRE;
1476       case 'b':
1477       case 'B':
1478         return bidi::kind::RLE;
1479       case 'c':
1480       case 'C':
1481         return bidi::kind::PDF;
1482       case 'd':
1483       case 'D':
1484         return bidi::kind::LRO;
1485       case 'e':
1486       case 'E':
1487         return bidi::kind::RLO;
1488       default:
1489         break;
1490       }
1491   else if (p[2] == '6')
1492     switch (p[3])
1493       {
1494       case '6':
1495         return bidi::kind::LRI;
1496       case '7':
1497         return bidi::kind::RLI;
1498       case '8':
1499         return bidi::kind::FSI;
1500       case '9':
1501         return bidi::kind::PDI;
1502       default:
1503         break;
1504       }
1505   else if (p[2] == '0')
1506     switch (p[3])
1507       {
1508       case 'e':
1509       case 'E':
1510         return bidi::kind::LTR;
1511       case 'f':
1512       case 'F':
1513         return bidi::kind::RTL;
1514       default:
1515         break;
1516       }
1517
1518   return bidi::kind::NONE;
1519 }
1520
1521 /* Parse a UCN where P points just past \u or \U and return its bidi code.
1522    If the kind is not NONE, write the location to *OUT.  */
1523
1524 static bidi::kind
1525 get_bidi_ucn (cpp_reader *pfile, const unsigned char *p, bool is_U,
1526               location_t *out)
1527 {
1528   const unsigned char *end;
1529   bidi::kind result = get_bidi_ucn_1 (p, is_U, &end);
1530   if (result != bidi::kind::NONE)
1531     {
1532       const unsigned char *start = p - 2;
1533       size_t num_bytes = end - start;
1534       *out = get_location_for_byte_range_in_cur_line (pfile, start, num_bytes);
1535     }
1536   return result;
1537 }
1538
1539 /* Parse a named universal character escape where P points just past \N and
1540    return its bidi code.  If the kind is not NONE, write the location to
1541    *OUT.  */
1542
1543 static bidi::kind
1544 get_bidi_named (cpp_reader *pfile, const unsigned char *p, location_t *out)
1545 {
1546   bidi::kind result = bidi::kind::NONE;
1547   if (*p != '{')
1548     return bidi::kind::NONE;
1549   if (strncmp ((const char *) (p + 1), "LEFT-TO-RIGHT ", 14) == 0)
1550     {
1551       if (strncmp ((const char *) (p + 15), "MARK}", 5) == 0)
1552         result = bidi::kind::LTR;
1553       else if (strncmp ((const char *) (p + 15), "EMBEDDING}", 10) == 0)
1554         result = bidi::kind::LRE;
1555       else if (strncmp ((const char *) (p + 15), "OVERRIDE}", 9) == 0)
1556         result = bidi::kind::LRO;
1557       else if (strncmp ((const char *) (p + 15), "ISOLATE}", 8) == 0)
1558         result = bidi::kind::LRI;
1559     }
1560   else if (strncmp ((const char *) (p + 1), "RIGHT-TO-LEFT ", 14) == 0)
1561     {
1562       if (strncmp ((const char *) (p + 15), "MARK}", 5) == 0)
1563         result = bidi::kind::RTL;
1564       else if (strncmp ((const char *) (p + 15), "EMBEDDING}", 10) == 0)
1565         result = bidi::kind::RLE;
1566       else if (strncmp ((const char *) (p + 15), "OVERRIDE}", 9) == 0)
1567         result = bidi::kind::RLO;
1568       else if (strncmp ((const char *) (p + 15), "ISOLATE}", 8) == 0)
1569         result = bidi::kind::RLI;
1570     }
1571   else if (strncmp ((const char *) (p + 1), "POP DIRECTIONAL ", 16) == 0)
1572     {
1573       if (strncmp ((const char *) (p + 16), "FORMATTING}", 11) == 0)
1574         result = bidi::kind::PDF;
1575       else if (strncmp ((const char *) (p + 16), "ISOLATE}", 8) == 0)
1576         result = bidi::kind::PDI;
1577     }
1578   else if (strncmp ((const char *) (p + 1), "FIRST STRONG ISOLATE}", 21) == 0)
1579     result = bidi::kind::FSI;
1580   if (result != bidi::kind::NONE)
1581     *out = get_location_for_byte_range_in_cur_line (pfile, p - 2,
1582                                                     (strchr ((const char *)
1583                                                              (p + 1), '}')
1584                                                      - (const char *) p)
1585                                                     + 3);
1586   return result;
1587 }
1588
1589 /* Subclass of rich_location for reporting on unpaired UTF-8
1590    bidirectional control character(s).
1591    Escape the source lines on output, and show all unclosed
1592    bidi context, labelling everything.  */
1593
1594 class unpaired_bidi_rich_location : public rich_location
1595 {
1596  public:
1597   class custom_range_label : public range_label
1598   {
1599    public:
1600      label_text get_text (unsigned range_idx) const final override
1601      {
1602        /* range 0 is the primary location; each subsequent range i + 1
1603           is for bidi::vec[i].  */
1604        if (range_idx > 0)
1605          {
1606            const bidi::context &ctxt (bidi::vec[range_idx - 1]);
1607            return label_text::borrow (bidi::to_str (ctxt.m_kind));
1608          }
1609        else
1610          return label_text::borrow (_("end of bidirectional context"));
1611      }
1612   };
1613
1614   unpaired_bidi_rich_location (cpp_reader *pfile, location_t loc)
1615   : rich_location (pfile->line_table, loc, &m_custom_label)
1616   {
1617     set_escape_on_output (true);
1618     for (unsigned i = 0; i < bidi::vec.count (); i++)
1619       add_range (bidi::vec[i].m_loc,
1620                  SHOW_RANGE_WITHOUT_CARET,
1621                  &m_custom_label);
1622   }
1623
1624  private:
1625    custom_range_label m_custom_label;
1626 };
1627
1628 /* We're closing a bidi context, that is, we've encountered a newline,
1629    are closing a C-style comment, or are at the end of a string literal,
1630    character constant, or identifier.  Warn if this context was not
1631    properly terminated by a PDI or PDF.  P points to the last character
1632    in this context.  */
1633
1634 static void
1635 maybe_warn_bidi_on_close (cpp_reader *pfile, const uchar *p)
1636 {
1637   const auto warn_bidi = CPP_OPTION (pfile, cpp_warn_bidirectional);
1638   if (bidi::vec.count () > 0
1639       && (warn_bidi & bidirectional_unpaired
1640           && (!bidi::current_ctx_ucn_p ()
1641               || (warn_bidi & bidirectional_ucn))))
1642     {
1643       const location_t loc
1644         = linemap_position_for_column (pfile->line_table,
1645                                        CPP_BUF_COLUMN (pfile->buffer, p));
1646       unpaired_bidi_rich_location rich_loc (pfile, loc);
1647       /* cpp_callbacks doesn't yet have a way to handle singular vs plural
1648          forms of a diagnostic, so fake it for now.  */
1649       if (bidi::vec.count () > 1)
1650         cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1651                         "unpaired UTF-8 bidirectional control characters "
1652                         "detected");
1653       else
1654         cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1655                         "unpaired UTF-8 bidirectional control character "
1656                         "detected");
1657     }
1658   /* We're done with this context.  */
1659   bidi::on_close ();
1660 }
1661
1662 /* We're at the beginning or in the middle of an identifier/comment/string
1663    literal/character constant.  Warn if we've encountered a bidi character.
1664    KIND says which bidi control character it was; UCN_P is true iff this bidi
1665    control character was written as a UCN.  LOC is the location of the
1666    character, but is only valid if KIND != bidi::kind::NONE.  */
1667
1668 static void
1669 maybe_warn_bidi_on_char (cpp_reader *pfile, bidi::kind kind,
1670                          bool ucn_p, location_t loc)
1671 {
1672   if (__builtin_expect (kind == bidi::kind::NONE, 1))
1673     return;
1674
1675   const auto warn_bidi = CPP_OPTION (pfile, cpp_warn_bidirectional);
1676
1677   if (warn_bidi & (bidirectional_unpaired|bidirectional_any))
1678     {
1679       rich_location rich_loc (pfile->line_table, loc);
1680       rich_loc.set_escape_on_output (true);
1681
1682       /* It seems excessive to warn about a PDI/PDF that is closing
1683          an opened context because we've already warned about the
1684          opening character.  Except warn when we have a UCN x UTF-8
1685          mismatch, if UCN checking is enabled.  */
1686       if (kind == bidi::current_ctx ())
1687         {
1688           if (warn_bidi == (bidirectional_unpaired|bidirectional_ucn)
1689               && bidi::current_ctx_ucn_p () != ucn_p)
1690             {
1691               rich_loc.add_range (bidi::current_ctx_loc ());
1692               cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1693                               "UTF-8 vs UCN mismatch when closing "
1694                               "a context by \"%s\"", bidi::to_str (kind));
1695             }
1696         }
1697       else if (warn_bidi & bidirectional_any
1698                && (!ucn_p || (warn_bidi & bidirectional_ucn)))
1699         {
1700           if (kind == bidi::kind::PDF || kind == bidi::kind::PDI)
1701             cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1702                             "\"%s\" is closing an unopened context",
1703                             bidi::to_str (kind));
1704           else
1705             cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1706                             "found problematic Unicode character \"%s\"",
1707                             bidi::to_str (kind));
1708         }
1709     }
1710   /* We're done with this context.  */
1711   bidi::on_char (kind, ucn_p, loc);
1712 }
1713
1714 static const cppchar_t utf8_continuation = 0x80;
1715 static const cppchar_t utf8_signifier = 0xC0;
1716
1717 /* Emit -Winvalid-utf8 warning on invalid UTF-8 character starting
1718    at PFILE->buffer->cur.  Return a pointer after the diagnosed
1719    invalid character.  */
1720
1721 static const uchar *
1722 _cpp_warn_invalid_utf8 (cpp_reader *pfile)
1723 {
1724   cpp_buffer *buffer = pfile->buffer;
1725   const uchar *cur = buffer->cur;
1726   bool pedantic = (CPP_PEDANTIC (pfile)
1727                    && CPP_OPTION (pfile, cpp_warn_invalid_utf8) == 2);
1728
1729   if (cur[0] < utf8_signifier
1730       || cur[1] < utf8_continuation || cur[1] >= utf8_signifier)
1731     {
1732       if (pedantic)
1733         cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1734                              pfile->line_table->highest_line,
1735                              CPP_BUF_COL (buffer),
1736                              "invalid UTF-8 character <%x>",
1737                              cur[0]);
1738       else
1739         cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1740                                pfile->line_table->highest_line,
1741                                CPP_BUF_COL (buffer),
1742                                "invalid UTF-8 character <%x>",
1743                                cur[0]);
1744       return cur + 1;
1745     }
1746   else if (cur[2] < utf8_continuation || cur[2] >= utf8_signifier)
1747     {
1748       if (pedantic)
1749         cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1750                              pfile->line_table->highest_line,
1751                              CPP_BUF_COL (buffer),
1752                              "invalid UTF-8 character <%x><%x>",
1753                              cur[0], cur[1]);
1754       else
1755         cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1756                                pfile->line_table->highest_line,
1757                                CPP_BUF_COL (buffer),
1758                                "invalid UTF-8 character <%x><%x>",
1759                                cur[0], cur[1]);
1760       return cur + 2;
1761     }
1762   else if (cur[3] < utf8_continuation || cur[3] >= utf8_signifier)
1763     {
1764       if (pedantic)
1765         cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1766                              pfile->line_table->highest_line,
1767                              CPP_BUF_COL (buffer),
1768                              "invalid UTF-8 character <%x><%x><%x>",
1769                              cur[0], cur[1], cur[2]);
1770       else
1771         cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1772                                pfile->line_table->highest_line,
1773                                CPP_BUF_COL (buffer),
1774                                "invalid UTF-8 character <%x><%x><%x>",
1775                                cur[0], cur[1], cur[2]);
1776       return cur + 3;
1777     }
1778   else
1779     {
1780       if (pedantic)
1781         cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1782                              pfile->line_table->highest_line,
1783                              CPP_BUF_COL (buffer),
1784                              "invalid UTF-8 character <%x><%x><%x><%x>",
1785                              cur[0], cur[1], cur[2], cur[3]);
1786       else
1787         cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1788                                pfile->line_table->highest_line,
1789                                CPP_BUF_COL (buffer),
1790                                "invalid UTF-8 character <%x><%x><%x><%x>",
1791                                cur[0], cur[1], cur[2], cur[3]);
1792       return cur + 4;
1793     }
1794 }
1795
1796 /* Helper function of *skip_*_comment and lex*_string.  For C,
1797    character at CUR[-1] with MSB set handle -Wbidi-chars* and
1798    -Winvalid-utf8 diagnostics and return pointer to first character
1799    that should be processed next.  */
1800
1801 static inline const uchar *
1802 _cpp_handle_multibyte_utf8 (cpp_reader *pfile, uchar c,
1803                             const uchar *cur, bool warn_bidi_p,
1804                             bool warn_invalid_utf8_p)
1805 {
1806   /* If this is a beginning of a UTF-8 encoding, it might be
1807      a bidirectional control character.  */
1808   if (c == bidi::utf8_start && warn_bidi_p)
1809     {
1810       location_t loc;
1811       bidi::kind kind = get_bidi_utf8 (pfile, cur - 1, &loc);
1812       maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
1813     }
1814   if (!warn_invalid_utf8_p)
1815     return cur;
1816   if (c >= utf8_signifier)
1817     {
1818       cppchar_t s;
1819       const uchar *pstr = cur - 1;
1820       if (_cpp_valid_utf8 (pfile, &pstr, pfile->buffer->rlimit, 0, NULL, &s)
1821           && s <= UCS_LIMIT)
1822         return pstr;
1823     }
1824   pfile->buffer->cur = cur - 1;
1825   return _cpp_warn_invalid_utf8 (pfile);
1826 }
1827
1828 /* Skip a C-style block comment.  We find the end of the comment by
1829    seeing if an asterisk is before every '/' we encounter.  Returns
1830    nonzero if comment terminated by EOF, zero otherwise.
1831
1832    Buffer->cur points to the initial asterisk of the comment.  */
1833 bool
1834 _cpp_skip_block_comment (cpp_reader *pfile)
1835 {
1836   cpp_buffer *buffer = pfile->buffer;
1837   const uchar *cur = buffer->cur;
1838   uchar c;
1839   const bool warn_bidi_p = pfile->warn_bidi_p ();
1840   const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
1841   const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
1842
1843   cur++;
1844   if (*cur == '/')
1845     cur++;
1846
1847   for (;;)
1848     {
1849       /* People like decorating comments with '*', so check for '/'
1850          instead for efficiency.  */
1851       c = *cur++;
1852
1853       if (c == '/')
1854         {
1855           if (cur[-2] == '*')
1856             {
1857               if (warn_bidi_p)
1858                 maybe_warn_bidi_on_close (pfile, cur);
1859               break;
1860             }
1861
1862           /* Warn about potential nested comments, but not if the '/'
1863              comes immediately before the true comment delimiter.
1864              Don't bother to get it right across escaped newlines.  */
1865           if (CPP_OPTION (pfile, warn_comments)
1866               && cur[0] == '*' && cur[1] != '/')
1867             {
1868               buffer->cur = cur;
1869               cpp_warning_with_line (pfile, CPP_W_COMMENTS,
1870                                      pfile->line_table->highest_line,
1871                                      CPP_BUF_COL (buffer),
1872                                      "\"/*\" within comment");
1873             }
1874         }
1875       else if (c == '\n')
1876         {
1877           unsigned int cols;
1878           buffer->cur = cur - 1;
1879           if (warn_bidi_p)
1880             maybe_warn_bidi_on_close (pfile, cur);
1881           _cpp_process_line_notes (pfile, true);
1882           if (buffer->next_line >= buffer->rlimit)
1883             return true;
1884           _cpp_clean_line (pfile);
1885
1886           cols = buffer->next_line - buffer->line_base;
1887           CPP_INCREMENT_LINE (pfile, cols);
1888
1889           cur = buffer->cur;
1890         }
1891       else if (__builtin_expect (c >= utf8_continuation, 0)
1892                && warn_bidi_or_invalid_utf8_p)
1893         cur = _cpp_handle_multibyte_utf8 (pfile, c, cur, warn_bidi_p,
1894                                           warn_invalid_utf8_p);
1895     }
1896
1897   buffer->cur = cur;
1898   _cpp_process_line_notes (pfile, true);
1899   return false;
1900 }
1901
1902 /* Skip a C++ line comment, leaving buffer->cur pointing to the
1903    terminating newline.  Handles escaped newlines.  Returns nonzero
1904    if a multiline comment.  */
1905 static int
1906 skip_line_comment (cpp_reader *pfile)
1907 {
1908   cpp_buffer *buffer = pfile->buffer;
1909   location_t orig_line = pfile->line_table->highest_line;
1910   const bool warn_bidi_p = pfile->warn_bidi_p ();
1911   const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
1912   const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
1913
1914   if (!warn_bidi_or_invalid_utf8_p)
1915     while (*buffer->cur != '\n')
1916       buffer->cur++;
1917   else if (!warn_invalid_utf8_p)
1918     {
1919       while (*buffer->cur != '\n'
1920              && *buffer->cur != bidi::utf8_start)
1921         buffer->cur++;
1922       if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
1923         {
1924           while (*buffer->cur != '\n')
1925             {
1926               if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
1927                 {
1928                   location_t loc;
1929                   bidi::kind kind = get_bidi_utf8 (pfile, buffer->cur, &loc);
1930                   maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
1931                 }
1932               buffer->cur++;
1933             }
1934           maybe_warn_bidi_on_close (pfile, buffer->cur);
1935         }
1936     }
1937   else
1938     {
1939       while (*buffer->cur != '\n')
1940         {
1941           if (*buffer->cur < utf8_continuation)
1942             {
1943               buffer->cur++;
1944               continue;
1945             }
1946           buffer->cur
1947             = _cpp_handle_multibyte_utf8 (pfile, *buffer->cur, buffer->cur + 1,
1948                                           warn_bidi_p, warn_invalid_utf8_p);
1949         }
1950       if (warn_bidi_p)
1951         maybe_warn_bidi_on_close (pfile, buffer->cur);
1952     }
1953
1954   _cpp_process_line_notes (pfile, true);
1955   return orig_line != pfile->line_table->highest_line;
1956 }
1957
1958 /* Skips whitespace, saving the next non-whitespace character.  */
1959 static void
1960 skip_whitespace (cpp_reader *pfile, cppchar_t c)
1961 {
1962   cpp_buffer *buffer = pfile->buffer;
1963   bool saw_NUL = false;
1964
1965   do
1966     {
1967       /* Horizontal space always OK.  */
1968       if (c == ' ' || c == '\t')
1969         ;
1970       /* Just \f \v or \0 left.  */
1971       else if (c == '\0')
1972         saw_NUL = true;
1973       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
1974         cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
1975                              CPP_BUF_COL (buffer),
1976                              "%s in preprocessing directive",
1977                              c == '\f' ? "form feed" : "vertical tab");
1978
1979       c = *buffer->cur++;
1980     }
1981   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
1982   while (is_nvspace (c));
1983
1984   if (saw_NUL)
1985     {
1986       encoding_rich_location rich_loc (pfile);
1987       cpp_error_at (pfile, CPP_DL_WARNING, &rich_loc,
1988                     "null character(s) ignored");
1989     }
1990
1991   buffer->cur--;
1992 }
1993
1994 /* See if the characters of a number token are valid in a name (no
1995    '.', '+' or '-').  */
1996 static int
1997 name_p (cpp_reader *pfile, const cpp_string *string)
1998 {
1999   unsigned int i;
2000
2001   for (i = 0; i < string->len; i++)
2002     if (!is_idchar (string->text[i]))
2003       return 0;
2004
2005   return 1;
2006 }
2007
2008 /* After parsing an identifier or other sequence, produce a warning about
2009    sequences not in NFC/NFKC.  */
2010 static void
2011 warn_about_normalization (cpp_reader *pfile,
2012                           const cpp_token *token,
2013                           const struct normalize_state *s,
2014                           bool identifier)
2015 {
2016   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
2017       && !pfile->state.skipping)
2018     {
2019       location_t loc = token->src_loc;
2020
2021       /* If possible, create a location range for the token.  */
2022       if (loc >= RESERVED_LOCATION_COUNT
2023           && token->type != CPP_EOF
2024           /* There must be no line notes to process.  */
2025           && (!(pfile->buffer->cur
2026                 >= pfile->buffer->notes[pfile->buffer->cur_note].pos
2027                 && !pfile->overlaid_buffer)))
2028         {
2029           source_range tok_range;
2030           tok_range.m_start = loc;
2031           tok_range.m_finish
2032             = linemap_position_for_column (pfile->line_table,
2033                                            CPP_BUF_COLUMN (pfile->buffer,
2034                                                            pfile->buffer->cur));
2035           loc = pfile->line_table->get_or_create_combined_loc (loc, tok_range,
2036                                                                nullptr, 0);
2037         }
2038
2039       encoding_rich_location rich_loc (pfile, loc);
2040
2041       /* Make sure that the token is printed using UCNs, even
2042          if we'd otherwise happily print UTF-8.  */
2043       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
2044       size_t sz;
2045
2046       sz = cpp_spell_token (pfile, token, buf, false) - buf;
2047       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
2048         cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
2049                         "`%.*s' is not in NFKC", (int) sz, buf);
2050       else if (identifier && CPP_OPTION (pfile, xid_identifiers))
2051         cpp_pedwarning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
2052                                   "`%.*s' is not in NFC", (int) sz, buf);
2053       else
2054         cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
2055                         "`%.*s' is not in NFC", (int) sz, buf);
2056       free (buf);
2057     }
2058 }
2059
2060 /* Returns TRUE if the byte sequence starting at buffer->cur is a valid
2061    extended character in an identifier.  If FIRST is TRUE, then the character
2062    must be valid at the beginning of an identifier as well.  If the return
2063    value is TRUE, then pfile->buffer->cur has been moved to point to the next
2064    byte after the extended character.  */
2065
2066 static bool
2067 forms_identifier_p (cpp_reader *pfile, int first,
2068                     struct normalize_state *state)
2069 {
2070   cpp_buffer *buffer = pfile->buffer;
2071   const bool warn_bidi_p = pfile->warn_bidi_p ();
2072
2073   if (*buffer->cur == '$')
2074     {
2075       if (!CPP_OPTION (pfile, dollars_in_ident))
2076         return false;
2077
2078       buffer->cur++;
2079       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
2080         {
2081           CPP_OPTION (pfile, warn_dollars) = 0;
2082           cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
2083         }
2084
2085       return true;
2086     }
2087
2088   /* Is this a syntactically valid UCN or a valid UTF-8 char?  */
2089   if (CPP_OPTION (pfile, extended_identifiers))
2090     {
2091       cppchar_t s;
2092       if (*buffer->cur >= utf8_signifier)
2093         {
2094           if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0)
2095               && warn_bidi_p)
2096             {
2097               location_t loc;
2098               bidi::kind kind = get_bidi_utf8 (pfile, buffer->cur, &loc);
2099               maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
2100             }
2101           if (_cpp_valid_utf8 (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
2102                                state, &s))
2103             return true;
2104         }
2105       else if (*buffer->cur == '\\'
2106                && (buffer->cur[1] == 'u'
2107                    || buffer->cur[1] == 'U'
2108                    || buffer->cur[1] == 'N'))
2109         {
2110           buffer->cur += 2;
2111           if (warn_bidi_p)
2112             {
2113               location_t loc;
2114               bidi::kind kind;
2115               if (buffer->cur[-1] == 'N')
2116                 kind = get_bidi_named (pfile, buffer->cur, &loc);
2117               else
2118                 kind = get_bidi_ucn (pfile, buffer->cur,
2119                                      buffer->cur[-1] == 'U', &loc);
2120               maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
2121             }
2122           if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
2123                               state, &s, NULL, NULL))
2124             return true;
2125           buffer->cur -= 2;
2126         }
2127     }
2128
2129   return false;
2130 }
2131
2132 /* Helper function to issue error about improper __VA_OPT__ use.  */
2133 static void
2134 maybe_va_opt_error (cpp_reader *pfile)
2135 {
2136   if (CPP_PEDANTIC (pfile) && !CPP_OPTION (pfile, va_opt))
2137     {
2138       /* __VA_OPT__ should not be accepted at all, but allow it in
2139          system headers.  */
2140       if (!_cpp_in_system_header (pfile))
2141         {
2142           if (CPP_OPTION (pfile, cplusplus))
2143             cpp_error (pfile, CPP_DL_PEDWARN,
2144                        "__VA_OPT__ is not available until C++20");
2145           else
2146             cpp_error (pfile, CPP_DL_PEDWARN,
2147                        "__VA_OPT__ is not available until C23");
2148         }
2149     }
2150   else if (!pfile->state.va_args_ok)
2151     {
2152       /* __VA_OPT__ should only appear in the replacement list of a
2153          variadic macro.  */
2154       cpp_error (pfile, CPP_DL_PEDWARN,
2155                  "__VA_OPT__ can only appear in the expansion"
2156                  " of a C++20 variadic macro");
2157     }
2158 }
2159
2160 /* Helper function to perform diagnostics that are needed (rarely)
2161    when an identifier is lexed.  */
2162 static void
2163 identifier_diagnostics_on_lex (cpp_reader *pfile, cpp_hashnode *node)
2164 {
2165   if (__builtin_expect (!(node->flags & NODE_DIAGNOSTIC)
2166                         || pfile->state.skipping, 1))
2167     return;
2168
2169   /* It is allowed to poison the same identifier twice.  */
2170   if ((node->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
2171     {
2172       cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
2173                  NODE_NAME (node));
2174       const auto data = (cpp_hashnode_extra *)
2175         ht_lookup (pfile->extra_hash_table, node->ident, HT_NO_INSERT);
2176       if (data && data->poisoned_loc)
2177         cpp_error_at (pfile, CPP_DL_NOTE, data->poisoned_loc, "poisoned here");
2178     }
2179
2180   /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
2181      replacement list of a variadic macro.  */
2182   if (node == pfile->spec_nodes.n__VA_ARGS__
2183       && !pfile->state.va_args_ok)
2184     {
2185       if (CPP_OPTION (pfile, cplusplus))
2186         cpp_error (pfile, CPP_DL_PEDWARN,
2187                    "__VA_ARGS__ can only appear in the expansion"
2188                    " of a C++11 variadic macro");
2189       else
2190         cpp_error (pfile, CPP_DL_PEDWARN,
2191                    "__VA_ARGS__ can only appear in the expansion"
2192                    " of a C99 variadic macro");
2193     }
2194
2195   /* __VA_OPT__ should only appear in the replacement list of a
2196      variadic macro.  */
2197   if (node == pfile->spec_nodes.n__VA_OPT__)
2198     maybe_va_opt_error (pfile);
2199
2200   /* For -Wc++-compat, warn about use of C++ named operators.  */
2201   if (node->flags & NODE_WARN_OPERATOR)
2202     cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
2203                  "identifier \"%s\" is a special operator name in C++",
2204                  NODE_NAME (node));
2205 }
2206
2207 /* Helper function to get the cpp_hashnode of the identifier BASE.  */
2208 static cpp_hashnode *
2209 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
2210 {
2211   cpp_hashnode *result;
2212   const uchar *cur;
2213   unsigned int len;
2214   unsigned int hash = HT_HASHSTEP (0, *base);
2215
2216   cur = base + 1;
2217   while (ISIDNUM (*cur))
2218     {
2219       hash = HT_HASHSTEP (hash, *cur);
2220       cur++;
2221     }
2222   len = cur - base;
2223   hash = HT_HASHFINISH (hash, len);
2224   result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
2225                                               base, len, hash, HT_ALLOC));
2226   identifier_diagnostics_on_lex (pfile, result);
2227   return result;
2228 }
2229
2230 /* Get the cpp_hashnode of an identifier specified by NAME in
2231    the current cpp_reader object.  If none is found, NULL is returned.  */
2232 cpp_hashnode *
2233 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
2234 {
2235   cpp_hashnode *result;
2236   result = lex_identifier_intern (pfile, (uchar *) name);
2237   return result;
2238 }
2239
2240 /* Lex an identifier starting at BASE.  BUFFER->CUR is expected to point
2241    one past the first character at BASE, which may be a (possibly multi-byte)
2242    character if STARTS_UCN is true.  */
2243 static cpp_hashnode *
2244 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
2245                 struct normalize_state *nst, cpp_hashnode **spelling)
2246 {
2247   cpp_hashnode *result;
2248   const uchar *cur;
2249   unsigned int len;
2250   unsigned int hash = HT_HASHSTEP (0, *base);
2251   const bool warn_bidi_p = pfile->warn_bidi_p ();
2252
2253   cur = pfile->buffer->cur;
2254   if (! starts_ucn)
2255     {
2256       while (ISIDNUM (*cur))
2257         {
2258           hash = HT_HASHSTEP (hash, *cur);
2259           cur++;
2260         }
2261       NORMALIZE_STATE_UPDATE_IDNUM (nst, *(cur - 1));
2262     }
2263   pfile->buffer->cur = cur;
2264   if (starts_ucn || forms_identifier_p (pfile, false, nst))
2265     {
2266       /* Slower version for identifiers containing UCNs
2267          or extended chars (including $).  */
2268       do {
2269         while (ISIDNUM (*pfile->buffer->cur))
2270           {
2271             NORMALIZE_STATE_UPDATE_IDNUM (nst, *pfile->buffer->cur);
2272             pfile->buffer->cur++;
2273           }
2274       } while (forms_identifier_p (pfile, false, nst));
2275       if (warn_bidi_p)
2276         maybe_warn_bidi_on_close (pfile, pfile->buffer->cur);
2277       result = _cpp_interpret_identifier (pfile, base,
2278                                           pfile->buffer->cur - base);
2279       *spelling = cpp_lookup (pfile, base, pfile->buffer->cur - base);
2280     }
2281   else
2282     {
2283       len = cur - base;
2284       hash = HT_HASHFINISH (hash, len);
2285
2286       result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
2287                                                   base, len, hash, HT_ALLOC));
2288       *spelling = result;
2289     }
2290
2291   return result;
2292 }
2293
2294 /* Struct to hold the return value of the scan_cur_identifier () helper
2295    function below.  */
2296
2297 struct scan_id_result
2298 {
2299   cpp_hashnode *node;
2300   normalize_state nst;
2301
2302   scan_id_result ()
2303     : node (nullptr)
2304   {
2305     nst = INITIAL_NORMALIZE_STATE;
2306   }
2307
2308   explicit operator bool () const { return node; }
2309 };
2310
2311 /* Helper function to scan an entire identifier beginning at
2312    pfile->buffer->cur, and possibly containing extended characters (UCNs
2313    and/or UTF-8).  Returns the cpp_hashnode for the identifier on success, or
2314    else nullptr, as well as a normalize_state so that normalization warnings
2315    may be issued once the token lexing is complete.  */
2316
2317 static scan_id_result
2318 scan_cur_identifier (cpp_reader *pfile)
2319 {
2320   const auto buffer = pfile->buffer;
2321   const auto begin = buffer->cur;
2322   scan_id_result result;
2323   if (ISIDST (*buffer->cur))
2324     {
2325       ++buffer->cur;
2326       cpp_hashnode *ignore;
2327       result.node = lex_identifier (pfile, begin, false, &result.nst, &ignore);
2328     }
2329   else if (forms_identifier_p (pfile, true, &result.nst))
2330     {
2331       /* buffer->cur has been moved already by the call
2332          to forms_identifier_p.  */
2333       cpp_hashnode *ignore;
2334       result.node = lex_identifier (pfile, begin, true, &result.nst, &ignore);
2335     }
2336   return result;
2337 }
2338
2339 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
2340 static void
2341 lex_number (cpp_reader *pfile, cpp_string *number,
2342             struct normalize_state *nst)
2343 {
2344   const uchar *cur;
2345   const uchar *base;
2346   uchar *dest;
2347
2348   base = pfile->buffer->cur - 1;
2349   do
2350     {
2351       const uchar *adj_digit_sep = NULL;
2352       cur = pfile->buffer->cur;
2353
2354       /* N.B. ISIDNUM does not include $.  */
2355       while (ISIDNUM (*cur)
2356              || (*cur == '.' && !DIGIT_SEP (cur[-1]))
2357              || DIGIT_SEP (*cur)
2358              || (VALID_SIGN (*cur, cur[-1]) && !DIGIT_SEP (cur[-2])))
2359         {
2360           NORMALIZE_STATE_UPDATE_IDNUM (nst, *cur);
2361           /* Adjacent digit separators do not form part of the pp-number syntax.
2362              However, they can safely be diagnosed here as an error, since '' is
2363              not a valid preprocessing token.  */
2364           if (DIGIT_SEP (*cur) && DIGIT_SEP (cur[-1]) && !adj_digit_sep)
2365             adj_digit_sep = cur;
2366           cur++;
2367         }
2368       /* A number can't end with a digit separator.  */
2369       while (cur > pfile->buffer->cur && DIGIT_SEP (cur[-1]))
2370         --cur;
2371       if (adj_digit_sep && adj_digit_sep < cur)
2372         cpp_error (pfile, CPP_DL_ERROR, "adjacent digit separators");
2373
2374       pfile->buffer->cur = cur;
2375     }
2376   while (forms_identifier_p (pfile, false, nst));
2377
2378   number->len = cur - base;
2379   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
2380   memcpy (dest, base, number->len);
2381   dest[number->len] = '\0';
2382   number->text = dest;
2383 }
2384
2385 /* Create a token of type TYPE with a literal spelling.  */
2386 static void
2387 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
2388                 unsigned int len, enum cpp_ttype type)
2389 {
2390   token->type = type;
2391   token->val.str.len = len;
2392   token->val.str.text = cpp_alloc_token_string (pfile, base, len);
2393 }
2394
2395 /* Like create_literal(), but construct it from two separate strings
2396    which are concatenated.  LEN2 may be 0 if no second string is
2397    required.  */
2398 static void
2399 create_literal2 (cpp_reader *pfile, cpp_token *token, const uchar *base1,
2400                  unsigned int len1, const uchar *base2, unsigned int len2,
2401                  enum cpp_ttype type)
2402 {
2403   token->type = type;
2404   token->val.str.len = len1 + len2;
2405   uchar *const dest = _cpp_unaligned_alloc (pfile, len1 + len2 + 1);
2406   memcpy (dest, base1, len1);
2407   if (len2)
2408     memcpy (dest+len1, base2, len2);
2409   dest[len1 + len2] = 0;
2410   token->val.str.text = dest;
2411 }
2412
2413 const uchar *
2414 cpp_alloc_token_string (cpp_reader *pfile,
2415                         const unsigned char *ptr, unsigned len)
2416 {
2417   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
2418
2419   dest[len] = 0;
2420   memcpy (dest, ptr, len);
2421   return dest;
2422 }
2423
2424 /* A pair of raw buffer pointers.  The currently open one is [1], the
2425    first one is [0].  Used for string literal lexing.  */
2426 struct lit_accum {
2427   _cpp_buff *first;
2428   _cpp_buff *last;
2429   const uchar *rpos;
2430   size_t accum;
2431
2432   lit_accum ()
2433     : first (NULL), last (NULL), rpos (0), accum (0)
2434   {
2435   }
2436
2437   void append (cpp_reader *, const uchar *, size_t);
2438
2439   void read_begin (cpp_reader *);
2440   bool reading_p () const
2441   {
2442     return rpos != NULL;
2443   }
2444   char read_char ()
2445   {
2446     char c = *rpos++;
2447     if (rpos == BUFF_FRONT (last))
2448       rpos = NULL;
2449     return c;
2450   }
2451
2452   void create_literal2 (cpp_reader *pfile, cpp_token *token,
2453                         const uchar *base1, unsigned int len1,
2454                         const uchar *base2, unsigned int len2,
2455                         enum cpp_ttype type);
2456 };
2457
2458 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
2459    sequence from *FIRST_BUFF_P to LAST_BUFF_P.  */
2460
2461 void
2462 lit_accum::append (cpp_reader *pfile, const uchar *base, size_t len)
2463 {
2464   if (!last)
2465     /* Starting.  */
2466     first = last = _cpp_get_buff (pfile, len);
2467   else if (len > BUFF_ROOM (last))
2468     {
2469       /* There is insufficient room in the buffer.  Copy what we can,
2470          and then either extend or create a new one.  */
2471       size_t room = BUFF_ROOM (last);
2472       memcpy (BUFF_FRONT (last), base, room);
2473       BUFF_FRONT (last) += room;
2474       base += room;
2475       len -= room;
2476       accum += room;
2477
2478       gcc_checking_assert (!rpos);
2479
2480       last = _cpp_append_extend_buff (pfile, last, len);
2481     }
2482
2483   memcpy (BUFF_FRONT (last), base, len);
2484   BUFF_FRONT (last) += len;
2485   accum += len;
2486 }
2487
2488 void
2489 lit_accum::read_begin (cpp_reader *pfile)
2490 {
2491   /* We never accumulate more than 4 chars to read.  */
2492   if (BUFF_ROOM (last) < 4)
2493
2494     last = _cpp_append_extend_buff (pfile, last, 4);
2495   rpos = BUFF_FRONT (last);
2496 }
2497
2498 /* Helper function to check if a string format macro, say from inttypes.h, is
2499    placed touching a string literal, in which case it could be parsed as a C++11
2500    user-defined string literal thus breaking the program.  Return TRUE if the
2501    UDL should be ignored for now and preserved for potential macro
2502    expansion.  */
2503
2504 static bool
2505 maybe_ignore_udl_macro_suffix (cpp_reader *pfile, location_t src_loc,
2506                                const uchar *suffix_begin, cpp_hashnode *node)
2507 {
2508   /* User-defined literals outside of namespace std must start with a single
2509      underscore, so assume anything of that form really is a UDL suffix.
2510      We don't need to worry about UDLs defined inside namespace std because
2511      their names are reserved, so cannot be used as macro names in valid
2512      programs.  */
2513   if ((suffix_begin[0] == '_' && suffix_begin[1] != '_')
2514       || !cpp_macro_p (node))
2515     return false;
2516
2517   /* Maybe raise a warning here; caller should arrange not to consume
2518      the tokens.  */
2519   if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
2520     cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX, src_loc, 0,
2521                            "invalid suffix on literal; C++11 requires a space "
2522                            "between literal and string macro");
2523   return true;
2524 }
2525
2526 /* Like create_literal2(), but also prepend all the accumulated data from
2527    the lit_accum struct.  */
2528 void
2529 lit_accum::create_literal2 (cpp_reader *pfile, cpp_token *token,
2530                             const uchar *base1, unsigned int len1,
2531                             const uchar *base2, unsigned int len2,
2532                             enum cpp_ttype type)
2533 {
2534   const unsigned int tot_len = accum + len1 + len2;
2535   uchar *dest = _cpp_unaligned_alloc (pfile, tot_len + 1);
2536   token->type = type;
2537   token->val.str.len = tot_len;
2538   token->val.str.text = dest;
2539   for (_cpp_buff *buf = first; buf; buf = buf->next)
2540     {
2541       size_t len = BUFF_FRONT (buf) - buf->base;
2542       memcpy (dest, buf->base, len);
2543       dest += len;
2544     }
2545   memcpy (dest, base1, len1);
2546   dest += len1;
2547   if (len2)
2548     memcpy (dest, base2, len2);
2549   dest += len2;
2550   *dest = '\0';
2551 }
2552
2553 /* Lexes a raw string.  The stored string contains the spelling,
2554    including double quotes, delimiter string, '(' and ')', any leading
2555    'L', 'u', 'U' or 'u8' and 'R' modifier.  The created token contains
2556    the type of the literal, or CPP_OTHER if it was not properly
2557    terminated.
2558
2559    BASE is the start of the token.  Updates pfile->buffer->cur to just
2560    after the lexed string.
2561
2562    The spelling is NUL-terminated, but it is not guaranteed that this
2563    is the first NUL since embedded NULs are preserved.  */
2564
2565 static void
2566 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
2567 {
2568   const uchar *pos = base;
2569   const bool warn_bidi_p = pfile->warn_bidi_p ();
2570   const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
2571   const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
2572
2573   /* 'tis a pity this information isn't passed down from the lexer's
2574      initial categorization of the token.  */
2575   enum cpp_ttype type = CPP_STRING;
2576
2577   if (*pos == 'L')
2578     {
2579       type = CPP_WSTRING;
2580       pos++;
2581     }
2582   else if (*pos == 'U')
2583     {
2584       type = CPP_STRING32;
2585       pos++;
2586     }
2587   else if (*pos == 'u')
2588     {
2589       if (pos[1] == '8')
2590         {
2591           type = CPP_UTF8STRING;
2592           pos++;
2593         }
2594       else
2595         type = CPP_STRING16;
2596       pos++;
2597     }
2598
2599   gcc_checking_assert (pos[0] == 'R' && pos[1] == '"');
2600   pos += 2;
2601
2602   _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
2603
2604   /* Skip notes before the ".  */
2605   while (note->pos < pos)
2606     ++note;
2607
2608   lit_accum accum;
2609
2610   uchar prefix[17];
2611   unsigned prefix_len = 0;
2612   enum Phase
2613   {
2614    PHASE_PREFIX = -2,
2615    PHASE_NONE = -1,
2616    PHASE_SUFFIX = 0
2617   } phase = PHASE_PREFIX;
2618
2619   for (;;)
2620     {
2621       gcc_checking_assert (note->pos >= pos);
2622
2623       /* Undo any escaped newlines and trigraphs.  */
2624       if (!accum.reading_p () && note->pos == pos)
2625         switch (note->type)
2626           {
2627           case '\\':
2628           case ' ':
2629             /* Restore backslash followed by newline.  */
2630             accum.append (pfile, base, pos - base);
2631             base = pos;
2632             accum.read_begin (pfile);
2633             accum.append (pfile, UC"\\", 1);
2634
2635           after_backslash:
2636             if (note->type == ' ')
2637               /* GNU backslash whitespace newline extension.  FIXME
2638                  could be any sequence of non-vertical space.  When we
2639                  can properly restore any such sequence, we should
2640                  mark this note as handled so _cpp_process_line_notes
2641                  doesn't warn.  */
2642               accum.append (pfile, UC" ", 1);
2643
2644             accum.append (pfile, UC"\n", 1);
2645             note++;
2646             break;
2647
2648           case '\n':
2649             /* This can happen for ??/<NEWLINE> when trigraphs are not
2650                being interpretted.  */
2651             gcc_checking_assert (!CPP_OPTION (pfile, trigraphs));
2652             note->type = 0;
2653             note++;
2654             break;
2655
2656           default:
2657             gcc_checking_assert (_cpp_trigraph_map[note->type]);
2658
2659             /* Don't warn about this trigraph in
2660                _cpp_process_line_notes, since trigraphs show up as
2661                trigraphs in raw strings.  */
2662             uchar type = note->type;
2663             note->type = 0;
2664
2665             if (CPP_OPTION (pfile, trigraphs))
2666               {
2667                 accum.append (pfile, base, pos - base);
2668                 base = pos;
2669                 accum.read_begin (pfile);
2670                 accum.append (pfile, UC"??", 2);
2671                 accum.append (pfile, &type, 1);
2672
2673                 /* ??/ followed by newline gets two line notes, one for
2674                    the trigraph and one for the backslash/newline.  */
2675                 if (type == '/' && note[1].pos == pos)
2676                   {
2677                     note++;
2678                     gcc_assert (note->type == '\\' || note->type == ' ');
2679                     goto after_backslash;
2680                   }
2681                 /* Skip the replacement character.  */
2682                 base = ++pos;
2683               }
2684
2685             note++;
2686             break;
2687           }
2688
2689       /* Now get a char to process.  Either from an expanded note, or
2690          from the line buffer.  */
2691       bool read_note = accum.reading_p ();
2692       char c = read_note ? accum.read_char () : *pos++;
2693
2694       if (phase == PHASE_PREFIX)
2695         {
2696           if (c == '(')
2697             {
2698               /* Done.  */
2699               phase = PHASE_NONE;
2700               prefix[prefix_len++] = '"';
2701             }
2702           else if (prefix_len < 16
2703                    /* Prefix chars are any of the basic character set,
2704                       [lex.charset] except for '
2705                       ()\\\t\v\f\n'. Optimized for a contiguous
2706                       alphabet.  */
2707                    /* Unlike a switch, this collapses down to one or
2708                       two shift and bitmask operations on an ASCII
2709                       system, with an outlier or two.   */
2710                    && (('Z' - 'A' == 25
2711                         ? ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
2712                         : ISIDST (c))
2713                        || (c >= '0' && c <= '9')
2714                        || c == '_' || c == '{' || c == '}'
2715                        || c == '[' || c == ']' || c == '#'
2716                        || c == '<' || c == '>' || c == '%'
2717                        || c == ':' || c == ';' || c == '.' || c == '?'
2718                        || c == '*' || c == '+' || c == '-' || c == '/'
2719                        || c == '^' || c == '&' || c == '|' || c == '~'
2720                        || c == '!' || c == '=' || c == ','
2721                        || c == '"' || c == '\''))
2722             prefix[prefix_len++] = c;
2723           else
2724             {
2725               /* Something is wrong.  */
2726               int col = CPP_BUF_COLUMN (pfile->buffer, pos) + read_note;
2727               if (prefix_len == 16)
2728                 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2729                                      col, "raw string delimiter longer "
2730                                      "than 16 characters");
2731               else if (c == '\n')
2732                 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2733                                      col, "invalid new-line in raw "
2734                                      "string delimiter");
2735               else
2736                 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2737                                      col, "invalid character '%c' in "
2738                                      "raw string delimiter", c);
2739               type = CPP_OTHER;
2740               phase = PHASE_NONE;
2741               /* Continue until we get a close quote, that's probably
2742                  the best failure mode.  */
2743               prefix_len = 0;
2744             }
2745           if (c != '\n')
2746             continue;
2747         }
2748
2749       if (phase != PHASE_NONE)
2750         {
2751           if (prefix[phase] != c)
2752             phase = PHASE_NONE;
2753           else if (unsigned (phase + 1) == prefix_len)
2754             break;
2755           else
2756             {
2757               phase = Phase (phase + 1);
2758               continue;
2759             }
2760         }
2761
2762       if (!prefix_len && c == '"')
2763         /* Failure mode lexing.  */
2764         goto out;
2765       else if (prefix_len && c == ')')
2766         phase = PHASE_SUFFIX;
2767       else if (!read_note && c == '\n')
2768         {
2769           pos--;
2770           pfile->buffer->cur = pos;
2771           if ((pfile->state.in_directive || pfile->state.parsing_args)
2772               && pfile->buffer->next_line >= pfile->buffer->rlimit)
2773             {
2774               cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
2775                                    "unterminated raw string");
2776               type = CPP_OTHER;
2777               goto out;
2778             }
2779
2780           accum.append (pfile, base, pos - base + 1);
2781           _cpp_process_line_notes (pfile, false);
2782
2783           if (pfile->buffer->next_line < pfile->buffer->rlimit)
2784             CPP_INCREMENT_LINE (pfile, 0);
2785           pfile->buffer->need_line = true;
2786
2787           if (!get_fresh_line_impl<true> (pfile))
2788             {
2789               /* We ran out of file and failed to get a line.  */
2790               location_t src_loc = token->src_loc;
2791               token->type = CPP_EOF;
2792               /* Tell the compiler the line number of the EOF token.  */
2793               token->src_loc = pfile->line_table->highest_line;
2794               token->flags = BOL;
2795               if (accum.first)
2796                 _cpp_release_buff (pfile, accum.first);
2797               cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
2798                                    "unterminated raw string");
2799
2800               /* Now pop the buffer that get_fresh_line_impl() did not.  Popping
2801                  is not safe if processing a directive, however this cannot
2802                  happen as we already checked above that a line would be
2803                  available, and get_fresh_line_impl() can't fail in this
2804                  case.  */
2805               gcc_assert (!pfile->state.in_directive);
2806               _cpp_pop_buffer (pfile);
2807
2808               return;
2809             }
2810
2811           pos = base = pfile->buffer->cur;
2812           note = &pfile->buffer->notes[pfile->buffer->cur_note];
2813         }
2814       else if (__builtin_expect ((unsigned char) c >= utf8_continuation, 0)
2815                && warn_bidi_or_invalid_utf8_p)
2816         pos = _cpp_handle_multibyte_utf8 (pfile, c, pos, warn_bidi_p,
2817                                           warn_invalid_utf8_p);
2818     }
2819
2820   if (warn_bidi_p)
2821     maybe_warn_bidi_on_close (pfile, pos);
2822
2823   if (CPP_OPTION (pfile, user_literals))
2824     {
2825       const uchar *const suffix_begin = pos;
2826       pfile->buffer->cur = pos;
2827
2828       if (const auto sr = scan_cur_identifier (pfile))
2829         {
2830           if (maybe_ignore_udl_macro_suffix (pfile, token->src_loc,
2831                                              suffix_begin, sr.node))
2832               pfile->buffer->cur = suffix_begin;
2833           else
2834             {
2835               type = cpp_userdef_string_add_type (type);
2836               accum.create_literal2 (pfile, token, base, suffix_begin - base,
2837                                      NODE_NAME (sr.node), NODE_LEN (sr.node),
2838                                      type);
2839               if (accum.first)
2840                 _cpp_release_buff (pfile, accum.first);
2841               warn_about_normalization (pfile, token, &sr.nst, true);
2842               return;
2843             }
2844         }
2845     }
2846
2847  out:
2848   pfile->buffer->cur = pos;
2849   if (!accum.accum)
2850     create_literal (pfile, token, base, pos - base, type);
2851   else
2852     {
2853       accum.create_literal2 (pfile, token, base, pos - base, nullptr, 0, type);
2854       _cpp_release_buff (pfile, accum.first);
2855     }
2856 }
2857
2858 /* Lexes a string, character constant, or angle-bracketed header file
2859    name.  The stored string contains the spelling, including opening
2860    quote and any leading 'L', 'u', 'U' or 'u8' and optional
2861    'R' modifier.  It returns the type of the literal, or CPP_OTHER
2862    if it was not properly terminated, or CPP_LESS for an unterminated
2863    header name which must be relexed as normal tokens.
2864
2865    The spelling is NUL-terminated, but it is not guaranteed that this
2866    is the first NUL since embedded NULs are preserved.  */
2867 static void
2868 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
2869 {
2870   bool saw_NUL = false;
2871   const uchar *cur;
2872   cppchar_t terminator;
2873   enum cpp_ttype type;
2874
2875   cur = base;
2876   terminator = *cur++;
2877   if (terminator == 'L' || terminator == 'U')
2878     terminator = *cur++;
2879   else if (terminator == 'u')
2880     {
2881       terminator = *cur++;
2882       if (terminator == '8')
2883         terminator = *cur++;
2884     }
2885   if (terminator == 'R')
2886     {
2887       lex_raw_string (pfile, token, base);
2888       return;
2889     }
2890   if (terminator == '"')
2891     type = (*base == 'L' ? CPP_WSTRING :
2892             *base == 'U' ? CPP_STRING32 :
2893             *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
2894                          : CPP_STRING);
2895   else if (terminator == '\'')
2896     type = (*base == 'L' ? CPP_WCHAR :
2897             *base == 'U' ? CPP_CHAR32 :
2898             *base == 'u' ? (base[1] == '8' ? CPP_UTF8CHAR : CPP_CHAR16)
2899                          : CPP_CHAR);
2900   else
2901     terminator = '>', type = CPP_HEADER_NAME;
2902
2903   const bool warn_bidi_p = pfile->warn_bidi_p ();
2904   const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
2905   const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
2906   for (;;)
2907     {
2908       cppchar_t c = *cur++;
2909
2910       /* In #include-style directives, terminators are not escapable.  */
2911       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
2912         {
2913           if ((cur[0] == 'u' || cur[0] == 'U' || cur[0] == 'N') && warn_bidi_p)
2914             {
2915               location_t loc;
2916               bidi::kind kind;
2917               if (cur[0] == 'N')
2918                 kind = get_bidi_named (pfile, cur + 1, &loc);
2919               else
2920                 kind = get_bidi_ucn (pfile, cur + 1, cur[0] == 'U', &loc);
2921               maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
2922             }
2923           cur++;
2924         }
2925       else if (c == terminator)
2926         {
2927           if (warn_bidi_p)
2928             maybe_warn_bidi_on_close (pfile, cur - 1);
2929           break;
2930         }
2931       else if (c == '\n')
2932         {
2933           cur--;
2934           /* Unmatched quotes always yield undefined behavior, but
2935              greedy lexing means that what appears to be an unterminated
2936              header name may actually be a legitimate sequence of tokens.  */
2937           if (terminator == '>')
2938             {
2939               token->type = CPP_LESS;
2940               return;
2941             }
2942           type = CPP_OTHER;
2943           break;
2944         }
2945       else if (c == '\0')
2946         saw_NUL = true;
2947       else if (__builtin_expect (c >= utf8_continuation, 0)
2948                && warn_bidi_or_invalid_utf8_p)
2949         cur = _cpp_handle_multibyte_utf8 (pfile, c, cur, warn_bidi_p,
2950                                           warn_invalid_utf8_p);
2951     }
2952
2953   if (saw_NUL && !pfile->state.skipping)
2954     cpp_error (pfile, CPP_DL_WARNING,
2955                "null character(s) preserved in literal");
2956
2957   if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
2958     cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
2959                (int) terminator);
2960
2961   pfile->buffer->cur = cur;
2962   const uchar *const suffix_begin = cur;
2963
2964   if (CPP_OPTION (pfile, user_literals))
2965     {
2966       if (const auto sr = scan_cur_identifier (pfile))
2967         {
2968           if (maybe_ignore_udl_macro_suffix (pfile, token->src_loc,
2969                                              suffix_begin, sr.node))
2970             pfile->buffer->cur = suffix_begin;
2971           else
2972             {
2973               /* Grab user defined literal suffix.  */
2974               type = cpp_userdef_char_add_type (type);
2975               type = cpp_userdef_string_add_type (type);
2976               create_literal2 (pfile, token, base, suffix_begin - base,
2977                                NODE_NAME (sr.node), NODE_LEN (sr.node), type);
2978               warn_about_normalization (pfile, token, &sr.nst, true);
2979               return;
2980             }
2981         }
2982     }
2983   else if (CPP_OPTION (pfile, cpp_warn_cxx11_compat)
2984            && !pfile->state.skipping)
2985     {
2986       const auto sr = scan_cur_identifier (pfile);
2987       /* Maybe raise a warning, but do not consume the tokens.  */
2988       pfile->buffer->cur = suffix_begin;
2989       if (sr && cpp_macro_p (sr.node))
2990         cpp_warning_with_line (pfile, CPP_W_CXX11_COMPAT,
2991                                token->src_loc, 0, "C++11 requires a space "
2992                                "between string literal and macro");
2993     }
2994
2995   create_literal (pfile, token, base, cur - base, type);
2996 }
2997
2998 /* Return the comment table. The client may not make any assumption
2999    about the ordering of the table.  */
3000 cpp_comment_table *
3001 cpp_get_comments (cpp_reader *pfile)
3002 {
3003   return &pfile->comments;
3004 }
3005
3006 /* Append a comment to the end of the comment table. */
3007 static void
3008 store_comment (cpp_reader *pfile, cpp_token *token)
3009 {
3010   int len;
3011
3012   if (pfile->comments.allocated == 0)
3013     {
3014       pfile->comments.allocated = 256;
3015       pfile->comments.entries = (cpp_comment *) xmalloc
3016         (pfile->comments.allocated * sizeof (cpp_comment));
3017     }
3018
3019   if (pfile->comments.count == pfile->comments.allocated)
3020     {
3021       pfile->comments.allocated *= 2;
3022       pfile->comments.entries = (cpp_comment *) xrealloc
3023         (pfile->comments.entries,
3024          pfile->comments.allocated * sizeof (cpp_comment));
3025     }
3026
3027   len = token->val.str.len;
3028
3029   /* Copy comment. Note, token may not be NULL terminated. */
3030   pfile->comments.entries[pfile->comments.count].comment =
3031     (char *) xmalloc (sizeof (char) * (len + 1));
3032   memcpy (pfile->comments.entries[pfile->comments.count].comment,
3033           token->val.str.text, len);
3034   pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
3035
3036   /* Set source location. */
3037   pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
3038
3039   /* Increment the count of entries in the comment table. */
3040   pfile->comments.count++;
3041 }
3042
3043 /* The stored comment includes the comment start and any terminator.  */
3044 static void
3045 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
3046               cppchar_t type)
3047 {
3048   unsigned char *buffer;
3049   unsigned int len, clen, i;
3050
3051   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
3052
3053   /* C++ comments probably (not definitely) have moved past a new
3054      line, which we don't want to save in the comment.  */
3055   if (is_vspace (pfile->buffer->cur[-1]))
3056     len--;
3057
3058   /* If we are currently in a directive or in argument parsing, then
3059      we need to store all C++ comments as C comments internally, and
3060      so we need to allocate a little extra space in that case.
3061
3062      Note that the only time we encounter a directive here is
3063      when we are saving comments in a "#define".  */
3064   clen = ((pfile->state.in_directive || pfile->state.parsing_args)
3065           && type == '/') ? len + 2 : len;
3066
3067   buffer = _cpp_unaligned_alloc (pfile, clen);
3068
3069   token->type = CPP_COMMENT;
3070   token->val.str.len = clen;
3071   token->val.str.text = buffer;
3072
3073   buffer[0] = '/';
3074   memcpy (buffer + 1, from, len - 1);
3075
3076   /* Finish conversion to a C comment, if necessary.  */
3077   if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
3078     {
3079       buffer[1] = '*';
3080       buffer[clen - 2] = '*';
3081       buffer[clen - 1] = '/';
3082       /* As there can be in a C++ comments illegal sequences for C comments
3083          we need to filter them out.  */
3084       for (i = 2; i < (clen - 2); i++)
3085         if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
3086           buffer[i] = '|';
3087     }
3088
3089   /* Finally store this comment for use by clients of libcpp. */
3090   store_comment (pfile, token);
3091 }
3092
3093 /* Returns true if comment at COMMENT_START is a recognized FALLTHROUGH
3094    comment.  */
3095
3096 static bool
3097 fallthrough_comment_p (cpp_reader *pfile, const unsigned char *comment_start)
3098 {
3099   const unsigned char *from = comment_start + 1;
3100
3101   switch (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough))
3102     {
3103       /* For both -Wimplicit-fallthrough=0 and -Wimplicit-fallthrough=5 we
3104          don't recognize any comments.  The latter only checks attributes,
3105          the former doesn't warn.  */
3106     case 0:
3107     default:
3108       return false;
3109       /* -Wimplicit-fallthrough=1 considers any comment, no matter what
3110          content it has.  */
3111     case 1:
3112       return true;
3113     case 2:
3114       /* -Wimplicit-fallthrough=2 looks for (case insensitive)
3115          .*falls?[ \t-]*thr(u|ough).* regex.  */
3116       for (; (size_t) (pfile->buffer->cur - from) >= sizeof "fallthru" - 1;
3117            from++)
3118         {
3119           /* Is there anything like strpbrk with upper boundary, or
3120              memchr looking for 2 characters rather than just one?  */
3121           if (from[0] != 'f' && from[0] != 'F')
3122             continue;
3123           if (from[1] != 'a' && from[1] != 'A')
3124             continue;
3125           if (from[2] != 'l' && from[2] != 'L')
3126             continue;
3127           if (from[3] != 'l' && from[3] != 'L')
3128             continue;
3129           from += sizeof "fall" - 1;
3130           if (from[0] == 's' || from[0] == 'S')
3131             from++;
3132           while (*from == ' ' || *from == '\t' || *from == '-')
3133             from++;
3134           if (from[0] != 't' && from[0] != 'T')
3135             continue;
3136           if (from[1] != 'h' && from[1] != 'H')
3137             continue;
3138           if (from[2] != 'r' && from[2] != 'R')
3139             continue;
3140           if (from[3] == 'u' || from[3] == 'U')
3141             return true;
3142           if (from[3] != 'o' && from[3] != 'O')
3143             continue;
3144           if (from[4] != 'u' && from[4] != 'U')
3145             continue;
3146           if (from[5] != 'g' && from[5] != 'G')
3147             continue;
3148           if (from[6] != 'h' && from[6] != 'H')
3149             continue;
3150           return true;
3151         }
3152       return false;
3153     case 3:
3154     case 4:
3155       break;
3156     }
3157
3158   /* Whole comment contents:
3159      -fallthrough
3160      @fallthrough@
3161    */
3162   if (*from == '-' || *from == '@')
3163     {
3164       size_t len = sizeof "fallthrough" - 1;
3165       if ((size_t) (pfile->buffer->cur - from - 1) < len)
3166         return false;
3167       if (memcmp (from + 1, "fallthrough", len))
3168         return false;
3169       if (*from == '@')
3170         {
3171           if (from[len + 1] != '@')
3172             return false;
3173           len++;
3174         }
3175       from += 1 + len;
3176     }
3177   /* Whole comment contents (regex):
3178      lint -fallthrough[ \t]*
3179    */
3180   else if (*from == 'l')
3181     {
3182       size_t len = sizeof "int -fallthrough" - 1;
3183       if ((size_t) (pfile->buffer->cur - from - 1) < len)
3184         return false;
3185       if (memcmp (from + 1, "int -fallthrough", len))
3186         return false;
3187       from += 1 + len;
3188       while (*from == ' ' || *from == '\t')
3189         from++;
3190     }
3191   /* Whole comment contents (regex):
3192      [ \t]*FALLTHR(U|OUGH)[ \t]*
3193    */
3194   else if (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough) == 4)
3195     {
3196       while (*from == ' ' || *from == '\t')
3197         from++;
3198       if ((size_t) (pfile->buffer->cur - from)  < sizeof "FALLTHRU" - 1)
3199         return false;
3200       if (memcmp (from, "FALLTHR", sizeof "FALLTHR" - 1))
3201         return false;
3202       from += sizeof "FALLTHR" - 1;
3203       if (*from == 'U')
3204         from++;
3205       else if ((size_t) (pfile->buffer->cur - from)  < sizeof "OUGH" - 1)
3206         return false;
3207       else if (memcmp (from, "OUGH", sizeof "OUGH" - 1))
3208         return false;
3209       else
3210         from += sizeof "OUGH" - 1;
3211       while (*from == ' ' || *from == '\t')
3212         from++;
3213     }
3214   /* Whole comment contents (regex):
3215      [ \t.!]*(ELSE,? |INTENTIONAL(LY)? )?FALL(S | |-)?THR(OUGH|U)[ \t.!]*(-[^\n\r]*)?
3216      [ \t.!]*(Else,? |Intentional(ly)? )?Fall((s | |-)[Tt]|t)hr(ough|u)[ \t.!]*(-[^\n\r]*)?
3217      [ \t.!]*([Ee]lse,? |[Ii]ntentional(ly)? )?fall(s | |-)?thr(ough|u)[ \t.!]*(-[^\n\r]*)?
3218    */
3219   else
3220     {
3221       while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
3222         from++;
3223       unsigned char f = *from;
3224       bool all_upper = false;
3225       if (f == 'E' || f == 'e')
3226         {
3227           if ((size_t) (pfile->buffer->cur - from)
3228               < sizeof "else fallthru" - 1)
3229             return false;
3230           if (f == 'E' && memcmp (from + 1, "LSE", sizeof "LSE" - 1) == 0)
3231             all_upper = true;
3232           else if (memcmp (from + 1, "lse", sizeof "lse" - 1))
3233             return false;
3234           from += sizeof "else" - 1;
3235           if (*from == ',')
3236             from++;
3237           if (*from != ' ')
3238             return false;
3239           from++;
3240           if (all_upper && *from == 'f')
3241             return false;
3242           if (f == 'e' && *from == 'F')
3243             return false;
3244           f = *from;
3245         }
3246       else if (f == 'I' || f == 'i')
3247         {
3248           if ((size_t) (pfile->buffer->cur - from)
3249               < sizeof "intentional fallthru" - 1)
3250             return false;
3251           if (f == 'I' && memcmp (from + 1, "NTENTIONAL",
3252                                   sizeof "NTENTIONAL" - 1) == 0)
3253             all_upper = true;
3254           else if (memcmp (from + 1, "ntentional",
3255                            sizeof "ntentional" - 1))
3256             return false;
3257           from += sizeof "intentional" - 1;
3258           if (*from == ' ')
3259             {
3260               from++;
3261               if (all_upper && *from == 'f')
3262                 return false;
3263             }
3264           else if (all_upper)
3265             {
3266               if (memcmp (from, "LY F", sizeof "LY F" - 1))
3267                 return false;
3268               from += sizeof "LY " - 1;
3269             }
3270           else
3271             {
3272               if (memcmp (from, "ly ", sizeof "ly " - 1))
3273                 return false;
3274               from += sizeof "ly " - 1;
3275             }
3276           if (f == 'i' && *from == 'F')
3277             return false;
3278           f = *from;
3279         }
3280       if (f != 'F' && f != 'f')
3281         return false;
3282       if ((size_t) (pfile->buffer->cur - from) < sizeof "fallthru" - 1)
3283         return false;
3284       if (f == 'F' && memcmp (from + 1, "ALL", sizeof "ALL" - 1) == 0)
3285         all_upper = true;
3286       else if (all_upper)
3287         return false;
3288       else if (memcmp (from + 1, "all", sizeof "all" - 1))
3289         return false;
3290       from += sizeof "fall" - 1;
3291       if (*from == (all_upper ? 'S' : 's') && from[1] == ' ')
3292         from += 2;
3293       else if (*from == ' ' || *from == '-')
3294         from++;
3295       else if (*from != (all_upper ? 'T' : 't'))
3296         return false;
3297       if ((f == 'f' || *from != 'T') && (all_upper || *from != 't'))
3298         return false;
3299       if ((size_t) (pfile->buffer->cur - from) < sizeof "thru" - 1)
3300         return false;
3301       if (memcmp (from + 1, all_upper ? "HRU" : "hru", sizeof "hru" - 1))
3302         {
3303           if ((size_t) (pfile->buffer->cur - from) < sizeof "through" - 1)
3304             return false;
3305           if (memcmp (from + 1, all_upper ? "HROUGH" : "hrough",
3306                       sizeof "hrough" - 1))
3307             return false;
3308           from += sizeof "through" - 1;
3309         }
3310       else
3311         from += sizeof "thru" - 1;
3312       while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
3313         from++;
3314       if (*from == '-')
3315         {
3316           from++;
3317           if (*comment_start == '*')
3318             {
3319               do
3320                 {
3321                   while (*from && *from != '*'
3322                          && *from != '\n' && *from != '\r')
3323                     from++;
3324                   if (*from != '*' || from[1] == '/')
3325                     break;
3326                   from++;
3327                 }
3328               while (1);
3329             }
3330           else
3331             while (*from && *from != '\n' && *from != '\r')
3332               from++;
3333         }
3334     }
3335   /* C block comment.  */
3336   if (*comment_start == '*')
3337     {
3338       if (*from != '*' || from[1] != '/')
3339         return false;
3340     }
3341   /* C++ line comment.  */
3342   else if (*from != '\n')
3343     return false;
3344
3345   return true;
3346 }
3347
3348 /* Allocate COUNT tokens for RUN.  */
3349 void
3350 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
3351 {
3352   run->base = XNEWVEC (cpp_token, count);
3353   run->limit = run->base + count;
3354   run->next = NULL;
3355 }
3356
3357 /* Returns the next tokenrun, or creates one if there is none.  */
3358 static tokenrun *
3359 next_tokenrun (tokenrun *run)
3360 {
3361   if (run->next == NULL)
3362     {
3363       run->next = XNEW (tokenrun);
3364       run->next->prev = run;
3365       _cpp_init_tokenrun (run->next, 250);
3366     }
3367
3368   return run->next;
3369 }
3370
3371 /* Return the number of not yet processed token in a given
3372    context.  */
3373 int
3374 _cpp_remaining_tokens_num_in_context (cpp_context *context)
3375 {
3376   if (context->tokens_kind == TOKENS_KIND_DIRECT)
3377     return (LAST (context).token - FIRST (context).token);
3378   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
3379            || context->tokens_kind == TOKENS_KIND_EXTENDED)
3380     return (LAST (context).ptoken - FIRST (context).ptoken);
3381   else
3382       abort ();
3383 }
3384
3385 /* Returns the token present at index INDEX in a given context.  If
3386    INDEX is zero, the next token to be processed is returned.  */
3387 static const cpp_token*
3388 _cpp_token_from_context_at (cpp_context *context, int index)
3389 {
3390   if (context->tokens_kind == TOKENS_KIND_DIRECT)
3391     return &(FIRST (context).token[index]);
3392   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
3393            || context->tokens_kind == TOKENS_KIND_EXTENDED)
3394     return FIRST (context).ptoken[index];
3395  else
3396    abort ();
3397 }
3398
3399 /* Look ahead in the input stream.  */
3400 const cpp_token *
3401 cpp_peek_token (cpp_reader *pfile, int index)
3402 {
3403   cpp_context *context = pfile->context;
3404   const cpp_token *peektok;
3405   int count;
3406
3407   /* First, scan through any pending cpp_context objects.  */
3408   while (context->prev)
3409     {
3410       ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
3411
3412       if (index < (int) sz)
3413         return _cpp_token_from_context_at (context, index);
3414       index -= (int) sz;
3415       context = context->prev;
3416     }
3417
3418   /* We will have to read some new tokens after all (and do so
3419      without invalidating preceding tokens).  */
3420   count = index;
3421   pfile->keep_tokens++;
3422
3423   /* For peeked tokens temporarily disable line_change reporting,
3424      until the tokens are parsed for real.  */
3425   void (*line_change) (cpp_reader *, const cpp_token *, int)
3426     = pfile->cb.line_change;
3427   pfile->cb.line_change = NULL;
3428
3429   do
3430     {
3431       peektok = _cpp_lex_token (pfile);
3432       if (peektok->type == CPP_EOF)
3433         {
3434           index--;
3435           break;
3436         }
3437       else if (peektok->type == CPP_PRAGMA)
3438         {
3439           /* Don't peek past a pragma.  */
3440           if (peektok == &pfile->directive_result)
3441             /* Save the pragma in the buffer.  */
3442             *pfile->cur_token++ = *peektok;
3443           index--;
3444           break;
3445         }
3446     }
3447   while (index--);
3448
3449   _cpp_backup_tokens_direct (pfile, count - index);
3450   pfile->keep_tokens--;
3451   pfile->cb.line_change = line_change;
3452
3453   return peektok;
3454 }
3455
3456 /* Allocate a single token that is invalidated at the same time as the
3457    rest of the tokens on the line.  Has its line and col set to the
3458    same as the last lexed token, so that diagnostics appear in the
3459    right place.  */
3460 cpp_token *
3461 _cpp_temp_token (cpp_reader *pfile)
3462 {
3463   cpp_token *old, *result;
3464   ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
3465   ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
3466
3467   old = pfile->cur_token - 1;
3468   /* Any pre-existing lookaheads must not be clobbered.  */
3469   if (la)
3470     {
3471       if (sz <= la)
3472         {
3473           tokenrun *next = next_tokenrun (pfile->cur_run);
3474
3475           if (sz < la)
3476             memmove (next->base + 1, next->base,
3477                      (la - sz) * sizeof (cpp_token));
3478
3479           next->base[0] = pfile->cur_run->limit[-1];
3480         }
3481
3482       if (sz > 1)
3483         memmove (pfile->cur_token + 1, pfile->cur_token,
3484                  MIN (la, sz - 1) * sizeof (cpp_token));
3485     }
3486
3487   if (!sz && pfile->cur_token == pfile->cur_run->limit)
3488     {
3489       pfile->cur_run = next_tokenrun (pfile->cur_run);
3490       pfile->cur_token = pfile->cur_run->base;
3491     }
3492
3493   result = pfile->cur_token++;
3494   result->src_loc = old->src_loc;
3495   return result;
3496 }
3497
3498 /* We're at the beginning of a logical line (so not in
3499   directives-mode) and RESULT is a CPP_NAME with NODE_MODULE set.  See
3500   if we should enter deferred_pragma mode to tokenize the rest of the
3501   line as a module control-line.  */
3502
3503 static void
3504 cpp_maybe_module_directive (cpp_reader *pfile, cpp_token *result)
3505 {
3506   unsigned backup = 0; /* Tokens we peeked.  */
3507   cpp_hashnode *node = result->val.node.node;
3508   cpp_token *peek = result;
3509   cpp_token *keyword = peek;
3510   cpp_hashnode *(&n_modules)[spec_nodes::M_HWM][2] = pfile->spec_nodes.n_modules;
3511   int header_count = 0;
3512
3513   /* Make sure the incoming state is as we expect it.  This way we
3514      can restore it using constants.  */
3515   gcc_checking_assert (!pfile->state.in_deferred_pragma
3516                        && !pfile->state.skipping
3517                        && !pfile->state.parsing_args
3518                        && !pfile->state.angled_headers
3519                        && (pfile->state.save_comments
3520                            == !CPP_OPTION (pfile, discard_comments)));
3521
3522   /* Enter directives mode sufficiently for peeking.  We don't have
3523      to actually set in_directive.  */
3524   pfile->state.in_deferred_pragma = true;
3525
3526   /* These two fields are needed to process tokenization in deferred
3527      pragma mode.  They are not used outside deferred pragma mode or
3528      directives mode.  */
3529   pfile->state.pragma_allow_expansion = true;
3530   pfile->directive_line = result->src_loc;
3531
3532   /* Saving comments is incompatible with directives mode.   */
3533   pfile->state.save_comments = 0;
3534
3535   if (node == n_modules[spec_nodes::M_EXPORT][0])
3536     {
3537       peek = _cpp_lex_direct (pfile);
3538       keyword = peek;
3539       backup++;
3540       if (keyword->type != CPP_NAME)
3541         goto not_module;
3542       node = keyword->val.node.node;
3543       if (!(node->flags & NODE_MODULE))
3544         goto not_module;
3545     }
3546
3547   if (node == n_modules[spec_nodes::M__IMPORT][0])
3548     /* __import  */
3549     header_count = backup + 2 + 16;
3550   else if (node == n_modules[spec_nodes::M_IMPORT][0])
3551     /* import  */
3552     header_count = backup + 2 + (CPP_OPTION (pfile, preprocessed) ? 16 : 0);
3553   else if (node == n_modules[spec_nodes::M_MODULE][0])
3554     ; /* module  */
3555   else
3556     goto not_module;
3557
3558   /* We've seen [export] {module|import|__import}.  Check the next token.  */
3559   if (header_count)
3560     /* After '{,__}import' a header name may appear.  */
3561     pfile->state.angled_headers = true;
3562   peek = _cpp_lex_direct (pfile);
3563   backup++;
3564
3565   /* ... import followed by identifier, ':', '<' or
3566      header-name preprocessing tokens, or module
3567      followed by cpp-identifier, ':' or ';' preprocessing
3568      tokens.  C++ keywords are not yet relevant.  */
3569   if (peek->type == CPP_NAME
3570       || peek->type == CPP_COLON
3571       ||  (header_count
3572            ? (peek->type == CPP_LESS
3573               || (peek->type == CPP_STRING && peek->val.str.text[0] != 'R')
3574               || peek->type == CPP_HEADER_NAME)
3575            : peek->type == CPP_SEMICOLON))
3576     {
3577       pfile->state.pragma_allow_expansion = !CPP_OPTION (pfile, preprocessed);
3578       if (!pfile->state.pragma_allow_expansion)
3579         pfile->state.prevent_expansion++;
3580
3581       if (!header_count && linemap_included_from
3582           (LINEMAPS_LAST_ORDINARY_MAP (pfile->line_table)))
3583         cpp_error_with_line (pfile, CPP_DL_ERROR, keyword->src_loc, 0,
3584                              "module control-line cannot be in included file");
3585
3586       /* The first one or two tokens cannot be macro names.  */
3587       for (int ix = backup; ix--;)
3588         {
3589           cpp_token *tok = ix ? keyword : result;
3590           cpp_hashnode *node = tok->val.node.node;
3591
3592           /* Don't attempt to expand the token.  */
3593           tok->flags |= NO_EXPAND;
3594           if (_cpp_defined_macro_p (node)
3595               && _cpp_maybe_notify_macro_use (pfile, node, tok->src_loc)
3596               && !cpp_fun_like_macro_p (node))
3597             cpp_error_with_line (pfile, CPP_DL_ERROR, tok->src_loc, 0,
3598                                  "module control-line \"%s\" cannot be"
3599                                  " an object-like macro",
3600                                  NODE_NAME (node));
3601         }
3602
3603       /* Map to underbar variants.  */
3604       keyword->val.node.node = n_modules[header_count
3605                                          ? spec_nodes::M_IMPORT
3606                                          : spec_nodes::M_MODULE][1];
3607       if (backup != 1)
3608         result->val.node.node = n_modules[spec_nodes::M_EXPORT][1];
3609
3610       /* Maybe tell the tokenizer we expect a header-name down the
3611          road.  */
3612       pfile->state.directive_file_token = header_count;
3613     }
3614   else
3615     {
3616     not_module:
3617       /* Drop out of directive mode.  */
3618       /* We aaserted save_comments had this value upon entry.  */
3619       pfile->state.save_comments
3620         = !CPP_OPTION (pfile, discard_comments);
3621       pfile->state.in_deferred_pragma = false;
3622       /* Do not let this remain on.  */
3623       pfile->state.angled_headers = false;
3624     }
3625
3626   /* In either case we want to backup the peeked tokens.  */
3627   if (backup)
3628     {
3629       /* If we saw EOL, we should drop it, because this isn't a module
3630          control-line after all.  */
3631       bool eol = peek->type == CPP_PRAGMA_EOL;
3632       if (!eol || backup > 1)
3633         {
3634           /* Put put the peeked tokens back  */
3635           _cpp_backup_tokens_direct (pfile, backup);
3636           /* But if the last one was an EOL, forget it.  */
3637           if (eol)
3638             pfile->lookaheads--;
3639         }
3640     }
3641 }
3642
3643 /* Lex a token into RESULT (external interface).  Takes care of issues
3644    like directive handling, token lookahead, multiple include
3645    optimization and skipping.  */
3646 const cpp_token *
3647 _cpp_lex_token (cpp_reader *pfile)
3648 {
3649   cpp_token *result;
3650
3651   for (;;)
3652     {
3653       if (pfile->cur_token == pfile->cur_run->limit)
3654         {
3655           pfile->cur_run = next_tokenrun (pfile->cur_run);
3656           pfile->cur_token = pfile->cur_run->base;
3657         }
3658       /* We assume that the current token is somewhere in the current
3659          run.  */
3660       if (pfile->cur_token < pfile->cur_run->base
3661           || pfile->cur_token >= pfile->cur_run->limit)
3662         abort ();
3663
3664       if (pfile->lookaheads)
3665         {
3666           pfile->lookaheads--;
3667           result = pfile->cur_token++;
3668         }
3669       else
3670         result = _cpp_lex_direct (pfile);
3671
3672       if (result->flags & BOL)
3673         {
3674           /* Is this a directive.  If _cpp_handle_directive returns
3675              false, it is an assembler #.  */
3676           if (result->type == CPP_HASH
3677               /* 6.10.3 p 11: Directives in a list of macro arguments
3678                  gives undefined behavior.  This implementation
3679                  handles the directive as normal.  */
3680               && pfile->state.parsing_args != 1)
3681             {
3682               if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
3683                 {
3684                   if (pfile->directive_result.type == CPP_PADDING)
3685                     continue;
3686                   result = &pfile->directive_result;
3687                 }
3688             }
3689           else if (pfile->state.in_deferred_pragma)
3690             result = &pfile->directive_result;
3691           else if (result->type == CPP_NAME
3692                    && (result->val.node.node->flags & NODE_MODULE)
3693                    && !pfile->state.skipping
3694                    /* Unlike regular directives, we do not deal with
3695                       tokenizing module directives as macro arguments.
3696                       That's not permitted.  */
3697                    && !pfile->state.parsing_args)
3698             {
3699               /* P1857.  Before macro expansion, At start of logical
3700                  line ... */
3701               /* We don't have to consider lookaheads at this point.  */
3702               gcc_checking_assert (!pfile->lookaheads);
3703
3704               cpp_maybe_module_directive (pfile, result);
3705             }
3706
3707           if (pfile->cb.line_change && !pfile->state.skipping)
3708             pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
3709         }
3710
3711       /* We don't skip tokens in directives.  */
3712       if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
3713         break;
3714
3715       /* Outside a directive, invalidate controlling macros.  At file
3716          EOF, _cpp_lex_direct takes care of popping the buffer, so we never
3717          get here and MI optimization works.  */
3718       pfile->mi_valid = false;
3719
3720       if (!pfile->state.skipping || result->type == CPP_EOF)
3721         break;
3722     }
3723
3724   return result;
3725 }
3726
3727 /* Returns true if a fresh line has been loaded.  */
3728 template <bool lexing_raw_string>
3729 static bool
3730 get_fresh_line_impl (cpp_reader *pfile)
3731 {
3732   /* We can't get a new line until we leave the current directive, unless we
3733      are lexing a raw string, in which case it will be OK as long as we don't
3734      pop the current buffer.  */
3735   if (!lexing_raw_string && pfile->state.in_directive)
3736     return false;
3737
3738   for (;;)
3739     {
3740       cpp_buffer *buffer = pfile->buffer;
3741
3742       if (!buffer->need_line)
3743         return true;
3744
3745       if (buffer->next_line < buffer->rlimit)
3746         {
3747           _cpp_clean_line (pfile);
3748           return true;
3749         }
3750
3751       /* We can't change buffers until we leave the current directive.  */
3752       if (lexing_raw_string && pfile->state.in_directive)
3753         return false;
3754
3755       /* First, get out of parsing arguments state.  */
3756       if (pfile->state.parsing_args)
3757         return false;
3758
3759       /* End of buffer.  Non-empty files should end in a newline.  */
3760       if (buffer->buf != buffer->rlimit
3761           && buffer->next_line > buffer->rlimit
3762           && !buffer->from_stage3)
3763         {
3764           /* Clip to buffer size.  */
3765           buffer->next_line = buffer->rlimit;
3766         }
3767
3768       if (buffer->prev && !buffer->return_at_eof)
3769         _cpp_pop_buffer (pfile);
3770       else
3771         {
3772           /* End of translation.  Do not pop the buffer yet. Increment
3773              line number so that the EOF token is on a line of its own
3774              (_cpp_lex_direct doesn't increment in that case, because
3775              it's hard for it to distinguish this special case). */
3776           CPP_INCREMENT_LINE (pfile, 0);
3777           return false;
3778         }
3779     }
3780 }
3781
3782 bool
3783 _cpp_get_fresh_line (cpp_reader *pfile)
3784 {
3785   return get_fresh_line_impl<false> (pfile);
3786 }
3787
3788
3789 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)          \
3790   do                                                    \
3791     {                                                   \
3792       result->type = ELSE_TYPE;                         \
3793       if (*buffer->cur == CHAR)                         \
3794         buffer->cur++, result->type = THEN_TYPE;        \
3795     }                                                   \
3796   while (0)
3797
3798 /* Lex a token into pfile->cur_token, which is also incremented, to
3799    get diagnostics pointing to the correct location.
3800
3801    Does not handle issues such as token lookahead, multiple-include
3802    optimization, directives, skipping etc.  This function is only
3803    suitable for use by _cpp_lex_token, and in special cases like
3804    lex_expansion_token which doesn't care for any of these issues.
3805
3806    When meeting a newline, returns CPP_EOF if parsing a directive,
3807    otherwise returns to the start of the token buffer if permissible.
3808    Returns the location of the lexed token.  */
3809 cpp_token *
3810 _cpp_lex_direct (cpp_reader *pfile)
3811 {
3812   cppchar_t c;
3813   cpp_buffer *buffer;
3814   const unsigned char *comment_start;
3815   bool fallthrough_comment = false;
3816   cpp_token *result = pfile->cur_token++;
3817
3818  fresh_line:
3819   result->flags = 0;
3820   buffer = pfile->buffer;
3821   if (buffer->need_line)
3822     {
3823       if (pfile->state.in_deferred_pragma)
3824         {
3825           /* This can happen in cases like:
3826              #define loop(x) whatever
3827              #pragma omp loop
3828              where when trying to expand loop we need to peek
3829              next token after loop, but aren't still in_deferred_pragma
3830              mode but are in in_directive mode, so buffer->need_line
3831              is set, a CPP_EOF is peeked.  */
3832           result->type = CPP_PRAGMA_EOL;
3833           pfile->state.in_deferred_pragma = false;
3834           if (!pfile->state.pragma_allow_expansion)
3835             pfile->state.prevent_expansion--;
3836           return result;
3837         }
3838       if (!_cpp_get_fresh_line (pfile))
3839         {
3840           result->type = CPP_EOF;
3841           /* Not a real EOF in a directive or arg parsing -- we refuse
3842              to advance to the next file now, and will once we're out
3843              of those modes.  */
3844           if (!pfile->state.in_directive && !pfile->state.parsing_args)
3845             {
3846               /* Tell the compiler the line number of the EOF token.  */
3847               result->src_loc = pfile->line_table->highest_line;
3848               result->flags = BOL;
3849               /* Now pop the buffer that _cpp_get_fresh_line did not.  */
3850               _cpp_pop_buffer (pfile);
3851             }
3852           return result;
3853         }
3854       if (buffer != pfile->buffer)
3855         fallthrough_comment = false;
3856       if (!pfile->keep_tokens)
3857         {
3858           pfile->cur_run = &pfile->base_run;
3859           result = pfile->base_run.base;
3860           pfile->cur_token = result + 1;
3861         }
3862       result->flags = BOL;
3863       if (pfile->state.parsing_args == 2)
3864         result->flags |= PREV_WHITE;
3865     }
3866   buffer = pfile->buffer;
3867  update_tokens_line:
3868   result->src_loc = pfile->line_table->highest_line;
3869
3870  skipped_white:
3871   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
3872       && !pfile->overlaid_buffer)
3873     {
3874       _cpp_process_line_notes (pfile, false);
3875       result->src_loc = pfile->line_table->highest_line;
3876     }
3877   c = *buffer->cur++;
3878
3879   if (pfile->forced_token_location)
3880     result->src_loc = pfile->forced_token_location;
3881   else
3882     result->src_loc = linemap_position_for_column (pfile->line_table,
3883                                           CPP_BUF_COLUMN (buffer, buffer->cur));
3884
3885   switch (c)
3886     {
3887     case ' ': case '\t': case '\f': case '\v': case '\0':
3888       result->flags |= PREV_WHITE;
3889       skip_whitespace (pfile, c);
3890       goto skipped_white;
3891
3892     case '\n':
3893       /* Increment the line, unless this is the last line ...  */
3894       if (buffer->cur < buffer->rlimit
3895           /* ... or this is a #include, (where _cpp_stack_file needs to
3896              unwind by one line) ...  */
3897           || (pfile->state.in_directive > 1
3898               /* ... except traditional-cpp increments this elsewhere.  */
3899               && !CPP_OPTION (pfile, traditional)))
3900         CPP_INCREMENT_LINE (pfile, 0);
3901       buffer->need_line = true;
3902       if (pfile->state.in_deferred_pragma)
3903         {
3904           /* Produce the PRAGMA_EOL on this line.  File reading
3905              ensures there is always a \n at end of the buffer, thus
3906              in a deferred pragma we always see CPP_PRAGMA_EOL before
3907              any CPP_EOF.  */
3908           result->type = CPP_PRAGMA_EOL;
3909           result->flags &= ~PREV_WHITE;
3910           pfile->state.in_deferred_pragma = false;
3911           if (!pfile->state.pragma_allow_expansion)
3912             pfile->state.prevent_expansion--;
3913           return result;
3914         }
3915       goto fresh_line;
3916
3917     case '0': case '1': case '2': case '3': case '4':
3918     case '5': case '6': case '7': case '8': case '9':
3919       {
3920         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3921         result->type = CPP_NUMBER;
3922         lex_number (pfile, &result->val.str, &nst);
3923         warn_about_normalization (pfile, result, &nst, false);
3924         break;
3925       }
3926
3927     case 'L':
3928     case 'u':
3929     case 'U':
3930     case 'R':
3931       /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
3932          wide strings or raw strings.  */
3933       if (c == 'L' || CPP_OPTION (pfile, rliterals)
3934           || (c != 'R' && CPP_OPTION (pfile, uliterals)))
3935         {
3936           if ((*buffer->cur == '\'' && c != 'R')
3937               || *buffer->cur == '"'
3938               || (*buffer->cur == 'R'
3939                   && c != 'R'
3940                   && buffer->cur[1] == '"'
3941                   && CPP_OPTION (pfile, rliterals))
3942               || (*buffer->cur == '8'
3943                   && c == 'u'
3944                   && ((buffer->cur[1] == '"' || (buffer->cur[1] == '\''
3945                                 && CPP_OPTION (pfile, utf8_char_literals)))
3946                       || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
3947                           && CPP_OPTION (pfile, rliterals)))))
3948             {
3949               lex_string (pfile, result, buffer->cur - 1);
3950               break;
3951             }
3952         }
3953       /* Fall through.  */
3954
3955     case '_':
3956     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
3957     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
3958     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
3959     case 's': case 't':           case 'v': case 'w': case 'x':
3960     case 'y': case 'z':
3961     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
3962     case 'G': case 'H': case 'I': case 'J': case 'K':
3963     case 'M': case 'N': case 'O': case 'P': case 'Q':
3964     case 'S': case 'T':           case 'V': case 'W': case 'X':
3965     case 'Y': case 'Z':
3966       result->type = CPP_NAME;
3967       {
3968         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3969         const auto node = lex_identifier (pfile, buffer->cur - 1, false, &nst,
3970                                           &result->val.node.spelling);
3971         result->val.node.node = node;
3972         identifier_diagnostics_on_lex (pfile, node);
3973         warn_about_normalization (pfile, result, &nst, true);
3974       }
3975
3976       /* Convert named operators to their proper types.  */
3977       if (result->val.node.node->flags & NODE_OPERATOR)
3978         {
3979           result->flags |= NAMED_OP;
3980           result->type = (enum cpp_ttype) result->val.node.node->directive_index;
3981         }
3982
3983       /* Signal FALLTHROUGH comment followed by another token.  */
3984       if (fallthrough_comment)
3985         result->flags |= PREV_FALLTHROUGH;
3986       break;
3987
3988     case '\'':
3989     case '"':
3990       lex_string (pfile, result, buffer->cur - 1);
3991       break;
3992
3993     case '/':
3994       /* A potential block or line comment.  */
3995       comment_start = buffer->cur;
3996       c = *buffer->cur;
3997
3998       if (c == '*')
3999         {
4000           if (_cpp_skip_block_comment (pfile))
4001             cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
4002         }
4003       else if (c == '/' && ! CPP_OPTION (pfile, traditional))
4004         {
4005           /* Don't warn for system headers.  */
4006           if (_cpp_in_system_header (pfile))
4007             ;
4008           /* Warn about comments if pedantically GNUC89, and not
4009              in system headers.  */
4010           else if (CPP_OPTION (pfile, lang) == CLK_GNUC89
4011                    && CPP_PEDANTIC (pfile)
4012                    && ! buffer->warned_cplusplus_comments)
4013             {
4014               if (cpp_error (pfile, CPP_DL_PEDWARN,
4015                              "C++ style comments are not allowed in ISO C90"))
4016                 cpp_error (pfile, CPP_DL_NOTE,
4017                            "(this will be reported only once per input file)");
4018               buffer->warned_cplusplus_comments = 1;
4019             }
4020           /* Or if specifically desired via -Wc90-c99-compat.  */
4021           else if (CPP_OPTION (pfile, cpp_warn_c90_c99_compat) > 0
4022                    && ! CPP_OPTION (pfile, cplusplus)
4023                    && ! buffer->warned_cplusplus_comments)
4024             {
4025               if (cpp_error (pfile, CPP_DL_WARNING,
4026                              "C++ style comments are incompatible with C90"))
4027                 cpp_error (pfile, CPP_DL_NOTE,
4028                            "(this will be reported only once per input file)");
4029               buffer->warned_cplusplus_comments = 1;
4030             }
4031           /* In C89/C94, C++ style comments are forbidden.  */
4032           else if ((CPP_OPTION (pfile, lang) == CLK_STDC89
4033                     || CPP_OPTION (pfile, lang) == CLK_STDC94))
4034             {
4035               /* But don't be confused about valid code such as
4036                  - // immediately followed by *,
4037                  - // in a preprocessing directive,
4038                  - // in an #if 0 block.  */
4039               if (buffer->cur[1] == '*'
4040                   || pfile->state.in_directive
4041                   || pfile->state.skipping)
4042                 {
4043                   result->type = CPP_DIV;
4044                   break;
4045                 }
4046               else if (! buffer->warned_cplusplus_comments)
4047                 {
4048                   if (cpp_error (pfile, CPP_DL_ERROR,
4049                                  "C++ style comments are not allowed in "
4050                                  "ISO C90"))
4051                     cpp_error (pfile, CPP_DL_NOTE,
4052                                "(this will be reported only once per input "
4053                                "file)");
4054                   buffer->warned_cplusplus_comments = 1;
4055                 }
4056             }
4057           if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
4058             cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
4059         }
4060       else if (c == '=')
4061         {
4062           buffer->cur++;
4063           result->type = CPP_DIV_EQ;
4064           break;
4065         }
4066       else
4067         {
4068           result->type = CPP_DIV;
4069           break;
4070         }
4071
4072       if (fallthrough_comment_p (pfile, comment_start))
4073         fallthrough_comment = true;
4074
4075       if (pfile->cb.comment)
4076         {
4077           size_t len = pfile->buffer->cur - comment_start;
4078           pfile->cb.comment (pfile, result->src_loc, comment_start - 1,
4079                              len + 1);
4080         }
4081
4082       if (!pfile->state.save_comments)
4083         {
4084           result->flags |= PREV_WHITE;
4085           goto update_tokens_line;
4086         }
4087
4088       if (fallthrough_comment)
4089         result->flags |= PREV_FALLTHROUGH;
4090
4091       /* Save the comment as a token in its own right.  */
4092       save_comment (pfile, result, comment_start, c);
4093       break;
4094
4095     case '<':
4096       if (pfile->state.angled_headers)
4097         {
4098           lex_string (pfile, result, buffer->cur - 1);
4099           if (result->type != CPP_LESS)
4100             break;
4101         }
4102
4103       result->type = CPP_LESS;
4104       if (*buffer->cur == '=')
4105         {
4106           buffer->cur++, result->type = CPP_LESS_EQ;
4107           if (*buffer->cur == '>'
4108               && CPP_OPTION (pfile, cplusplus)
4109               && CPP_OPTION (pfile, lang) >= CLK_GNUCXX20)
4110             buffer->cur++, result->type = CPP_SPACESHIP;
4111         }
4112       else if (*buffer->cur == '<')
4113         {
4114           buffer->cur++;
4115           IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
4116         }
4117       else if (CPP_OPTION (pfile, digraphs))
4118         {
4119           if (*buffer->cur == ':')
4120             {
4121               /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
4122                  three characters are <:: and the subsequent character
4123                  is neither : nor >, the < is treated as a preprocessor
4124                  token by itself".  */
4125               if (CPP_OPTION (pfile, cplusplus)
4126                   && CPP_OPTION (pfile, lang) != CLK_CXX98
4127                   && CPP_OPTION (pfile, lang) != CLK_GNUCXX
4128                   && buffer->cur[1] == ':'
4129                   && buffer->cur[2] != ':' && buffer->cur[2] != '>')
4130                 break;
4131
4132               buffer->cur++;
4133               result->flags |= DIGRAPH;
4134               result->type = CPP_OPEN_SQUARE;
4135             }
4136           else if (*buffer->cur == '%')
4137             {
4138               buffer->cur++;
4139               result->flags |= DIGRAPH;
4140               result->type = CPP_OPEN_BRACE;
4141             }
4142         }
4143       break;
4144
4145     case '>':
4146       result->type = CPP_GREATER;
4147       if (*buffer->cur == '=')
4148         buffer->cur++, result->type = CPP_GREATER_EQ;
4149       else if (*buffer->cur == '>')
4150         {
4151           buffer->cur++;
4152           IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
4153         }
4154       break;
4155
4156     case '%':
4157       result->type = CPP_MOD;
4158       if (*buffer->cur == '=')
4159         buffer->cur++, result->type = CPP_MOD_EQ;
4160       else if (CPP_OPTION (pfile, digraphs))
4161         {
4162           if (*buffer->cur == ':')
4163             {
4164               buffer->cur++;
4165               result->flags |= DIGRAPH;
4166               result->type = CPP_HASH;
4167               if (*buffer->cur == '%' && buffer->cur[1] == ':')
4168                 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
4169             }
4170           else if (*buffer->cur == '>')
4171             {
4172               buffer->cur++;
4173               result->flags |= DIGRAPH;
4174               result->type = CPP_CLOSE_BRACE;
4175             }
4176         }
4177       break;
4178
4179     case '.':
4180       result->type = CPP_DOT;
4181       if (ISDIGIT (*buffer->cur))
4182         {
4183           struct normalize_state nst = INITIAL_NORMALIZE_STATE;
4184           result->type = CPP_NUMBER;
4185           lex_number (pfile, &result->val.str, &nst);
4186           warn_about_normalization (pfile, result, &nst, false);
4187         }
4188       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
4189         buffer->cur += 2, result->type = CPP_ELLIPSIS;
4190       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
4191         buffer->cur++, result->type = CPP_DOT_STAR;
4192       break;
4193
4194     case '+':
4195       result->type = CPP_PLUS;
4196       if (*buffer->cur == '+')
4197         buffer->cur++, result->type = CPP_PLUS_PLUS;
4198       else if (*buffer->cur == '=')
4199         buffer->cur++, result->type = CPP_PLUS_EQ;
4200       break;
4201
4202     case '-':
4203       result->type = CPP_MINUS;
4204       if (*buffer->cur == '>')
4205         {
4206           buffer->cur++;
4207           result->type = CPP_DEREF;
4208           if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
4209             buffer->cur++, result->type = CPP_DEREF_STAR;
4210         }
4211       else if (*buffer->cur == '-')
4212         buffer->cur++, result->type = CPP_MINUS_MINUS;
4213       else if (*buffer->cur == '=')
4214         buffer->cur++, result->type = CPP_MINUS_EQ;
4215       break;
4216
4217     case '&':
4218       result->type = CPP_AND;
4219       if (*buffer->cur == '&')
4220         buffer->cur++, result->type = CPP_AND_AND;
4221       else if (*buffer->cur == '=')
4222         buffer->cur++, result->type = CPP_AND_EQ;
4223       break;
4224
4225     case '|':
4226       result->type = CPP_OR;
4227       if (*buffer->cur == '|')
4228         buffer->cur++, result->type = CPP_OR_OR;
4229       else if (*buffer->cur == '=')
4230         buffer->cur++, result->type = CPP_OR_EQ;
4231       break;
4232
4233     case ':':
4234       result->type = CPP_COLON;
4235       if (*buffer->cur == ':' && CPP_OPTION (pfile, scope))
4236         buffer->cur++, result->type = CPP_SCOPE;
4237       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
4238         {
4239           buffer->cur++;
4240           result->flags |= DIGRAPH;
4241           result->type = CPP_CLOSE_SQUARE;
4242         }
4243       break;
4244
4245     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
4246     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
4247     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
4248     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
4249     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
4250
4251     case '?': result->type = CPP_QUERY; break;
4252     case '~': result->type = CPP_COMPL; break;
4253     case ',': result->type = CPP_COMMA; break;
4254     case '(': result->type = CPP_OPEN_PAREN; break;
4255     case ')': result->type = CPP_CLOSE_PAREN; break;
4256     case '[': result->type = CPP_OPEN_SQUARE; break;
4257     case ']': result->type = CPP_CLOSE_SQUARE; break;
4258     case '{': result->type = CPP_OPEN_BRACE; break;
4259     case '}': result->type = CPP_CLOSE_BRACE; break;
4260     case ';': result->type = CPP_SEMICOLON; break;
4261
4262       /* @ is a punctuator in Objective-C.  */
4263     case '@': result->type = CPP_ATSIGN; break;
4264
4265     default:
4266       {
4267         const uchar *base = --buffer->cur;
4268         static int no_warn_cnt;
4269
4270         /* Check for an extended identifier ($ or UCN or UTF-8).  */
4271         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
4272         if (forms_identifier_p (pfile, true, &nst))
4273           {
4274             result->type = CPP_NAME;
4275             const auto node = lex_identifier (pfile, base, true, &nst,
4276                                               &result->val.node.spelling);
4277             result->val.node.node = node;
4278             identifier_diagnostics_on_lex (pfile, node);
4279             warn_about_normalization (pfile, result, &nst, true);
4280             break;
4281           }
4282
4283         /* Otherwise this will form a CPP_OTHER token.  Parse valid UTF-8 as a
4284            single token.  */
4285         buffer->cur++;
4286         if (c >= utf8_signifier)
4287           {
4288             const uchar *pstr = base;
4289             cppchar_t s;
4290             if (_cpp_valid_utf8 (pfile, &pstr, buffer->rlimit, 0, NULL, &s))
4291               {
4292                 if (s > UCS_LIMIT && CPP_OPTION (pfile, cpp_warn_invalid_utf8))
4293                   {
4294                     buffer->cur = base;
4295                     _cpp_warn_invalid_utf8 (pfile);
4296                   }
4297                 buffer->cur = pstr;
4298               }
4299             else if (CPP_OPTION (pfile, cpp_warn_invalid_utf8))
4300               {
4301                 buffer->cur = base;
4302                 const uchar *end = _cpp_warn_invalid_utf8 (pfile);
4303                 buffer->cur = base + 1;
4304                 no_warn_cnt = end - buffer->cur;
4305               }
4306           }
4307         else if (c >= utf8_continuation
4308                  && CPP_OPTION (pfile, cpp_warn_invalid_utf8))
4309           {
4310             if (no_warn_cnt)
4311               --no_warn_cnt;
4312             else
4313               {
4314                 buffer->cur = base;
4315                 _cpp_warn_invalid_utf8 (pfile);
4316                 buffer->cur = base + 1;
4317               }
4318           }
4319         create_literal (pfile, result, base, buffer->cur - base, CPP_OTHER);
4320         break;
4321       }
4322
4323     }
4324
4325   /* Potentially convert the location of the token to a range.  */
4326   if (result->src_loc >= RESERVED_LOCATION_COUNT
4327       && result->type != CPP_EOF)
4328     {
4329       /* Ensure that any line notes are processed, so that we have the
4330          correct physical line/column for the end-point of the token even
4331          when a logical line is split via one or more backslashes.  */
4332       if (buffer->cur >= buffer->notes[buffer->cur_note].pos
4333           && !pfile->overlaid_buffer)
4334         _cpp_process_line_notes (pfile, false);
4335
4336       source_range tok_range;
4337       tok_range.m_start = result->src_loc;
4338       tok_range.m_finish
4339         = linemap_position_for_column (pfile->line_table,
4340                                        CPP_BUF_COLUMN (buffer, buffer->cur));
4341
4342       result->src_loc
4343         = pfile->line_table->get_or_create_combined_loc (result->src_loc,
4344                                                          tok_range, nullptr, 0);
4345     }
4346
4347   return result;
4348 }
4349
4350 /* An upper bound on the number of bytes needed to spell TOKEN.
4351    Does not include preceding whitespace.  */
4352 unsigned int
4353 cpp_token_len (const cpp_token *token)
4354 {
4355   unsigned int len;
4356
4357   switch (TOKEN_SPELL (token))
4358     {
4359     default:            len = 6;                                break;
4360     case SPELL_LITERAL: len = token->val.str.len;               break;
4361     case SPELL_IDENT:   len = NODE_LEN (token->val.node.node) * 10;     break;
4362     }
4363
4364   return len;
4365 }
4366
4367 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
4368    Return the number of bytes read out of NAME.  (There are always
4369    10 bytes written to BUFFER.)  */
4370
4371 static size_t
4372 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
4373 {
4374   int j;
4375   int ucn_len = 0;
4376   int ucn_len_c;
4377   unsigned t;
4378   unsigned long utf32;
4379
4380   /* Compute the length of the UTF-8 sequence.  */
4381   for (t = *name; t & 0x80; t <<= 1)
4382     ucn_len++;
4383
4384   utf32 = *name & (0x7F >> ucn_len);
4385   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
4386     {
4387       utf32 = (utf32 << 6) | (*++name & 0x3F);
4388
4389       /* Ill-formed UTF-8.  */
4390       if ((*name & ~0x3F) != 0x80)
4391         abort ();
4392     }
4393
4394   *buffer++ = '\\';
4395   *buffer++ = 'U';
4396   for (j = 7; j >= 0; j--)
4397     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
4398   return ucn_len;
4399 }
4400
4401 /* Given a token TYPE corresponding to a digraph, return a pointer to
4402    the spelling of the digraph.  */
4403 static const unsigned char *
4404 cpp_digraph2name (enum cpp_ttype type)
4405 {
4406   return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
4407 }
4408
4409 /* Write the spelling of an identifier IDENT, using UCNs, to BUFFER.
4410    The buffer must already contain enough space to hold the
4411    token's spelling.  Returns a pointer to the character after the
4412    last character written.  */
4413 unsigned char *
4414 _cpp_spell_ident_ucns (unsigned char *buffer, cpp_hashnode *ident)
4415 {
4416   size_t i;
4417   const unsigned char *name = NODE_NAME (ident);
4418
4419   for (i = 0; i < NODE_LEN (ident); i++)
4420     if (name[i] & ~0x7F)
4421       {
4422         i += utf8_to_ucn (buffer, name + i) - 1;
4423         buffer += 10;
4424       }
4425     else
4426       *buffer++ = name[i];
4427
4428   return buffer;
4429 }
4430
4431 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
4432    already contain enough space to hold the token's spelling.
4433    Returns a pointer to the character after the last character written.
4434    FORSTRING is true if this is to be the spelling after translation
4435    phase 1 (with the original spelling of extended identifiers), false
4436    if extended identifiers should always be written using UCNs (there is
4437    no option for always writing them in the internal UTF-8 form).
4438    FIXME: Would be nice if we didn't need the PFILE argument.  */
4439 unsigned char *
4440 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
4441                  unsigned char *buffer, bool forstring)
4442 {
4443   switch (TOKEN_SPELL (token))
4444     {
4445     case SPELL_OPERATOR:
4446       {
4447         const unsigned char *spelling;
4448         unsigned char c;
4449
4450         if (token->flags & DIGRAPH)
4451           spelling = cpp_digraph2name (token->type);
4452         else if (token->flags & NAMED_OP)
4453           goto spell_ident;
4454         else
4455           spelling = TOKEN_NAME (token);
4456
4457         while ((c = *spelling++) != '\0')
4458           *buffer++ = c;
4459       }
4460       break;
4461
4462     spell_ident:
4463     case SPELL_IDENT:
4464       if (forstring)
4465         {
4466           memcpy (buffer, NODE_NAME (token->val.node.spelling),
4467                   NODE_LEN (token->val.node.spelling));
4468           buffer += NODE_LEN (token->val.node.spelling);
4469         }
4470       else
4471         buffer = _cpp_spell_ident_ucns (buffer, token->val.node.node);
4472       break;
4473
4474     case SPELL_LITERAL:
4475       memcpy (buffer, token->val.str.text, token->val.str.len);
4476       buffer += token->val.str.len;
4477       break;
4478
4479     case SPELL_NONE:
4480       cpp_error (pfile, CPP_DL_ICE,
4481                  "unspellable token %s", TOKEN_NAME (token));
4482       break;
4483     }
4484
4485   return buffer;
4486 }
4487
4488 /* Returns TOKEN spelt as a null-terminated string.  The string is
4489    freed when the reader is destroyed.  Useful for diagnostics.  */
4490 unsigned char *
4491 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
4492 {
4493   unsigned int len = cpp_token_len (token) + 1;
4494   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
4495
4496   end = cpp_spell_token (pfile, token, start, false);
4497   end[0] = '\0';
4498
4499   return start;
4500 }
4501
4502 /* Returns a pointer to a string which spells the token defined by
4503    TYPE and FLAGS.  Used by C front ends, which really should move to
4504    using cpp_token_as_text.  */
4505 const char *
4506 cpp_type2name (enum cpp_ttype type, unsigned char flags)
4507 {
4508   if (flags & DIGRAPH)
4509     return (const char *) cpp_digraph2name (type);
4510   else if (flags & NAMED_OP)
4511     return cpp_named_operator2name (type);
4512
4513   return (const char *) token_spellings[type].name;
4514 }
4515
4516 /* Writes the spelling of token to FP, without any preceding space.
4517    Separated from cpp_spell_token for efficiency - to avoid stdio
4518    double-buffering.  */
4519 void
4520 cpp_output_token (const cpp_token *token, FILE *fp)
4521 {
4522   switch (TOKEN_SPELL (token))
4523     {
4524     case SPELL_OPERATOR:
4525       {
4526         const unsigned char *spelling;
4527         int c;
4528
4529         if (token->flags & DIGRAPH)
4530           spelling = cpp_digraph2name (token->type);
4531         else if (token->flags & NAMED_OP)
4532           goto spell_ident;
4533         else
4534           spelling = TOKEN_NAME (token);
4535
4536         c = *spelling;
4537         do
4538           putc (c, fp);
4539         while ((c = *++spelling) != '\0');
4540       }
4541       break;
4542
4543     spell_ident:
4544     case SPELL_IDENT:
4545       {
4546         size_t i;
4547         const unsigned char * name = NODE_NAME (token->val.node.node);
4548
4549         for (i = 0; i < NODE_LEN (token->val.node.node); i++)
4550           if (name[i] & ~0x7F)
4551             {
4552               unsigned char buffer[10];
4553               i += utf8_to_ucn (buffer, name + i) - 1;
4554               fwrite (buffer, 1, 10, fp);
4555             }
4556           else
4557             fputc (NODE_NAME (token->val.node.node)[i], fp);
4558       }
4559       break;
4560
4561     case SPELL_LITERAL:
4562       if (token->type == CPP_HEADER_NAME)
4563         fputc ('"', fp);
4564       fwrite (token->val.str.text, 1, token->val.str.len, fp);
4565       if (token->type == CPP_HEADER_NAME)
4566         fputc ('"', fp);
4567       break;
4568
4569     case SPELL_NONE:
4570       /* An error, most probably.  */
4571       break;
4572     }
4573 }
4574
4575 /* Compare two tokens.  */
4576 int
4577 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
4578 {
4579   if (a->type == b->type && a->flags == b->flags)
4580     switch (TOKEN_SPELL (a))
4581       {
4582       default:                  /* Keep compiler happy.  */
4583       case SPELL_OPERATOR:
4584         /* token_no is used to track where multiple consecutive ##
4585            tokens were originally located.  */
4586         return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
4587       case SPELL_NONE:
4588         return (a->type != CPP_MACRO_ARG
4589                 || (a->val.macro_arg.arg_no == b->val.macro_arg.arg_no
4590                     && a->val.macro_arg.spelling == b->val.macro_arg.spelling));
4591       case SPELL_IDENT:
4592         return (a->val.node.node == b->val.node.node
4593                 && a->val.node.spelling == b->val.node.spelling);
4594       case SPELL_LITERAL:
4595         return (a->val.str.len == b->val.str.len
4596                 && !memcmp (a->val.str.text, b->val.str.text,
4597                             a->val.str.len));
4598       }
4599
4600   return 0;
4601 }
4602
4603 /* Returns nonzero if a space should be inserted to avoid an
4604    accidental token paste for output.  For simplicity, it is
4605    conservative, and occasionally advises a space where one is not
4606    needed, e.g. "." and ".2".  */
4607 int
4608 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
4609                  const cpp_token *token2)
4610 {
4611   enum cpp_ttype a = token1->type, b = token2->type;
4612   cppchar_t c;
4613
4614   if (token1->flags & NAMED_OP)
4615     a = CPP_NAME;
4616   if (token2->flags & NAMED_OP)
4617     b = CPP_NAME;
4618
4619   c = EOF;
4620   if (token2->flags & DIGRAPH)
4621     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
4622   else if (token_spellings[b].category == SPELL_OPERATOR)
4623     c = token_spellings[b].name[0];
4624
4625   /* Quickly get everything that can paste with an '='.  */
4626   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
4627     return 1;
4628
4629   switch (a)
4630     {
4631     case CPP_GREATER:   return c == '>';
4632     case CPP_LESS:      return c == '<' || c == '%' || c == ':';
4633     case CPP_PLUS:      return c == '+';
4634     case CPP_MINUS:     return c == '-' || c == '>';
4635     case CPP_DIV:       return c == '/' || c == '*'; /* Comments.  */
4636     case CPP_MOD:       return c == ':' || c == '>';
4637     case CPP_AND:       return c == '&';
4638     case CPP_OR:        return c == '|';
4639     case CPP_COLON:     return c == ':' || c == '>';
4640     case CPP_DEREF:     return c == '*';
4641     case CPP_DOT:       return c == '.' || c == '%' || b == CPP_NUMBER;
4642     case CPP_HASH:      return c == '#' || c == '%'; /* Digraph form.  */
4643     case CPP_PRAGMA:
4644     case CPP_NAME:      return ((b == CPP_NUMBER
4645                                  && name_p (pfile, &token2->val.str))
4646                                 || b == CPP_NAME
4647                                 || b == CPP_CHAR || b == CPP_STRING); /* L */
4648     case CPP_NUMBER:    return (b == CPP_NUMBER || b == CPP_NAME
4649                                 || b == CPP_CHAR
4650                                 || c == '.' || c == '+' || c == '-');
4651                                       /* UCNs */
4652     case CPP_OTHER:     return ((token1->val.str.text[0] == '\\'
4653                                  && b == CPP_NAME)
4654                                 || (CPP_OPTION (pfile, objc)
4655                                     && token1->val.str.text[0] == '@'
4656                                     && (b == CPP_NAME || b == CPP_STRING)));
4657     case CPP_LESS_EQ:   return c == '>';
4658     case CPP_STRING:
4659     case CPP_WSTRING:
4660     case CPP_UTF8STRING:
4661     case CPP_STRING16:
4662     case CPP_STRING32:  return (CPP_OPTION (pfile, user_literals)
4663                                 && (b == CPP_NAME
4664                                     || (TOKEN_SPELL (token2) == SPELL_LITERAL
4665                                         && ISIDST (token2->val.str.text[0]))));
4666
4667     default:            break;
4668     }
4669
4670   return 0;
4671 }
4672
4673 /* Output all the remaining tokens on the current line, and a newline
4674    character, to FP.  Leading whitespace is removed.  If there are
4675    macros, special token padding is not performed.  */
4676 void
4677 cpp_output_line (cpp_reader *pfile, FILE *fp)
4678 {
4679   const cpp_token *token;
4680
4681   token = cpp_get_token (pfile);
4682   while (token->type != CPP_EOF)
4683     {
4684       cpp_output_token (token, fp);
4685       token = cpp_get_token (pfile);
4686       if (token->flags & PREV_WHITE)
4687         putc (' ', fp);
4688     }
4689
4690   putc ('\n', fp);
4691 }
4692
4693 /* Return a string representation of all the remaining tokens on the
4694    current line.  The result is allocated using xmalloc and must be
4695    freed by the caller.  */
4696 unsigned char *
4697 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
4698 {
4699   const cpp_token *token;
4700   unsigned int out = dir_name ? ustrlen (dir_name) : 0;
4701   unsigned int alloced = 120 + out;
4702   unsigned char *result = (unsigned char *) xmalloc (alloced);
4703
4704   /* If DIR_NAME is empty, there are no initial contents.  */
4705   if (dir_name)
4706     {
4707       sprintf ((char *) result, "#%s ", dir_name);
4708       out += 2;
4709     }
4710
4711   token = cpp_get_token (pfile);
4712   while (token->type != CPP_EOF)
4713     {
4714       unsigned char *last;
4715       /* Include room for a possible space and the terminating nul.  */
4716       unsigned int len = cpp_token_len (token) + 2;
4717
4718       if (out + len > alloced)
4719         {
4720           alloced *= 2;
4721           if (out + len > alloced)
4722             alloced = out + len;
4723           result = (unsigned char *) xrealloc (result, alloced);
4724         }
4725
4726       last = cpp_spell_token (pfile, token, &result[out], 0);
4727       out = last - result;
4728
4729       token = cpp_get_token (pfile);
4730       if (token->flags & PREV_WHITE)
4731         result[out++] = ' ';
4732     }
4733
4734   result[out] = '\0';
4735   return result;
4736 }
4737
4738 /* Memory buffers.  Changing these three constants can have a dramatic
4739    effect on performance.  The values here are reasonable defaults,
4740    but might be tuned.  If you adjust them, be sure to test across a
4741    range of uses of cpplib, including heavy nested function-like macro
4742    expansion.  Also check the change in peak memory usage (NJAMD is a
4743    good tool for this).  */
4744 #define MIN_BUFF_SIZE 8000
4745 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
4746 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
4747         (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
4748
4749 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
4750   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
4751 #endif
4752
4753 /* Create a new allocation buffer.  Place the control block at the end
4754    of the buffer, so that buffer overflows will cause immediate chaos.  */
4755 static _cpp_buff *
4756 new_buff (size_t len)
4757 {
4758   _cpp_buff *result;
4759   unsigned char *base;
4760
4761   if (len < MIN_BUFF_SIZE)
4762     len = MIN_BUFF_SIZE;
4763   len = CPP_ALIGN (len);
4764
4765 #ifdef ENABLE_VALGRIND_WORKAROUNDS
4766   /* Valgrind warns about uses of interior pointers, so put _cpp_buff
4767      struct first.  */
4768   size_t slen = CPP_ALIGN2 (sizeof (_cpp_buff), 2 * DEFAULT_ALIGNMENT);
4769   base = XNEWVEC (unsigned char, len + slen);
4770   result = (_cpp_buff *) base;
4771   base += slen;
4772 #else
4773   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
4774   result = (_cpp_buff *) (base + len);
4775 #endif
4776   result->base = base;
4777   result->cur = base;
4778   result->limit = base + len;
4779   result->next = NULL;
4780   return result;
4781 }
4782
4783 /* Place a chain of unwanted allocation buffers on the free list.  */
4784 void
4785 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
4786 {
4787   _cpp_buff *end = buff;
4788
4789   while (end->next)
4790     end = end->next;
4791   end->next = pfile->free_buffs;
4792   pfile->free_buffs = buff;
4793 }
4794
4795 /* Return a free buffer of size at least MIN_SIZE.  */
4796 _cpp_buff *
4797 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
4798 {
4799   _cpp_buff *result, **p;
4800
4801   for (p = &pfile->free_buffs;; p = &(*p)->next)
4802     {
4803       size_t size;
4804
4805       if (*p == NULL)
4806         return new_buff (min_size);
4807       result = *p;
4808       size = result->limit - result->base;
4809       /* Return a buffer that's big enough, but don't waste one that's
4810          way too big.  */
4811       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
4812         break;
4813     }
4814
4815   *p = result->next;
4816   result->next = NULL;
4817   result->cur = result->base;
4818   return result;
4819 }
4820
4821 /* Creates a new buffer with enough space to hold the uncommitted
4822    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
4823    the excess bytes to the new buffer.  Chains the new buffer after
4824    BUFF, and returns the new buffer.  */
4825 _cpp_buff *
4826 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
4827 {
4828   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
4829   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
4830
4831   buff->next = new_buff;
4832   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
4833   return new_buff;
4834 }
4835
4836 /* Creates a new buffer with enough space to hold the uncommitted
4837    remaining bytes of the buffer pointed to by BUFF, and at least
4838    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
4839    Chains the new buffer before the buffer pointed to by BUFF, and
4840    updates the pointer to point to the new buffer.  */
4841 void
4842 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
4843 {
4844   _cpp_buff *new_buff, *old_buff = *pbuff;
4845   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
4846
4847   new_buff = _cpp_get_buff (pfile, size);
4848   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
4849   new_buff->next = old_buff;
4850   *pbuff = new_buff;
4851 }
4852
4853 /* Free a chain of buffers starting at BUFF.  */
4854 void
4855 _cpp_free_buff (_cpp_buff *buff)
4856 {
4857   _cpp_buff *next;
4858
4859   for (; buff; buff = next)
4860     {
4861       next = buff->next;
4862 #ifdef ENABLE_VALGRIND_WORKAROUNDS
4863       free (buff);
4864 #else
4865       free (buff->base);
4866 #endif
4867     }
4868 }
4869
4870 /* Allocate permanent, unaligned storage of length LEN.  */
4871 unsigned char *
4872 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
4873 {
4874   _cpp_buff *buff = pfile->u_buff;
4875   unsigned char *result = buff->cur;
4876
4877   if (len > (size_t) (buff->limit - result))
4878     {
4879       buff = _cpp_get_buff (pfile, len);
4880       buff->next = pfile->u_buff;
4881       pfile->u_buff = buff;
4882       result = buff->cur;
4883     }
4884
4885   buff->cur = result + len;
4886   return result;
4887 }
4888
4889 /* Allocate permanent, unaligned storage of length LEN from a_buff.
4890    That buffer is used for growing allocations when saving macro
4891    replacement lists in a #define, and when parsing an answer to an
4892    assertion in #assert, #unassert or #if (and therefore possibly
4893    whilst expanding macros).  It therefore must not be used by any
4894    code that they might call: specifically the lexer and the guts of
4895    the macro expander.
4896
4897    All existing other uses clearly fit this restriction: storing
4898    registered pragmas during initialization.  */
4899 unsigned char *
4900 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
4901 {
4902   _cpp_buff *buff = pfile->a_buff;
4903   unsigned char *result = buff->cur;
4904
4905   if (len > (size_t) (buff->limit - result))
4906     {
4907       buff = _cpp_get_buff (pfile, len);
4908       buff->next = pfile->a_buff;
4909       pfile->a_buff = buff;
4910       result = buff->cur;
4911     }
4912
4913   buff->cur = result + len;
4914   return result;
4915 }
4916
4917 /* Commit or allocate storage from a buffer.  */
4918
4919 void *
4920 _cpp_commit_buff (cpp_reader *pfile, size_t size)
4921 {
4922   void *ptr = BUFF_FRONT (pfile->a_buff);
4923
4924   if (pfile->hash_table->alloc_subobject)
4925     {
4926       void *copy = pfile->hash_table->alloc_subobject (size);
4927       memcpy (copy, ptr, size);
4928       ptr = copy;
4929     }
4930   else
4931     BUFF_FRONT (pfile->a_buff) += size;
4932
4933   return ptr;
4934 }
4935
4936 /* Say which field of TOK is in use.  */
4937
4938 enum cpp_token_fld_kind
4939 cpp_token_val_index (const cpp_token *tok)
4940 {
4941   switch (TOKEN_SPELL (tok))
4942     {
4943     case SPELL_IDENT:
4944       return CPP_TOKEN_FLD_NODE;
4945     case SPELL_LITERAL:
4946       return CPP_TOKEN_FLD_STR;
4947     case SPELL_OPERATOR:
4948       /* Operands which were originally spelled as ident keep around
4949          the node for the exact spelling.  */
4950       if (tok->flags & NAMED_OP)
4951         return CPP_TOKEN_FLD_NODE;
4952       else if (tok->type == CPP_PASTE)
4953         return CPP_TOKEN_FLD_TOKEN_NO;
4954       else
4955         return CPP_TOKEN_FLD_NONE;
4956     case SPELL_NONE:
4957       if (tok->type == CPP_MACRO_ARG)
4958         return CPP_TOKEN_FLD_ARG_NO;
4959       else if (tok->type == CPP_PADDING)
4960         return CPP_TOKEN_FLD_SOURCE;
4961       else if (tok->type == CPP_PRAGMA)
4962         return CPP_TOKEN_FLD_PRAGMA;
4963       /* fall through */
4964     default:
4965       return CPP_TOKEN_FLD_NONE;
4966     }
4967 }
4968
4969 /* All tokens lexed in R after calling this function will be forced to
4970    have their location_t to be P, until
4971    cpp_stop_forcing_token_locations is called for R.  */
4972
4973 void
4974 cpp_force_token_locations (cpp_reader *r, location_t loc)
4975 {
4976   r->forced_token_location = loc;
4977 }
4978
4979 /* Go back to assigning locations naturally for lexed tokens.  */
4980
4981 void
4982 cpp_stop_forcing_token_locations (cpp_reader *r)
4983 {
4984   r->forced_token_location = 0;
4985 }
4986
4987 /* We're looking at \, if it's escaping EOL, look past it.  If at
4988    LIMIT, don't advance.  */
4989
4990 static const unsigned char *
4991 do_peek_backslash (const unsigned char *peek, const unsigned char *limit)
4992 {
4993   const unsigned char *probe = peek;
4994
4995   if (__builtin_expect (peek[1] == '\n', true))
4996     {
4997     eol:
4998       probe += 2;
4999       if (__builtin_expect (probe < limit, true))
5000         {
5001           peek = probe;
5002           if (*peek == '\\')
5003             /* The user might be perverse.  */
5004             return do_peek_backslash (peek, limit);
5005         }
5006     }
5007   else if (__builtin_expect (peek[1] == '\r', false))
5008     {
5009       if (probe[2] == '\n')
5010         probe++;
5011       goto eol;
5012     }
5013
5014   return peek;
5015 }
5016
5017 static const unsigned char *
5018 do_peek_next (const unsigned char *peek, const unsigned char *limit)
5019 {
5020   if (__builtin_expect (*peek == '\\', false))
5021     peek = do_peek_backslash (peek, limit);
5022   return peek;
5023 }
5024
5025 static const unsigned char *
5026 do_peek_prev (const unsigned char *peek, const unsigned char *bound)
5027 {
5028   if (peek == bound)
5029     return NULL;
5030
5031   unsigned char c = *--peek;
5032   if (__builtin_expect (c == '\n', false)
5033       || __builtin_expect (c == 'r', false))
5034     {
5035       if (peek == bound)
5036         return peek;
5037       int ix = -1;
5038       if (c == '\n' && peek[ix] == '\r')
5039         {
5040           if (peek + ix == bound)
5041             return peek;
5042           ix--;
5043         }
5044
5045       if (peek[ix] == '\\')
5046         return do_peek_prev (peek + ix, bound);
5047
5048       return peek;
5049     }
5050   else
5051     return peek;
5052 }
5053
5054 /* If PEEK[-1] is identifier MATCH, scan past it and trailing white
5055    space.  Otherwise return NULL.  */
5056
5057 static const unsigned char *
5058 do_peek_ident (const char *match, const unsigned char *peek,
5059                const unsigned char *limit)
5060 {
5061   for (; *++match; peek++)
5062     if (*peek != *match)
5063       {
5064         peek = do_peek_next (peek, limit);
5065         if (*peek != *match)
5066           return NULL;
5067       }
5068
5069   /* Must now not be looking at an identifier char.  */
5070   peek = do_peek_next (peek, limit);
5071   if (ISIDNUM (*peek))
5072     return NULL;
5073
5074   /* Skip control-line whitespace.  */
5075  ws:
5076   while (*peek == ' ' || *peek == '\t')
5077     peek++;
5078   if (__builtin_expect (*peek == '\\', false))
5079     {
5080       peek = do_peek_backslash (peek, limit);
5081       if (*peek != '\\')
5082         goto ws;
5083     }
5084
5085   return peek;
5086 }
5087
5088 /* Are we looking at a module control line starting as PEEK - 1?  */
5089
5090 static bool
5091 do_peek_module (cpp_reader *pfile, unsigned char c,
5092                 const unsigned char *peek, const unsigned char *limit)
5093 {
5094   bool import = false;
5095
5096   if (__builtin_expect (c == 'e', false))
5097     {
5098       if (!((peek[0] == 'x' || peek[0] == '\\')
5099             && (peek = do_peek_ident ("export", peek, limit))))
5100         return false;
5101
5102       /* export, peek for import or module.  No need to peek __import
5103          here.  */
5104       if (peek[0] == 'i')
5105         {
5106           if (!((peek[1] == 'm' || peek[1] == '\\')
5107                 && (peek = do_peek_ident ("import", peek + 1, limit))))
5108             return false;
5109           import = true;
5110         }
5111       else if (peek[0] == 'm')
5112         {
5113           if (!((peek[1] == 'o' || peek[1] == '\\')
5114                 && (peek = do_peek_ident ("module", peek + 1, limit))))
5115             return false;
5116         }
5117       else
5118         return false;
5119     }
5120   else if (__builtin_expect (c == 'i', false))
5121     {
5122       if (!((peek[0] == 'm' || peek[0] == '\\')
5123             && (peek = do_peek_ident ("import", peek, limit))))
5124         return false;
5125       import = true;
5126     }
5127   else if (__builtin_expect (c == '_', false))
5128     {
5129       /* Needed for translated includes.   */
5130       if (!((peek[0] == '_' || peek[0] == '\\')
5131             && (peek = do_peek_ident ("__import", peek, limit))))
5132         return false;
5133       import = true;
5134     }
5135   else if (__builtin_expect (c == 'm', false))
5136     {
5137       if (!((peek[0] == 'o' || peek[0] == '\\')
5138             && (peek = do_peek_ident ("module", peek, limit))))
5139         return false;
5140     }
5141   else
5142     return false;
5143
5144   /* Peek the next character to see if it's good enough.  We'll be at
5145      the first non-whitespace char, including skipping an escaped
5146      newline.  */
5147   /* ... import followed by identifier, ':', '<' or header-name
5148      preprocessing tokens, or module followed by identifier, ':' or
5149      ';' preprocessing tokens.  */
5150   unsigned char p = *peek++;
5151
5152   /* A character literal is ... single quotes, ... optionally preceded
5153      by u8, u, U, or L */
5154   /* A string-literal is a ... double quotes, optionally prefixed by
5155      R, u8, u8R, u, uR, U, UR, L, or LR */
5156   if (p == 'u')
5157     {
5158       peek = do_peek_next (peek, limit);
5159       if (*peek == '8')
5160         {
5161           peek++;
5162           goto peek_u8;
5163         }
5164       goto peek_u;
5165     }
5166   else if (p == 'U' || p == 'L')
5167     {
5168     peek_u8:
5169       peek = do_peek_next (peek, limit);
5170     peek_u:
5171       if (*peek == '\"' || *peek == '\'')
5172         return false;
5173
5174       if (*peek == 'R')
5175         goto peek_R;
5176       /* Identifier. Ok.  */
5177     }
5178   else if (p == 'R')
5179     {
5180     peek_R:
5181       if (CPP_OPTION (pfile, rliterals))
5182         {
5183           peek = do_peek_next (peek, limit);
5184           if (*peek == '\"')
5185             return false;
5186         }
5187       /* Identifier. Ok.  */
5188     }
5189   else if ('Z' - 'A' == 25
5190            ? ((p >= 'A' && p <= 'Z') || (p >= 'a' && p <= 'z') || p == '_')
5191            : ISIDST (p))
5192     {
5193       /* Identifier.  Ok. */
5194     }
5195   else if (p == '<')
5196     {
5197       /* Maybe angle header, ok for import.  Reject
5198          '<=', '<<' digraph:'<:'.  */
5199       if (!import)
5200         return false;
5201       peek = do_peek_next (peek, limit);
5202       if (*peek == '=' || *peek == '<'
5203           || (*peek == ':' && CPP_OPTION (pfile, digraphs)))
5204         return false;
5205     }
5206   else if (p == ';')
5207     {
5208       /* SEMICOLON, ok for module.  */
5209       if (import)
5210         return false;
5211     }
5212   else if (p == '"')
5213     {
5214       /* STRING, ok for import.  */
5215       if (!import)
5216         return false;
5217     }
5218   else if (p == ':')
5219     {
5220       /* Maybe COLON, ok.  Reject '::', digraph:':>'.  */
5221       peek = do_peek_next (peek, limit);
5222       if (*peek == ':' || (*peek == '>' && CPP_OPTION (pfile, digraphs)))
5223         return false;
5224     }
5225   else
5226     /* FIXME: Detect a unicode character, excluding those not
5227        permitted as the initial character. [lex.name]/1.  I presume
5228        we need to check the \[uU] spellings, and directly using
5229        Unicode in say UTF8 form?  Or perhaps we do the phase-1
5230        conversion of UTF8 to universal-character-names?  */
5231     return false;
5232
5233   return true;
5234 }
5235
5236 /* Directives-only scanning.  Somewhat more relaxed than correct
5237    parsing -- some ill-formed programs will not be rejected.  */
5238
5239 void
5240 cpp_directive_only_process (cpp_reader *pfile,
5241                             void *data,
5242                             void (*cb) (cpp_reader *, CPP_DO_task, void *, ...))
5243 {
5244   bool module_p = CPP_OPTION (pfile, module_directives);
5245
5246   do
5247     {
5248     restart:
5249       /* Buffer initialization, but no line cleaning. */
5250       cpp_buffer *buffer = pfile->buffer;
5251       buffer->cur_note = buffer->notes_used = 0;
5252       buffer->cur = buffer->line_base = buffer->next_line;
5253       buffer->need_line = false;
5254       /* Files always end in a newline or carriage return.  We rely on this for
5255          character peeking safety.  */
5256       gcc_assert (buffer->rlimit[0] == '\n' || buffer->rlimit[0] == '\r');
5257
5258       const unsigned char *base = buffer->cur;
5259       unsigned line_count = 0;
5260       const unsigned char *line_start = base;
5261
5262       bool bol = true;
5263       bool raw = false;
5264
5265       const unsigned char *lwm = base;
5266       for (const unsigned char *pos = base, *limit = buffer->rlimit;
5267            pos < limit;)
5268         {
5269           unsigned char c = *pos++;
5270           /* This matches the switch in _cpp_lex_direct.  */
5271           switch (c)
5272             {
5273             case ' ': case '\t': case '\f': case '\v':
5274               /* Whitespace, do nothing.  */
5275               break;
5276
5277             case '\r': /* MAC line ending, or Windows \r\n  */
5278               if (*pos == '\n')
5279                 pos++;
5280               /* FALLTHROUGH */
5281
5282             case '\n':
5283               bol = true;
5284
5285             next_line:
5286               CPP_INCREMENT_LINE (pfile, 0);
5287               line_count++;
5288               line_start = pos;
5289               break;
5290
5291             case '\\':
5292               /* <backslash><newline> is removed, and doesn't undo any
5293                  preceeding escape or whatnot.  */
5294               if (*pos == '\n')
5295                 {
5296                   pos++;
5297                   goto next_line;
5298                 }
5299               else if (*pos == '\r')
5300                 {
5301                   if (pos[1] == '\n')
5302                     pos++;
5303                   pos++;
5304                   goto next_line;
5305                 }
5306               goto dflt;
5307
5308             case '#':
5309               if (bol)
5310                 {
5311                   /* Line directive.  */
5312                   if (pos - 1 > base && !pfile->state.skipping)
5313                     cb (pfile, CPP_DO_print, data,
5314                         line_count, base, pos - 1 - base);
5315
5316                   /* Prep things for directive handling. */
5317                   buffer->next_line = pos;
5318                   buffer->need_line = true;
5319                   bool ok = _cpp_get_fresh_line (pfile);
5320                   gcc_checking_assert (ok);
5321
5322                   /* Ensure proper column numbering for generated
5323                      error messages. */
5324                   buffer->line_base -= pos - line_start;
5325
5326                   _cpp_handle_directive (pfile, line_start + 1 != pos);
5327
5328                   /* Sanitize the line settings.  Duplicate #include's can
5329                      mess things up. */
5330                   // FIXME: Necessary?
5331                   pfile->line_table->highest_location
5332                     = pfile->line_table->highest_line;
5333
5334                   if (!pfile->state.skipping
5335                       && pfile->buffer->next_line < pfile->buffer->rlimit)
5336                     cb (pfile, CPP_DO_location, data,
5337                         pfile->line_table->highest_line);
5338
5339                   goto restart;
5340                 }
5341               goto dflt;
5342
5343             case '/':
5344               {
5345                 const unsigned char *peek = do_peek_next (pos, limit);
5346                 if (!(*peek == '/' || *peek == '*'))
5347                   goto dflt;
5348
5349                 /* Line or block comment  */
5350                 bool is_block = *peek == '*';
5351                 bool star = false;
5352                 bool esc = false;
5353                 location_t sloc
5354                   = linemap_position_for_column (pfile->line_table,
5355                                                  pos - line_start);
5356
5357                 while (pos < limit)
5358                   {
5359                     char c = *pos++;
5360                     switch (c)
5361                       {
5362                       case '\\':
5363                         esc = true;
5364                         break;
5365
5366                       case '\r':
5367                         if (*pos == '\n')
5368                           pos++;
5369                         /* FALLTHROUGH  */
5370
5371                       case '\n':
5372                         {
5373                           CPP_INCREMENT_LINE (pfile, 0);
5374                           line_count++;
5375                           line_start = pos;
5376                           if (!esc && !is_block)
5377                             {
5378                               bol = true;
5379                               goto done_comment;
5380                             }
5381                         }
5382                         if (!esc)
5383                           star = false;
5384                         esc = false;
5385                         break;
5386
5387                       case '*':
5388                         if (pos > peek)
5389                           star = is_block;
5390                         esc = false;
5391                         break;
5392
5393                       case '/':
5394                         if (star)
5395                           goto done_comment;
5396                         /* FALLTHROUGH  */
5397
5398                       default:
5399                         star = false;
5400                         esc = false;
5401                         break;
5402                       }
5403                   }
5404                 if (pos < limit || is_block)
5405                   cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
5406                                        "unterminated comment");
5407               done_comment:
5408                 lwm = pos;
5409                 break;
5410               }
5411
5412             case '\'':
5413               if (!CPP_OPTION (pfile, digit_separators))
5414                 goto delimited_string;
5415
5416               /* Possibly a number punctuator.  */
5417               if (!ISIDNUM (*do_peek_next (pos, limit)))
5418                 goto delimited_string;
5419
5420               goto quote_peek;
5421
5422             case '\"':
5423               if (!CPP_OPTION (pfile, rliterals))
5424                 goto delimited_string;
5425
5426             quote_peek:
5427               {
5428                 /* For ' see if it's a number punctuator
5429                    \.?<digit>(<digit>|<identifier-nondigit>
5430                    |'<digit>|'<nondigit>|[eEpP]<sign>|\.)* */
5431                 /* For " see if it's a raw string
5432                    {U,L,u,u8}R.  This includes CPP_NUMBER detection,
5433                    because that could be 0e+R.  */
5434                 const unsigned char *peek = pos - 1;
5435                 bool quote_first = c == '"';
5436                 bool quote_eight = false;
5437                 bool maybe_number_start = false;
5438                 bool want_number = false;
5439
5440                 while ((peek = do_peek_prev (peek, lwm)))
5441                   {
5442                     unsigned char p = *peek;
5443                     if (quote_first)
5444                       {
5445                         if (!raw)
5446                           {
5447                             if (p != 'R')
5448                               break;
5449                             raw = true;
5450                             continue;
5451                           }
5452
5453                         quote_first = false;
5454                         if (p == 'L' || p == 'U' || p == 'u')
5455                           ;
5456                         else if (p == '8')
5457                           quote_eight = true;
5458                         else
5459                           goto second_raw;
5460                       }
5461                     else if (quote_eight)
5462                       {
5463                         if (p != 'u')
5464                           {
5465                             raw = false;
5466                             break;
5467                           }
5468                         quote_eight = false;
5469                       }
5470                     else if (c == '"')
5471                       {
5472                       second_raw:;
5473                         if (!want_number && ISIDNUM (p))
5474                           {
5475                             raw = false;
5476                             break;
5477                           }
5478                       }
5479
5480                     if (ISDIGIT (p))
5481                       maybe_number_start = true;
5482                     else if (p == '.')
5483                       want_number = true;
5484                     else if (ISIDNUM (p))
5485                       maybe_number_start = false;
5486                     else if (p == '+' || p == '-')
5487                       {
5488                         if (const unsigned char *peek_prev
5489                             = do_peek_prev (peek, lwm))
5490                           {
5491                             p = *peek_prev;
5492                             if (p == 'e' || p == 'E'
5493                                 || p == 'p' || p == 'P')
5494                               {
5495                                 want_number = true;
5496                                 maybe_number_start = false;
5497                               }
5498                             else
5499                               break;
5500                           }
5501                         else
5502                           break;
5503                       }
5504                     else if (p == '\'' || p == '\"')
5505                       {
5506                         /* If this is lwm, this must be the end of a
5507                            previous string.  So this is a trailing
5508                            literal type, (a) if those are allowed,
5509                              and (b) maybe_start is false.  Otherwise
5510                              this must be a CPP_NUMBER because we've
5511                              met another ', and we'd have checked that
5512                              in its own right.  */
5513                         if (peek == lwm && CPP_OPTION (pfile, uliterals))
5514                           {
5515                             if  (!maybe_number_start && !want_number)
5516                               /* Must be a literal type.  */
5517                               raw = false;
5518                           }
5519                         else if (p == '\''
5520                                  && CPP_OPTION (pfile, digit_separators))
5521                           maybe_number_start = true;
5522                         break;
5523                       }
5524                     else if (c == '\'')
5525                       break;
5526                     else if (!quote_first && !quote_eight)
5527                       break;
5528                   }
5529
5530                 if (maybe_number_start)
5531                   {
5532                     if (c == '\'')
5533                       /* A CPP NUMBER.  */
5534                       goto dflt;
5535                     raw = false;
5536                   }
5537
5538                 goto delimited_string;
5539               }
5540
5541             delimited_string:
5542               {
5543                 /* (Possibly raw) string or char literal.  */
5544                 unsigned char end = c;
5545                 int delim_len = -1;
5546                 const unsigned char *delim = NULL;
5547                 location_t sloc = linemap_position_for_column (pfile->line_table,
5548                                                                pos - line_start);
5549                 int esc = 0;
5550
5551                 if (raw)
5552                   {
5553                     /* There can be no line breaks in the delimiter.  */
5554                     delim = pos;
5555                     for (delim_len = 0; (c = *pos++) != '('; delim_len++)
5556                       {
5557                         if (delim_len == 16)
5558                           {
5559                             cpp_error_with_line (pfile, CPP_DL_ERROR,
5560                                                  sloc, 0,
5561                                                  "raw string delimiter"
5562                                                  " longer than %d"
5563                                                  " characters",
5564                                                  delim_len);
5565                             raw = false;
5566                             pos = delim;
5567                             break;
5568                           }
5569                         if (strchr (") \\\t\v\f\n", c))
5570                           {
5571                             cpp_error_with_line (pfile, CPP_DL_ERROR,
5572                                                  sloc, 0,
5573                                                  "invalid character '%c'"
5574                                                  " in raw string"
5575                                                  " delimiter", c);
5576                             raw = false;
5577                             pos = delim;
5578                             break;
5579                           }
5580                         if (pos >= limit)
5581                           goto bad_string;
5582                       }
5583                   }
5584
5585                 while (pos < limit)
5586                   {
5587                     char c = *pos++;
5588                     switch (c)
5589                       {
5590                       case '\\':
5591                         if (!raw)
5592                           esc++;
5593                         break;
5594
5595                       case '\r':
5596                         if (*pos == '\n')
5597                           pos++;
5598                         /* FALLTHROUGH  */
5599
5600                       case '\n':
5601                         {
5602                           CPP_INCREMENT_LINE (pfile, 0);
5603                           line_count++;
5604                           line_start = pos;
5605                         }
5606                         if (esc)
5607                           esc--;
5608                         break;
5609
5610                       case ')':
5611                         if (raw
5612                             && pos + delim_len + 1 < limit
5613                             && pos[delim_len] == end
5614                             && !memcmp (delim, pos, delim_len))
5615                           {
5616                             pos += delim_len + 1;
5617                             raw = false;
5618                             goto done_string;
5619                           }
5620                         break;
5621
5622                       default:
5623                         if (!raw && !(esc & 1) && c == end)
5624                           goto done_string;
5625                         esc = 0;
5626                         break;
5627                       }
5628                   }
5629               bad_string:
5630                 cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
5631                                      "unterminated literal");
5632
5633               done_string:
5634                 raw = false;
5635                 lwm = pos - 1;
5636               }
5637               goto dflt;
5638
5639             case '_':
5640             case 'e':
5641             case 'i':
5642             case 'm':
5643               if (bol && module_p && !pfile->state.skipping
5644                   && do_peek_module (pfile, c, pos, limit))
5645                 {
5646                   /* We've seen the start of a module control line.
5647                      Start up the tokenizer.  */
5648                   pos--; /* Backup over the first character.  */
5649
5650                   /* Backup over whitespace to start of line.  */
5651                   while (pos > line_start
5652                          && (pos[-1] == ' ' || pos[-1] == '\t'))
5653                     pos--;
5654
5655                   if (pos > base)
5656                     cb (pfile, CPP_DO_print, data, line_count, base, pos - base);
5657
5658                   /* Prep things for directive handling. */
5659                   buffer->next_line = pos;
5660                   buffer->need_line = true;
5661
5662                   /* Now get tokens until the PRAGMA_EOL.  */
5663                   do
5664                     {
5665                       location_t spelling;
5666                       const cpp_token *tok
5667                         = cpp_get_token_with_location (pfile, &spelling);
5668
5669                       gcc_assert (pfile->state.in_deferred_pragma
5670                                   || tok->type == CPP_PRAGMA_EOL);
5671                       cb (pfile, CPP_DO_token, data, tok, spelling);
5672                     }
5673                   while (pfile->state.in_deferred_pragma);
5674
5675                   if (pfile->buffer->next_line < pfile->buffer->rlimit)
5676                     cb (pfile, CPP_DO_location, data,
5677                         pfile->line_table->highest_line);
5678
5679                   pfile->mi_valid = false;
5680                   goto restart;
5681                 }
5682               goto dflt;
5683
5684             default:
5685             dflt:
5686               bol = false;
5687               pfile->mi_valid = false;
5688               break;
5689             }
5690         }
5691
5692       if (buffer->rlimit > base && !pfile->state.skipping)
5693         {
5694           const unsigned char *limit = buffer->rlimit;
5695           /* If the file was not newline terminated, add rlimit, which is
5696              guaranteed to point to a newline, to the end of our range.  */
5697           if (limit[-1] != '\n')
5698             {
5699               limit++;
5700               CPP_INCREMENT_LINE (pfile, 0);
5701               line_count++;
5702             }
5703           cb (pfile, CPP_DO_print, data, line_count, base, limit - base);
5704         }
5705
5706       _cpp_pop_buffer (pfile);
5707     }
5708   while (pfile->buffer);
5709 }