libcpp/lex.cc

   1 /* CPP Library - lexical analysis.
   2    Copyright (C) 2000-2023 Free Software Foundation, Inc.
   3    Contributed by Per Bothner, 1994-95.
   4    Based on CCCP program by Paul Rubin, June 1986
   5    Adapted to ANSI C, Richard Stallman, Jan 1987
   6    Broken out to separate file, Zack Weinberg, Mar 2000
   7
   8 This program is free software; you can redistribute it and/or modify it
   9 under the terms of the GNU General Public License as published by the
  10 Free Software Foundation; either version 3, or (at your option) any
  11 later version.
  12
  13 This program is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with this program; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "cpplib.h"
  25 #include "internal.h"
  26
  27 enum spell_type
  28 {
  29   SPELL_OPERATOR = 0,
  30   SPELL_IDENT,
  31   SPELL_LITERAL,
  32   SPELL_NONE
  33 };
  34
  35 struct token_spelling
  36 {
  37   enum spell_type category;
  38   const unsigned char *name;
  39 };
  40
  41 static const unsigned char *const digraph_spellings[] =
  42 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
  43
  44 #define OP(e, s) { SPELL_OPERATOR, UC s  },
  45 #define TK(e, s) { SPELL_ ## s,    UC #e },
  46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
  47 #undef OP
  48 #undef TK
  49
  50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
  51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
  52
  53 /* ISO 10646 defines the UCS codespace as the range 0-0x10FFFF inclusive.  */
  54 #define UCS_LIMIT 0x10FFFF
  55
  56 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
  57 static int skip_line_comment (cpp_reader *);
  58 static void skip_whitespace (cpp_reader *, cppchar_t);
  59 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
  60 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
  61 static void store_comment (cpp_reader *, cpp_token *);
  62 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
  63                             unsigned int, enum cpp_ttype);
  64 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
  65 static int name_p (cpp_reader *, const cpp_string *);
  66 static tokenrun *next_tokenrun (tokenrun *);
  67
  68 static _cpp_buff *new_buff (size_t);
  69
  70
  71 /* Utility routine:
  72
  73    Compares, the token TOKEN to the NUL-terminated string STRING.
  74    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
  75 int
  76 cpp_ideq (const cpp_token *token, const char *string)
  77 {
  78   if (token->type != CPP_NAME)
  79     return 0;
  80
  81   return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
  82 }
  83
  84 /* Record a note TYPE at byte POS into the current cleaned logical
  85    line.  */
  86 static void
  87 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
  88 {
  89   if (buffer->notes_used == buffer->notes_cap)
  90     {
  91       buffer->notes_cap = buffer->notes_cap * 2 + 200;
  92       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
  93                                   buffer->notes_cap);
  94     }
  95
  96   buffer->notes[buffer->notes_used].pos = pos;
  97   buffer->notes[buffer->notes_used].type = type;
  98   buffer->notes_used++;
  99 }
 100
 101 \f
 102 /* Fast path to find line special characters using optimized character
 103    scanning algorithms.  Anything complicated falls back to the slow
 104    path below.  Since this loop is very hot it's worth doing these kinds
 105    of optimizations.
 106
 107    One of the paths through the ifdefs should provide
 108
 109      const uchar *search_line_fast (const uchar *s, const uchar *end);
 110
 111    Between S and END, search for \n, \r, \\, ?.  Return a pointer to
 112    the found character.
 113
 114    Note that the last character of the buffer is *always* a newline,
 115    as forced by _cpp_convert_input.  This fact can be used to avoid
 116    explicitly looking for the end of the buffer.  */
 117
 118 /* Configure gives us an ifdef test.  */
 119 #ifndef WORDS_BIGENDIAN
 120 #define WORDS_BIGENDIAN 0
 121 #endif
 122
 123 /* We'd like the largest integer that fits into a register.  There's nothing
 124    in <stdint.h> that gives us that.  For most hosts this is unsigned long,
 125    but MS decided on an LLP64 model.  Thankfully when building with GCC we
 126    can get the "real" word size.  */
 127 #ifdef __GNUC__
 128 typedef unsigned int word_type __attribute__((__mode__(__word__)));
 129 #else
 130 typedef unsigned long word_type;
 131 #endif
 132
 133 /* The code below is only expecting sizes 4 or 8.
 134    Die at compile-time if this expectation is violated.  */
 135 typedef char check_word_type_size
 136   [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
 137
 138 /* Return X with the first N bytes forced to values that won't match one
 139    of the interesting characters.  Note that NUL is not interesting.  */
 140
 141 static inline word_type
 142 acc_char_mask_misalign (word_type val, unsigned int n)
 143 {
 144   word_type mask = -1;
 145   if (WORDS_BIGENDIAN)
 146     mask >>= n * 8;
 147   else
 148     mask <<= n * 8;
 149   return val & mask;
 150 }
 151
 152 /* Return X replicated to all byte positions within WORD_TYPE.  */
 153
 154 static inline word_type
 155 acc_char_replicate (uchar x)
 156 {
 157   word_type ret;
 158
 159   ret = (x << 24) | (x << 16) | (x << 8) | x;
 160   if (sizeof(word_type) == 8)
 161     ret = (ret << 16 << 16) | ret;
 162   return ret;
 163 }
 164
 165 /* Return non-zero if some byte of VAL is (probably) C.  */
 166
 167 static inline word_type
 168 acc_char_cmp (word_type val, word_type c)
 169 {
 170 #if defined(__GNUC__) && defined(__alpha__)
 171   /* We can get exact results using a compare-bytes instruction.
 172      Get (val == c) via (0 >= (val ^ c)).  */
 173   return __builtin_alpha_cmpbge (0, val ^ c);
 174 #else
 175   word_type magic = 0x7efefefeU;
 176   if (sizeof(word_type) == 8)
 177     magic = (magic << 16 << 16) | 0xfefefefeU;
 178   magic |= 1;
 179
 180   val ^= c;
 181   return ((val + magic) ^ ~val) & ~magic;
 182 #endif
 183 }
 184
 185 /* Given the result of acc_char_cmp is non-zero, return the index of
 186    the found character.  If this was a false positive, return -1.  */
 187
 188 static inline int
 189 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
 190                 word_type val ATTRIBUTE_UNUSED)
 191 {
 192 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
 193   /* The cmpbge instruction sets *bits* of the result corresponding to
 194      matches in the bytes with no false positives.  */
 195   return __builtin_ctzl (cmp);
 196 #else
 197   unsigned int i;
 198
 199   /* ??? It would be nice to force unrolling here,
 200      and have all of these constants folded.  */
 201   for (i = 0; i < sizeof(word_type); ++i)
 202     {
 203       uchar c;
 204       if (WORDS_BIGENDIAN)
 205         c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
 206       else
 207         c = (val >> i * 8) & 0xff;
 208
 209       if (c == '\n' || c == '\r' || c == '\\' || c == '?')
 210         return i;
 211     }
 212
 213   return -1;
 214 #endif
 215 }
 216
 217 /* A version of the fast scanner using bit fiddling techniques.
 218
 219    For 32-bit words, one would normally perform 16 comparisons and
 220    16 branches.  With this algorithm one performs 24 arithmetic
 221    operations and one branch.  Whether this is faster with a 32-bit
 222    word size is going to be somewhat system dependent.
 223
 224    For 64-bit words, we eliminate twice the number of comparisons
 225    and branches without increasing the number of arithmetic operations.
 226    It's almost certainly going to be a win with 64-bit word size.  */
 227
 228 static const uchar * search_line_acc_char (const uchar *, const uchar *)
 229   ATTRIBUTE_UNUSED;
 230
 231 static const uchar *
 232 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 233 {
 234   const word_type repl_nl = acc_char_replicate ('\n');
 235   const word_type repl_cr = acc_char_replicate ('\r');
 236   const word_type repl_bs = acc_char_replicate ('\\');
 237   const word_type repl_qm = acc_char_replicate ('?');
 238
 239   unsigned int misalign;
 240   const word_type *p;
 241   word_type val, t;
 242
 243   /* Align the buffer.  Mask out any bytes from before the beginning.  */
 244   p = (word_type *)((uintptr_t)s & -sizeof(word_type));
 245   val = *p;
 246   misalign = (uintptr_t)s & (sizeof(word_type) - 1);
 247   if (misalign)
 248     val = acc_char_mask_misalign (val, misalign);
 249
 250   /* Main loop.  */
 251   while (1)
 252     {
 253       t  = acc_char_cmp (val, repl_nl);
 254       t |= acc_char_cmp (val, repl_cr);
 255       t |= acc_char_cmp (val, repl_bs);
 256       t |= acc_char_cmp (val, repl_qm);
 257
 258       if (__builtin_expect (t != 0, 0))
 259         {
 260           int i = acc_char_index (t, val);
 261           if (i >= 0)
 262             return (const uchar *)p + i;
 263         }
 264
 265       val = *++p;
 266     }
 267 }
 268
 269 /* Disable on Solaris 2/x86 until the following problem can be properly
 270    autoconfed:
 271
 272    The Solaris 10+ assembler tags objects with the instruction set
 273    extensions used, so SSE4.2 executables cannot run on machines that
 274    don't support that extension.  */
 275
 276 #if (GCC_VERSION >= 4005) && (__GNUC__ >= 5 || !defined(__PIC__)) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
 277
 278 /* Replicated character data to be shared between implementations.
 279    Recall that outside of a context with vector support we can't
 280    define compatible vector types, therefore these are all defined
 281    in terms of raw characters.  */
 282 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
 283   { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 284     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
 285   { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 286     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
 287   { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 288     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
 289   { '?', '?', '?', '?', '?', '?', '?', '?',
 290     '?', '?', '?', '?', '?', '?', '?', '?' },
 291 };
 292
 293 /* A version of the fast scanner using MMX vectorized byte compare insns.
 294
 295    This uses the PMOVMSKB instruction which was introduced with "MMX2",
 296    which was packaged into SSE1; it is also present in the AMD MMX
 297    extension.  Mark the function as using "sse" so that we emit a real
 298    "emms" instruction, rather than the 3dNOW "femms" instruction.  */
 299
 300 static const uchar *
 301 #ifndef __SSE__
 302 __attribute__((__target__("sse")))
 303 #endif
 304 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 305 {
 306   typedef char v8qi __attribute__ ((__vector_size__ (8)));
 307   typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
 308
 309   const v8qi repl_nl = *(const v8qi *)repl_chars[0];
 310   const v8qi repl_cr = *(const v8qi *)repl_chars[1];
 311   const v8qi repl_bs = *(const v8qi *)repl_chars[2];
 312   const v8qi repl_qm = *(const v8qi *)repl_chars[3];
 313
 314   unsigned int misalign, found, mask;
 315   const v8qi *p;
 316   v8qi data, t, c;
 317
 318   /* Align the source pointer.  While MMX doesn't generate unaligned data
 319      faults, this allows us to safely scan to the end of the buffer without
 320      reading beyond the end of the last page.  */
 321   misalign = (uintptr_t)s & 7;
 322   p = (const v8qi *)((uintptr_t)s & -8);
 323   data = *p;
 324
 325   /* Create a mask for the bytes that are valid within the first
 326      16-byte block.  The Idea here is that the AND with the mask
 327      within the loop is "free", since we need some AND or TEST
 328      insn in order to set the flags for the branch anyway.  */
 329   mask = -1u << misalign;
 330
 331   /* Main loop processing 8 bytes at a time.  */
 332   goto start;
 333   do
 334     {
 335       data = *++p;
 336       mask = -1;
 337
 338     start:
 339       t = __builtin_ia32_pcmpeqb(data, repl_nl);
 340       c = __builtin_ia32_pcmpeqb(data, repl_cr);
 341       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 342       c = __builtin_ia32_pcmpeqb(data, repl_bs);
 343       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 344       c = __builtin_ia32_pcmpeqb(data, repl_qm);
 345       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 346       found = __builtin_ia32_pmovmskb (t);
 347       found &= mask;
 348     }
 349   while (!found);
 350
 351   __builtin_ia32_emms ();
 352
 353   /* FOUND contains 1 in bits for which we matched a relevant
 354      character.  Conversion to the byte index is trivial.  */
 355   found = __builtin_ctz(found);
 356   return (const uchar *)p + found;
 357 }
 358
 359 /* A version of the fast scanner using SSE2 vectorized byte compare insns.  */
 360
 361 static const uchar *
 362 #ifndef __SSE2__
 363 __attribute__((__target__("sse2")))
 364 #endif
 365 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 366 {
 367   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 368
 369   const v16qi repl_nl = *(const v16qi *)repl_chars[0];
 370   const v16qi repl_cr = *(const v16qi *)repl_chars[1];
 371   const v16qi repl_bs = *(const v16qi *)repl_chars[2];
 372   const v16qi repl_qm = *(const v16qi *)repl_chars[3];
 373
 374   unsigned int misalign, found, mask;
 375   const v16qi *p;
 376   v16qi data, t;
 377
 378   /* Align the source pointer.  */
 379   misalign = (uintptr_t)s & 15;
 380   p = (const v16qi *)((uintptr_t)s & -16);
 381   data = *p;
 382
 383   /* Create a mask for the bytes that are valid within the first
 384      16-byte block.  The Idea here is that the AND with the mask
 385      within the loop is "free", since we need some AND or TEST
 386      insn in order to set the flags for the branch anyway.  */
 387   mask = -1u << misalign;
 388
 389   /* Main loop processing 16 bytes at a time.  */
 390   goto start;
 391   do
 392     {
 393       data = *++p;
 394       mask = -1;
 395
 396     start:
 397       t  = data == repl_nl;
 398       t |= data == repl_cr;
 399       t |= data == repl_bs;
 400       t |= data == repl_qm;
 401       found = __builtin_ia32_pmovmskb128 (t);
 402       found &= mask;
 403     }
 404   while (!found);
 405
 406   /* FOUND contains 1 in bits for which we matched a relevant
 407      character.  Conversion to the byte index is trivial.  */
 408   found = __builtin_ctz(found);
 409   return (const uchar *)p + found;
 410 }
 411
 412 #ifdef HAVE_SSE4
 413 /* A version of the fast scanner using SSE 4.2 vectorized string insns.  */
 414
 415 static const uchar *
 416 #ifndef __SSE4_2__
 417 __attribute__((__target__("sse4.2")))
 418 #endif
 419 search_line_sse42 (const uchar *s, const uchar *end)
 420 {
 421   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 422   static const v16qi search = { '\n', '\r', '?', '\\' };
 423
 424   uintptr_t si = (uintptr_t)s;
 425   uintptr_t index;
 426
 427   /* Check for unaligned input.  */
 428   if (si & 15)
 429     {
 430       v16qi sv;
 431
 432       if (__builtin_expect (end - s < 16, 0)
 433           && __builtin_expect ((si & 0xfff) > 0xff0, 0))
 434         {
 435           /* There are less than 16 bytes left in the buffer, and less
 436              than 16 bytes left on the page.  Reading 16 bytes at this
 437              point might generate a spurious page fault.  Defer to the
 438              SSE2 implementation, which already handles alignment.  */
 439           return search_line_sse2 (s, end);
 440         }
 441
 442       /* ??? The builtin doesn't understand that the PCMPESTRI read from
 443          memory need not be aligned.  */
 444       sv = __builtin_ia32_loaddqu ((const char *) s);
 445       index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
 446
 447       if (__builtin_expect (index < 16, 0))
 448         goto found;
 449
 450       /* Advance the pointer to an aligned address.  We will re-scan a
 451          few bytes, but we no longer need care for reading past the
 452          end of a page, since we're guaranteed a match.  */
 453       s = (const uchar *)((si + 15) & -16);
 454     }
 455
 456   /* Main loop, processing 16 bytes at a time.  */
 457 #ifdef __GCC_ASM_FLAG_OUTPUTS__
 458   while (1)
 459     {
 460       char f;
 461
 462       /* By using inline assembly instead of the builtin,
 463          we can use the result, as well as the flags set.  */
 464       __asm ("%vpcmpestri\t$0, %2, %3"
 465              : "=c"(index), "=@ccc"(f)
 466              : "m"(*s), "x"(search), "a"(4), "d"(16));
 467       if (f)
 468         break;
 469
 470       s += 16;
 471     }
 472 #else
 473   s -= 16;
 474   /* By doing the whole loop in inline assembly,
 475      we can make proper use of the flags set.  */
 476   __asm (      ".balign 16\n"
 477         "0:     add $16, %1\n"
 478         "       %vpcmpestri\t$0, (%1), %2\n"
 479         "       jnc 0b"
 480         : "=&c"(index), "+r"(s)
 481         : "x"(search), "a"(4), "d"(16));
 482 #endif
 483
 484  found:
 485   return s + index;
 486 }
 487
 488 #else
 489 /* Work around out-dated assemblers without sse4 support.  */
 490 #define search_line_sse42 search_line_sse2
 491 #endif
 492
 493 /* Check the CPU capabilities.  */
 494
 495 #include "../gcc/config/i386/cpuid.h"
 496
 497 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
 498 static search_line_fast_type search_line_fast;
 499
 500 #define HAVE_init_vectorized_lexer 1
 501 static inline void
 502 init_vectorized_lexer (void)
 503 {
 504   unsigned dummy, ecx = 0, edx = 0;
 505   search_line_fast_type impl = search_line_acc_char;
 506   int minimum = 0;
 507
 508 #if defined(__SSE4_2__)
 509   minimum = 3;
 510 #elif defined(__SSE2__)
 511   minimum = 2;
 512 #elif defined(__SSE__)
 513   minimum = 1;
 514 #endif
 515
 516   if (minimum == 3)
 517     impl = search_line_sse42;
 518   else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
 519     {
 520       if (minimum == 3 || (ecx & bit_SSE4_2))
 521         impl = search_line_sse42;
 522       else if (minimum == 2 || (edx & bit_SSE2))
 523         impl = search_line_sse2;
 524       else if (minimum == 1 || (edx & bit_SSE))
 525         impl = search_line_mmx;
 526     }
 527   else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
 528     {
 529       if (minimum == 1
 530           || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
 531         impl = search_line_mmx;
 532     }
 533
 534   search_line_fast = impl;
 535 }
 536
 537 #elif (GCC_VERSION >= 4005) && defined(_ARCH_PWR8) && defined(__ALTIVEC__)
 538
 539 /* A vection of the fast scanner using AltiVec vectorized byte compares
 540    and VSX unaligned loads (when VSX is available).  This is otherwise
 541    the same as the AltiVec version.  */
 542
 543 ATTRIBUTE_NO_SANITIZE_UNDEFINED
 544 static const uchar *
 545 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 546 {
 547   typedef __attribute__((altivec(vector))) unsigned char vc;
 548
 549   const vc repl_nl = {
 550     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 551     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 552   };
 553   const vc repl_cr = {
 554     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 555     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 556   };
 557   const vc repl_bs = {
 558     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 559     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 560   };
 561   const vc repl_qm = {
 562     '?', '?', '?', '?', '?', '?', '?', '?',
 563     '?', '?', '?', '?', '?', '?', '?', '?',
 564   };
 565   const vc zero = { 0 };
 566
 567   vc data, t;
 568
 569   /* Main loop processing 16 bytes at a time.  */
 570   do
 571     {
 572       vc m_nl, m_cr, m_bs, m_qm;
 573
 574       data = __builtin_vec_vsx_ld (0, s);
 575       s += 16;
 576
 577       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 578       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 579       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 580       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 581       t = (m_nl | m_cr) | (m_bs | m_qm);
 582
 583       /* T now contains 0xff in bytes for which we matched one of the relevant
 584          characters.  We want to exit the loop if any byte in T is non-zero.
 585          Below is the expansion of vec_any_ne(t, zero).  */
 586     }
 587   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 588
 589   /* Restore s to to point to the 16 bytes we just processed.  */
 590   s -= 16;
 591
 592   {
 593 #define N  (sizeof(vc) / sizeof(long))
 594
 595     union {
 596       vc v;
 597       /* Statically assert that N is 2 or 4.  */
 598       unsigned long l[(N == 2 || N == 4) ? N : -1];
 599     } u;
 600     unsigned long l, i = 0;
 601
 602     u.v = t;
 603
 604     /* Find the first word of T that is non-zero.  */
 605     switch (N)
 606       {
 607       case 4:
 608         l = u.l[i++];
 609         if (l != 0)
 610           break;
 611         s += sizeof(unsigned long);
 612         l = u.l[i++];
 613         if (l != 0)
 614           break;
 615         s += sizeof(unsigned long);
 616         /* FALLTHRU */
 617       case 2:
 618         l = u.l[i++];
 619         if (l != 0)
 620           break;
 621         s += sizeof(unsigned long);
 622         l = u.l[i];
 623       }
 624
 625     /* L now contains 0xff in bytes for which we matched one of the
 626        relevant characters.  We can find the byte index by finding
 627        its bit index and dividing by 8.  */
 628 #ifdef __BIG_ENDIAN__
 629     l = __builtin_clzl(l) >> 3;
 630 #else
 631     l = __builtin_ctzl(l) >> 3;
 632 #endif
 633     return s + l;
 634
 635 #undef N
 636   }
 637 }
 638
 639 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__) && defined (__BIG_ENDIAN__)
 640
 641 /* A vection of the fast scanner using AltiVec vectorized byte compares.
 642    This cannot be used for little endian because vec_lvsl/lvsr are
 643    deprecated for little endian and the code won't work properly.  */
 644 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
 645    so we can't compile this function without -maltivec on the command line
 646    (or implied by some other switch).  */
 647
 648 static const uchar *
 649 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 650 {
 651   typedef __attribute__((altivec(vector))) unsigned char vc;
 652
 653   const vc repl_nl = {
 654     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 655     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 656   };
 657   const vc repl_cr = {
 658     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 659     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 660   };
 661   const vc repl_bs = {
 662     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 663     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 664   };
 665   const vc repl_qm = {
 666     '?', '?', '?', '?', '?', '?', '?', '?',
 667     '?', '?', '?', '?', '?', '?', '?', '?',
 668   };
 669   const vc ones = {
 670     -1, -1, -1, -1, -1, -1, -1, -1,
 671     -1, -1, -1, -1, -1, -1, -1, -1,
 672   };
 673   const vc zero = { 0 };
 674
 675   vc data, mask, t;
 676
 677   /* Altivec loads automatically mask addresses with -16.  This lets us
 678      issue the first load as early as possible.  */
 679   data = __builtin_vec_ld(0, (const vc *)s);
 680
 681   /* Discard bytes before the beginning of the buffer.  Do this by
 682      beginning with all ones and shifting in zeros according to the
 683      mis-alignment.  The LVSR instruction pulls the exact shift we
 684      want from the address.  */
 685   mask = __builtin_vec_lvsr(0, s);
 686   mask = __builtin_vec_perm(zero, ones, mask);
 687   data &= mask;
 688
 689   /* While altivec loads mask addresses, we still need to align S so
 690      that the offset we compute at the end is correct.  */
 691   s = (const uchar *)((uintptr_t)s & -16);
 692
 693   /* Main loop processing 16 bytes at a time.  */
 694   goto start;
 695   do
 696     {
 697       vc m_nl, m_cr, m_bs, m_qm;
 698
 699       s += 16;
 700       data = __builtin_vec_ld(0, (const vc *)s);
 701
 702     start:
 703       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 704       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 705       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 706       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 707       t = (m_nl | m_cr) | (m_bs | m_qm);
 708
 709       /* T now contains 0xff in bytes for which we matched one of the relevant
 710          characters.  We want to exit the loop if any byte in T is non-zero.
 711          Below is the expansion of vec_any_ne(t, zero).  */
 712     }
 713   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 714
 715   {
 716 #define N  (sizeof(vc) / sizeof(long))
 717
 718     union {
 719       vc v;
 720       /* Statically assert that N is 2 or 4.  */
 721       unsigned long l[(N == 2 || N == 4) ? N : -1];
 722     } u;
 723     unsigned long l, i = 0;
 724
 725     u.v = t;
 726
 727     /* Find the first word of T that is non-zero.  */
 728     switch (N)
 729       {
 730       case 4:
 731         l = u.l[i++];
 732         if (l != 0)
 733           break;
 734         s += sizeof(unsigned long);
 735         l = u.l[i++];
 736         if (l != 0)
 737           break;
 738         s += sizeof(unsigned long);
 739         /* FALLTHROUGH */
 740       case 2:
 741         l = u.l[i++];
 742         if (l != 0)
 743           break;
 744         s += sizeof(unsigned long);
 745         l = u.l[i];
 746       }
 747
 748     /* L now contains 0xff in bytes for which we matched one of the
 749        relevant characters.  We can find the byte index by finding
 750        its bit index and dividing by 8.  */
 751     l = __builtin_clzl(l) >> 3;
 752     return s + l;
 753
 754 #undef N
 755   }
 756 }
 757
 758 #elif defined (__ARM_NEON) && defined (__ARM_64BIT_STATE)
 759 #include "arm_neon.h"
 760
 761 /* This doesn't have to be the exact page size, but no system may use
 762    a size smaller than this.  ARMv8 requires a minimum page size of
 763    4k.  The impact of being conservative here is a small number of
 764    cases will take the slightly slower entry path into the main
 765    loop.  */
 766
 767 #define AARCH64_MIN_PAGE_SIZE 4096
 768
 769 static const uchar *
 770 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 771 {
 772   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
 773   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
 774   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
 775   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
 776   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
 777
 778 #ifdef __ARM_BIG_ENDIAN
 779   const int16x8_t shift = {8, 8, 8, 8, 0, 0, 0, 0};
 780 #else
 781   const int16x8_t shift = {0, 0, 0, 0, 8, 8, 8, 8};
 782 #endif
 783
 784   unsigned int found;
 785   const uint8_t *p;
 786   uint8x16_t data;
 787   uint8x16_t t;
 788   uint16x8_t m;
 789   uint8x16_t u, v, w;
 790
 791   /* Align the source pointer.  */
 792   p = (const uint8_t *)((uintptr_t)s & -16);
 793
 794   /* Assuming random string start positions, with a 4k page size we'll take
 795      the slow path about 0.37% of the time.  */
 796   if (__builtin_expect ((AARCH64_MIN_PAGE_SIZE
 797                          - (((uintptr_t) s) & (AARCH64_MIN_PAGE_SIZE - 1)))
 798                         < 16, 0))
 799     {
 800       /* Slow path: the string starts near a possible page boundary.  */
 801       uint32_t misalign, mask;
 802
 803       misalign = (uintptr_t)s & 15;
 804       mask = (-1u << misalign) & 0xffff;
 805       data = vld1q_u8 (p);
 806       t = vceqq_u8 (data, repl_nl);
 807       u = vceqq_u8 (data, repl_cr);
 808       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 809       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 810       t = vorrq_u8 (v, w);
 811       t = vandq_u8 (t, xmask);
 812       m = vpaddlq_u8 (t);
 813       m = vshlq_u16 (m, shift);
 814       found = vaddvq_u16 (m);
 815       found &= mask;
 816       if (found)
 817         return (const uchar*)p + __builtin_ctz (found);
 818     }
 819   else
 820     {
 821       data = vld1q_u8 ((const uint8_t *) s);
 822       t = vceqq_u8 (data, repl_nl);
 823       u = vceqq_u8 (data, repl_cr);
 824       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 825       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 826       t = vorrq_u8 (v, w);
 827       if (__builtin_expect (vpaddd_u64 ((uint64x2_t)t) != 0, 0))
 828         goto done;
 829     }
 830
 831   do
 832     {
 833       p += 16;
 834       data = vld1q_u8 (p);
 835       t = vceqq_u8 (data, repl_nl);
 836       u = vceqq_u8 (data, repl_cr);
 837       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 838       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 839       t = vorrq_u8 (v, w);
 840     } while (!vpaddd_u64 ((uint64x2_t)t));
 841
 842 done:
 843   /* Now that we've found the terminating substring, work out precisely where
 844      we need to stop.  */
 845   t = vandq_u8 (t, xmask);
 846   m = vpaddlq_u8 (t);
 847   m = vshlq_u16 (m, shift);
 848   found = vaddvq_u16 (m);
 849   return (((((uintptr_t) p) < (uintptr_t) s) ? s : (const uchar *)p)
 850           + __builtin_ctz (found));
 851 }
 852
 853 #elif defined (__ARM_NEON)
 854 #include "arm_neon.h"
 855
 856 static const uchar *
 857 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 858 {
 859   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
 860   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
 861   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
 862   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
 863   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
 864
 865   unsigned int misalign, found, mask;
 866   const uint8_t *p;
 867   uint8x16_t data;
 868
 869   /* Align the source pointer.  */
 870   misalign = (uintptr_t)s & 15;
 871   p = (const uint8_t *)((uintptr_t)s & -16);
 872   data = vld1q_u8 (p);
 873
 874   /* Create a mask for the bytes that are valid within the first
 875      16-byte block.  The Idea here is that the AND with the mask
 876      within the loop is "free", since we need some AND or TEST
 877      insn in order to set the flags for the branch anyway.  */
 878   mask = (-1u << misalign) & 0xffff;
 879
 880   /* Main loop, processing 16 bytes at a time.  */
 881   goto start;
 882
 883   do
 884     {
 885       uint8x8_t l;
 886       uint16x4_t m;
 887       uint32x2_t n;
 888       uint8x16_t t, u, v, w;
 889
 890       p += 16;
 891       data = vld1q_u8 (p);
 892       mask = 0xffff;
 893
 894     start:
 895       t = vceqq_u8 (data, repl_nl);
 896       u = vceqq_u8 (data, repl_cr);
 897       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 898       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 899       t = vandq_u8 (vorrq_u8 (v, w), xmask);
 900       l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
 901       m = vpaddl_u8 (l);
 902       n = vpaddl_u16 (m);
 903
 904       found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
 905               vshr_n_u64 ((uint64x1_t) n, 24)), 0);
 906       found &= mask;
 907     }
 908   while (!found);
 909
 910   /* FOUND contains 1 in bits for which we matched a relevant
 911      character.  Conversion to the byte index is trivial.  */
 912   found = __builtin_ctz (found);
 913   return (const uchar *)p + found;
 914 }
 915
 916 #else
 917
 918 /* We only have one accelerated alternative.  Use a direct call so that
 919    we encourage inlining.  */
 920
 921 #define search_line_fast  search_line_acc_char
 922
 923 #endif
 924
 925 /* Initialize the lexer if needed.  */
 926
 927 void
 928 _cpp_init_lexer (void)
 929 {
 930 #ifdef HAVE_init_vectorized_lexer
 931   init_vectorized_lexer ();
 932 #endif
 933 }
 934
 935 /* Returns with a logical line that contains no escaped newlines or
 936    trigraphs.  This is a time-critical inner loop.  */
 937 void
 938 _cpp_clean_line (cpp_reader *pfile)
 939 {
 940   cpp_buffer *buffer;
 941   const uchar *s;
 942   uchar c, *d, *p;
 943
 944   buffer = pfile->buffer;
 945   buffer->cur_note = buffer->notes_used = 0;
 946   buffer->cur = buffer->line_base = buffer->next_line;
 947   buffer->need_line = false;
 948   s = buffer->next_line;
 949
 950   if (!buffer->from_stage3)
 951     {
 952       const uchar *pbackslash = NULL;
 953
 954       /* Fast path.  This is the common case of an un-escaped line with
 955          no trigraphs.  The primary win here is by not writing any
 956          data back to memory until we have to.  */
 957       while (1)
 958         {
 959           /* Perform an optimized search for \n, \r, \\, ?.  */
 960           s = search_line_fast (s, buffer->rlimit);
 961
 962           c = *s;
 963           if (c == '\\')
 964             {
 965               /* Record the location of the backslash and continue.  */
 966               pbackslash = s++;
 967             }
 968           else if (__builtin_expect (c == '?', 0))
 969             {
 970               if (__builtin_expect (s[1] == '?', false)
 971                    && _cpp_trigraph_map[s[2]])
 972                 {
 973                   /* Have a trigraph.  We may or may not have to convert
 974                      it.  Add a line note regardless, for -Wtrigraphs.  */
 975                   add_line_note (buffer, s, s[2]);
 976                   if (CPP_OPTION (pfile, trigraphs))
 977                     {
 978                       /* We do, and that means we have to switch to the
 979                          slow path.  */
 980                       d = (uchar *) s;
 981                       *d = _cpp_trigraph_map[s[2]];
 982                       s += 2;
 983                       goto slow_path;
 984                     }
 985                 }
 986               /* Not a trigraph.  Continue on fast-path.  */
 987               s++;
 988             }
 989           else
 990             break;
 991         }
 992
 993       /* This must be \r or \n.  We're either done, or we'll be forced
 994          to write back to the buffer and continue on the slow path.  */
 995       d = (uchar *) s;
 996
 997       if (__builtin_expect (s == buffer->rlimit, false))
 998         goto done;
 999
1000       /* DOS line ending? */
1001       if (__builtin_expect (c == '\r', false) && s[1] == '\n')
1002         {
1003           s++;
1004           if (s == buffer->rlimit)
1005             goto done;
1006         }
1007
1008       if (__builtin_expect (pbackslash == NULL, true))
1009         goto done;
1010
1011       /* Check for escaped newline.  */
1012       p = d;
1013       while (is_nvspace (p[-1]))
1014         p--;
1015       if (p - 1 != pbackslash)
1016         goto done;
1017
1018       /* Have an escaped newline; process it and proceed to
1019          the slow path.  */
1020       add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
1021       d = p - 2;
1022       buffer->next_line = p - 1;
1023
1024     slow_path:
1025       while (1)
1026         {
1027           c = *++s;
1028           *++d = c;
1029
1030           if (c == '\n' || c == '\r')
1031             {
1032               /* Handle DOS line endings.  */
1033               if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
1034                 s++;
1035               if (s == buffer->rlimit)
1036                 break;
1037
1038               /* Escaped?  */
1039               p = d;
1040               while (p != buffer->next_line && is_nvspace (p[-1]))
1041                 p--;
1042               if (p == buffer->next_line || p[-1] != '\\')
1043                 break;
1044
1045               add_line_note (buffer, p - 1, p != d ? ' ': '\\');
1046               d = p - 2;
1047               buffer->next_line = p - 1;
1048             }
1049           else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
1050             {
1051               /* Add a note regardless, for the benefit of -Wtrigraphs.  */
1052               add_line_note (buffer, d, s[2]);
1053               if (CPP_OPTION (pfile, trigraphs))
1054                 {
1055                   *d = _cpp_trigraph_map[s[2]];
1056                   s += 2;
1057                 }
1058             }
1059         }
1060     }
1061   else
1062     {
1063       while (*s != '\n' && *s != '\r')
1064         s++;
1065       d = (uchar *) s;
1066
1067       /* Handle DOS line endings.  */
1068       if (*s == '\r' && s + 1 != buffer->rlimit && s[1] == '\n')
1069         s++;
1070     }
1071
1072  done:
1073   *d = '\n';
1074   /* A sentinel note that should never be processed.  */
1075   add_line_note (buffer, d + 1, '\n');
1076   buffer->next_line = s + 1;
1077 }
1078
1079 template <bool lexing_raw_string>
1080 static bool get_fresh_line_impl (cpp_reader *pfile);
1081
1082 /* Return true if the trigraph indicated by NOTE should be warned
1083    about in a comment.  */
1084 static bool
1085 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
1086 {
1087   const uchar *p;
1088
1089   /* Within comments we don't warn about trigraphs, unless the
1090      trigraph forms an escaped newline, as that may change
1091      behavior.  */
1092   if (note->type != '/')
1093     return false;
1094
1095   /* If -trigraphs, then this was an escaped newline iff the next note
1096      is coincident.  */
1097   if (CPP_OPTION (pfile, trigraphs))
1098     return note[1].pos == note->pos;
1099
1100   /* Otherwise, see if this forms an escaped newline.  */
1101   p = note->pos + 3;
1102   while (is_nvspace (*p))
1103     p++;
1104
1105   /* There might have been escaped newlines between the trigraph and the
1106      newline we found.  Hence the position test.  */
1107   return (*p == '\n' && p < note[1].pos);
1108 }
1109
1110 /* Process the notes created by add_line_note as far as the current
1111    location.  */
1112 void
1113 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
1114 {
1115   cpp_buffer *buffer = pfile->buffer;
1116
1117   for (;;)
1118     {
1119       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
1120       unsigned int col;
1121
1122       if (note->pos > buffer->cur)
1123         break;
1124
1125       buffer->cur_note++;
1126       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
1127
1128       if (note->type == '\\' || note->type == ' ')
1129         {
1130           if (note->type == ' ' && !in_comment)
1131             cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
1132                                  "backslash and newline separated by space");
1133
1134           if (buffer->next_line > buffer->rlimit)
1135             {
1136               cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
1137                                    "backslash-newline at end of file");
1138               /* Prevent "no newline at end of file" warning.  */
1139               buffer->next_line = buffer->rlimit;
1140             }
1141
1142           buffer->line_base = note->pos;
1143           CPP_INCREMENT_LINE (pfile, 0);
1144         }
1145       else if (_cpp_trigraph_map[note->type])
1146         {
1147           if (CPP_OPTION (pfile, warn_trigraphs)
1148               && (!in_comment || warn_in_comment (pfile, note)))
1149             {
1150               if (CPP_OPTION (pfile, trigraphs))
1151                 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
1152                                        pfile->line_table->highest_line, col,
1153                                        "trigraph ??%c converted to %c",
1154                                        note->type,
1155                                        (int) _cpp_trigraph_map[note->type]);
1156               else
1157                 {
1158                   cpp_warning_with_line
1159                     (pfile, CPP_W_TRIGRAPHS,
1160                      pfile->line_table->highest_line, col,
1161                      "trigraph ??%c ignored, use -trigraphs to enable",
1162                      note->type);
1163                 }
1164             }
1165         }
1166       else if (note->type == 0)
1167         /* Already processed in lex_raw_string.  */;
1168       else
1169         abort ();
1170     }
1171 }
1172
1173 namespace bidi {
1174   enum class kind {
1175     NONE, LRE, RLE, LRO, RLO, LRI, RLI, FSI, PDF, PDI, LTR, RTL
1176   };
1177
1178   /* All the UTF-8 encodings of bidi characters start with E2.  */
1179   constexpr uchar utf8_start = 0xe2;
1180
1181   struct context
1182   {
1183     context () {}
1184     context (location_t loc, kind k, bool pdf, bool ucn)
1185     : m_loc (loc), m_kind (k), m_pdf (pdf), m_ucn (ucn)
1186     {
1187     }
1188
1189     kind get_pop_kind () const
1190     {
1191       return m_pdf ? kind::PDF : kind::PDI;
1192     }
1193     bool ucn_p () const
1194     {
1195       return m_ucn;
1196     }
1197
1198     location_t m_loc;
1199     kind m_kind;
1200     unsigned m_pdf : 1;
1201     unsigned m_ucn : 1;
1202   };
1203
1204   /* A vector holding currently open bidi contexts.  We use a char for
1205      each context, its LSB is 1 if it represents a PDF context, 0 if it
1206      represents a PDI context.  The next bit is 1 if this context was open
1207      by a bidi character written as a UCN, and 0 when it was UTF-8.  */
1208   semi_embedded_vec <context, 16> vec;
1209
1210   /* Close the whole comment/identifier/string literal/character constant
1211      context.  */
1212   void on_close ()
1213   {
1214     vec.truncate (0);
1215   }
1216
1217   /* Pop the last element in the vector.  */
1218   void pop ()
1219   {
1220     unsigned int len = vec.count ();
1221     gcc_checking_assert (len > 0);
1222     vec.truncate (len - 1);
1223   }
1224
1225   /* Return the pop kind of the context of the Ith element.  */
1226   kind pop_kind_at (unsigned int i)
1227   {
1228     return vec[i].get_pop_kind ();
1229   }
1230
1231   /* Return the pop kind of the context that is currently opened.  */
1232   kind current_ctx ()
1233   {
1234     unsigned int len = vec.count ();
1235     if (len == 0)
1236       return kind::NONE;
1237     return vec[len - 1].get_pop_kind ();
1238   }
1239
1240   /* Return true if the current context comes from a UCN origin, that is,
1241      the bidi char which started this bidi context was written as a UCN.  */
1242   bool current_ctx_ucn_p ()
1243   {
1244     unsigned int len = vec.count ();
1245     gcc_checking_assert (len > 0);
1246     return vec[len - 1].m_ucn;
1247   }
1248
1249   location_t current_ctx_loc ()
1250   {
1251     unsigned int len = vec.count ();
1252     gcc_checking_assert (len > 0);
1253     return vec[len - 1].m_loc;
1254   }
1255
1256   /* We've read a bidi char, update the current vector as necessary.
1257      LOC is only valid when K is not kind::NONE.  */
1258   void on_char (kind k, bool ucn_p, location_t loc)
1259   {
1260     switch (k)
1261       {
1262       case kind::LRE:
1263       case kind::RLE:
1264       case kind::LRO:
1265       case kind::RLO:
1266         vec.push (context (loc, k, true, ucn_p));
1267         break;
1268       case kind::LRI:
1269       case kind::RLI:
1270       case kind::FSI:
1271         vec.push (context (loc, k, false, ucn_p));
1272         break;
1273       /* PDF terminates the scope of the last LRE, RLE, LRO, or RLO
1274          whose scope has not yet been terminated.  */
1275       case kind::PDF:
1276         if (current_ctx () == kind::PDF)
1277           pop ();
1278         break;
1279       /* PDI terminates the scope of the last LRI, RLI, or FSI whose
1280          scope has not yet been terminated, as well as the scopes of
1281          any subsequent LREs, RLEs, LROs, or RLOs whose scopes have not
1282          yet been terminated.  */
1283       case kind::PDI:
1284         for (int i = vec.count () - 1; i >= 0; --i)
1285           if (pop_kind_at (i) == kind::PDI)
1286             {
1287               vec.truncate (i);
1288               break;
1289             }
1290         break;
1291       case kind::LTR:
1292       case kind::RTL:
1293         /* These aren't popped by a PDF/PDI.  */
1294         break;
1295       ATTR_LIKELY case kind::NONE:
1296         break;
1297       default:
1298         abort ();
1299       }
1300   }
1301
1302   /* Return a descriptive string for K.  */
1303   const char *to_str (kind k)
1304   {
1305     switch (k)
1306       {
1307       case kind::LRE:
1308         return "U+202A (LEFT-TO-RIGHT EMBEDDING)";
1309       case kind::RLE:
1310         return "U+202B (RIGHT-TO-LEFT EMBEDDING)";
1311       case kind::LRO:
1312         return "U+202D (LEFT-TO-RIGHT OVERRIDE)";
1313       case kind::RLO:
1314         return "U+202E (RIGHT-TO-LEFT OVERRIDE)";
1315       case kind::LRI:
1316         return "U+2066 (LEFT-TO-RIGHT ISOLATE)";
1317       case kind::RLI:
1318         return "U+2067 (RIGHT-TO-LEFT ISOLATE)";
1319       case kind::FSI:
1320         return "U+2068 (FIRST STRONG ISOLATE)";
1321       case kind::PDF:
1322         return "U+202C (POP DIRECTIONAL FORMATTING)";
1323       case kind::PDI:
1324         return "U+2069 (POP DIRECTIONAL ISOLATE)";
1325       case kind::LTR:
1326         return "U+200E (LEFT-TO-RIGHT MARK)";
1327       case kind::RTL:
1328         return "U+200F (RIGHT-TO-LEFT MARK)";
1329       default:
1330         abort ();
1331       }
1332   }
1333 }
1334
1335 /* Get location_t for the range of bytes [START, START + NUM_BYTES)
1336    within the current line in FILE, with the caret at START.  */
1337
1338 static location_t
1339 get_location_for_byte_range_in_cur_line (cpp_reader *pfile,
1340                                          const unsigned char *const start,
1341                                          size_t num_bytes)
1342 {
1343   gcc_checking_assert (num_bytes > 0);
1344
1345   /* CPP_BUF_COLUMN and linemap_position_for_column both refer
1346      to offsets in bytes, but CPP_BUF_COLUMN is 0-based,
1347      whereas linemap_position_for_column is 1-based.  */
1348
1349   /* Get 0-based offsets within the line.  */
1350   size_t start_offset = CPP_BUF_COLUMN (pfile->buffer, start);
1351   size_t end_offset = start_offset + num_bytes - 1;
1352
1353   /* Now convert to location_t, where "columns" are 1-based byte offsets.  */
1354   location_t start_loc = linemap_position_for_column (pfile->line_table,
1355                                                       start_offset + 1);
1356   location_t end_loc = linemap_position_for_column (pfile->line_table,
1357                                                      end_offset + 1);
1358
1359   if (start_loc == end_loc)
1360     return start_loc;
1361
1362   source_range src_range;
1363   src_range.m_start = start_loc;
1364   src_range.m_finish = end_loc;
1365   location_t combined_loc = COMBINE_LOCATION_DATA (pfile->line_table,
1366                                                    start_loc,
1367                                                    src_range,
1368                                                    NULL,
1369                                                    0);
1370   return combined_loc;
1371 }
1372
1373 /* Parse a sequence of 3 bytes starting with P and return its bidi code.  */
1374
1375 static bidi::kind
1376 get_bidi_utf8_1 (const unsigned char *const p)
1377 {
1378   gcc_checking_assert (p[0] == bidi::utf8_start);
1379
1380   if (p[1] == 0x80)
1381     switch (p[2])
1382       {
1383       case 0xaa:
1384         return bidi::kind::LRE;
1385       case 0xab:
1386         return bidi::kind::RLE;
1387       case 0xac:
1388         return bidi::kind::PDF;
1389       case 0xad:
1390         return bidi::kind::LRO;
1391       case 0xae:
1392         return bidi::kind::RLO;
1393       case 0x8e:
1394         return bidi::kind::LTR;
1395       case 0x8f:
1396         return bidi::kind::RTL;
1397       default:
1398         break;
1399       }
1400   else if (p[1] == 0x81)
1401     switch (p[2])
1402       {
1403       case 0xa6:
1404         return bidi::kind::LRI;
1405       case 0xa7:
1406         return bidi::kind::RLI;
1407       case 0xa8:
1408         return bidi::kind::FSI;
1409       case 0xa9:
1410         return bidi::kind::PDI;
1411       default:
1412         break;
1413       }
1414
1415   return bidi::kind::NONE;
1416 }
1417
1418 /* Parse a sequence of 3 bytes starting with P and return its bidi code.
1419    If the kind is not NONE, write the location to *OUT.*/
1420
1421 static bidi::kind
1422 get_bidi_utf8 (cpp_reader *pfile, const unsigned char *const p, location_t *out)
1423 {
1424   bidi::kind result = get_bidi_utf8_1 (p);
1425   if (result != bidi::kind::NONE)
1426     {
1427       /* We have a sequence of 3 bytes starting at P.  */
1428       *out = get_location_for_byte_range_in_cur_line (pfile, p, 3);
1429     }
1430   return result;
1431 }
1432
1433 /* Parse a UCN where P points just past \u or \U and return its bidi code.  */
1434
1435 static bidi::kind
1436 get_bidi_ucn_1 (const unsigned char *p, bool is_U, const unsigned char **end)
1437 {
1438   /* 6.4.3 Universal Character Names
1439       \u hex-quad
1440       \U hex-quad hex-quad
1441       \u { simple-hexadecimal-digit-sequence }
1442      where \unnnn means \U0000nnnn.  */
1443
1444   *end = p + 4;
1445   if (is_U)
1446     {
1447       if (p[0] != '0' || p[1] != '0' || p[2] != '0' || p[3] != '0')
1448         return bidi::kind::NONE;
1449       /* Skip 4B so we can treat \u and \U the same below.  */
1450       p += 4;
1451       *end += 4;
1452     }
1453   else if (p[0] == '{')
1454     {
1455       p++;
1456       while (*p == '0')
1457         p++;
1458       if (p[0] != '2'
1459           || p[1] != '0'
1460           || !ISXDIGIT (p[2])
1461           || !ISXDIGIT (p[3])
1462           || p[4] != '}')
1463         return bidi::kind::NONE;
1464       *end = p + 5;
1465     }
1466
1467   /* All code points we are looking for start with 20xx.  */
1468   if (p[0] != '2' || p[1] != '0')
1469     return bidi::kind::NONE;
1470   else if (p[2] == '2')
1471     switch (p[3])
1472       {
1473       case 'a':
1474       case 'A':
1475         return bidi::kind::LRE;
1476       case 'b':
1477       case 'B':
1478         return bidi::kind::RLE;
1479       case 'c':
1480       case 'C':
1481         return bidi::kind::PDF;
1482       case 'd':
1483       case 'D':
1484         return bidi::kind::LRO;
1485       case 'e':
1486       case 'E':
1487         return bidi::kind::RLO;
1488       default:
1489         break;
1490       }
1491   else if (p[2] == '6')
1492     switch (p[3])
1493       {
1494       case '6':
1495         return bidi::kind::LRI;
1496       case '7':
1497         return bidi::kind::RLI;
1498       case '8':
1499         return bidi::kind::FSI;
1500       case '9':
1501         return bidi::kind::PDI;
1502       default:
1503         break;
1504       }
1505   else if (p[2] == '0')
1506     switch (p[3])
1507       {
1508       case 'e':
1509       case 'E':
1510         return bidi::kind::LTR;
1511       case 'f':
1512       case 'F':
1513         return bidi::kind::RTL;
1514       default:
1515         break;
1516       }
1517
1518   return bidi::kind::NONE;
1519 }
1520
1521 /* Parse a UCN where P points just past \u or \U and return its bidi code.
1522    If the kind is not NONE, write the location to *OUT.  */
1523
1524 static bidi::kind
1525 get_bidi_ucn (cpp_reader *pfile, const unsigned char *p, bool is_U,
1526               location_t *out)
1527 {
1528   const unsigned char *end;
1529   bidi::kind result = get_bidi_ucn_1 (p, is_U, &end);
1530   if (result != bidi::kind::NONE)
1531     {
1532       const unsigned char *start = p - 2;
1533       size_t num_bytes = end - start;
1534       *out = get_location_for_byte_range_in_cur_line (pfile, start, num_bytes);
1535     }
1536   return result;
1537 }
1538
1539 /* Parse a named universal character escape where P points just past \N and
1540    return its bidi code.  If the kind is not NONE, write the location to
1541    *OUT.  */
1542
1543 static bidi::kind
1544 get_bidi_named (cpp_reader *pfile, const unsigned char *p, location_t *out)
1545 {
1546   bidi::kind result = bidi::kind::NONE;
1547   if (*p != '{')
1548     return bidi::kind::NONE;
1549   if (strncmp ((const char *) (p + 1), "LEFT-TO-RIGHT ", 14) == 0)
1550     {
1551       if (strncmp ((const char *) (p + 15), "MARK}", 5) == 0)
1552         result = bidi::kind::LTR;
1553       else if (strncmp ((const char *) (p + 15), "EMBEDDING}", 10) == 0)
1554         result = bidi::kind::LRE;
1555       else if (strncmp ((const char *) (p + 15), "OVERRIDE}", 9) == 0)
1556         result = bidi::kind::LRO;
1557       else if (strncmp ((const char *) (p + 15), "ISOLATE}", 8) == 0)
1558         result = bidi::kind::LRI;
1559     }
1560   else if (strncmp ((const char *) (p + 1), "RIGHT-TO-LEFT ", 14) == 0)
1561     {
1562       if (strncmp ((const char *) (p + 15), "MARK}", 5) == 0)
1563         result = bidi::kind::RTL;
1564       else if (strncmp ((const char *) (p + 15), "EMBEDDING}", 10) == 0)
1565         result = bidi::kind::RLE;
1566       else if (strncmp ((const char *) (p + 15), "OVERRIDE}", 9) == 0)
1567         result = bidi::kind::RLO;
1568       else if (strncmp ((const char *) (p + 15), "ISOLATE}", 8) == 0)
1569         result = bidi::kind::RLI;
1570     }
1571   else if (strncmp ((const char *) (p + 1), "POP DIRECTIONAL ", 16) == 0)
1572     {
1573       if (strncmp ((const char *) (p + 16), "FORMATTING}", 11) == 0)
1574         result = bidi::kind::PDF;
1575       else if (strncmp ((const char *) (p + 16), "ISOLATE}", 8) == 0)
1576         result = bidi::kind::PDI;
1577     }
1578   else if (strncmp ((const char *) (p + 1), "FIRST STRONG ISOLATE}", 21) == 0)
1579     result = bidi::kind::FSI;
1580   if (result != bidi::kind::NONE)
1581     *out = get_location_for_byte_range_in_cur_line (pfile, p - 2,
1582                                                     (strchr ((const char *)
1583                                                              (p + 1), '}')
1584                                                      - (const char *) p)
1585                                                     + 3);
1586   return result;
1587 }
1588
1589 /* Subclass of rich_location for reporting on unpaired UTF-8
1590    bidirectional control character(s).
1591    Escape the source lines on output, and show all unclosed
1592    bidi context, labelling everything.  */
1593
1594 class unpaired_bidi_rich_location : public rich_location
1595 {
1596  public:
1597   class custom_range_label : public range_label
1598   {
1599    public:
1600      label_text get_text (unsigned range_idx) const final override
1601      {
1602        /* range 0 is the primary location; each subsequent range i + 1
1603           is for bidi::vec[i].  */
1604        if (range_idx > 0)
1605          {
1606            const bidi::context &ctxt (bidi::vec[range_idx - 1]);
1607            return label_text::borrow (bidi::to_str (ctxt.m_kind));
1608          }
1609        else
1610          return label_text::borrow (_("end of bidirectional context"));
1611      }
1612   };
1613
1614   unpaired_bidi_rich_location (cpp_reader *pfile, location_t loc)
1615   : rich_location (pfile->line_table, loc, &m_custom_label)
1616   {
1617     set_escape_on_output (true);
1618     for (unsigned i = 0; i < bidi::vec.count (); i++)
1619       add_range (bidi::vec[i].m_loc,
1620                  SHOW_RANGE_WITHOUT_CARET,
1621                  &m_custom_label);
1622   }
1623
1624  private:
1625    custom_range_label m_custom_label;
1626 };
1627
1628 /* We're closing a bidi context, that is, we've encountered a newline,
1629    are closing a C-style comment, or are at the end of a string literal,
1630    character constant, or identifier.  Warn if this context was not
1631    properly terminated by a PDI or PDF.  P points to the last character
1632    in this context.  */
1633
1634 static void
1635 maybe_warn_bidi_on_close (cpp_reader *pfile, const uchar *p)
1636 {
1637   const auto warn_bidi = CPP_OPTION (pfile, cpp_warn_bidirectional);
1638   if (bidi::vec.count () > 0
1639       && (warn_bidi & bidirectional_unpaired
1640           && (!bidi::current_ctx_ucn_p ()
1641               || (warn_bidi & bidirectional_ucn))))
1642     {
1643       const location_t loc
1644         = linemap_position_for_column (pfile->line_table,
1645                                        CPP_BUF_COLUMN (pfile->buffer, p));
1646       unpaired_bidi_rich_location rich_loc (pfile, loc);
1647       /* cpp_callbacks doesn't yet have a way to handle singular vs plural
1648          forms of a diagnostic, so fake it for now.  */
1649       if (bidi::vec.count () > 1)
1650         cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1651                         "unpaired UTF-8 bidirectional control characters "
1652                         "detected");
1653       else
1654         cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1655                         "unpaired UTF-8 bidirectional control character "
1656                         "detected");
1657     }
1658   /* We're done with this context.  */
1659   bidi::on_close ();
1660 }
1661
1662 /* We're at the beginning or in the middle of an identifier/comment/string
1663    literal/character constant.  Warn if we've encountered a bidi character.
1664    KIND says which bidi control character it was; UCN_P is true iff this bidi
1665    control character was written as a UCN.  LOC is the location of the
1666    character, but is only valid if KIND != bidi::kind::NONE.  */
1667
1668 static void
1669 maybe_warn_bidi_on_char (cpp_reader *pfile, bidi::kind kind,
1670                          bool ucn_p, location_t loc)
1671 {
1672   if (__builtin_expect (kind == bidi::kind::NONE, 1))
1673     return;
1674
1675   const auto warn_bidi = CPP_OPTION (pfile, cpp_warn_bidirectional);
1676
1677   if (warn_bidi & (bidirectional_unpaired|bidirectional_any))
1678     {
1679       rich_location rich_loc (pfile->line_table, loc);
1680       rich_loc.set_escape_on_output (true);
1681
1682       /* It seems excessive to warn about a PDI/PDF that is closing
1683          an opened context because we've already warned about the
1684          opening character.  Except warn when we have a UCN x UTF-8
1685          mismatch, if UCN checking is enabled.  */
1686       if (kind == bidi::current_ctx ())
1687         {
1688           if (warn_bidi == (bidirectional_unpaired|bidirectional_ucn)
1689               && bidi::current_ctx_ucn_p () != ucn_p)
1690             {
1691               rich_loc.add_range (bidi::current_ctx_loc ());
1692               cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1693                               "UTF-8 vs UCN mismatch when closing "
1694                               "a context by \"%s\"", bidi::to_str (kind));
1695             }
1696         }
1697       else if (warn_bidi & bidirectional_any
1698                && (!ucn_p || (warn_bidi & bidirectional_ucn)))
1699         {
1700           if (kind == bidi::kind::PDF || kind == bidi::kind::PDI)
1701             cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1702                             "\"%s\" is closing an unopened context",
1703                             bidi::to_str (kind));
1704           else
1705             cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1706                             "found problematic Unicode character \"%s\"",
1707                             bidi::to_str (kind));
1708         }
1709     }
1710   /* We're done with this context.  */
1711   bidi::on_char (kind, ucn_p, loc);
1712 }
1713
1714 static const cppchar_t utf8_continuation = 0x80;
1715 static const cppchar_t utf8_signifier = 0xC0;
1716
1717 /* Emit -Winvalid-utf8 warning on invalid UTF-8 character starting
1718    at PFILE->buffer->cur.  Return a pointer after the diagnosed
1719    invalid character.  */
1720
1721 static const uchar *
1722 _cpp_warn_invalid_utf8 (cpp_reader *pfile)
1723 {
1724   cpp_buffer *buffer = pfile->buffer;
1725   const uchar *cur = buffer->cur;
1726   bool pedantic = (CPP_PEDANTIC (pfile)
1727                    && CPP_OPTION (pfile, cpp_warn_invalid_utf8) == 2);
1728
1729   if (cur[0] < utf8_signifier
1730       || cur[1] < utf8_continuation || cur[1] >= utf8_signifier)
1731     {
1732       if (pedantic)
1733         cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1734                              pfile->line_table->highest_line,
1735                              CPP_BUF_COL (buffer),
1736                              "invalid UTF-8 character <%x>",
1737                              cur[0]);
1738       else
1739         cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1740                                pfile->line_table->highest_line,
1741                                CPP_BUF_COL (buffer),
1742                                "invalid UTF-8 character <%x>",
1743                                cur[0]);
1744       return cur + 1;
1745     }
1746   else if (cur[2] < utf8_continuation || cur[2] >= utf8_signifier)
1747     {
1748       if (pedantic)
1749         cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1750                              pfile->line_table->highest_line,
1751                              CPP_BUF_COL (buffer),
1752                              "invalid UTF-8 character <%x><%x>",
1753                              cur[0], cur[1]);
1754       else
1755         cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1756                                pfile->line_table->highest_line,
1757                                CPP_BUF_COL (buffer),
1758                                "invalid UTF-8 character <%x><%x>",
1759                                cur[0], cur[1]);
1760       return cur + 2;
1761     }
1762   else if (cur[3] < utf8_continuation || cur[3] >= utf8_signifier)
1763     {
1764       if (pedantic)
1765         cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1766                              pfile->line_table->highest_line,
1767                              CPP_BUF_COL (buffer),
1768                              "invalid UTF-8 character <%x><%x><%x>",
1769                              cur[0], cur[1], cur[2]);
1770       else
1771         cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1772                                pfile->line_table->highest_line,
1773                                CPP_BUF_COL (buffer),
1774                                "invalid UTF-8 character <%x><%x><%x>",
1775                                cur[0], cur[1], cur[2]);
1776       return cur + 3;
1777     }
1778   else
1779     {
1780       if (pedantic)
1781         cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1782                              pfile->line_table->highest_line,
1783                              CPP_BUF_COL (buffer),
1784                              "invalid UTF-8 character <%x><%x><%x><%x>",
1785                              cur[0], cur[1], cur[2], cur[3]);
1786       else
1787         cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1788                                pfile->line_table->highest_line,
1789                                CPP_BUF_COL (buffer),
1790                                "invalid UTF-8 character <%x><%x><%x><%x>",
1791                                cur[0], cur[1], cur[2], cur[3]);
1792       return cur + 4;
1793     }
1794 }
1795
1796 /* Helper function of *skip_*_comment and lex*_string.  For C,
1797    character at CUR[-1] with MSB set handle -Wbidi-chars* and
1798    -Winvalid-utf8 diagnostics and return pointer to first character
1799    that should be processed next.  */
1800
1801 static inline const uchar *
1802 _cpp_handle_multibyte_utf8 (cpp_reader *pfile, uchar c,
1803                             const uchar *cur, bool warn_bidi_p,
1804                             bool warn_invalid_utf8_p)
1805 {
1806   /* If this is a beginning of a UTF-8 encoding, it might be
1807      a bidirectional control character.  */
1808   if (c == bidi::utf8_start && warn_bidi_p)
1809     {
1810       location_t loc;
1811       bidi::kind kind = get_bidi_utf8 (pfile, cur - 1, &loc);
1812       maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
1813     }
1814   if (!warn_invalid_utf8_p)
1815     return cur;
1816   if (c >= utf8_signifier)
1817     {
1818       cppchar_t s;
1819       const uchar *pstr = cur - 1;
1820       if (_cpp_valid_utf8 (pfile, &pstr, pfile->buffer->rlimit, 0, NULL, &s)
1821           && s <= UCS_LIMIT)
1822         return pstr;
1823     }
1824   pfile->buffer->cur = cur - 1;
1825   return _cpp_warn_invalid_utf8 (pfile);
1826 }
1827
1828 /* Skip a C-style block comment.  We find the end of the comment by
1829    seeing if an asterisk is before every '/' we encounter.  Returns
1830    nonzero if comment terminated by EOF, zero otherwise.
1831
1832    Buffer->cur points to the initial asterisk of the comment.  */
1833 bool
1834 _cpp_skip_block_comment (cpp_reader *pfile)
1835 {
1836   cpp_buffer *buffer = pfile->buffer;
1837   const uchar *cur = buffer->cur;
1838   uchar c;
1839   const bool warn_bidi_p = pfile->warn_bidi_p ();
1840   const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
1841   const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
1842
1843   cur++;
1844   if (*cur == '/')
1845     cur++;
1846
1847   for (;;)
1848     {
1849       /* People like decorating comments with '*', so check for '/'
1850          instead for efficiency.  */
1851       c = *cur++;
1852
1853       if (c == '/')
1854         {
1855           if (cur[-2] == '*')
1856             {
1857               if (warn_bidi_p)
1858                 maybe_warn_bidi_on_close (pfile, cur);
1859               break;
1860             }
1861
1862           /* Warn about potential nested comments, but not if the '/'
1863              comes immediately before the true comment delimiter.
1864              Don't bother to get it right across escaped newlines.  */
1865           if (CPP_OPTION (pfile, warn_comments)
1866               && cur[0] == '*' && cur[1] != '/')
1867             {
1868               buffer->cur = cur;
1869               cpp_warning_with_line (pfile, CPP_W_COMMENTS,
1870                                      pfile->line_table->highest_line,
1871                                      CPP_BUF_COL (buffer),
1872                                      "\"/*\" within comment");
1873             }
1874         }
1875       else if (c == '\n')
1876         {
1877           unsigned int cols;
1878           buffer->cur = cur - 1;
1879           if (warn_bidi_p)
1880             maybe_warn_bidi_on_close (pfile, cur);
1881           _cpp_process_line_notes (pfile, true);
1882           if (buffer->next_line >= buffer->rlimit)
1883             return true;
1884           _cpp_clean_line (pfile);
1885
1886           cols = buffer->next_line - buffer->line_base;
1887           CPP_INCREMENT_LINE (pfile, cols);
1888
1889           cur = buffer->cur;
1890         }
1891       else if (__builtin_expect (c >= utf8_continuation, 0)
1892                && warn_bidi_or_invalid_utf8_p)
1893         cur = _cpp_handle_multibyte_utf8 (pfile, c, cur, warn_bidi_p,
1894                                           warn_invalid_utf8_p);
1895     }
1896
1897   buffer->cur = cur;
1898   _cpp_process_line_notes (pfile, true);
1899   return false;
1900 }
1901
1902 /* Skip a C++ line comment, leaving buffer->cur pointing to the
1903    terminating newline.  Handles escaped newlines.  Returns nonzero
1904    if a multiline comment.  */
1905 static int
1906 skip_line_comment (cpp_reader *pfile)
1907 {
1908   cpp_buffer *buffer = pfile->buffer;
1909   location_t orig_line = pfile->line_table->highest_line;
1910   const bool warn_bidi_p = pfile->warn_bidi_p ();
1911   const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
1912   const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
1913
1914   if (!warn_bidi_or_invalid_utf8_p)
1915     while (*buffer->cur != '\n')
1916       buffer->cur++;
1917   else if (!warn_invalid_utf8_p)
1918     {
1919       while (*buffer->cur != '\n'
1920              && *buffer->cur != bidi::utf8_start)
1921         buffer->cur++;
1922       if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
1923         {
1924           while (*buffer->cur != '\n')
1925             {
1926               if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
1927                 {
1928                   location_t loc;
1929                   bidi::kind kind = get_bidi_utf8 (pfile, buffer->cur, &loc);
1930                   maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
1931                 }
1932               buffer->cur++;
1933             }
1934           maybe_warn_bidi_on_close (pfile, buffer->cur);
1935         }
1936     }
1937   else
1938     {
1939       while (*buffer->cur != '\n')
1940         {
1941           if (*buffer->cur < utf8_continuation)
1942             {
1943               buffer->cur++;
1944               continue;
1945             }
1946           buffer->cur
1947             = _cpp_handle_multibyte_utf8 (pfile, *buffer->cur, buffer->cur + 1,
1948                                           warn_bidi_p, warn_invalid_utf8_p);
1949         }
1950       if (warn_bidi_p)
1951         maybe_warn_bidi_on_close (pfile, buffer->cur);
1952     }
1953
1954   _cpp_process_line_notes (pfile, true);
1955   return orig_line != pfile->line_table->highest_line;
1956 }
1957
1958 /* Skips whitespace, saving the next non-whitespace character.  */
1959 static void
1960 skip_whitespace (cpp_reader *pfile, cppchar_t c)
1961 {
1962   cpp_buffer *buffer = pfile->buffer;
1963   bool saw_NUL = false;
1964
1965   do
1966     {
1967       /* Horizontal space always OK.  */
1968       if (c == ' ' || c == '\t')
1969         ;
1970       /* Just \f \v or \0 left.  */
1971       else if (c == '\0')
1972         saw_NUL = true;
1973       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
1974         cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
1975                              CPP_BUF_COL (buffer),
1976                              "%s in preprocessing directive",
1977                              c == '\f' ? "form feed" : "vertical tab");
1978
1979       c = *buffer->cur++;
1980     }
1981   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
1982   while (is_nvspace (c));
1983
1984   if (saw_NUL)
1985     {
1986       encoding_rich_location rich_loc (pfile);
1987       cpp_error_at (pfile, CPP_DL_WARNING, &rich_loc,
1988                     "null character(s) ignored");
1989     }
1990
1991   buffer->cur--;
1992 }
1993
1994 /* See if the characters of a number token are valid in a name (no
1995    '.', '+' or '-').  */
1996 static int
1997 name_p (cpp_reader *pfile, const cpp_string *string)
1998 {
1999   unsigned int i;
2000
2001   for (i = 0; i < string->len; i++)
2002     if (!is_idchar (string->text[i]))
2003       return 0;
2004
2005   return 1;
2006 }
2007
2008 /* After parsing an identifier or other sequence, produce a warning about
2009    sequences not in NFC/NFKC.  */
2010 static void
2011 warn_about_normalization (cpp_reader *pfile,
2012                           const cpp_token *token,
2013                           const struct normalize_state *s,
2014                           bool identifier)
2015 {
2016   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
2017       && !pfile->state.skipping)
2018     {
2019       location_t loc = token->src_loc;
2020
2021       /* If possible, create a location range for the token.  */
2022       if (loc >= RESERVED_LOCATION_COUNT
2023           && token->type != CPP_EOF
2024           /* There must be no line notes to process.  */
2025           && (!(pfile->buffer->cur
2026                 >= pfile->buffer->notes[pfile->buffer->cur_note].pos
2027                 && !pfile->overlaid_buffer)))
2028         {
2029           source_range tok_range;
2030           tok_range.m_start = loc;
2031           tok_range.m_finish
2032             = linemap_position_for_column (pfile->line_table,
2033                                            CPP_BUF_COLUMN (pfile->buffer,
2034                                                            pfile->buffer->cur));
2035           loc = COMBINE_LOCATION_DATA (pfile->line_table,
2036                                        loc, tok_range, NULL, 0);
2037         }
2038
2039       encoding_rich_location rich_loc (pfile, loc);
2040
2041       /* Make sure that the token is printed using UCNs, even
2042          if we'd otherwise happily print UTF-8.  */
2043       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
2044       size_t sz;
2045
2046       sz = cpp_spell_token (pfile, token, buf, false) - buf;
2047       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
2048         cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
2049                         "`%.*s' is not in NFKC", (int) sz, buf);
2050       else if (identifier && CPP_OPTION (pfile, xid_identifiers))
2051         cpp_pedwarning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
2052                                   "`%.*s' is not in NFC", (int) sz, buf);
2053       else
2054         cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
2055                         "`%.*s' is not in NFC", (int) sz, buf);
2056       free (buf);
2057     }
2058 }
2059
2060 /* Returns TRUE if the byte sequence starting at buffer->cur is a valid
2061    extended character in an identifier.  If FIRST is TRUE, then the character
2062    must be valid at the beginning of an identifier as well.  If the return
2063    value is TRUE, then pfile->buffer->cur has been moved to point to the next
2064    byte after the extended character.  */
2065
2066 static bool
2067 forms_identifier_p (cpp_reader *pfile, int first,
2068                     struct normalize_state *state)
2069 {
2070   cpp_buffer *buffer = pfile->buffer;
2071   const bool warn_bidi_p = pfile->warn_bidi_p ();
2072
2073   if (*buffer->cur == '$')
2074     {
2075       if (!CPP_OPTION (pfile, dollars_in_ident))
2076         return false;
2077
2078       buffer->cur++;
2079       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
2080         {
2081           CPP_OPTION (pfile, warn_dollars) = 0;
2082           cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
2083         }
2084
2085       return true;
2086     }
2087
2088   /* Is this a syntactically valid UCN or a valid UTF-8 char?  */
2089   if (CPP_OPTION (pfile, extended_identifiers))
2090     {
2091       cppchar_t s;
2092       if (*buffer->cur >= utf8_signifier)
2093         {
2094           if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0)
2095               && warn_bidi_p)
2096             {
2097               location_t loc;
2098               bidi::kind kind = get_bidi_utf8 (pfile, buffer->cur, &loc);
2099               maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
2100             }
2101           if (_cpp_valid_utf8 (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
2102                                state, &s))
2103             return true;
2104         }
2105       else if (*buffer->cur == '\\'
2106                && (buffer->cur[1] == 'u'
2107                    || buffer->cur[1] == 'U'
2108                    || buffer->cur[1] == 'N'))
2109         {
2110           buffer->cur += 2;
2111           if (warn_bidi_p)
2112             {
2113               location_t loc;
2114               bidi::kind kind;
2115               if (buffer->cur[-1] == 'N')
2116                 kind = get_bidi_named (pfile, buffer->cur, &loc);
2117               else
2118                 kind = get_bidi_ucn (pfile, buffer->cur,
2119                                      buffer->cur[-1] == 'U', &loc);
2120               maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
2121             }
2122           if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
2123                               state, &s, NULL, NULL))
2124             return true;
2125           buffer->cur -= 2;
2126         }
2127     }
2128
2129   return false;
2130 }
2131
2132 /* Helper function to issue error about improper __VA_OPT__ use.  */
2133 static void
2134 maybe_va_opt_error (cpp_reader *pfile)
2135 {
2136   if (CPP_PEDANTIC (pfile) && !CPP_OPTION (pfile, va_opt))
2137     {
2138       /* __VA_OPT__ should not be accepted at all, but allow it in
2139          system headers.  */
2140       if (!_cpp_in_system_header (pfile))
2141         {
2142           if (CPP_OPTION (pfile, cplusplus))
2143             cpp_error (pfile, CPP_DL_PEDWARN,
2144                        "__VA_OPT__ is not available until C++20");
2145           else
2146             cpp_error (pfile, CPP_DL_PEDWARN,
2147                        "__VA_OPT__ is not available until C2X");
2148         }
2149     }
2150   else if (!pfile->state.va_args_ok)
2151     {
2152       /* __VA_OPT__ should only appear in the replacement list of a
2153          variadic macro.  */
2154       cpp_error (pfile, CPP_DL_PEDWARN,
2155                  "__VA_OPT__ can only appear in the expansion"
2156                  " of a C++20 variadic macro");
2157     }
2158 }
2159
2160 /* Helper function to perform diagnostics that are needed (rarely)
2161    when an identifier is lexed.  */
2162 static void
2163 identifier_diagnostics_on_lex (cpp_reader *pfile, cpp_hashnode *node)
2164 {
2165   if (__builtin_expect (!(node->flags & NODE_DIAGNOSTIC)
2166                         || pfile->state.skipping, 1))
2167     return;
2168
2169   /* It is allowed to poison the same identifier twice.  */
2170   if ((node->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
2171     cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
2172                NODE_NAME (node));
2173
2174   /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
2175      replacement list of a variadic macro.  */
2176   if (node == pfile->spec_nodes.n__VA_ARGS__
2177       && !pfile->state.va_args_ok)
2178     {
2179       if (CPP_OPTION (pfile, cplusplus))
2180         cpp_error (pfile, CPP_DL_PEDWARN,
2181                    "__VA_ARGS__ can only appear in the expansion"
2182                    " of a C++11 variadic macro");
2183       else
2184         cpp_error (pfile, CPP_DL_PEDWARN,
2185                    "__VA_ARGS__ can only appear in the expansion"
2186                    " of a C99 variadic macro");
2187     }
2188
2189   /* __VA_OPT__ should only appear in the replacement list of a
2190      variadic macro.  */
2191   if (node == pfile->spec_nodes.n__VA_OPT__)
2192     maybe_va_opt_error (pfile);
2193
2194   /* For -Wc++-compat, warn about use of C++ named operators.  */
2195   if (node->flags & NODE_WARN_OPERATOR)
2196     cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
2197                  "identifier \"%s\" is a special operator name in C++",
2198                  NODE_NAME (node));
2199 }
2200
2201 /* Helper function to get the cpp_hashnode of the identifier BASE.  */
2202 static cpp_hashnode *
2203 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
2204 {
2205   cpp_hashnode *result;
2206   const uchar *cur;
2207   unsigned int len;
2208   unsigned int hash = HT_HASHSTEP (0, *base);
2209
2210   cur = base + 1;
2211   while (ISIDNUM (*cur))
2212     {
2213       hash = HT_HASHSTEP (hash, *cur);
2214       cur++;
2215     }
2216   len = cur - base;
2217   hash = HT_HASHFINISH (hash, len);
2218   result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
2219                                               base, len, hash, HT_ALLOC));
2220   identifier_diagnostics_on_lex (pfile, result);
2221   return result;
2222 }
2223
2224 /* Get the cpp_hashnode of an identifier specified by NAME in
2225    the current cpp_reader object.  If none is found, NULL is returned.  */
2226 cpp_hashnode *
2227 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
2228 {
2229   cpp_hashnode *result;
2230   result = lex_identifier_intern (pfile, (uchar *) name);
2231   return result;
2232 }
2233
2234 /* Lex an identifier starting at BASE.  BUFFER->CUR is expected to point
2235    one past the first character at BASE, which may be a (possibly multi-byte)
2236    character if STARTS_UCN is true.  */
2237 static cpp_hashnode *
2238 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
2239                 struct normalize_state *nst, cpp_hashnode **spelling)
2240 {
2241   cpp_hashnode *result;
2242   const uchar *cur;
2243   unsigned int len;
2244   unsigned int hash = HT_HASHSTEP (0, *base);
2245   const bool warn_bidi_p = pfile->warn_bidi_p ();
2246
2247   cur = pfile->buffer->cur;
2248   if (! starts_ucn)
2249     {
2250       while (ISIDNUM (*cur))
2251         {
2252           hash = HT_HASHSTEP (hash, *cur);
2253           cur++;
2254         }
2255       NORMALIZE_STATE_UPDATE_IDNUM (nst, *(cur - 1));
2256     }
2257   pfile->buffer->cur = cur;
2258   if (starts_ucn || forms_identifier_p (pfile, false, nst))
2259     {
2260       /* Slower version for identifiers containing UCNs
2261          or extended chars (including $).  */
2262       do {
2263         while (ISIDNUM (*pfile->buffer->cur))
2264           {
2265             NORMALIZE_STATE_UPDATE_IDNUM (nst, *pfile->buffer->cur);
2266             pfile->buffer->cur++;
2267           }
2268       } while (forms_identifier_p (pfile, false, nst));
2269       if (warn_bidi_p)
2270         maybe_warn_bidi_on_close (pfile, pfile->buffer->cur);
2271       result = _cpp_interpret_identifier (pfile, base,
2272                                           pfile->buffer->cur - base);
2273       *spelling = cpp_lookup (pfile, base, pfile->buffer->cur - base);
2274     }
2275   else
2276     {
2277       len = cur - base;
2278       hash = HT_HASHFINISH (hash, len);
2279
2280       result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
2281                                                   base, len, hash, HT_ALLOC));
2282       *spelling = result;
2283     }
2284
2285   return result;
2286 }
2287
2288 /* Struct to hold the return value of the scan_cur_identifier () helper
2289    function below.  */
2290
2291 struct scan_id_result
2292 {
2293   cpp_hashnode *node;
2294   normalize_state nst;
2295
2296   scan_id_result ()
2297     : node (nullptr)
2298   {
2299     nst = INITIAL_NORMALIZE_STATE;
2300   }
2301
2302   explicit operator bool () const { return node; }
2303 };
2304
2305 /* Helper function to scan an entire identifier beginning at
2306    pfile->buffer->cur, and possibly containing extended characters (UCNs
2307    and/or UTF-8).  Returns the cpp_hashnode for the identifier on success, or
2308    else nullptr, as well as a normalize_state so that normalization warnings
2309    may be issued once the token lexing is complete.  */
2310
2311 static scan_id_result
2312 scan_cur_identifier (cpp_reader *pfile)
2313 {
2314   const auto buffer = pfile->buffer;
2315   const auto begin = buffer->cur;
2316   scan_id_result result;
2317   if (ISIDST (*buffer->cur))
2318     {
2319       ++buffer->cur;
2320       cpp_hashnode *ignore;
2321       result.node = lex_identifier (pfile, begin, false, &result.nst, &ignore);
2322     }
2323   else if (forms_identifier_p (pfile, true, &result.nst))
2324     {
2325       /* buffer->cur has been moved already by the call
2326          to forms_identifier_p.  */
2327       cpp_hashnode *ignore;
2328       result.node = lex_identifier (pfile, begin, true, &result.nst, &ignore);
2329     }
2330   return result;
2331 }
2332
2333 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
2334 static void
2335 lex_number (cpp_reader *pfile, cpp_string *number,
2336             struct normalize_state *nst)
2337 {
2338   const uchar *cur;
2339   const uchar *base;
2340   uchar *dest;
2341
2342   base = pfile->buffer->cur - 1;
2343   do
2344     {
2345       const uchar *adj_digit_sep = NULL;
2346       cur = pfile->buffer->cur;
2347
2348       /* N.B. ISIDNUM does not include $.  */
2349       while (ISIDNUM (*cur)
2350              || (*cur == '.' && !DIGIT_SEP (cur[-1]))
2351              || DIGIT_SEP (*cur)
2352              || (VALID_SIGN (*cur, cur[-1]) && !DIGIT_SEP (cur[-2])))
2353         {
2354           NORMALIZE_STATE_UPDATE_IDNUM (nst, *cur);
2355           /* Adjacent digit separators do not form part of the pp-number syntax.
2356              However, they can safely be diagnosed here as an error, since '' is
2357              not a valid preprocessing token.  */
2358           if (DIGIT_SEP (*cur) && DIGIT_SEP (cur[-1]) && !adj_digit_sep)
2359             adj_digit_sep = cur;
2360           cur++;
2361         }
2362       /* A number can't end with a digit separator.  */
2363       while (cur > pfile->buffer->cur && DIGIT_SEP (cur[-1]))
2364         --cur;
2365       if (adj_digit_sep && adj_digit_sep < cur)
2366         cpp_error (pfile, CPP_DL_ERROR, "adjacent digit separators");
2367
2368       pfile->buffer->cur = cur;
2369     }
2370   while (forms_identifier_p (pfile, false, nst));
2371
2372   number->len = cur - base;
2373   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
2374   memcpy (dest, base, number->len);
2375   dest[number->len] = '\0';
2376   number->text = dest;
2377 }
2378
2379 /* Create a token of type TYPE with a literal spelling.  */
2380 static void
2381 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
2382                 unsigned int len, enum cpp_ttype type)
2383 {
2384   token->type = type;
2385   token->val.str.len = len;
2386   token->val.str.text = cpp_alloc_token_string (pfile, base, len);
2387 }
2388
2389 /* Like create_literal(), but construct it from two separate strings
2390    which are concatenated.  LEN2 may be 0 if no second string is
2391    required.  */
2392 static void
2393 create_literal2 (cpp_reader *pfile, cpp_token *token, const uchar *base1,
2394                  unsigned int len1, const uchar *base2, unsigned int len2,
2395                  enum cpp_ttype type)
2396 {
2397   token->type = type;
2398   token->val.str.len = len1 + len2;
2399   uchar *const dest = _cpp_unaligned_alloc (pfile, len1 + len2 + 1);
2400   memcpy (dest, base1, len1);
2401   if (len2)
2402     memcpy (dest+len1, base2, len2);
2403   dest[len1 + len2] = 0;
2404   token->val.str.text = dest;
2405 }
2406
2407 const uchar *
2408 cpp_alloc_token_string (cpp_reader *pfile,
2409                         const unsigned char *ptr, unsigned len)
2410 {
2411   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
2412
2413   dest[len] = 0;
2414   memcpy (dest, ptr, len);
2415   return dest;
2416 }
2417
2418 /* A pair of raw buffer pointers.  The currently open one is [1], the
2419    first one is [0].  Used for string literal lexing.  */
2420 struct lit_accum {
2421   _cpp_buff *first;
2422   _cpp_buff *last;
2423   const uchar *rpos;
2424   size_t accum;
2425
2426   lit_accum ()
2427     : first (NULL), last (NULL), rpos (0), accum (0)
2428   {
2429   }
2430
2431   void append (cpp_reader *, const uchar *, size_t);
2432
2433   void read_begin (cpp_reader *);
2434   bool reading_p () const
2435   {
2436     return rpos != NULL;
2437   }
2438   char read_char ()
2439   {
2440     char c = *rpos++;
2441     if (rpos == BUFF_FRONT (last))
2442       rpos = NULL;
2443     return c;
2444   }
2445
2446   void create_literal2 (cpp_reader *pfile, cpp_token *token,
2447                         const uchar *base1, unsigned int len1,
2448                         const uchar *base2, unsigned int len2,
2449                         enum cpp_ttype type);
2450 };
2451
2452 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
2453    sequence from *FIRST_BUFF_P to LAST_BUFF_P.  */
2454
2455 void
2456 lit_accum::append (cpp_reader *pfile, const uchar *base, size_t len)
2457 {
2458   if (!last)
2459     /* Starting.  */
2460     first = last = _cpp_get_buff (pfile, len);
2461   else if (len > BUFF_ROOM (last))
2462     {
2463       /* There is insufficient room in the buffer.  Copy what we can,
2464          and then either extend or create a new one.  */
2465       size_t room = BUFF_ROOM (last);
2466       memcpy (BUFF_FRONT (last), base, room);
2467       BUFF_FRONT (last) += room;
2468       base += room;
2469       len -= room;
2470       accum += room;
2471
2472       gcc_checking_assert (!rpos);
2473
2474       last = _cpp_append_extend_buff (pfile, last, len);
2475     }
2476
2477   memcpy (BUFF_FRONT (last), base, len);
2478   BUFF_FRONT (last) += len;
2479   accum += len;
2480 }
2481
2482 void
2483 lit_accum::read_begin (cpp_reader *pfile)
2484 {
2485   /* We never accumulate more than 4 chars to read.  */
2486   if (BUFF_ROOM (last) < 4)
2487
2488     last = _cpp_append_extend_buff (pfile, last, 4);
2489   rpos = BUFF_FRONT (last);
2490 }
2491
2492 /* Helper function to check if a string format macro, say from inttypes.h, is
2493    placed touching a string literal, in which case it could be parsed as a C++11
2494    user-defined string literal thus breaking the program.  Return TRUE if the
2495    UDL should be ignored for now and preserved for potential macro
2496    expansion.  */
2497
2498 static bool
2499 maybe_ignore_udl_macro_suffix (cpp_reader *pfile, location_t src_loc,
2500                                const uchar *suffix_begin, cpp_hashnode *node)
2501 {
2502   /* User-defined literals outside of namespace std must start with a single
2503      underscore, so assume anything of that form really is a UDL suffix.
2504      We don't need to worry about UDLs defined inside namespace std because
2505      their names are reserved, so cannot be used as macro names in valid
2506      programs.  */
2507   if ((suffix_begin[0] == '_' && suffix_begin[1] != '_')
2508       || !cpp_macro_p (node))
2509     return false;
2510
2511   /* Maybe raise a warning here; caller should arrange not to consume
2512      the tokens.  */
2513   if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
2514     cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX, src_loc, 0,
2515                            "invalid suffix on literal; C++11 requires a space "
2516                            "between literal and string macro");
2517   return true;
2518 }
2519
2520 /* Like create_literal2(), but also prepend all the accumulated data from
2521    the lit_accum struct.  */
2522 void
2523 lit_accum::create_literal2 (cpp_reader *pfile, cpp_token *token,
2524                             const uchar *base1, unsigned int len1,
2525                             const uchar *base2, unsigned int len2,
2526                             enum cpp_ttype type)
2527 {
2528   const unsigned int tot_len = accum + len1 + len2;
2529   uchar *dest = _cpp_unaligned_alloc (pfile, tot_len + 1);
2530   token->type = type;
2531   token->val.str.len = tot_len;
2532   token->val.str.text = dest;
2533   for (_cpp_buff *buf = first; buf; buf = buf->next)
2534     {
2535       size_t len = BUFF_FRONT (buf) - buf->base;
2536       memcpy (dest, buf->base, len);
2537       dest += len;
2538     }
2539   memcpy (dest, base1, len1);
2540   dest += len1;
2541   if (len2)
2542     memcpy (dest, base2, len2);
2543   dest += len2;
2544   *dest = '\0';
2545 }
2546
2547 /* Lexes a raw string.  The stored string contains the spelling,
2548    including double quotes, delimiter string, '(' and ')', any leading
2549    'L', 'u', 'U' or 'u8' and 'R' modifier.  The created token contains
2550    the type of the literal, or CPP_OTHER if it was not properly
2551    terminated.
2552
2553    BASE is the start of the token.  Updates pfile->buffer->cur to just
2554    after the lexed string.
2555
2556    The spelling is NUL-terminated, but it is not guaranteed that this
2557    is the first NUL since embedded NULs are preserved.  */
2558
2559 static void
2560 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
2561 {
2562   const uchar *pos = base;
2563   const bool warn_bidi_p = pfile->warn_bidi_p ();
2564   const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
2565   const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
2566
2567   /* 'tis a pity this information isn't passed down from the lexer's
2568      initial categorization of the token.  */
2569   enum cpp_ttype type = CPP_STRING;
2570
2571   if (*pos == 'L')
2572     {
2573       type = CPP_WSTRING;
2574       pos++;
2575     }
2576   else if (*pos == 'U')
2577     {
2578       type = CPP_STRING32;
2579       pos++;
2580     }
2581   else if (*pos == 'u')
2582     {
2583       if (pos[1] == '8')
2584         {
2585           type = CPP_UTF8STRING;
2586           pos++;
2587         }
2588       else
2589         type = CPP_STRING16;
2590       pos++;
2591     }
2592
2593   gcc_checking_assert (pos[0] == 'R' && pos[1] == '"');
2594   pos += 2;
2595
2596   _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
2597
2598   /* Skip notes before the ".  */
2599   while (note->pos < pos)
2600     ++note;
2601
2602   lit_accum accum;
2603
2604   uchar prefix[17];
2605   unsigned prefix_len = 0;
2606   enum Phase
2607   {
2608    PHASE_PREFIX = -2,
2609    PHASE_NONE = -1,
2610    PHASE_SUFFIX = 0
2611   } phase = PHASE_PREFIX;
2612
2613   for (;;)
2614     {
2615       gcc_checking_assert (note->pos >= pos);
2616
2617       /* Undo any escaped newlines and trigraphs.  */
2618       if (!accum.reading_p () && note->pos == pos)
2619         switch (note->type)
2620           {
2621           case '\\':
2622           case ' ':
2623             /* Restore backslash followed by newline.  */
2624             accum.append (pfile, base, pos - base);
2625             base = pos;
2626             accum.read_begin (pfile);
2627             accum.append (pfile, UC"\\", 1);
2628
2629           after_backslash:
2630             if (note->type == ' ')
2631               /* GNU backslash whitespace newline extension.  FIXME
2632                  could be any sequence of non-vertical space.  When we
2633                  can properly restore any such sequence, we should
2634                  mark this note as handled so _cpp_process_line_notes
2635                  doesn't warn.  */
2636               accum.append (pfile, UC" ", 1);
2637
2638             accum.append (pfile, UC"\n", 1);
2639             note++;
2640             break;
2641
2642           case '\n':
2643             /* This can happen for ??/<NEWLINE> when trigraphs are not
2644                being interpretted.  */
2645             gcc_checking_assert (!CPP_OPTION (pfile, trigraphs));
2646             note->type = 0;
2647             note++;
2648             break;
2649
2650           default:
2651             gcc_checking_assert (_cpp_trigraph_map[note->type]);
2652
2653             /* Don't warn about this trigraph in
2654                _cpp_process_line_notes, since trigraphs show up as
2655                trigraphs in raw strings.  */
2656             uchar type = note->type;
2657             note->type = 0;
2658
2659             if (CPP_OPTION (pfile, trigraphs))
2660               {
2661                 accum.append (pfile, base, pos - base);
2662                 base = pos;
2663                 accum.read_begin (pfile);
2664                 accum.append (pfile, UC"??", 2);
2665                 accum.append (pfile, &type, 1);
2666
2667                 /* ??/ followed by newline gets two line notes, one for
2668                    the trigraph and one for the backslash/newline.  */
2669                 if (type == '/' && note[1].pos == pos)
2670                   {
2671                     note++;
2672                     gcc_assert (note->type == '\\' || note->type == ' ');
2673                     goto after_backslash;
2674                   }
2675                 /* Skip the replacement character.  */
2676                 base = ++pos;
2677               }
2678
2679             note++;
2680             break;
2681           }
2682
2683       /* Now get a char to process.  Either from an expanded note, or
2684          from the line buffer.  */
2685       bool read_note = accum.reading_p ();
2686       char c = read_note ? accum.read_char () : *pos++;
2687
2688       if (phase == PHASE_PREFIX)
2689         {
2690           if (c == '(')
2691             {
2692               /* Done.  */
2693               phase = PHASE_NONE;
2694               prefix[prefix_len++] = '"';
2695             }
2696           else if (prefix_len < 16
2697                    /* Prefix chars are any of the basic character set,
2698                       [lex.charset] except for '
2699                       ()\\\t\v\f\n'. Optimized for a contiguous
2700                       alphabet.  */
2701                    /* Unlike a switch, this collapses down to one or
2702                       two shift and bitmask operations on an ASCII
2703                       system, with an outlier or two.   */
2704                    && (('Z' - 'A' == 25
2705                         ? ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
2706                         : ISIDST (c))
2707                        || (c >= '0' && c <= '9')
2708                        || c == '_' || c == '{' || c == '}'
2709                        || c == '[' || c == ']' || c == '#'
2710                        || c == '<' || c == '>' || c == '%'
2711                        || c == ':' || c == ';' || c == '.' || c == '?'
2712                        || c == '*' || c == '+' || c == '-' || c == '/'
2713                        || c == '^' || c == '&' || c == '|' || c == '~'
2714                        || c == '!' || c == '=' || c == ','
2715                        || c == '"' || c == '\''))
2716             prefix[prefix_len++] = c;
2717           else
2718             {
2719               /* Something is wrong.  */
2720               int col = CPP_BUF_COLUMN (pfile->buffer, pos) + read_note;
2721               if (prefix_len == 16)
2722                 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2723                                      col, "raw string delimiter longer "
2724                                      "than 16 characters");
2725               else if (c == '\n')
2726                 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2727                                      col, "invalid new-line in raw "
2728                                      "string delimiter");
2729               else
2730                 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2731                                      col, "invalid character '%c' in "
2732                                      "raw string delimiter", c);
2733               type = CPP_OTHER;
2734               phase = PHASE_NONE;
2735               /* Continue until we get a close quote, that's probably
2736                  the best failure mode.  */
2737               prefix_len = 0;
2738             }
2739           if (c != '\n')
2740             continue;
2741         }
2742
2743       if (phase != PHASE_NONE)
2744         {
2745           if (prefix[phase] != c)
2746             phase = PHASE_NONE;
2747           else if (unsigned (phase + 1) == prefix_len)
2748             break;
2749           else
2750             {
2751               phase = Phase (phase + 1);
2752               continue;
2753             }
2754         }
2755
2756       if (!prefix_len && c == '"')
2757         /* Failure mode lexing.  */
2758         goto out;
2759       else if (prefix_len && c == ')')
2760         phase = PHASE_SUFFIX;
2761       else if (!read_note && c == '\n')
2762         {
2763           pos--;
2764           pfile->buffer->cur = pos;
2765           if ((pfile->state.in_directive || pfile->state.parsing_args)
2766               && pfile->buffer->next_line >= pfile->buffer->rlimit)
2767             {
2768               cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
2769                                    "unterminated raw string");
2770               type = CPP_OTHER;
2771               goto out;
2772             }
2773
2774           accum.append (pfile, base, pos - base + 1);
2775           _cpp_process_line_notes (pfile, false);
2776
2777           if (pfile->buffer->next_line < pfile->buffer->rlimit)
2778             CPP_INCREMENT_LINE (pfile, 0);
2779           pfile->buffer->need_line = true;
2780
2781           if (!get_fresh_line_impl<true> (pfile))
2782             {
2783               /* We ran out of file and failed to get a line.  */
2784               location_t src_loc = token->src_loc;
2785               token->type = CPP_EOF;
2786               /* Tell the compiler the line number of the EOF token.  */
2787               token->src_loc = pfile->line_table->highest_line;
2788               token->flags = BOL;
2789               if (accum.first)
2790                 _cpp_release_buff (pfile, accum.first);
2791               cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
2792                                    "unterminated raw string");
2793
2794               /* Now pop the buffer that get_fresh_line_impl() did not.  Popping
2795                  is not safe if processing a directive, however this cannot
2796                  happen as we already checked above that a line would be
2797                  available, and get_fresh_line_impl() can't fail in this
2798                  case.  */
2799               gcc_assert (!pfile->state.in_directive);
2800               _cpp_pop_buffer (pfile);
2801
2802               return;
2803             }
2804
2805           pos = base = pfile->buffer->cur;
2806           note = &pfile->buffer->notes[pfile->buffer->cur_note];
2807         }
2808       else if (__builtin_expect ((unsigned char) c >= utf8_continuation, 0)
2809                && warn_bidi_or_invalid_utf8_p)
2810         pos = _cpp_handle_multibyte_utf8 (pfile, c, pos, warn_bidi_p,
2811                                           warn_invalid_utf8_p);
2812     }
2813
2814   if (warn_bidi_p)
2815     maybe_warn_bidi_on_close (pfile, pos);
2816
2817   if (CPP_OPTION (pfile, user_literals))
2818     {
2819       const uchar *const suffix_begin = pos;
2820       pfile->buffer->cur = pos;
2821
2822       if (const auto sr = scan_cur_identifier (pfile))
2823         {
2824           if (maybe_ignore_udl_macro_suffix (pfile, token->src_loc,
2825                                              suffix_begin, sr.node))
2826               pfile->buffer->cur = suffix_begin;
2827           else
2828             {
2829               type = cpp_userdef_string_add_type (type);
2830               accum.create_literal2 (pfile, token, base, suffix_begin - base,
2831                                      NODE_NAME (sr.node), NODE_LEN (sr.node),
2832                                      type);
2833               if (accum.first)
2834                 _cpp_release_buff (pfile, accum.first);
2835               warn_about_normalization (pfile, token, &sr.nst, true);
2836               return;
2837             }
2838         }
2839     }
2840
2841  out:
2842   pfile->buffer->cur = pos;
2843   if (!accum.accum)
2844     create_literal (pfile, token, base, pos - base, type);
2845   else
2846     {
2847       accum.create_literal2 (pfile, token, base, pos - base, nullptr, 0, type);
2848       _cpp_release_buff (pfile, accum.first);
2849     }
2850 }
2851
2852 /* Lexes a string, character constant, or angle-bracketed header file
2853    name.  The stored string contains the spelling, including opening
2854    quote and any leading 'L', 'u', 'U' or 'u8' and optional
2855    'R' modifier.  It returns the type of the literal, or CPP_OTHER
2856    if it was not properly terminated, or CPP_LESS for an unterminated
2857    header name which must be relexed as normal tokens.
2858
2859    The spelling is NUL-terminated, but it is not guaranteed that this
2860    is the first NUL since embedded NULs are preserved.  */
2861 static void
2862 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
2863 {
2864   bool saw_NUL = false;
2865   const uchar *cur;
2866   cppchar_t terminator;
2867   enum cpp_ttype type;
2868
2869   cur = base;
2870   terminator = *cur++;
2871   if (terminator == 'L' || terminator == 'U')
2872     terminator = *cur++;
2873   else if (terminator == 'u')
2874     {
2875       terminator = *cur++;
2876       if (terminator == '8')
2877         terminator = *cur++;
2878     }
2879   if (terminator == 'R')
2880     {
2881       lex_raw_string (pfile, token, base);
2882       return;
2883     }
2884   if (terminator == '"')
2885     type = (*base == 'L' ? CPP_WSTRING :
2886             *base == 'U' ? CPP_STRING32 :
2887             *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
2888                          : CPP_STRING);
2889   else if (terminator == '\'')
2890     type = (*base == 'L' ? CPP_WCHAR :
2891             *base == 'U' ? CPP_CHAR32 :
2892             *base == 'u' ? (base[1] == '8' ? CPP_UTF8CHAR : CPP_CHAR16)
2893                          : CPP_CHAR);
2894   else
2895     terminator = '>', type = CPP_HEADER_NAME;
2896
2897   const bool warn_bidi_p = pfile->warn_bidi_p ();
2898   const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
2899   const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
2900   for (;;)
2901     {
2902       cppchar_t c = *cur++;
2903
2904       /* In #include-style directives, terminators are not escapable.  */
2905       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
2906         {
2907           if ((cur[0] == 'u' || cur[0] == 'U' || cur[0] == 'N') && warn_bidi_p)
2908             {
2909               location_t loc;
2910               bidi::kind kind;
2911               if (cur[0] == 'N')
2912                 kind = get_bidi_named (pfile, cur + 1, &loc);
2913               else
2914                 kind = get_bidi_ucn (pfile, cur + 1, cur[0] == 'U', &loc);
2915               maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
2916             }
2917           cur++;
2918         }
2919       else if (c == terminator)
2920         {
2921           if (warn_bidi_p)
2922             maybe_warn_bidi_on_close (pfile, cur - 1);
2923           break;
2924         }
2925       else if (c == '\n')
2926         {
2927           cur--;
2928           /* Unmatched quotes always yield undefined behavior, but
2929              greedy lexing means that what appears to be an unterminated
2930              header name may actually be a legitimate sequence of tokens.  */
2931           if (terminator == '>')
2932             {
2933               token->type = CPP_LESS;
2934               return;
2935             }
2936           type = CPP_OTHER;
2937           break;
2938         }
2939       else if (c == '\0')
2940         saw_NUL = true;
2941       else if (__builtin_expect (c >= utf8_continuation, 0)
2942                && warn_bidi_or_invalid_utf8_p)
2943         cur = _cpp_handle_multibyte_utf8 (pfile, c, cur, warn_bidi_p,
2944                                           warn_invalid_utf8_p);
2945     }
2946
2947   if (saw_NUL && !pfile->state.skipping)
2948     cpp_error (pfile, CPP_DL_WARNING,
2949                "null character(s) preserved in literal");
2950
2951   if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
2952     cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
2953                (int) terminator);
2954
2955   pfile->buffer->cur = cur;
2956   const uchar *const suffix_begin = cur;
2957
2958   if (CPP_OPTION (pfile, user_literals))
2959     {
2960       if (const auto sr = scan_cur_identifier (pfile))
2961         {
2962           if (maybe_ignore_udl_macro_suffix (pfile, token->src_loc,
2963                                              suffix_begin, sr.node))
2964             pfile->buffer->cur = suffix_begin;
2965           else
2966             {
2967               /* Grab user defined literal suffix.  */
2968               type = cpp_userdef_char_add_type (type);
2969               type = cpp_userdef_string_add_type (type);
2970               create_literal2 (pfile, token, base, suffix_begin - base,
2971                                NODE_NAME (sr.node), NODE_LEN (sr.node), type);
2972               warn_about_normalization (pfile, token, &sr.nst, true);
2973               return;
2974             }
2975         }
2976     }
2977   else if (CPP_OPTION (pfile, cpp_warn_cxx11_compat)
2978            && !pfile->state.skipping)
2979     {
2980       const auto sr = scan_cur_identifier (pfile);
2981       /* Maybe raise a warning, but do not consume the tokens.  */
2982       pfile->buffer->cur = suffix_begin;
2983       if (sr && cpp_macro_p (sr.node))
2984         cpp_warning_with_line (pfile, CPP_W_CXX11_COMPAT,
2985                                token->src_loc, 0, "C++11 requires a space "
2986                                "between string literal and macro");
2987     }
2988
2989   create_literal (pfile, token, base, cur - base, type);
2990 }
2991
2992 /* Return the comment table. The client may not make any assumption
2993    about the ordering of the table.  */
2994 cpp_comment_table *
2995 cpp_get_comments (cpp_reader *pfile)
2996 {
2997   return &pfile->comments;
2998 }
2999
3000 /* Append a comment to the end of the comment table. */
3001 static void
3002 store_comment (cpp_reader *pfile, cpp_token *token)
3003 {
3004   int len;
3005
3006   if (pfile->comments.allocated == 0)
3007     {
3008       pfile->comments.allocated = 256;
3009       pfile->comments.entries = (cpp_comment *) xmalloc
3010         (pfile->comments.allocated * sizeof (cpp_comment));
3011     }
3012
3013   if (pfile->comments.count == pfile->comments.allocated)
3014     {
3015       pfile->comments.allocated *= 2;
3016       pfile->comments.entries = (cpp_comment *) xrealloc
3017         (pfile->comments.entries,
3018          pfile->comments.allocated * sizeof (cpp_comment));
3019     }
3020
3021   len = token->val.str.len;
3022
3023   /* Copy comment. Note, token may not be NULL terminated. */
3024   pfile->comments.entries[pfile->comments.count].comment =
3025     (char *) xmalloc (sizeof (char) * (len + 1));
3026   memcpy (pfile->comments.entries[pfile->comments.count].comment,
3027           token->val.str.text, len);
3028   pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
3029
3030   /* Set source location. */
3031   pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
3032
3033   /* Increment the count of entries in the comment table. */
3034   pfile->comments.count++;
3035 }
3036
3037 /* The stored comment includes the comment start and any terminator.  */
3038 static void
3039 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
3040               cppchar_t type)
3041 {
3042   unsigned char *buffer;
3043   unsigned int len, clen, i;
3044
3045   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
3046
3047   /* C++ comments probably (not definitely) have moved past a new
3048      line, which we don't want to save in the comment.  */
3049   if (is_vspace (pfile->buffer->cur[-1]))
3050     len--;
3051
3052   /* If we are currently in a directive or in argument parsing, then
3053      we need to store all C++ comments as C comments internally, and
3054      so we need to allocate a little extra space in that case.
3055
3056      Note that the only time we encounter a directive here is
3057      when we are saving comments in a "#define".  */
3058   clen = ((pfile->state.in_directive || pfile->state.parsing_args)
3059           && type == '/') ? len + 2 : len;
3060
3061   buffer = _cpp_unaligned_alloc (pfile, clen);
3062
3063   token->type = CPP_COMMENT;
3064   token->val.str.len = clen;
3065   token->val.str.text = buffer;
3066
3067   buffer[0] = '/';
3068   memcpy (buffer + 1, from, len - 1);
3069
3070   /* Finish conversion to a C comment, if necessary.  */
3071   if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
3072     {
3073       buffer[1] = '*';
3074       buffer[clen - 2] = '*';
3075       buffer[clen - 1] = '/';
3076       /* As there can be in a C++ comments illegal sequences for C comments
3077          we need to filter them out.  */
3078       for (i = 2; i < (clen - 2); i++)
3079         if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
3080           buffer[i] = '|';
3081     }
3082
3083   /* Finally store this comment for use by clients of libcpp. */
3084   store_comment (pfile, token);
3085 }
3086
3087 /* Returns true if comment at COMMENT_START is a recognized FALLTHROUGH
3088    comment.  */
3089
3090 static bool
3091 fallthrough_comment_p (cpp_reader *pfile, const unsigned char *comment_start)
3092 {
3093   const unsigned char *from = comment_start + 1;
3094
3095   switch (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough))
3096     {
3097       /* For both -Wimplicit-fallthrough=0 and -Wimplicit-fallthrough=5 we
3098          don't recognize any comments.  The latter only checks attributes,
3099          the former doesn't warn.  */
3100     case 0:
3101     default:
3102       return false;
3103       /* -Wimplicit-fallthrough=1 considers any comment, no matter what
3104          content it has.  */
3105     case 1:
3106       return true;
3107     case 2:
3108       /* -Wimplicit-fallthrough=2 looks for (case insensitive)
3109          .*falls?[ \t-]*thr(u|ough).* regex.  */
3110       for (; (size_t) (pfile->buffer->cur - from) >= sizeof "fallthru" - 1;
3111            from++)
3112         {
3113           /* Is there anything like strpbrk with upper boundary, or
3114              memchr looking for 2 characters rather than just one?  */
3115           if (from[0] != 'f' && from[0] != 'F')
3116             continue;
3117           if (from[1] != 'a' && from[1] != 'A')
3118             continue;
3119           if (from[2] != 'l' && from[2] != 'L')
3120             continue;
3121           if (from[3] != 'l' && from[3] != 'L')
3122             continue;
3123           from += sizeof "fall" - 1;
3124           if (from[0] == 's' || from[0] == 'S')
3125             from++;
3126           while (*from == ' ' || *from == '\t' || *from == '-')
3127             from++;
3128           if (from[0] != 't' && from[0] != 'T')
3129             continue;
3130           if (from[1] != 'h' && from[1] != 'H')
3131             continue;
3132           if (from[2] != 'r' && from[2] != 'R')
3133             continue;
3134           if (from[3] == 'u' || from[3] == 'U')
3135             return true;
3136           if (from[3] != 'o' && from[3] != 'O')
3137             continue;
3138           if (from[4] != 'u' && from[4] != 'U')
3139             continue;
3140           if (from[5] != 'g' && from[5] != 'G')
3141             continue;
3142           if (from[6] != 'h' && from[6] != 'H')
3143             continue;
3144           return true;
3145         }
3146       return false;
3147     case 3:
3148     case 4:
3149       break;
3150     }
3151
3152   /* Whole comment contents:
3153      -fallthrough
3154      @fallthrough@
3155    */
3156   if (*from == '-' || *from == '@')
3157     {
3158       size_t len = sizeof "fallthrough" - 1;
3159       if ((size_t) (pfile->buffer->cur - from - 1) < len)
3160         return false;
3161       if (memcmp (from + 1, "fallthrough", len))
3162         return false;
3163       if (*from == '@')
3164         {
3165           if (from[len + 1] != '@')
3166             return false;
3167           len++;
3168         }
3169       from += 1 + len;
3170     }
3171   /* Whole comment contents (regex):
3172      lint -fallthrough[ \t]*
3173    */
3174   else if (*from == 'l')
3175     {
3176       size_t len = sizeof "int -fallthrough" - 1;
3177       if ((size_t) (pfile->buffer->cur - from - 1) < len)
3178         return false;
3179       if (memcmp (from + 1, "int -fallthrough", len))
3180         return false;
3181       from += 1 + len;
3182       while (*from == ' ' || *from == '\t')
3183         from++;
3184     }
3185   /* Whole comment contents (regex):
3186      [ \t]*FALLTHR(U|OUGH)[ \t]*
3187    */
3188   else if (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough) == 4)
3189     {
3190       while (*from == ' ' || *from == '\t')
3191         from++;
3192       if ((size_t) (pfile->buffer->cur - from)  < sizeof "FALLTHRU" - 1)
3193         return false;
3194       if (memcmp (from, "FALLTHR", sizeof "FALLTHR" - 1))
3195         return false;
3196       from += sizeof "FALLTHR" - 1;
3197       if (*from == 'U')
3198         from++;
3199       else if ((size_t) (pfile->buffer->cur - from)  < sizeof "OUGH" - 1)
3200         return false;
3201       else if (memcmp (from, "OUGH", sizeof "OUGH" - 1))
3202         return false;
3203       else
3204         from += sizeof "OUGH" - 1;
3205       while (*from == ' ' || *from == '\t')
3206         from++;
3207     }
3208   /* Whole comment contents (regex):
3209      [ \t.!]*(ELSE,? |INTENTIONAL(LY)? )?FALL(S | |-)?THR(OUGH|U)[ \t.!]*(-[^\n\r]*)?
3210      [ \t.!]*(Else,? |Intentional(ly)? )?Fall((s | |-)[Tt]|t)hr(ough|u)[ \t.!]*(-[^\n\r]*)?
3211      [ \t.!]*([Ee]lse,? |[Ii]ntentional(ly)? )?fall(s | |-)?thr(ough|u)[ \t.!]*(-[^\n\r]*)?
3212    */
3213   else
3214     {
3215       while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
3216         from++;
3217       unsigned char f = *from;
3218       bool all_upper = false;
3219       if (f == 'E' || f == 'e')
3220         {
3221           if ((size_t) (pfile->buffer->cur - from)
3222               < sizeof "else fallthru" - 1)
3223             return false;
3224           if (f == 'E' && memcmp (from + 1, "LSE", sizeof "LSE" - 1) == 0)
3225             all_upper = true;
3226           else if (memcmp (from + 1, "lse", sizeof "lse" - 1))
3227             return false;
3228           from += sizeof "else" - 1;
3229           if (*from == ',')
3230             from++;
3231           if (*from != ' ')
3232             return false;
3233           from++;
3234           if (all_upper && *from == 'f')
3235             return false;
3236           if (f == 'e' && *from == 'F')
3237             return false;
3238           f = *from;
3239         }
3240       else if (f == 'I' || f == 'i')
3241         {
3242           if ((size_t) (pfile->buffer->cur - from)
3243               < sizeof "intentional fallthru" - 1)
3244             return false;
3245           if (f == 'I' && memcmp (from + 1, "NTENTIONAL",
3246                                   sizeof "NTENTIONAL" - 1) == 0)
3247             all_upper = true;
3248           else if (memcmp (from + 1, "ntentional",
3249                            sizeof "ntentional" - 1))
3250             return false;
3251           from += sizeof "intentional" - 1;
3252           if (*from == ' ')
3253             {
3254               from++;
3255               if (all_upper && *from == 'f')
3256                 return false;
3257             }
3258           else if (all_upper)
3259             {
3260               if (memcmp (from, "LY F", sizeof "LY F" - 1))
3261                 return false;
3262               from += sizeof "LY " - 1;
3263             }
3264           else
3265             {
3266               if (memcmp (from, "ly ", sizeof "ly " - 1))
3267                 return false;
3268               from += sizeof "ly " - 1;
3269             }
3270           if (f == 'i' && *from == 'F')
3271             return false;
3272           f = *from;
3273         }
3274       if (f != 'F' && f != 'f')
3275         return false;
3276       if ((size_t) (pfile->buffer->cur - from) < sizeof "fallthru" - 1)
3277         return false;
3278       if (f == 'F' && memcmp (from + 1, "ALL", sizeof "ALL" - 1) == 0)
3279         all_upper = true;
3280       else if (all_upper)
3281         return false;
3282       else if (memcmp (from + 1, "all", sizeof "all" - 1))
3283         return false;
3284       from += sizeof "fall" - 1;
3285       if (*from == (all_upper ? 'S' : 's') && from[1] == ' ')
3286         from += 2;
3287       else if (*from == ' ' || *from == '-')
3288         from++;
3289       else if (*from != (all_upper ? 'T' : 't'))
3290         return false;
3291       if ((f == 'f' || *from != 'T') && (all_upper || *from != 't'))
3292         return false;
3293       if ((size_t) (pfile->buffer->cur - from) < sizeof "thru" - 1)
3294         return false;
3295       if (memcmp (from + 1, all_upper ? "HRU" : "hru", sizeof "hru" - 1))
3296         {
3297           if ((size_t) (pfile->buffer->cur - from) < sizeof "through" - 1)
3298             return false;
3299           if (memcmp (from + 1, all_upper ? "HROUGH" : "hrough",
3300                       sizeof "hrough" - 1))
3301             return false;
3302           from += sizeof "through" - 1;
3303         }
3304       else
3305         from += sizeof "thru" - 1;
3306       while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
3307         from++;
3308       if (*from == '-')
3309         {
3310           from++;
3311           if (*comment_start == '*')
3312             {
3313               do
3314                 {
3315                   while (*from && *from != '*'
3316                          && *from != '\n' && *from != '\r')
3317                     from++;
3318                   if (*from != '*' || from[1] == '/')
3319                     break;
3320                   from++;
3321                 }
3322               while (1);
3323             }
3324           else
3325             while (*from && *from != '\n' && *from != '\r')
3326               from++;
3327         }
3328     }
3329   /* C block comment.  */
3330   if (*comment_start == '*')
3331     {
3332       if (*from != '*' || from[1] != '/')
3333         return false;
3334     }
3335   /* C++ line comment.  */
3336   else if (*from != '\n')
3337     return false;
3338
3339   return true;
3340 }
3341
3342 /* Allocate COUNT tokens for RUN.  */
3343 void
3344 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
3345 {
3346   run->base = XNEWVEC (cpp_token, count);
3347   run->limit = run->base + count;
3348   run->next = NULL;
3349 }
3350
3351 /* Returns the next tokenrun, or creates one if there is none.  */
3352 static tokenrun *
3353 next_tokenrun (tokenrun *run)
3354 {
3355   if (run->next == NULL)
3356     {
3357       run->next = XNEW (tokenrun);
3358       run->next->prev = run;
3359       _cpp_init_tokenrun (run->next, 250);
3360     }
3361
3362   return run->next;
3363 }
3364
3365 /* Return the number of not yet processed token in a given
3366    context.  */
3367 int
3368 _cpp_remaining_tokens_num_in_context (cpp_context *context)
3369 {
3370   if (context->tokens_kind == TOKENS_KIND_DIRECT)
3371     return (LAST (context).token - FIRST (context).token);
3372   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
3373            || context->tokens_kind == TOKENS_KIND_EXTENDED)
3374     return (LAST (context).ptoken - FIRST (context).ptoken);
3375   else
3376       abort ();
3377 }
3378
3379 /* Returns the token present at index INDEX in a given context.  If
3380    INDEX is zero, the next token to be processed is returned.  */
3381 static const cpp_token*
3382 _cpp_token_from_context_at (cpp_context *context, int index)
3383 {
3384   if (context->tokens_kind == TOKENS_KIND_DIRECT)
3385     return &(FIRST (context).token[index]);
3386   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
3387            || context->tokens_kind == TOKENS_KIND_EXTENDED)
3388     return FIRST (context).ptoken[index];
3389  else
3390    abort ();
3391 }
3392
3393 /* Look ahead in the input stream.  */
3394 const cpp_token *
3395 cpp_peek_token (cpp_reader *pfile, int index)
3396 {
3397   cpp_context *context = pfile->context;
3398   const cpp_token *peektok;
3399   int count;
3400
3401   /* First, scan through any pending cpp_context objects.  */
3402   while (context->prev)
3403     {
3404       ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
3405
3406       if (index < (int) sz)
3407         return _cpp_token_from_context_at (context, index);
3408       index -= (int) sz;
3409       context = context->prev;
3410     }
3411
3412   /* We will have to read some new tokens after all (and do so
3413      without invalidating preceding tokens).  */
3414   count = index;
3415   pfile->keep_tokens++;
3416
3417   /* For peeked tokens temporarily disable line_change reporting,
3418      until the tokens are parsed for real.  */
3419   void (*line_change) (cpp_reader *, const cpp_token *, int)
3420     = pfile->cb.line_change;
3421   pfile->cb.line_change = NULL;
3422
3423   do
3424     {
3425       peektok = _cpp_lex_token (pfile);
3426       if (peektok->type == CPP_EOF)
3427         {
3428           index--;
3429           break;
3430         }
3431       else if (peektok->type == CPP_PRAGMA)
3432         {
3433           /* Don't peek past a pragma.  */
3434           if (peektok == &pfile->directive_result)
3435             /* Save the pragma in the buffer.  */
3436             *pfile->cur_token++ = *peektok;
3437           index--;
3438           break;
3439         }
3440     }
3441   while (index--);
3442
3443   _cpp_backup_tokens_direct (pfile, count - index);
3444   pfile->keep_tokens--;
3445   pfile->cb.line_change = line_change;
3446
3447   return peektok;
3448 }
3449
3450 /* Allocate a single token that is invalidated at the same time as the
3451    rest of the tokens on the line.  Has its line and col set to the
3452    same as the last lexed token, so that diagnostics appear in the
3453    right place.  */
3454 cpp_token *
3455 _cpp_temp_token (cpp_reader *pfile)
3456 {
3457   cpp_token *old, *result;
3458   ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
3459   ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
3460
3461   old = pfile->cur_token - 1;
3462   /* Any pre-existing lookaheads must not be clobbered.  */
3463   if (la)
3464     {
3465       if (sz <= la)
3466         {
3467           tokenrun *next = next_tokenrun (pfile->cur_run);
3468
3469           if (sz < la)
3470             memmove (next->base + 1, next->base,
3471                      (la - sz) * sizeof (cpp_token));
3472
3473           next->base[0] = pfile->cur_run->limit[-1];
3474         }
3475
3476       if (sz > 1)
3477         memmove (pfile->cur_token + 1, pfile->cur_token,
3478                  MIN (la, sz - 1) * sizeof (cpp_token));
3479     }
3480
3481   if (!sz && pfile->cur_token == pfile->cur_run->limit)
3482     {
3483       pfile->cur_run = next_tokenrun (pfile->cur_run);
3484       pfile->cur_token = pfile->cur_run->base;
3485     }
3486
3487   result = pfile->cur_token++;
3488   result->src_loc = old->src_loc;
3489   return result;
3490 }
3491
3492 /* We're at the beginning of a logical line (so not in
3493   directives-mode) and RESULT is a CPP_NAME with NODE_MODULE set.  See
3494   if we should enter deferred_pragma mode to tokenize the rest of the
3495   line as a module control-line.  */
3496
3497 static void
3498 cpp_maybe_module_directive (cpp_reader *pfile, cpp_token *result)
3499 {
3500   unsigned backup = 0; /* Tokens we peeked.  */
3501   cpp_hashnode *node = result->val.node.node;
3502   cpp_token *peek = result;
3503   cpp_token *keyword = peek;
3504   cpp_hashnode *(&n_modules)[spec_nodes::M_HWM][2] = pfile->spec_nodes.n_modules;
3505   int header_count = 0;
3506
3507   /* Make sure the incoming state is as we expect it.  This way we
3508      can restore it using constants.  */
3509   gcc_checking_assert (!pfile->state.in_deferred_pragma
3510                        && !pfile->state.skipping
3511                        && !pfile->state.parsing_args
3512                        && !pfile->state.angled_headers
3513                        && (pfile->state.save_comments
3514                            == !CPP_OPTION (pfile, discard_comments)));
3515
3516   /* Enter directives mode sufficiently for peeking.  We don't have
3517      to actually set in_directive.  */
3518   pfile->state.in_deferred_pragma = true;
3519
3520   /* These two fields are needed to process tokenization in deferred
3521      pragma mode.  They are not used outside deferred pragma mode or
3522      directives mode.  */
3523   pfile->state.pragma_allow_expansion = true;
3524   pfile->directive_line = result->src_loc;
3525
3526   /* Saving comments is incompatible with directives mode.   */
3527   pfile->state.save_comments = 0;
3528
3529   if (node == n_modules[spec_nodes::M_EXPORT][0])
3530     {
3531       peek = _cpp_lex_direct (pfile);
3532       keyword = peek;
3533       backup++;
3534       if (keyword->type != CPP_NAME)
3535         goto not_module;
3536       node = keyword->val.node.node;
3537       if (!(node->flags & NODE_MODULE))
3538         goto not_module;
3539     }
3540
3541   if (node == n_modules[spec_nodes::M__IMPORT][0])
3542     /* __import  */
3543     header_count = backup + 2 + 16;
3544   else if (node == n_modules[spec_nodes::M_IMPORT][0])
3545     /* import  */
3546     header_count = backup + 2 + (CPP_OPTION (pfile, preprocessed) ? 16 : 0);
3547   else if (node == n_modules[spec_nodes::M_MODULE][0])
3548     ; /* module  */
3549   else
3550     goto not_module;
3551
3552   /* We've seen [export] {module|import|__import}.  Check the next token.  */
3553   if (header_count)
3554     /* After '{,__}import' a header name may appear.  */
3555     pfile->state.angled_headers = true;
3556   peek = _cpp_lex_direct (pfile);
3557   backup++;
3558
3559   /* ... import followed by identifier, ':', '<' or
3560      header-name preprocessing tokens, or module
3561      followed by cpp-identifier, ':' or ';' preprocessing
3562      tokens.  C++ keywords are not yet relevant.  */
3563   if (peek->type == CPP_NAME
3564       || peek->type == CPP_COLON
3565       ||  (header_count
3566            ? (peek->type == CPP_LESS
3567               || (peek->type == CPP_STRING && peek->val.str.text[0] != 'R')
3568               || peek->type == CPP_HEADER_NAME)
3569            : peek->type == CPP_SEMICOLON))
3570     {
3571       pfile->state.pragma_allow_expansion = !CPP_OPTION (pfile, preprocessed);
3572       if (!pfile->state.pragma_allow_expansion)
3573         pfile->state.prevent_expansion++;
3574
3575       if (!header_count && linemap_included_from
3576           (LINEMAPS_LAST_ORDINARY_MAP (pfile->line_table)))
3577         cpp_error_with_line (pfile, CPP_DL_ERROR, keyword->src_loc, 0,
3578                              "module control-line cannot be in included file");
3579
3580       /* The first one or two tokens cannot be macro names.  */
3581       for (int ix = backup; ix--;)
3582         {
3583           cpp_token *tok = ix ? keyword : result;
3584           cpp_hashnode *node = tok->val.node.node;
3585
3586           /* Don't attempt to expand the token.  */
3587           tok->flags |= NO_EXPAND;
3588           if (_cpp_defined_macro_p (node)
3589               && _cpp_maybe_notify_macro_use (pfile, node, tok->src_loc)
3590               && !cpp_fun_like_macro_p (node))
3591             cpp_error_with_line (pfile, CPP_DL_ERROR, tok->src_loc, 0,
3592                                  "module control-line \"%s\" cannot be"
3593                                  " an object-like macro",
3594                                  NODE_NAME (node));
3595         }
3596
3597       /* Map to underbar variants.  */
3598       keyword->val.node.node = n_modules[header_count
3599                                          ? spec_nodes::M_IMPORT
3600                                          : spec_nodes::M_MODULE][1];
3601       if (backup != 1)
3602         result->val.node.node = n_modules[spec_nodes::M_EXPORT][1];
3603
3604       /* Maybe tell the tokenizer we expect a header-name down the
3605          road.  */
3606       pfile->state.directive_file_token = header_count;
3607     }
3608   else
3609     {
3610     not_module:
3611       /* Drop out of directive mode.  */
3612       /* We aaserted save_comments had this value upon entry.  */
3613       pfile->state.save_comments
3614         = !CPP_OPTION (pfile, discard_comments);
3615       pfile->state.in_deferred_pragma = false;
3616       /* Do not let this remain on.  */
3617       pfile->state.angled_headers = false;
3618     }
3619
3620   /* In either case we want to backup the peeked tokens.  */
3621   if (backup)
3622     {
3623       /* If we saw EOL, we should drop it, because this isn't a module
3624          control-line after all.  */
3625       bool eol = peek->type == CPP_PRAGMA_EOL;
3626       if (!eol || backup > 1)
3627         {
3628           /* Put put the peeked tokens back  */
3629           _cpp_backup_tokens_direct (pfile, backup);
3630           /* But if the last one was an EOL, forget it.  */
3631           if (eol)
3632             pfile->lookaheads--;
3633         }
3634     }
3635 }
3636
3637 /* Lex a token into RESULT (external interface).  Takes care of issues
3638    like directive handling, token lookahead, multiple include
3639    optimization and skipping.  */
3640 const cpp_token *
3641 _cpp_lex_token (cpp_reader *pfile)
3642 {
3643   cpp_token *result;
3644
3645   for (;;)
3646     {
3647       if (pfile->cur_token == pfile->cur_run->limit)
3648         {
3649           pfile->cur_run = next_tokenrun (pfile->cur_run);
3650           pfile->cur_token = pfile->cur_run->base;
3651         }
3652       /* We assume that the current token is somewhere in the current
3653          run.  */
3654       if (pfile->cur_token < pfile->cur_run->base
3655           || pfile->cur_token >= pfile->cur_run->limit)
3656         abort ();
3657
3658       if (pfile->lookaheads)
3659         {
3660           pfile->lookaheads--;
3661           result = pfile->cur_token++;
3662         }
3663       else
3664         result = _cpp_lex_direct (pfile);
3665
3666       if (result->flags & BOL)
3667         {
3668           /* Is this a directive.  If _cpp_handle_directive returns
3669              false, it is an assembler #.  */
3670           if (result->type == CPP_HASH
3671               /* 6.10.3 p 11: Directives in a list of macro arguments
3672                  gives undefined behavior.  This implementation
3673                  handles the directive as normal.  */
3674               && pfile->state.parsing_args != 1)
3675             {
3676               if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
3677                 {
3678                   if (pfile->directive_result.type == CPP_PADDING)
3679                     continue;
3680                   result = &pfile->directive_result;
3681                 }
3682             }
3683           else if (pfile->state.in_deferred_pragma)
3684             result = &pfile->directive_result;
3685           else if (result->type == CPP_NAME
3686                    && (result->val.node.node->flags & NODE_MODULE)
3687                    && !pfile->state.skipping
3688                    /* Unlike regular directives, we do not deal with
3689                       tokenizing module directives as macro arguments.
3690                       That's not permitted.  */
3691                    && !pfile->state.parsing_args)
3692             {
3693               /* P1857.  Before macro expansion, At start of logical
3694                  line ... */
3695               /* We don't have to consider lookaheads at this point.  */
3696               gcc_checking_assert (!pfile->lookaheads);
3697
3698               cpp_maybe_module_directive (pfile, result);
3699             }
3700
3701           if (pfile->cb.line_change && !pfile->state.skipping)
3702             pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
3703         }
3704
3705       /* We don't skip tokens in directives.  */
3706       if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
3707         break;
3708
3709       /* Outside a directive, invalidate controlling macros.  At file
3710          EOF, _cpp_lex_direct takes care of popping the buffer, so we never
3711          get here and MI optimization works.  */
3712       pfile->mi_valid = false;
3713
3714       if (!pfile->state.skipping || result->type == CPP_EOF)
3715         break;
3716     }
3717
3718   return result;
3719 }
3720
3721 /* Returns true if a fresh line has been loaded.  */
3722 template <bool lexing_raw_string>
3723 static bool
3724 get_fresh_line_impl (cpp_reader *pfile)
3725 {
3726   /* We can't get a new line until we leave the current directive, unless we
3727      are lexing a raw string, in which case it will be OK as long as we don't
3728      pop the current buffer.  */
3729   if (!lexing_raw_string && pfile->state.in_directive)
3730     return false;
3731
3732   for (;;)
3733     {
3734       cpp_buffer *buffer = pfile->buffer;
3735
3736       if (!buffer->need_line)
3737         return true;
3738
3739       if (buffer->next_line < buffer->rlimit)
3740         {
3741           _cpp_clean_line (pfile);
3742           return true;
3743         }
3744
3745       /* We can't change buffers until we leave the current directive.  */
3746       if (lexing_raw_string && pfile->state.in_directive)
3747         return false;
3748
3749       /* First, get out of parsing arguments state.  */
3750       if (pfile->state.parsing_args)
3751         return false;
3752
3753       /* End of buffer.  Non-empty files should end in a newline.  */
3754       if (buffer->buf != buffer->rlimit
3755           && buffer->next_line > buffer->rlimit
3756           && !buffer->from_stage3)
3757         {
3758           /* Clip to buffer size.  */
3759           buffer->next_line = buffer->rlimit;
3760         }
3761
3762       if (buffer->prev && !buffer->return_at_eof)
3763         _cpp_pop_buffer (pfile);
3764       else
3765         {
3766           /* End of translation.  Do not pop the buffer yet. Increment
3767              line number so that the EOF token is on a line of its own
3768              (_cpp_lex_direct doesn't increment in that case, because
3769              it's hard for it to distinguish this special case). */
3770           CPP_INCREMENT_LINE (pfile, 0);
3771           return false;
3772         }
3773     }
3774 }
3775
3776 bool
3777 _cpp_get_fresh_line (cpp_reader *pfile)
3778 {
3779   return get_fresh_line_impl<false> (pfile);
3780 }
3781
3782
3783 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)          \
3784   do                                                    \
3785     {                                                   \
3786       result->type = ELSE_TYPE;                         \
3787       if (*buffer->cur == CHAR)                         \
3788         buffer->cur++, result->type = THEN_TYPE;        \
3789     }                                                   \
3790   while (0)
3791
3792 /* Lex a token into pfile->cur_token, which is also incremented, to
3793    get diagnostics pointing to the correct location.
3794
3795    Does not handle issues such as token lookahead, multiple-include
3796    optimization, directives, skipping etc.  This function is only
3797    suitable for use by _cpp_lex_token, and in special cases like
3798    lex_expansion_token which doesn't care for any of these issues.
3799
3800    When meeting a newline, returns CPP_EOF if parsing a directive,
3801    otherwise returns to the start of the token buffer if permissible.
3802    Returns the location of the lexed token.  */
3803 cpp_token *
3804 _cpp_lex_direct (cpp_reader *pfile)
3805 {
3806   cppchar_t c;
3807   cpp_buffer *buffer;
3808   const unsigned char *comment_start;
3809   bool fallthrough_comment = false;
3810   cpp_token *result = pfile->cur_token++;
3811
3812  fresh_line:
3813   result->flags = 0;
3814   buffer = pfile->buffer;
3815   if (buffer->need_line)
3816     {
3817       if (pfile->state.in_deferred_pragma)
3818         {
3819           /* This can happen in cases like:
3820              #define loop(x) whatever
3821              #pragma omp loop
3822              where when trying to expand loop we need to peek
3823              next token after loop, but aren't still in_deferred_pragma
3824              mode but are in in_directive mode, so buffer->need_line
3825              is set, a CPP_EOF is peeked.  */
3826           result->type = CPP_PRAGMA_EOL;
3827           pfile->state.in_deferred_pragma = false;
3828           if (!pfile->state.pragma_allow_expansion)
3829             pfile->state.prevent_expansion--;
3830           return result;
3831         }
3832       if (!_cpp_get_fresh_line (pfile))
3833         {
3834           result->type = CPP_EOF;
3835           /* Not a real EOF in a directive or arg parsing -- we refuse
3836              to advance to the next file now, and will once we're out
3837              of those modes.  */
3838           if (!pfile->state.in_directive && !pfile->state.parsing_args)
3839             {
3840               /* Tell the compiler the line number of the EOF token.  */
3841               result->src_loc = pfile->line_table->highest_line;
3842               result->flags = BOL;
3843               /* Now pop the buffer that _cpp_get_fresh_line did not.  */
3844               _cpp_pop_buffer (pfile);
3845             }
3846           return result;
3847         }
3848       if (buffer != pfile->buffer)
3849         fallthrough_comment = false;
3850       if (!pfile->keep_tokens)
3851         {
3852           pfile->cur_run = &pfile->base_run;
3853           result = pfile->base_run.base;
3854           pfile->cur_token = result + 1;
3855         }
3856       result->flags = BOL;
3857       if (pfile->state.parsing_args == 2)
3858         result->flags |= PREV_WHITE;
3859     }
3860   buffer = pfile->buffer;
3861  update_tokens_line:
3862   result->src_loc = pfile->line_table->highest_line;
3863
3864  skipped_white:
3865   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
3866       && !pfile->overlaid_buffer)
3867     {
3868       _cpp_process_line_notes (pfile, false);
3869       result->src_loc = pfile->line_table->highest_line;
3870     }
3871   c = *buffer->cur++;
3872
3873   if (pfile->forced_token_location)
3874     result->src_loc = pfile->forced_token_location;
3875   else
3876     result->src_loc = linemap_position_for_column (pfile->line_table,
3877                                           CPP_BUF_COLUMN (buffer, buffer->cur));
3878
3879   switch (c)
3880     {
3881     case ' ': case '\t': case '\f': case '\v': case '\0':
3882       result->flags |= PREV_WHITE;
3883       skip_whitespace (pfile, c);
3884       goto skipped_white;
3885
3886     case '\n':
3887       /* Increment the line, unless this is the last line ...  */
3888       if (buffer->cur < buffer->rlimit
3889           /* ... or this is a #include, (where _cpp_stack_file needs to
3890              unwind by one line) ...  */
3891           || (pfile->state.in_directive > 1
3892               /* ... except traditional-cpp increments this elsewhere.  */
3893               && !CPP_OPTION (pfile, traditional)))
3894         CPP_INCREMENT_LINE (pfile, 0);
3895       buffer->need_line = true;
3896       if (pfile->state.in_deferred_pragma)
3897         {
3898           /* Produce the PRAGMA_EOL on this line.  File reading
3899              ensures there is always a \n at end of the buffer, thus
3900              in a deferred pragma we always see CPP_PRAGMA_EOL before
3901              any CPP_EOF.  */
3902           result->type = CPP_PRAGMA_EOL;
3903           result->flags &= ~PREV_WHITE;
3904           pfile->state.in_deferred_pragma = false;
3905           if (!pfile->state.pragma_allow_expansion)
3906             pfile->state.prevent_expansion--;
3907           return result;
3908         }
3909       goto fresh_line;
3910
3911     case '0': case '1': case '2': case '3': case '4':
3912     case '5': case '6': case '7': case '8': case '9':
3913       {
3914         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3915         result->type = CPP_NUMBER;
3916         lex_number (pfile, &result->val.str, &nst);
3917         warn_about_normalization (pfile, result, &nst, false);
3918         break;
3919       }
3920
3921     case 'L':
3922     case 'u':
3923     case 'U':
3924     case 'R':
3925       /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
3926          wide strings or raw strings.  */
3927       if (c == 'L' || CPP_OPTION (pfile, rliterals)
3928           || (c != 'R' && CPP_OPTION (pfile, uliterals)))
3929         {
3930           if ((*buffer->cur == '\'' && c != 'R')
3931               || *buffer->cur == '"'
3932               || (*buffer->cur == 'R'
3933                   && c != 'R'
3934                   && buffer->cur[1] == '"'
3935                   && CPP_OPTION (pfile, rliterals))
3936               || (*buffer->cur == '8'
3937                   && c == 'u'
3938                   && ((buffer->cur[1] == '"' || (buffer->cur[1] == '\''
3939                                 && CPP_OPTION (pfile, utf8_char_literals)))
3940                       || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
3941                           && CPP_OPTION (pfile, rliterals)))))
3942             {
3943               lex_string (pfile, result, buffer->cur - 1);
3944               break;
3945             }
3946         }
3947       /* Fall through.  */
3948
3949     case '_':
3950     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
3951     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
3952     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
3953     case 's': case 't':           case 'v': case 'w': case 'x':
3954     case 'y': case 'z':
3955     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
3956     case 'G': case 'H': case 'I': case 'J': case 'K':
3957     case 'M': case 'N': case 'O': case 'P': case 'Q':
3958     case 'S': case 'T':           case 'V': case 'W': case 'X':
3959     case 'Y': case 'Z':
3960       result->type = CPP_NAME;
3961       {
3962         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3963         const auto node = lex_identifier (pfile, buffer->cur - 1, false, &nst,
3964                                           &result->val.node.spelling);
3965         result->val.node.node = node;
3966         identifier_diagnostics_on_lex (pfile, node);
3967         warn_about_normalization (pfile, result, &nst, true);
3968       }
3969
3970       /* Convert named operators to their proper types.  */
3971       if (result->val.node.node->flags & NODE_OPERATOR)
3972         {
3973           result->flags |= NAMED_OP;
3974           result->type = (enum cpp_ttype) result->val.node.node->directive_index;
3975         }
3976
3977       /* Signal FALLTHROUGH comment followed by another token.  */
3978       if (fallthrough_comment)
3979         result->flags |= PREV_FALLTHROUGH;
3980       break;
3981
3982     case '\'':
3983     case '"':
3984       lex_string (pfile, result, buffer->cur - 1);
3985       break;
3986
3987     case '/':
3988       /* A potential block or line comment.  */
3989       comment_start = buffer->cur;
3990       c = *buffer->cur;
3991
3992       if (c == '*')
3993         {
3994           if (_cpp_skip_block_comment (pfile))
3995             cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
3996         }
3997       else if (c == '/' && ! CPP_OPTION (pfile, traditional))
3998         {
3999           /* Don't warn for system headers.  */
4000           if (_cpp_in_system_header (pfile))
4001             ;
4002           /* Warn about comments if pedantically GNUC89, and not
4003              in system headers.  */
4004           else if (CPP_OPTION (pfile, lang) == CLK_GNUC89
4005                    && CPP_PEDANTIC (pfile)
4006                    && ! buffer->warned_cplusplus_comments)
4007             {
4008               if (cpp_error (pfile, CPP_DL_PEDWARN,
4009                              "C++ style comments are not allowed in ISO C90"))
4010                 cpp_error (pfile, CPP_DL_NOTE,
4011                            "(this will be reported only once per input file)");
4012               buffer->warned_cplusplus_comments = 1;
4013             }
4014           /* Or if specifically desired via -Wc90-c99-compat.  */
4015           else if (CPP_OPTION (pfile, cpp_warn_c90_c99_compat) > 0
4016                    && ! CPP_OPTION (pfile, cplusplus)
4017                    && ! buffer->warned_cplusplus_comments)
4018             {
4019               if (cpp_error (pfile, CPP_DL_WARNING,
4020                              "C++ style comments are incompatible with C90"))
4021                 cpp_error (pfile, CPP_DL_NOTE,
4022                            "(this will be reported only once per input file)");
4023               buffer->warned_cplusplus_comments = 1;
4024             }
4025           /* In C89/C94, C++ style comments are forbidden.  */
4026           else if ((CPP_OPTION (pfile, lang) == CLK_STDC89
4027                     || CPP_OPTION (pfile, lang) == CLK_STDC94))
4028             {
4029               /* But don't be confused about valid code such as
4030                  - // immediately followed by *,
4031                  - // in a preprocessing directive,
4032                  - // in an #if 0 block.  */
4033               if (buffer->cur[1] == '*'
4034                   || pfile->state.in_directive
4035                   || pfile->state.skipping)
4036                 {
4037                   result->type = CPP_DIV;
4038                   break;
4039                 }
4040               else if (! buffer->warned_cplusplus_comments)
4041                 {
4042                   if (cpp_error (pfile, CPP_DL_ERROR,
4043                                  "C++ style comments are not allowed in "
4044                                  "ISO C90"))
4045                     cpp_error (pfile, CPP_DL_NOTE,
4046                                "(this will be reported only once per input "
4047                                "file)");
4048                   buffer->warned_cplusplus_comments = 1;
4049                 }
4050             }
4051           if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
4052             cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
4053         }
4054       else if (c == '=')
4055         {
4056           buffer->cur++;
4057           result->type = CPP_DIV_EQ;
4058           break;
4059         }
4060       else
4061         {
4062           result->type = CPP_DIV;
4063           break;
4064         }
4065
4066       if (fallthrough_comment_p (pfile, comment_start))
4067         fallthrough_comment = true;
4068
4069       if (pfile->cb.comment)
4070         {
4071           size_t len = pfile->buffer->cur - comment_start;
4072           pfile->cb.comment (pfile, result->src_loc, comment_start - 1,
4073                              len + 1);
4074         }
4075
4076       if (!pfile->state.save_comments)
4077         {
4078           result->flags |= PREV_WHITE;
4079           goto update_tokens_line;
4080         }
4081
4082       if (fallthrough_comment)
4083         result->flags |= PREV_FALLTHROUGH;
4084
4085       /* Save the comment as a token in its own right.  */
4086       save_comment (pfile, result, comment_start, c);
4087       break;
4088
4089     case '<':
4090       if (pfile->state.angled_headers)
4091         {
4092           lex_string (pfile, result, buffer->cur - 1);
4093           if (result->type != CPP_LESS)
4094             break;
4095         }
4096
4097       result->type = CPP_LESS;
4098       if (*buffer->cur == '=')
4099         {
4100           buffer->cur++, result->type = CPP_LESS_EQ;
4101           if (*buffer->cur == '>'
4102               && CPP_OPTION (pfile, cplusplus)
4103               && CPP_OPTION (pfile, lang) >= CLK_GNUCXX20)
4104             buffer->cur++, result->type = CPP_SPACESHIP;
4105         }
4106       else if (*buffer->cur == '<')
4107         {
4108           buffer->cur++;
4109           IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
4110         }
4111       else if (CPP_OPTION (pfile, digraphs))
4112         {
4113           if (*buffer->cur == ':')
4114             {
4115               /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
4116                  three characters are <:: and the subsequent character
4117                  is neither : nor >, the < is treated as a preprocessor
4118                  token by itself".  */
4119               if (CPP_OPTION (pfile, cplusplus)
4120                   && CPP_OPTION (pfile, lang) != CLK_CXX98
4121                   && CPP_OPTION (pfile, lang) != CLK_GNUCXX
4122                   && buffer->cur[1] == ':'
4123                   && buffer->cur[2] != ':' && buffer->cur[2] != '>')
4124                 break;
4125
4126               buffer->cur++;
4127               result->flags |= DIGRAPH;
4128               result->type = CPP_OPEN_SQUARE;
4129             }
4130           else if (*buffer->cur == '%')
4131             {
4132               buffer->cur++;
4133               result->flags |= DIGRAPH;
4134               result->type = CPP_OPEN_BRACE;
4135             }
4136         }
4137       break;
4138
4139     case '>':
4140       result->type = CPP_GREATER;
4141       if (*buffer->cur == '=')
4142         buffer->cur++, result->type = CPP_GREATER_EQ;
4143       else if (*buffer->cur == '>')
4144         {
4145           buffer->cur++;
4146           IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
4147         }
4148       break;
4149
4150     case '%':
4151       result->type = CPP_MOD;
4152       if (*buffer->cur == '=')
4153         buffer->cur++, result->type = CPP_MOD_EQ;
4154       else if (CPP_OPTION (pfile, digraphs))
4155         {
4156           if (*buffer->cur == ':')
4157             {
4158               buffer->cur++;
4159               result->flags |= DIGRAPH;
4160               result->type = CPP_HASH;
4161               if (*buffer->cur == '%' && buffer->cur[1] == ':')
4162                 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
4163             }
4164           else if (*buffer->cur == '>')
4165             {
4166               buffer->cur++;
4167               result->flags |= DIGRAPH;
4168               result->type = CPP_CLOSE_BRACE;
4169             }
4170         }
4171       break;
4172
4173     case '.':
4174       result->type = CPP_DOT;
4175       if (ISDIGIT (*buffer->cur))
4176         {
4177           struct normalize_state nst = INITIAL_NORMALIZE_STATE;
4178           result->type = CPP_NUMBER;
4179           lex_number (pfile, &result->val.str, &nst);
4180           warn_about_normalization (pfile, result, &nst, false);
4181         }
4182       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
4183         buffer->cur += 2, result->type = CPP_ELLIPSIS;
4184       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
4185         buffer->cur++, result->type = CPP_DOT_STAR;
4186       break;
4187
4188     case '+':
4189       result->type = CPP_PLUS;
4190       if (*buffer->cur == '+')
4191         buffer->cur++, result->type = CPP_PLUS_PLUS;
4192       else if (*buffer->cur == '=')
4193         buffer->cur++, result->type = CPP_PLUS_EQ;
4194       break;
4195
4196     case '-':
4197       result->type = CPP_MINUS;
4198       if (*buffer->cur == '>')
4199         {
4200           buffer->cur++;
4201           result->type = CPP_DEREF;
4202           if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
4203             buffer->cur++, result->type = CPP_DEREF_STAR;
4204         }
4205       else if (*buffer->cur == '-')
4206         buffer->cur++, result->type = CPP_MINUS_MINUS;
4207       else if (*buffer->cur == '=')
4208         buffer->cur++, result->type = CPP_MINUS_EQ;
4209       break;
4210
4211     case '&':
4212       result->type = CPP_AND;
4213       if (*buffer->cur == '&')
4214         buffer->cur++, result->type = CPP_AND_AND;
4215       else if (*buffer->cur == '=')
4216         buffer->cur++, result->type = CPP_AND_EQ;
4217       break;
4218
4219     case '|':
4220       result->type = CPP_OR;
4221       if (*buffer->cur == '|')
4222         buffer->cur++, result->type = CPP_OR_OR;
4223       else if (*buffer->cur == '=')
4224         buffer->cur++, result->type = CPP_OR_EQ;
4225       break;
4226
4227     case ':':
4228       result->type = CPP_COLON;
4229       if (*buffer->cur == ':' && CPP_OPTION (pfile, scope))
4230         buffer->cur++, result->type = CPP_SCOPE;
4231       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
4232         {
4233           buffer->cur++;
4234           result->flags |= DIGRAPH;
4235           result->type = CPP_CLOSE_SQUARE;
4236         }
4237       break;
4238
4239     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
4240     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
4241     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
4242     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
4243     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
4244
4245     case '?': result->type = CPP_QUERY; break;
4246     case '~': result->type = CPP_COMPL; break;
4247     case ',': result->type = CPP_COMMA; break;
4248     case '(': result->type = CPP_OPEN_PAREN; break;
4249     case ')': result->type = CPP_CLOSE_PAREN; break;
4250     case '[': result->type = CPP_OPEN_SQUARE; break;
4251     case ']': result->type = CPP_CLOSE_SQUARE; break;
4252     case '{': result->type = CPP_OPEN_BRACE; break;
4253     case '}': result->type = CPP_CLOSE_BRACE; break;
4254     case ';': result->type = CPP_SEMICOLON; break;
4255
4256       /* @ is a punctuator in Objective-C.  */
4257     case '@': result->type = CPP_ATSIGN; break;
4258
4259     default:
4260       {
4261         const uchar *base = --buffer->cur;
4262         static int no_warn_cnt;
4263
4264         /* Check for an extended identifier ($ or UCN or UTF-8).  */
4265         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
4266         if (forms_identifier_p (pfile, true, &nst))
4267           {
4268             result->type = CPP_NAME;
4269             const auto node = lex_identifier (pfile, base, true, &nst,
4270                                               &result->val.node.spelling);
4271             result->val.node.node = node;
4272             identifier_diagnostics_on_lex (pfile, node);
4273             warn_about_normalization (pfile, result, &nst, true);
4274             break;
4275           }
4276
4277         /* Otherwise this will form a CPP_OTHER token.  Parse valid UTF-8 as a
4278            single token.  */
4279         buffer->cur++;
4280         if (c >= utf8_signifier)
4281           {
4282             const uchar *pstr = base;
4283             cppchar_t s;
4284             if (_cpp_valid_utf8 (pfile, &pstr, buffer->rlimit, 0, NULL, &s))
4285               {
4286                 if (s > UCS_LIMIT && CPP_OPTION (pfile, cpp_warn_invalid_utf8))
4287                   {
4288                     buffer->cur = base;
4289                     _cpp_warn_invalid_utf8 (pfile);
4290                   }
4291                 buffer->cur = pstr;
4292               }
4293             else if (CPP_OPTION (pfile, cpp_warn_invalid_utf8))
4294               {
4295                 buffer->cur = base;
4296                 const uchar *end = _cpp_warn_invalid_utf8 (pfile);
4297                 buffer->cur = base + 1;
4298                 no_warn_cnt = end - buffer->cur;
4299               }
4300           }
4301         else if (c >= utf8_continuation
4302                  && CPP_OPTION (pfile, cpp_warn_invalid_utf8))
4303           {
4304             if (no_warn_cnt)
4305               --no_warn_cnt;
4306             else
4307               {
4308                 buffer->cur = base;
4309                 _cpp_warn_invalid_utf8 (pfile);
4310                 buffer->cur = base + 1;
4311               }
4312           }
4313         create_literal (pfile, result, base, buffer->cur - base, CPP_OTHER);
4314         break;
4315       }
4316
4317     }
4318
4319   /* Potentially convert the location of the token to a range.  */
4320   if (result->src_loc >= RESERVED_LOCATION_COUNT
4321       && result->type != CPP_EOF)
4322     {
4323       /* Ensure that any line notes are processed, so that we have the
4324          correct physical line/column for the end-point of the token even
4325          when a logical line is split via one or more backslashes.  */
4326       if (buffer->cur >= buffer->notes[buffer->cur_note].pos
4327           && !pfile->overlaid_buffer)
4328         _cpp_process_line_notes (pfile, false);
4329
4330       source_range tok_range;
4331       tok_range.m_start = result->src_loc;
4332       tok_range.m_finish
4333         = linemap_position_for_column (pfile->line_table,
4334                                        CPP_BUF_COLUMN (buffer, buffer->cur));
4335
4336       result->src_loc = COMBINE_LOCATION_DATA (pfile->line_table,
4337                                                result->src_loc,
4338                                                tok_range, NULL, 0);
4339     }
4340
4341   return result;
4342 }
4343
4344 /* An upper bound on the number of bytes needed to spell TOKEN.
4345    Does not include preceding whitespace.  */
4346 unsigned int
4347 cpp_token_len (const cpp_token *token)
4348 {
4349   unsigned int len;
4350
4351   switch (TOKEN_SPELL (token))
4352     {
4353     default:            len = 6;                                break;
4354     case SPELL_LITERAL: len = token->val.str.len;               break;
4355     case SPELL_IDENT:   len = NODE_LEN (token->val.node.node) * 10;     break;
4356     }
4357
4358   return len;
4359 }
4360
4361 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
4362    Return the number of bytes read out of NAME.  (There are always
4363    10 bytes written to BUFFER.)  */
4364
4365 static size_t
4366 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
4367 {
4368   int j;
4369   int ucn_len = 0;
4370   int ucn_len_c;
4371   unsigned t;
4372   unsigned long utf32;
4373
4374   /* Compute the length of the UTF-8 sequence.  */
4375   for (t = *name; t & 0x80; t <<= 1)
4376     ucn_len++;
4377
4378   utf32 = *name & (0x7F >> ucn_len);
4379   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
4380     {
4381       utf32 = (utf32 << 6) | (*++name & 0x3F);
4382
4383       /* Ill-formed UTF-8.  */
4384       if ((*name & ~0x3F) != 0x80)
4385         abort ();
4386     }
4387
4388   *buffer++ = '\\';
4389   *buffer++ = 'U';
4390   for (j = 7; j >= 0; j--)
4391     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
4392   return ucn_len;
4393 }
4394
4395 /* Given a token TYPE corresponding to a digraph, return a pointer to
4396    the spelling of the digraph.  */
4397 static const unsigned char *
4398 cpp_digraph2name (enum cpp_ttype type)
4399 {
4400   return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
4401 }
4402
4403 /* Write the spelling of an identifier IDENT, using UCNs, to BUFFER.
4404    The buffer must already contain enough space to hold the
4405    token's spelling.  Returns a pointer to the character after the
4406    last character written.  */
4407 unsigned char *
4408 _cpp_spell_ident_ucns (unsigned char *buffer, cpp_hashnode *ident)
4409 {
4410   size_t i;
4411   const unsigned char *name = NODE_NAME (ident);
4412
4413   for (i = 0; i < NODE_LEN (ident); i++)
4414     if (name[i] & ~0x7F)
4415       {
4416         i += utf8_to_ucn (buffer, name + i) - 1;
4417         buffer += 10;
4418       }
4419     else
4420       *buffer++ = name[i];
4421
4422   return buffer;
4423 }
4424
4425 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
4426    already contain enough space to hold the token's spelling.
4427    Returns a pointer to the character after the last character written.
4428    FORSTRING is true if this is to be the spelling after translation
4429    phase 1 (with the original spelling of extended identifiers), false
4430    if extended identifiers should always be written using UCNs (there is
4431    no option for always writing them in the internal UTF-8 form).
4432    FIXME: Would be nice if we didn't need the PFILE argument.  */
4433 unsigned char *
4434 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
4435                  unsigned char *buffer, bool forstring)
4436 {
4437   switch (TOKEN_SPELL (token))
4438     {
4439     case SPELL_OPERATOR:
4440       {
4441         const unsigned char *spelling;
4442         unsigned char c;
4443
4444         if (token->flags & DIGRAPH)
4445           spelling = cpp_digraph2name (token->type);
4446         else if (token->flags & NAMED_OP)
4447           goto spell_ident;
4448         else
4449           spelling = TOKEN_NAME (token);
4450
4451         while ((c = *spelling++) != '\0')
4452           *buffer++ = c;
4453       }
4454       break;
4455
4456     spell_ident:
4457     case SPELL_IDENT:
4458       if (forstring)
4459         {
4460           memcpy (buffer, NODE_NAME (token->val.node.spelling),
4461                   NODE_LEN (token->val.node.spelling));
4462           buffer += NODE_LEN (token->val.node.spelling);
4463         }
4464       else
4465         buffer = _cpp_spell_ident_ucns (buffer, token->val.node.node);
4466       break;
4467
4468     case SPELL_LITERAL:
4469       memcpy (buffer, token->val.str.text, token->val.str.len);
4470       buffer += token->val.str.len;
4471       break;
4472
4473     case SPELL_NONE:
4474       cpp_error (pfile, CPP_DL_ICE,
4475                  "unspellable token %s", TOKEN_NAME (token));
4476       break;
4477     }
4478
4479   return buffer;
4480 }
4481
4482 /* Returns TOKEN spelt as a null-terminated string.  The string is
4483    freed when the reader is destroyed.  Useful for diagnostics.  */
4484 unsigned char *
4485 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
4486 {
4487   unsigned int len = cpp_token_len (token) + 1;
4488   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
4489
4490   end = cpp_spell_token (pfile, token, start, false);
4491   end[0] = '\0';
4492
4493   return start;
4494 }
4495
4496 /* Returns a pointer to a string which spells the token defined by
4497    TYPE and FLAGS.  Used by C front ends, which really should move to
4498    using cpp_token_as_text.  */
4499 const char *
4500 cpp_type2name (enum cpp_ttype type, unsigned char flags)
4501 {
4502   if (flags & DIGRAPH)
4503     return (const char *) cpp_digraph2name (type);
4504   else if (flags & NAMED_OP)
4505     return cpp_named_operator2name (type);
4506
4507   return (const char *) token_spellings[type].name;
4508 }
4509
4510 /* Writes the spelling of token to FP, without any preceding space.
4511    Separated from cpp_spell_token for efficiency - to avoid stdio
4512    double-buffering.  */
4513 void
4514 cpp_output_token (const cpp_token *token, FILE *fp)
4515 {
4516   switch (TOKEN_SPELL (token))
4517     {
4518     case SPELL_OPERATOR:
4519       {
4520         const unsigned char *spelling;
4521         int c;
4522
4523         if (token->flags & DIGRAPH)
4524           spelling = cpp_digraph2name (token->type);
4525         else if (token->flags & NAMED_OP)
4526           goto spell_ident;
4527         else
4528           spelling = TOKEN_NAME (token);
4529
4530         c = *spelling;
4531         do
4532           putc (c, fp);
4533         while ((c = *++spelling) != '\0');
4534       }
4535       break;
4536
4537     spell_ident:
4538     case SPELL_IDENT:
4539       {
4540         size_t i;
4541         const unsigned char * name = NODE_NAME (token->val.node.node);
4542
4543         for (i = 0; i < NODE_LEN (token->val.node.node); i++)
4544           if (name[i] & ~0x7F)
4545             {
4546               unsigned char buffer[10];
4547               i += utf8_to_ucn (buffer, name + i) - 1;
4548               fwrite (buffer, 1, 10, fp);
4549             }
4550           else
4551             fputc (NODE_NAME (token->val.node.node)[i], fp);
4552       }
4553       break;
4554
4555     case SPELL_LITERAL:
4556       if (token->type == CPP_HEADER_NAME)
4557         fputc ('"', fp);
4558       fwrite (token->val.str.text, 1, token->val.str.len, fp);
4559       if (token->type == CPP_HEADER_NAME)
4560         fputc ('"', fp);
4561       break;
4562
4563     case SPELL_NONE:
4564       /* An error, most probably.  */
4565       break;
4566     }
4567 }
4568
4569 /* Compare two tokens.  */
4570 int
4571 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
4572 {
4573   if (a->type == b->type && a->flags == b->flags)
4574     switch (TOKEN_SPELL (a))
4575       {
4576       default:                  /* Keep compiler happy.  */
4577       case SPELL_OPERATOR:
4578         /* token_no is used to track where multiple consecutive ##
4579            tokens were originally located.  */
4580         return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
4581       case SPELL_NONE:
4582         return (a->type != CPP_MACRO_ARG
4583                 || (a->val.macro_arg.arg_no == b->val.macro_arg.arg_no
4584                     && a->val.macro_arg.spelling == b->val.macro_arg.spelling));
4585       case SPELL_IDENT:
4586         return (a->val.node.node == b->val.node.node
4587                 && a->val.node.spelling == b->val.node.spelling);
4588       case SPELL_LITERAL:
4589         return (a->val.str.len == b->val.str.len
4590                 && !memcmp (a->val.str.text, b->val.str.text,
4591                             a->val.str.len));
4592       }
4593
4594   return 0;
4595 }
4596
4597 /* Returns nonzero if a space should be inserted to avoid an
4598    accidental token paste for output.  For simplicity, it is
4599    conservative, and occasionally advises a space where one is not
4600    needed, e.g. "." and ".2".  */
4601 int
4602 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
4603                  const cpp_token *token2)
4604 {
4605   enum cpp_ttype a = token1->type, b = token2->type;
4606   cppchar_t c;
4607
4608   if (token1->flags & NAMED_OP)
4609     a = CPP_NAME;
4610   if (token2->flags & NAMED_OP)
4611     b = CPP_NAME;
4612
4613   c = EOF;
4614   if (token2->flags & DIGRAPH)
4615     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
4616   else if (token_spellings[b].category == SPELL_OPERATOR)
4617     c = token_spellings[b].name[0];
4618
4619   /* Quickly get everything that can paste with an '='.  */
4620   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
4621     return 1;
4622
4623   switch (a)
4624     {
4625     case CPP_GREATER:   return c == '>';
4626     case CPP_LESS:      return c == '<' || c == '%' || c == ':';
4627     case CPP_PLUS:      return c == '+';
4628     case CPP_MINUS:     return c == '-' || c == '>';
4629     case CPP_DIV:       return c == '/' || c == '*'; /* Comments.  */
4630     case CPP_MOD:       return c == ':' || c == '>';
4631     case CPP_AND:       return c == '&';
4632     case CPP_OR:        return c == '|';
4633     case CPP_COLON:     return c == ':' || c == '>';
4634     case CPP_DEREF:     return c == '*';
4635     case CPP_DOT:       return c == '.' || c == '%' || b == CPP_NUMBER;
4636     case CPP_HASH:      return c == '#' || c == '%'; /* Digraph form.  */
4637     case CPP_PRAGMA:
4638     case CPP_NAME:      return ((b == CPP_NUMBER
4639                                  && name_p (pfile, &token2->val.str))
4640                                 || b == CPP_NAME
4641                                 || b == CPP_CHAR || b == CPP_STRING); /* L */
4642     case CPP_NUMBER:    return (b == CPP_NUMBER || b == CPP_NAME
4643                                 || b == CPP_CHAR
4644                                 || c == '.' || c == '+' || c == '-');
4645                                       /* UCNs */
4646     case CPP_OTHER:     return ((token1->val.str.text[0] == '\\'
4647                                  && b == CPP_NAME)
4648                                 || (CPP_OPTION (pfile, objc)
4649                                     && token1->val.str.text[0] == '@'
4650                                     && (b == CPP_NAME || b == CPP_STRING)));
4651     case CPP_LESS_EQ:   return c == '>';
4652     case CPP_STRING:
4653     case CPP_WSTRING:
4654     case CPP_UTF8STRING:
4655     case CPP_STRING16:
4656     case CPP_STRING32:  return (CPP_OPTION (pfile, user_literals)
4657                                 && (b == CPP_NAME
4658                                     || (TOKEN_SPELL (token2) == SPELL_LITERAL
4659                                         && ISIDST (token2->val.str.text[0]))));
4660
4661     default:            break;
4662     }
4663
4664   return 0;
4665 }
4666
4667 /* Output all the remaining tokens on the current line, and a newline
4668    character, to FP.  Leading whitespace is removed.  If there are
4669    macros, special token padding is not performed.  */
4670 void
4671 cpp_output_line (cpp_reader *pfile, FILE *fp)
4672 {
4673   const cpp_token *token;
4674
4675   token = cpp_get_token (pfile);
4676   while (token->type != CPP_EOF)
4677     {
4678       cpp_output_token (token, fp);
4679       token = cpp_get_token (pfile);
4680       if (token->flags & PREV_WHITE)
4681         putc (' ', fp);
4682     }
4683
4684   putc ('\n', fp);
4685 }
4686
4687 /* Return a string representation of all the remaining tokens on the
4688    current line.  The result is allocated using xmalloc and must be
4689    freed by the caller.  */
4690 unsigned char *
4691 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
4692 {
4693   const cpp_token *token;
4694   unsigned int out = dir_name ? ustrlen (dir_name) : 0;
4695   unsigned int alloced = 120 + out;
4696   unsigned char *result = (unsigned char *) xmalloc (alloced);
4697
4698   /* If DIR_NAME is empty, there are no initial contents.  */
4699   if (dir_name)
4700     {
4701       sprintf ((char *) result, "#%s ", dir_name);
4702       out += 2;
4703     }
4704
4705   token = cpp_get_token (pfile);
4706   while (token->type != CPP_EOF)
4707     {
4708       unsigned char *last;
4709       /* Include room for a possible space and the terminating nul.  */
4710       unsigned int len = cpp_token_len (token) + 2;
4711
4712       if (out + len > alloced)
4713         {
4714           alloced *= 2;
4715           if (out + len > alloced)
4716             alloced = out + len;
4717           result = (unsigned char *) xrealloc (result, alloced);
4718         }
4719
4720       last = cpp_spell_token (pfile, token, &result[out], 0);
4721       out = last - result;
4722
4723       token = cpp_get_token (pfile);
4724       if (token->flags & PREV_WHITE)
4725         result[out++] = ' ';
4726     }
4727
4728   result[out] = '\0';
4729   return result;
4730 }
4731
4732 /* Memory buffers.  Changing these three constants can have a dramatic
4733    effect on performance.  The values here are reasonable defaults,
4734    but might be tuned.  If you adjust them, be sure to test across a
4735    range of uses of cpplib, including heavy nested function-like macro
4736    expansion.  Also check the change in peak memory usage (NJAMD is a
4737    good tool for this).  */
4738 #define MIN_BUFF_SIZE 8000
4739 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
4740 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
4741         (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
4742
4743 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
4744   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
4745 #endif
4746
4747 /* Create a new allocation buffer.  Place the control block at the end
4748    of the buffer, so that buffer overflows will cause immediate chaos.  */
4749 static _cpp_buff *
4750 new_buff (size_t len)
4751 {
4752   _cpp_buff *result;
4753   unsigned char *base;
4754
4755   if (len < MIN_BUFF_SIZE)
4756     len = MIN_BUFF_SIZE;
4757   len = CPP_ALIGN (len);
4758
4759 #ifdef ENABLE_VALGRIND_ANNOTATIONS
4760   /* Valgrind warns about uses of interior pointers, so put _cpp_buff
4761      struct first.  */
4762   size_t slen = CPP_ALIGN2 (sizeof (_cpp_buff), 2 * DEFAULT_ALIGNMENT);
4763   base = XNEWVEC (unsigned char, len + slen);
4764   result = (_cpp_buff *) base;
4765   base += slen;
4766 #else
4767   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
4768   result = (_cpp_buff *) (base + len);
4769 #endif
4770   result->base = base;
4771   result->cur = base;
4772   result->limit = base + len;
4773   result->next = NULL;
4774   return result;
4775 }
4776
4777 /* Place a chain of unwanted allocation buffers on the free list.  */
4778 void
4779 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
4780 {
4781   _cpp_buff *end = buff;
4782
4783   while (end->next)
4784     end = end->next;
4785   end->next = pfile->free_buffs;
4786   pfile->free_buffs = buff;
4787 }
4788
4789 /* Return a free buffer of size at least MIN_SIZE.  */
4790 _cpp_buff *
4791 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
4792 {
4793   _cpp_buff *result, **p;
4794
4795   for (p = &pfile->free_buffs;; p = &(*p)->next)
4796     {
4797       size_t size;
4798
4799       if (*p == NULL)
4800         return new_buff (min_size);
4801       result = *p;
4802       size = result->limit - result->base;
4803       /* Return a buffer that's big enough, but don't waste one that's
4804          way too big.  */
4805       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
4806         break;
4807     }
4808
4809   *p = result->next;
4810   result->next = NULL;
4811   result->cur = result->base;
4812   return result;
4813 }
4814
4815 /* Creates a new buffer with enough space to hold the uncommitted
4816    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
4817    the excess bytes to the new buffer.  Chains the new buffer after
4818    BUFF, and returns the new buffer.  */
4819 _cpp_buff *
4820 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
4821 {
4822   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
4823   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
4824
4825   buff->next = new_buff;
4826   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
4827   return new_buff;
4828 }
4829
4830 /* Creates a new buffer with enough space to hold the uncommitted
4831    remaining bytes of the buffer pointed to by BUFF, and at least
4832    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
4833    Chains the new buffer before the buffer pointed to by BUFF, and
4834    updates the pointer to point to the new buffer.  */
4835 void
4836 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
4837 {
4838   _cpp_buff *new_buff, *old_buff = *pbuff;
4839   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
4840
4841   new_buff = _cpp_get_buff (pfile, size);
4842   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
4843   new_buff->next = old_buff;
4844   *pbuff = new_buff;
4845 }
4846
4847 /* Free a chain of buffers starting at BUFF.  */
4848 void
4849 _cpp_free_buff (_cpp_buff *buff)
4850 {
4851   _cpp_buff *next;
4852
4853   for (; buff; buff = next)
4854     {
4855       next = buff->next;
4856 #ifdef ENABLE_VALGRIND_ANNOTATIONS
4857       free (buff);
4858 #else
4859       free (buff->base);
4860 #endif
4861     }
4862 }
4863
4864 /* Allocate permanent, unaligned storage of length LEN.  */
4865 unsigned char *
4866 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
4867 {
4868   _cpp_buff *buff = pfile->u_buff;
4869   unsigned char *result = buff->cur;
4870
4871   if (len > (size_t) (buff->limit - result))
4872     {
4873       buff = _cpp_get_buff (pfile, len);
4874       buff->next = pfile->u_buff;
4875       pfile->u_buff = buff;
4876       result = buff->cur;
4877     }
4878
4879   buff->cur = result + len;
4880   return result;
4881 }
4882
4883 /* Allocate permanent, unaligned storage of length LEN from a_buff.
4884    That buffer is used for growing allocations when saving macro
4885    replacement lists in a #define, and when parsing an answer to an
4886    assertion in #assert, #unassert or #if (and therefore possibly
4887    whilst expanding macros).  It therefore must not be used by any
4888    code that they might call: specifically the lexer and the guts of
4889    the macro expander.
4890
4891    All existing other uses clearly fit this restriction: storing
4892    registered pragmas during initialization.  */
4893 unsigned char *
4894 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
4895 {
4896   _cpp_buff *buff = pfile->a_buff;
4897   unsigned char *result = buff->cur;
4898
4899   if (len > (size_t) (buff->limit - result))
4900     {
4901       buff = _cpp_get_buff (pfile, len);
4902       buff->next = pfile->a_buff;
4903       pfile->a_buff = buff;
4904       result = buff->cur;
4905     }
4906
4907   buff->cur = result + len;
4908   return result;
4909 }
4910
4911 /* Commit or allocate storage from a buffer.  */
4912
4913 void *
4914 _cpp_commit_buff (cpp_reader *pfile, size_t size)
4915 {
4916   void *ptr = BUFF_FRONT (pfile->a_buff);
4917
4918   if (pfile->hash_table->alloc_subobject)
4919     {
4920       void *copy = pfile->hash_table->alloc_subobject (size);
4921       memcpy (copy, ptr, size);
4922       ptr = copy;
4923     }
4924   else
4925     BUFF_FRONT (pfile->a_buff) += size;
4926
4927   return ptr;
4928 }
4929
4930 /* Say which field of TOK is in use.  */
4931
4932 enum cpp_token_fld_kind
4933 cpp_token_val_index (const cpp_token *tok)
4934 {
4935   switch (TOKEN_SPELL (tok))
4936     {
4937     case SPELL_IDENT:
4938       return CPP_TOKEN_FLD_NODE;
4939     case SPELL_LITERAL:
4940       return CPP_TOKEN_FLD_STR;
4941     case SPELL_OPERATOR:
4942       /* Operands which were originally spelled as ident keep around
4943          the node for the exact spelling.  */
4944       if (tok->flags & NAMED_OP)
4945         return CPP_TOKEN_FLD_NODE;
4946       else if (tok->type == CPP_PASTE)
4947         return CPP_TOKEN_FLD_TOKEN_NO;
4948       else
4949         return CPP_TOKEN_FLD_NONE;
4950     case SPELL_NONE:
4951       if (tok->type == CPP_MACRO_ARG)
4952         return CPP_TOKEN_FLD_ARG_NO;
4953       else if (tok->type == CPP_PADDING)
4954         return CPP_TOKEN_FLD_SOURCE;
4955       else if (tok->type == CPP_PRAGMA)
4956         return CPP_TOKEN_FLD_PRAGMA;
4957       /* fall through */
4958     default:
4959       return CPP_TOKEN_FLD_NONE;
4960     }
4961 }
4962
4963 /* All tokens lexed in R after calling this function will be forced to
4964    have their location_t to be P, until
4965    cpp_stop_forcing_token_locations is called for R.  */
4966
4967 void
4968 cpp_force_token_locations (cpp_reader *r, location_t loc)
4969 {
4970   r->forced_token_location = loc;
4971 }
4972
4973 /* Go back to assigning locations naturally for lexed tokens.  */
4974
4975 void
4976 cpp_stop_forcing_token_locations (cpp_reader *r)
4977 {
4978   r->forced_token_location = 0;
4979 }
4980
4981 /* We're looking at \, if it's escaping EOL, look past it.  If at
4982    LIMIT, don't advance.  */
4983
4984 static const unsigned char *
4985 do_peek_backslash (const unsigned char *peek, const unsigned char *limit)
4986 {
4987   const unsigned char *probe = peek;
4988
4989   if (__builtin_expect (peek[1] == '\n', true))
4990     {
4991     eol:
4992       probe += 2;
4993       if (__builtin_expect (probe < limit, true))
4994         {
4995           peek = probe;
4996           if (*peek == '\\')
4997             /* The user might be perverse.  */
4998             return do_peek_backslash (peek, limit);
4999         }
5000     }
5001   else if (__builtin_expect (peek[1] == '\r', false))
5002     {
5003       if (probe[2] == '\n')
5004         probe++;
5005       goto eol;
5006     }
5007
5008   return peek;
5009 }
5010
5011 static const unsigned char *
5012 do_peek_next (const unsigned char *peek, const unsigned char *limit)
5013 {
5014   if (__builtin_expect (*peek == '\\', false))
5015     peek = do_peek_backslash (peek, limit);
5016   return peek;
5017 }
5018
5019 static const unsigned char *
5020 do_peek_prev (const unsigned char *peek, const unsigned char *bound)
5021 {
5022   if (peek == bound)
5023     return NULL;
5024
5025   unsigned char c = *--peek;
5026   if (__builtin_expect (c == '\n', false)
5027       || __builtin_expect (c == 'r', false))
5028     {
5029       if (peek == bound)
5030         return peek;
5031       int ix = -1;
5032       if (c == '\n' && peek[ix] == '\r')
5033         {
5034           if (peek + ix == bound)
5035             return peek;
5036           ix--;
5037         }
5038
5039       if (peek[ix] == '\\')
5040         return do_peek_prev (peek + ix, bound);
5041
5042       return peek;
5043     }
5044   else
5045     return peek;
5046 }
5047
5048 /* If PEEK[-1] is identifier MATCH, scan past it and trailing white
5049    space.  Otherwise return NULL.  */
5050
5051 static const unsigned char *
5052 do_peek_ident (const char *match, const unsigned char *peek,
5053                const unsigned char *limit)
5054 {
5055   for (; *++match; peek++)
5056     if (*peek != *match)
5057       {
5058         peek = do_peek_next (peek, limit);
5059         if (*peek != *match)
5060           return NULL;
5061       }
5062
5063   /* Must now not be looking at an identifier char.  */
5064   peek = do_peek_next (peek, limit);
5065   if (ISIDNUM (*peek))
5066     return NULL;
5067
5068   /* Skip control-line whitespace.  */
5069  ws:
5070   while (*peek == ' ' || *peek == '\t')
5071     peek++;
5072   if (__builtin_expect (*peek == '\\', false))
5073     {
5074       peek = do_peek_backslash (peek, limit);
5075       if (*peek != '\\')
5076         goto ws;
5077     }
5078
5079   return peek;
5080 }
5081
5082 /* Are we looking at a module control line starting as PEEK - 1?  */
5083
5084 static bool
5085 do_peek_module (cpp_reader *pfile, unsigned char c,
5086                 const unsigned char *peek, const unsigned char *limit)
5087 {
5088   bool import = false;
5089
5090   if (__builtin_expect (c == 'e', false))
5091     {
5092       if (!((peek[0] == 'x' || peek[0] == '\\')
5093             && (peek = do_peek_ident ("export", peek, limit))))
5094         return false;
5095
5096       /* export, peek for import or module.  No need to peek __import
5097          here.  */
5098       if (peek[0] == 'i')
5099         {
5100           if (!((peek[1] == 'm' || peek[1] == '\\')
5101                 && (peek = do_peek_ident ("import", peek + 1, limit))))
5102             return false;
5103           import = true;
5104         }
5105       else if (peek[0] == 'm')
5106         {
5107           if (!((peek[1] == 'o' || peek[1] == '\\')
5108                 && (peek = do_peek_ident ("module", peek + 1, limit))))
5109             return false;
5110         }
5111       else
5112         return false;
5113     }
5114   else if (__builtin_expect (c == 'i', false))
5115     {
5116       if (!((peek[0] == 'm' || peek[0] == '\\')
5117             && (peek = do_peek_ident ("import", peek, limit))))
5118         return false;
5119       import = true;
5120     }
5121   else if (__builtin_expect (c == '_', false))
5122     {
5123       /* Needed for translated includes.   */
5124       if (!((peek[0] == '_' || peek[0] == '\\')
5125             && (peek = do_peek_ident ("__import", peek, limit))))
5126         return false;
5127       import = true;
5128     }
5129   else if (__builtin_expect (c == 'm', false))
5130     {
5131       if (!((peek[0] == 'o' || peek[0] == '\\')
5132             && (peek = do_peek_ident ("module", peek, limit))))
5133         return false;
5134     }
5135   else
5136     return false;
5137
5138   /* Peek the next character to see if it's good enough.  We'll be at
5139      the first non-whitespace char, including skipping an escaped
5140      newline.  */
5141   /* ... import followed by identifier, ':', '<' or header-name
5142      preprocessing tokens, or module followed by identifier, ':' or
5143      ';' preprocessing tokens.  */
5144   unsigned char p = *peek++;
5145
5146   /* A character literal is ... single quotes, ... optionally preceded
5147      by u8, u, U, or L */
5148   /* A string-literal is a ... double quotes, optionally prefixed by
5149      R, u8, u8R, u, uR, U, UR, L, or LR */
5150   if (p == 'u')
5151     {
5152       peek = do_peek_next (peek, limit);
5153       if (*peek == '8')
5154         {
5155           peek++;
5156           goto peek_u8;
5157         }
5158       goto peek_u;
5159     }
5160   else if (p == 'U' || p == 'L')
5161     {
5162     peek_u8:
5163       peek = do_peek_next (peek, limit);
5164     peek_u:
5165       if (*peek == '\"' || *peek == '\'')
5166         return false;
5167
5168       if (*peek == 'R')
5169         goto peek_R;
5170       /* Identifier. Ok.  */
5171     }
5172   else if (p == 'R')
5173     {
5174     peek_R:
5175       if (CPP_OPTION (pfile, rliterals))
5176         {
5177           peek = do_peek_next (peek, limit);
5178           if (*peek == '\"')
5179             return false;
5180         }
5181       /* Identifier. Ok.  */
5182     }
5183   else if ('Z' - 'A' == 25
5184            ? ((p >= 'A' && p <= 'Z') || (p >= 'a' && p <= 'z') || p == '_')
5185            : ISIDST (p))
5186     {
5187       /* Identifier.  Ok. */
5188     }
5189   else if (p == '<')
5190     {
5191       /* Maybe angle header, ok for import.  Reject
5192          '<=', '<<' digraph:'<:'.  */
5193       if (!import)
5194         return false;
5195       peek = do_peek_next (peek, limit);
5196       if (*peek == '=' || *peek == '<'
5197           || (*peek == ':' && CPP_OPTION (pfile, digraphs)))
5198         return false;
5199     }
5200   else if (p == ';')
5201     {
5202       /* SEMICOLON, ok for module.  */
5203       if (import)
5204         return false;
5205     }
5206   else if (p == '"')
5207     {
5208       /* STRING, ok for import.  */
5209       if (!import)
5210         return false;
5211     }
5212   else if (p == ':')
5213     {
5214       /* Maybe COLON, ok.  Reject '::', digraph:':>'.  */
5215       peek = do_peek_next (peek, limit);
5216       if (*peek == ':' || (*peek == '>' && CPP_OPTION (pfile, digraphs)))
5217         return false;
5218     }
5219   else
5220     /* FIXME: Detect a unicode character, excluding those not
5221        permitted as the initial character. [lex.name]/1.  I presume
5222        we need to check the \[uU] spellings, and directly using
5223        Unicode in say UTF8 form?  Or perhaps we do the phase-1
5224        conversion of UTF8 to universal-character-names?  */
5225     return false;
5226
5227   return true;
5228 }
5229
5230 /* Directives-only scanning.  Somewhat more relaxed than correct
5231    parsing -- some ill-formed programs will not be rejected.  */
5232
5233 void
5234 cpp_directive_only_process (cpp_reader *pfile,
5235                             void *data,
5236                             void (*cb) (cpp_reader *, CPP_DO_task, void *, ...))
5237 {
5238   bool module_p = CPP_OPTION (pfile, module_directives);
5239
5240   do
5241     {
5242     restart:
5243       /* Buffer initialization, but no line cleaning. */
5244       cpp_buffer *buffer = pfile->buffer;
5245       buffer->cur_note = buffer->notes_used = 0;
5246       buffer->cur = buffer->line_base = buffer->next_line;
5247       buffer->need_line = false;
5248       /* Files always end in a newline or carriage return.  We rely on this for
5249          character peeking safety.  */
5250       gcc_assert (buffer->rlimit[0] == '\n' || buffer->rlimit[0] == '\r');
5251
5252       const unsigned char *base = buffer->cur;
5253       unsigned line_count = 0;
5254       const unsigned char *line_start = base;
5255
5256       bool bol = true;
5257       bool raw = false;
5258
5259       const unsigned char *lwm = base;
5260       for (const unsigned char *pos = base, *limit = buffer->rlimit;
5261            pos < limit;)
5262         {
5263           unsigned char c = *pos++;
5264           /* This matches the switch in _cpp_lex_direct.  */
5265           switch (c)
5266             {
5267             case ' ': case '\t': case '\f': case '\v':
5268               /* Whitespace, do nothing.  */
5269               break;
5270
5271             case '\r': /* MAC line ending, or Windows \r\n  */
5272               if (*pos == '\n')
5273                 pos++;
5274               /* FALLTHROUGH */
5275
5276             case '\n':
5277               bol = true;
5278
5279             next_line:
5280               CPP_INCREMENT_LINE (pfile, 0);
5281               line_count++;
5282               line_start = pos;
5283               break;
5284
5285             case '\\':
5286               /* <backslash><newline> is removed, and doesn't undo any
5287                  preceeding escape or whatnot.  */
5288               if (*pos == '\n')
5289                 {
5290                   pos++;
5291                   goto next_line;
5292                 }
5293               else if (*pos == '\r')
5294                 {
5295                   if (pos[1] == '\n')
5296                     pos++;
5297                   pos++;
5298                   goto next_line;
5299                 }
5300               goto dflt;
5301
5302             case '#':
5303               if (bol)
5304                 {
5305                   /* Line directive.  */
5306                   if (pos - 1 > base && !pfile->state.skipping)
5307                     cb (pfile, CPP_DO_print, data,
5308                         line_count, base, pos - 1 - base);
5309
5310                   /* Prep things for directive handling. */
5311                   buffer->next_line = pos;
5312                   buffer->need_line = true;
5313                   bool ok = _cpp_get_fresh_line (pfile);
5314                   gcc_checking_assert (ok);
5315
5316                   /* Ensure proper column numbering for generated
5317                      error messages. */
5318                   buffer->line_base -= pos - line_start;
5319
5320                   _cpp_handle_directive (pfile, line_start + 1 != pos);
5321
5322                   /* Sanitize the line settings.  Duplicate #include's can
5323                      mess things up. */
5324                   // FIXME: Necessary?
5325                   pfile->line_table->highest_location
5326                     = pfile->line_table->highest_line;
5327
5328                   if (!pfile->state.skipping
5329                       && pfile->buffer->next_line < pfile->buffer->rlimit)
5330                     cb (pfile, CPP_DO_location, data,
5331                         pfile->line_table->highest_line);
5332
5333                   goto restart;
5334                 }
5335               goto dflt;
5336
5337             case '/':
5338               {
5339                 const unsigned char *peek = do_peek_next (pos, limit);
5340                 if (!(*peek == '/' || *peek == '*'))
5341                   goto dflt;
5342
5343                 /* Line or block comment  */
5344                 bool is_block = *peek == '*';
5345                 bool star = false;
5346                 bool esc = false;
5347                 location_t sloc
5348                   = linemap_position_for_column (pfile->line_table,
5349                                                  pos - line_start);
5350
5351                 while (pos < limit)
5352                   {
5353                     char c = *pos++;
5354                     switch (c)
5355                       {
5356                       case '\\':
5357                         esc = true;
5358                         break;
5359
5360                       case '\r':
5361                         if (*pos == '\n')
5362                           pos++;
5363                         /* FALLTHROUGH  */
5364
5365                       case '\n':
5366                         {
5367                           CPP_INCREMENT_LINE (pfile, 0);
5368                           line_count++;
5369                           line_start = pos;
5370                           if (!esc && !is_block)
5371                             {
5372                               bol = true;
5373                               goto done_comment;
5374                             }
5375                         }
5376                         if (!esc)
5377                           star = false;
5378                         esc = false;
5379                         break;
5380
5381                       case '*':
5382                         if (pos > peek)
5383                           star = is_block;
5384                         esc = false;
5385                         break;
5386
5387                       case '/':
5388                         if (star)
5389                           goto done_comment;
5390                         /* FALLTHROUGH  */
5391
5392                       default:
5393                         star = false;
5394                         esc = false;
5395                         break;
5396                       }
5397                   }
5398                 if (pos < limit || is_block)
5399                   cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
5400                                        "unterminated comment");
5401               done_comment:
5402                 lwm = pos;
5403                 break;
5404               }
5405
5406             case '\'':
5407               if (!CPP_OPTION (pfile, digit_separators))
5408                 goto delimited_string;
5409
5410               /* Possibly a number punctuator.  */
5411               if (!ISIDNUM (*do_peek_next (pos, limit)))
5412                 goto delimited_string;
5413
5414               goto quote_peek;
5415
5416             case '\"':
5417               if (!CPP_OPTION (pfile, rliterals))
5418                 goto delimited_string;
5419
5420             quote_peek:
5421               {
5422                 /* For ' see if it's a number punctuator
5423                    \.?<digit>(<digit>|<identifier-nondigit>
5424                    |'<digit>|'<nondigit>|[eEpP]<sign>|\.)* */
5425                 /* For " see if it's a raw string
5426                    {U,L,u,u8}R.  This includes CPP_NUMBER detection,
5427                    because that could be 0e+R.  */
5428                 const unsigned char *peek = pos - 1;
5429                 bool quote_first = c == '"';
5430                 bool quote_eight = false;
5431                 bool maybe_number_start = false;
5432                 bool want_number = false;
5433
5434                 while ((peek = do_peek_prev (peek, lwm)))
5435                   {
5436                     unsigned char p = *peek;
5437                     if (quote_first)
5438                       {
5439                         if (!raw)
5440                           {
5441                             if (p != 'R')
5442                               break;
5443                             raw = true;
5444                             continue;
5445                           }
5446
5447                         quote_first = false;
5448                         if (p == 'L' || p == 'U' || p == 'u')
5449                           ;
5450                         else if (p == '8')
5451                           quote_eight = true;
5452                         else
5453                           goto second_raw;
5454                       }
5455                     else if (quote_eight)
5456                       {
5457                         if (p != 'u')
5458                           {
5459                             raw = false;
5460                             break;
5461                           }
5462                         quote_eight = false;
5463                       }
5464                     else if (c == '"')
5465                       {
5466                       second_raw:;
5467                         if (!want_number && ISIDNUM (p))
5468                           {
5469                             raw = false;
5470                             break;
5471                           }
5472                       }
5473
5474                     if (ISDIGIT (p))
5475                       maybe_number_start = true;
5476                     else if (p == '.')
5477                       want_number = true;
5478                     else if (ISIDNUM (p))
5479                       maybe_number_start = false;
5480                     else if (p == '+' || p == '-')
5481                       {
5482                         if (const unsigned char *peek_prev
5483                             = do_peek_prev (peek, lwm))
5484                           {
5485                             p = *peek_prev;
5486                             if (p == 'e' || p == 'E'
5487                                 || p == 'p' || p == 'P')
5488                               {
5489                                 want_number = true;
5490                                 maybe_number_start = false;
5491                               }
5492                             else
5493                               break;
5494                           }
5495                         else
5496                           break;
5497                       }
5498                     else if (p == '\'' || p == '\"')
5499                       {
5500                         /* If this is lwm, this must be the end of a
5501                            previous string.  So this is a trailing
5502                            literal type, (a) if those are allowed,
5503                              and (b) maybe_start is false.  Otherwise
5504                              this must be a CPP_NUMBER because we've
5505                              met another ', and we'd have checked that
5506                              in its own right.  */
5507                         if (peek == lwm && CPP_OPTION (pfile, uliterals))
5508                           {
5509                             if  (!maybe_number_start && !want_number)
5510                               /* Must be a literal type.  */
5511                               raw = false;
5512                           }
5513                         else if (p == '\''
5514                                  && CPP_OPTION (pfile, digit_separators))
5515                           maybe_number_start = true;
5516                         break;
5517                       }
5518                     else if (c == '\'')
5519                       break;
5520                     else if (!quote_first && !quote_eight)
5521                       break;
5522                   }
5523
5524                 if (maybe_number_start)
5525                   {
5526                     if (c == '\'')
5527                       /* A CPP NUMBER.  */
5528                       goto dflt;
5529                     raw = false;
5530                   }
5531
5532                 goto delimited_string;
5533               }
5534
5535             delimited_string:
5536               {
5537                 /* (Possibly raw) string or char literal.  */
5538                 unsigned char end = c;
5539                 int delim_len = -1;
5540                 const unsigned char *delim = NULL;
5541                 location_t sloc = linemap_position_for_column (pfile->line_table,
5542                                                                pos - line_start);
5543                 int esc = 0;
5544
5545                 if (raw)
5546                   {
5547                     /* There can be no line breaks in the delimiter.  */
5548                     delim = pos;
5549                     for (delim_len = 0; (c = *pos++) != '('; delim_len++)
5550                       {
5551                         if (delim_len == 16)
5552                           {
5553                             cpp_error_with_line (pfile, CPP_DL_ERROR,
5554                                                  sloc, 0,
5555                                                  "raw string delimiter"
5556                                                  " longer than %d"
5557                                                  " characters",
5558                                                  delim_len);
5559                             raw = false;
5560                             pos = delim;
5561                             break;
5562                           }
5563                         if (strchr (") \\\t\v\f\n", c))
5564                           {
5565                             cpp_error_with_line (pfile, CPP_DL_ERROR,
5566                                                  sloc, 0,
5567                                                  "invalid character '%c'"
5568                                                  " in raw string"
5569                                                  " delimiter", c);
5570                             raw = false;
5571                             pos = delim;
5572                             break;
5573                           }
5574                         if (pos >= limit)
5575                           goto bad_string;
5576                       }
5577                   }
5578
5579                 while (pos < limit)
5580                   {
5581                     char c = *pos++;
5582                     switch (c)
5583                       {
5584                       case '\\':
5585                         if (!raw)
5586                           esc++;
5587                         break;
5588
5589                       case '\r':
5590                         if (*pos == '\n')
5591                           pos++;
5592                         /* FALLTHROUGH  */
5593
5594                       case '\n':
5595                         {
5596                           CPP_INCREMENT_LINE (pfile, 0);
5597                           line_count++;
5598                           line_start = pos;
5599                         }
5600                         if (esc)
5601                           esc--;
5602                         break;
5603
5604                       case ')':
5605                         if (raw
5606                             && pos + delim_len + 1 < limit
5607                             && pos[delim_len] == end
5608                             && !memcmp (delim, pos, delim_len))
5609                           {
5610                             pos += delim_len + 1;
5611                             raw = false;
5612                             goto done_string;
5613                           }
5614                         break;
5615
5616                       default:
5617                         if (!raw && !(esc & 1) && c == end)
5618                           goto done_string;
5619                         esc = 0;
5620                         break;
5621                       }
5622                   }
5623               bad_string:
5624                 cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
5625                                      "unterminated literal");
5626
5627               done_string:
5628                 raw = false;
5629                 lwm = pos - 1;
5630               }
5631               goto dflt;
5632
5633             case '_':
5634             case 'e':
5635             case 'i':
5636             case 'm':
5637               if (bol && module_p && !pfile->state.skipping
5638                   && do_peek_module (pfile, c, pos, limit))
5639                 {
5640                   /* We've seen the start of a module control line.
5641                      Start up the tokenizer.  */
5642                   pos--; /* Backup over the first character.  */
5643
5644                   /* Backup over whitespace to start of line.  */
5645                   while (pos > line_start
5646                          && (pos[-1] == ' ' || pos[-1] == '\t'))
5647                     pos--;
5648
5649                   if (pos > base)
5650                     cb (pfile, CPP_DO_print, data, line_count, base, pos - base);
5651
5652                   /* Prep things for directive handling. */
5653                   buffer->next_line = pos;
5654                   buffer->need_line = true;
5655
5656                   /* Now get tokens until the PRAGMA_EOL.  */
5657                   do
5658                     {
5659                       location_t spelling;
5660                       const cpp_token *tok
5661                         = cpp_get_token_with_location (pfile, &spelling);
5662
5663                       gcc_assert (pfile->state.in_deferred_pragma
5664                                   || tok->type == CPP_PRAGMA_EOL);
5665                       cb (pfile, CPP_DO_token, data, tok, spelling);
5666                     }
5667                   while (pfile->state.in_deferred_pragma);
5668
5669                   if (pfile->buffer->next_line < pfile->buffer->rlimit)
5670                     cb (pfile, CPP_DO_location, data,
5671                         pfile->line_table->highest_line);
5672
5673                   pfile->mi_valid = false;
5674                   goto restart;
5675                 }
5676               goto dflt;
5677
5678             default:
5679             dflt:
5680               bol = false;
5681               pfile->mi_valid = false;
5682               break;
5683             }
5684         }
5685
5686       if (buffer->rlimit > base && !pfile->state.skipping)
5687         {
5688           const unsigned char *limit = buffer->rlimit;
5689           /* If the file was not newline terminated, add rlimit, which is
5690              guaranteed to point to a newline, to the end of our range.  */
5691           if (limit[-1] != '\n')
5692             {
5693               limit++;
5694               CPP_INCREMENT_LINE (pfile, 0);
5695               line_count++;
5696             }
5697           cb (pfile, CPP_DO_print, data, line_count, base, limit - base);
5698         }
5699
5700       _cpp_pop_buffer (pfile);
5701     }
5702   while (pfile->buffer);
5703 }