libcpp/lex.cc

   1 /* CPP Library - lexical analysis.
   2    Copyright (C) 2000-2022 Free Software Foundation, Inc.
   3    Contributed by Per Bothner, 1994-95.
   4    Based on CCCP program by Paul Rubin, June 1986
   5    Adapted to ANSI C, Richard Stallman, Jan 1987
   6    Broken out to separate file, Zack Weinberg, Mar 2000
   7
   8 This program is free software; you can redistribute it and/or modify it
   9 under the terms of the GNU General Public License as published by the
  10 Free Software Foundation; either version 3, or (at your option) any
  11 later version.
  12
  13 This program is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with this program; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "cpplib.h"
  25 #include "internal.h"
  26
  27 enum spell_type
  28 {
  29   SPELL_OPERATOR = 0,
  30   SPELL_IDENT,
  31   SPELL_LITERAL,
  32   SPELL_NONE
  33 };
  34
  35 struct token_spelling
  36 {
  37   enum spell_type category;
  38   const unsigned char *name;
  39 };
  40
  41 static const unsigned char *const digraph_spellings[] =
  42 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
  43
  44 #define OP(e, s) { SPELL_OPERATOR, UC s  },
  45 #define TK(e, s) { SPELL_ ## s,    UC #e },
  46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
  47 #undef OP
  48 #undef TK
  49
  50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
  51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
  52
  53 /* ISO 10646 defines the UCS codespace as the range 0-0x10FFFF inclusive.  */
  54 #define UCS_LIMIT 0x10FFFF
  55
  56 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
  57 static int skip_line_comment (cpp_reader *);
  58 static void skip_whitespace (cpp_reader *, cppchar_t);
  59 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
  60 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
  61 static void store_comment (cpp_reader *, cpp_token *);
  62 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
  63                             unsigned int, enum cpp_ttype);
  64 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
  65 static int name_p (cpp_reader *, const cpp_string *);
  66 static tokenrun *next_tokenrun (tokenrun *);
  67
  68 static _cpp_buff *new_buff (size_t);
  69
  70
  71 /* Utility routine:
  72
  73    Compares, the token TOKEN to the NUL-terminated string STRING.
  74    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
  75 int
  76 cpp_ideq (const cpp_token *token, const char *string)
  77 {
  78   if (token->type != CPP_NAME)
  79     return 0;
  80
  81   return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
  82 }
  83
  84 /* Record a note TYPE at byte POS into the current cleaned logical
  85    line.  */
  86 static void
  87 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
  88 {
  89   if (buffer->notes_used == buffer->notes_cap)
  90     {
  91       buffer->notes_cap = buffer->notes_cap * 2 + 200;
  92       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
  93                                   buffer->notes_cap);
  94     }
  95
  96   buffer->notes[buffer->notes_used].pos = pos;
  97   buffer->notes[buffer->notes_used].type = type;
  98   buffer->notes_used++;
  99 }
 100
 101 \f
 102 /* Fast path to find line special characters using optimized character
 103    scanning algorithms.  Anything complicated falls back to the slow
 104    path below.  Since this loop is very hot it's worth doing these kinds
 105    of optimizations.
 106
 107    One of the paths through the ifdefs should provide
 108
 109      const uchar *search_line_fast (const uchar *s, const uchar *end);
 110
 111    Between S and END, search for \n, \r, \\, ?.  Return a pointer to
 112    the found character.
 113
 114    Note that the last character of the buffer is *always* a newline,
 115    as forced by _cpp_convert_input.  This fact can be used to avoid
 116    explicitly looking for the end of the buffer.  */
 117
 118 /* Configure gives us an ifdef test.  */
 119 #ifndef WORDS_BIGENDIAN
 120 #define WORDS_BIGENDIAN 0
 121 #endif
 122
 123 /* We'd like the largest integer that fits into a register.  There's nothing
 124    in <stdint.h> that gives us that.  For most hosts this is unsigned long,
 125    but MS decided on an LLP64 model.  Thankfully when building with GCC we
 126    can get the "real" word size.  */
 127 #ifdef __GNUC__
 128 typedef unsigned int word_type __attribute__((__mode__(__word__)));
 129 #else
 130 typedef unsigned long word_type;
 131 #endif
 132
 133 /* The code below is only expecting sizes 4 or 8.
 134    Die at compile-time if this expectation is violated.  */
 135 typedef char check_word_type_size
 136   [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
 137
 138 /* Return X with the first N bytes forced to values that won't match one
 139    of the interesting characters.  Note that NUL is not interesting.  */
 140
 141 static inline word_type
 142 acc_char_mask_misalign (word_type val, unsigned int n)
 143 {
 144   word_type mask = -1;
 145   if (WORDS_BIGENDIAN)
 146     mask >>= n * 8;
 147   else
 148     mask <<= n * 8;
 149   return val & mask;
 150 }
 151
 152 /* Return X replicated to all byte positions within WORD_TYPE.  */
 153
 154 static inline word_type
 155 acc_char_replicate (uchar x)
 156 {
 157   word_type ret;
 158
 159   ret = (x << 24) | (x << 16) | (x << 8) | x;
 160   if (sizeof(word_type) == 8)
 161     ret = (ret << 16 << 16) | ret;
 162   return ret;
 163 }
 164
 165 /* Return non-zero if some byte of VAL is (probably) C.  */
 166
 167 static inline word_type
 168 acc_char_cmp (word_type val, word_type c)
 169 {
 170 #if defined(__GNUC__) && defined(__alpha__)
 171   /* We can get exact results using a compare-bytes instruction.
 172      Get (val == c) via (0 >= (val ^ c)).  */
 173   return __builtin_alpha_cmpbge (0, val ^ c);
 174 #else
 175   word_type magic = 0x7efefefeU;
 176   if (sizeof(word_type) == 8)
 177     magic = (magic << 16 << 16) | 0xfefefefeU;
 178   magic |= 1;
 179
 180   val ^= c;
 181   return ((val + magic) ^ ~val) & ~magic;
 182 #endif
 183 }
 184
 185 /* Given the result of acc_char_cmp is non-zero, return the index of
 186    the found character.  If this was a false positive, return -1.  */
 187
 188 static inline int
 189 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
 190                 word_type val ATTRIBUTE_UNUSED)
 191 {
 192 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
 193   /* The cmpbge instruction sets *bits* of the result corresponding to
 194      matches in the bytes with no false positives.  */
 195   return __builtin_ctzl (cmp);
 196 #else
 197   unsigned int i;
 198
 199   /* ??? It would be nice to force unrolling here,
 200      and have all of these constants folded.  */
 201   for (i = 0; i < sizeof(word_type); ++i)
 202     {
 203       uchar c;
 204       if (WORDS_BIGENDIAN)
 205         c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
 206       else
 207         c = (val >> i * 8) & 0xff;
 208
 209       if (c == '\n' || c == '\r' || c == '\\' || c == '?')
 210         return i;
 211     }
 212
 213   return -1;
 214 #endif
 215 }
 216
 217 /* A version of the fast scanner using bit fiddling techniques.
 218
 219    For 32-bit words, one would normally perform 16 comparisons and
 220    16 branches.  With this algorithm one performs 24 arithmetic
 221    operations and one branch.  Whether this is faster with a 32-bit
 222    word size is going to be somewhat system dependent.
 223
 224    For 64-bit words, we eliminate twice the number of comparisons
 225    and branches without increasing the number of arithmetic operations.
 226    It's almost certainly going to be a win with 64-bit word size.  */
 227
 228 static const uchar * search_line_acc_char (const uchar *, const uchar *)
 229   ATTRIBUTE_UNUSED;
 230
 231 static const uchar *
 232 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 233 {
 234   const word_type repl_nl = acc_char_replicate ('\n');
 235   const word_type repl_cr = acc_char_replicate ('\r');
 236   const word_type repl_bs = acc_char_replicate ('\\');
 237   const word_type repl_qm = acc_char_replicate ('?');
 238
 239   unsigned int misalign;
 240   const word_type *p;
 241   word_type val, t;
 242
 243   /* Align the buffer.  Mask out any bytes from before the beginning.  */
 244   p = (word_type *)((uintptr_t)s & -sizeof(word_type));
 245   val = *p;
 246   misalign = (uintptr_t)s & (sizeof(word_type) - 1);
 247   if (misalign)
 248     val = acc_char_mask_misalign (val, misalign);
 249
 250   /* Main loop.  */
 251   while (1)
 252     {
 253       t  = acc_char_cmp (val, repl_nl);
 254       t |= acc_char_cmp (val, repl_cr);
 255       t |= acc_char_cmp (val, repl_bs);
 256       t |= acc_char_cmp (val, repl_qm);
 257
 258       if (__builtin_expect (t != 0, 0))
 259         {
 260           int i = acc_char_index (t, val);
 261           if (i >= 0)
 262             return (const uchar *)p + i;
 263         }
 264
 265       val = *++p;
 266     }
 267 }
 268
 269 /* Disable on Solaris 2/x86 until the following problem can be properly
 270    autoconfed:
 271
 272    The Solaris 10+ assembler tags objects with the instruction set
 273    extensions used, so SSE4.2 executables cannot run on machines that
 274    don't support that extension.  */
 275
 276 #if (GCC_VERSION >= 4005) && (__GNUC__ >= 5 || !defined(__PIC__)) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
 277
 278 /* Replicated character data to be shared between implementations.
 279    Recall that outside of a context with vector support we can't
 280    define compatible vector types, therefore these are all defined
 281    in terms of raw characters.  */
 282 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
 283   { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 284     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
 285   { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 286     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
 287   { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 288     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
 289   { '?', '?', '?', '?', '?', '?', '?', '?',
 290     '?', '?', '?', '?', '?', '?', '?', '?' },
 291 };
 292
 293 /* A version of the fast scanner using MMX vectorized byte compare insns.
 294
 295    This uses the PMOVMSKB instruction which was introduced with "MMX2",
 296    which was packaged into SSE1; it is also present in the AMD MMX
 297    extension.  Mark the function as using "sse" so that we emit a real
 298    "emms" instruction, rather than the 3dNOW "femms" instruction.  */
 299
 300 static const uchar *
 301 #ifndef __SSE__
 302 __attribute__((__target__("sse")))
 303 #endif
 304 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 305 {
 306   typedef char v8qi __attribute__ ((__vector_size__ (8)));
 307   typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
 308
 309   const v8qi repl_nl = *(const v8qi *)repl_chars[0];
 310   const v8qi repl_cr = *(const v8qi *)repl_chars[1];
 311   const v8qi repl_bs = *(const v8qi *)repl_chars[2];
 312   const v8qi repl_qm = *(const v8qi *)repl_chars[3];
 313
 314   unsigned int misalign, found, mask;
 315   const v8qi *p;
 316   v8qi data, t, c;
 317
 318   /* Align the source pointer.  While MMX doesn't generate unaligned data
 319      faults, this allows us to safely scan to the end of the buffer without
 320      reading beyond the end of the last page.  */
 321   misalign = (uintptr_t)s & 7;
 322   p = (const v8qi *)((uintptr_t)s & -8);
 323   data = *p;
 324
 325   /* Create a mask for the bytes that are valid within the first
 326      16-byte block.  The Idea here is that the AND with the mask
 327      within the loop is "free", since we need some AND or TEST
 328      insn in order to set the flags for the branch anyway.  */
 329   mask = -1u << misalign;
 330
 331   /* Main loop processing 8 bytes at a time.  */
 332   goto start;
 333   do
 334     {
 335       data = *++p;
 336       mask = -1;
 337
 338     start:
 339       t = __builtin_ia32_pcmpeqb(data, repl_nl);
 340       c = __builtin_ia32_pcmpeqb(data, repl_cr);
 341       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 342       c = __builtin_ia32_pcmpeqb(data, repl_bs);
 343       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 344       c = __builtin_ia32_pcmpeqb(data, repl_qm);
 345       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 346       found = __builtin_ia32_pmovmskb (t);
 347       found &= mask;
 348     }
 349   while (!found);
 350
 351   __builtin_ia32_emms ();
 352
 353   /* FOUND contains 1 in bits for which we matched a relevant
 354      character.  Conversion to the byte index is trivial.  */
 355   found = __builtin_ctz(found);
 356   return (const uchar *)p + found;
 357 }
 358
 359 /* A version of the fast scanner using SSE2 vectorized byte compare insns.  */
 360
 361 static const uchar *
 362 #ifndef __SSE2__
 363 __attribute__((__target__("sse2")))
 364 #endif
 365 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 366 {
 367   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 368
 369   const v16qi repl_nl = *(const v16qi *)repl_chars[0];
 370   const v16qi repl_cr = *(const v16qi *)repl_chars[1];
 371   const v16qi repl_bs = *(const v16qi *)repl_chars[2];
 372   const v16qi repl_qm = *(const v16qi *)repl_chars[3];
 373
 374   unsigned int misalign, found, mask;
 375   const v16qi *p;
 376   v16qi data, t;
 377
 378   /* Align the source pointer.  */
 379   misalign = (uintptr_t)s & 15;
 380   p = (const v16qi *)((uintptr_t)s & -16);
 381   data = *p;
 382
 383   /* Create a mask for the bytes that are valid within the first
 384      16-byte block.  The Idea here is that the AND with the mask
 385      within the loop is "free", since we need some AND or TEST
 386      insn in order to set the flags for the branch anyway.  */
 387   mask = -1u << misalign;
 388
 389   /* Main loop processing 16 bytes at a time.  */
 390   goto start;
 391   do
 392     {
 393       data = *++p;
 394       mask = -1;
 395
 396     start:
 397       t  = data == repl_nl;
 398       t |= data == repl_cr;
 399       t |= data == repl_bs;
 400       t |= data == repl_qm;
 401       found = __builtin_ia32_pmovmskb128 (t);
 402       found &= mask;
 403     }
 404   while (!found);
 405
 406   /* FOUND contains 1 in bits for which we matched a relevant
 407      character.  Conversion to the byte index is trivial.  */
 408   found = __builtin_ctz(found);
 409   return (const uchar *)p + found;
 410 }
 411
 412 #ifdef HAVE_SSE4
 413 /* A version of the fast scanner using SSE 4.2 vectorized string insns.  */
 414
 415 static const uchar *
 416 #ifndef __SSE4_2__
 417 __attribute__((__target__("sse4.2")))
 418 #endif
 419 search_line_sse42 (const uchar *s, const uchar *end)
 420 {
 421   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 422   static const v16qi search = { '\n', '\r', '?', '\\' };
 423
 424   uintptr_t si = (uintptr_t)s;
 425   uintptr_t index;
 426
 427   /* Check for unaligned input.  */
 428   if (si & 15)
 429     {
 430       v16qi sv;
 431
 432       if (__builtin_expect (end - s < 16, 0)
 433           && __builtin_expect ((si & 0xfff) > 0xff0, 0))
 434         {
 435           /* There are less than 16 bytes left in the buffer, and less
 436              than 16 bytes left on the page.  Reading 16 bytes at this
 437              point might generate a spurious page fault.  Defer to the
 438              SSE2 implementation, which already handles alignment.  */
 439           return search_line_sse2 (s, end);
 440         }
 441
 442       /* ??? The builtin doesn't understand that the PCMPESTRI read from
 443          memory need not be aligned.  */
 444       sv = __builtin_ia32_loaddqu ((const char *) s);
 445       index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
 446
 447       if (__builtin_expect (index < 16, 0))
 448         goto found;
 449
 450       /* Advance the pointer to an aligned address.  We will re-scan a
 451          few bytes, but we no longer need care for reading past the
 452          end of a page, since we're guaranteed a match.  */
 453       s = (const uchar *)((si + 15) & -16);
 454     }
 455
 456   /* Main loop, processing 16 bytes at a time.  */
 457 #ifdef __GCC_ASM_FLAG_OUTPUTS__
 458   while (1)
 459     {
 460       char f;
 461
 462       /* By using inline assembly instead of the builtin,
 463          we can use the result, as well as the flags set.  */
 464       __asm ("%vpcmpestri\t$0, %2, %3"
 465              : "=c"(index), "=@ccc"(f)
 466              : "m"(*s), "x"(search), "a"(4), "d"(16));
 467       if (f)
 468         break;
 469
 470       s += 16;
 471     }
 472 #else
 473   s -= 16;
 474   /* By doing the whole loop in inline assembly,
 475      we can make proper use of the flags set.  */
 476   __asm (      ".balign 16\n"
 477         "0:     add $16, %1\n"
 478         "       %vpcmpestri\t$0, (%1), %2\n"
 479         "       jnc 0b"
 480         : "=&c"(index), "+r"(s)
 481         : "x"(search), "a"(4), "d"(16));
 482 #endif
 483
 484  found:
 485   return s + index;
 486 }
 487
 488 #else
 489 /* Work around out-dated assemblers without sse4 support.  */
 490 #define search_line_sse42 search_line_sse2
 491 #endif
 492
 493 /* Check the CPU capabilities.  */
 494
 495 #include "../gcc/config/i386/cpuid.h"
 496
 497 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
 498 static search_line_fast_type search_line_fast;
 499
 500 #define HAVE_init_vectorized_lexer 1
 501 static inline void
 502 init_vectorized_lexer (void)
 503 {
 504   unsigned dummy, ecx = 0, edx = 0;
 505   search_line_fast_type impl = search_line_acc_char;
 506   int minimum = 0;
 507
 508 #if defined(__SSE4_2__)
 509   minimum = 3;
 510 #elif defined(__SSE2__)
 511   minimum = 2;
 512 #elif defined(__SSE__)
 513   minimum = 1;
 514 #endif
 515
 516   if (minimum == 3)
 517     impl = search_line_sse42;
 518   else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
 519     {
 520       if (minimum == 3 || (ecx & bit_SSE4_2))
 521         impl = search_line_sse42;
 522       else if (minimum == 2 || (edx & bit_SSE2))
 523         impl = search_line_sse2;
 524       else if (minimum == 1 || (edx & bit_SSE))
 525         impl = search_line_mmx;
 526     }
 527   else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
 528     {
 529       if (minimum == 1
 530           || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
 531         impl = search_line_mmx;
 532     }
 533
 534   search_line_fast = impl;
 535 }
 536
 537 #elif (GCC_VERSION >= 4005) && defined(_ARCH_PWR8) && defined(__ALTIVEC__)
 538
 539 /* A vection of the fast scanner using AltiVec vectorized byte compares
 540    and VSX unaligned loads (when VSX is available).  This is otherwise
 541    the same as the AltiVec version.  */
 542
 543 ATTRIBUTE_NO_SANITIZE_UNDEFINED
 544 static const uchar *
 545 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 546 {
 547   typedef __attribute__((altivec(vector))) unsigned char vc;
 548
 549   const vc repl_nl = {
 550     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 551     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 552   };
 553   const vc repl_cr = {
 554     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 555     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 556   };
 557   const vc repl_bs = {
 558     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 559     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 560   };
 561   const vc repl_qm = {
 562     '?', '?', '?', '?', '?', '?', '?', '?',
 563     '?', '?', '?', '?', '?', '?', '?', '?',
 564   };
 565   const vc zero = { 0 };
 566
 567   vc data, t;
 568
 569   /* Main loop processing 16 bytes at a time.  */
 570   do
 571     {
 572       vc m_nl, m_cr, m_bs, m_qm;
 573
 574       data = __builtin_vec_vsx_ld (0, s);
 575       s += 16;
 576
 577       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 578       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 579       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 580       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 581       t = (m_nl | m_cr) | (m_bs | m_qm);
 582
 583       /* T now contains 0xff in bytes for which we matched one of the relevant
 584          characters.  We want to exit the loop if any byte in T is non-zero.
 585          Below is the expansion of vec_any_ne(t, zero).  */
 586     }
 587   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 588
 589   /* Restore s to to point to the 16 bytes we just processed.  */
 590   s -= 16;
 591
 592   {
 593 #define N  (sizeof(vc) / sizeof(long))
 594
 595     union {
 596       vc v;
 597       /* Statically assert that N is 2 or 4.  */
 598       unsigned long l[(N == 2 || N == 4) ? N : -1];
 599     } u;
 600     unsigned long l, i = 0;
 601
 602     u.v = t;
 603
 604     /* Find the first word of T that is non-zero.  */
 605     switch (N)
 606       {
 607       case 4:
 608         l = u.l[i++];
 609         if (l != 0)
 610           break;
 611         s += sizeof(unsigned long);
 612         l = u.l[i++];
 613         if (l != 0)
 614           break;
 615         s += sizeof(unsigned long);
 616         /* FALLTHRU */
 617       case 2:
 618         l = u.l[i++];
 619         if (l != 0)
 620           break;
 621         s += sizeof(unsigned long);
 622         l = u.l[i];
 623       }
 624
 625     /* L now contains 0xff in bytes for which we matched one of the
 626        relevant characters.  We can find the byte index by finding
 627        its bit index and dividing by 8.  */
 628 #ifdef __BIG_ENDIAN__
 629     l = __builtin_clzl(l) >> 3;
 630 #else
 631     l = __builtin_ctzl(l) >> 3;
 632 #endif
 633     return s + l;
 634
 635 #undef N
 636   }
 637 }
 638
 639 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__) && defined (__BIG_ENDIAN__)
 640
 641 /* A vection of the fast scanner using AltiVec vectorized byte compares.
 642    This cannot be used for little endian because vec_lvsl/lvsr are
 643    deprecated for little endian and the code won't work properly.  */
 644 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
 645    so we can't compile this function without -maltivec on the command line
 646    (or implied by some other switch).  */
 647
 648 static const uchar *
 649 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 650 {
 651   typedef __attribute__((altivec(vector))) unsigned char vc;
 652
 653   const vc repl_nl = {
 654     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 655     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 656   };
 657   const vc repl_cr = {
 658     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 659     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 660   };
 661   const vc repl_bs = {
 662     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 663     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 664   };
 665   const vc repl_qm = {
 666     '?', '?', '?', '?', '?', '?', '?', '?',
 667     '?', '?', '?', '?', '?', '?', '?', '?',
 668   };
 669   const vc ones = {
 670     -1, -1, -1, -1, -1, -1, -1, -1,
 671     -1, -1, -1, -1, -1, -1, -1, -1,
 672   };
 673   const vc zero = { 0 };
 674
 675   vc data, mask, t;
 676
 677   /* Altivec loads automatically mask addresses with -16.  This lets us
 678      issue the first load as early as possible.  */
 679   data = __builtin_vec_ld(0, (const vc *)s);
 680
 681   /* Discard bytes before the beginning of the buffer.  Do this by
 682      beginning with all ones and shifting in zeros according to the
 683      mis-alignment.  The LVSR instruction pulls the exact shift we
 684      want from the address.  */
 685   mask = __builtin_vec_lvsr(0, s);
 686   mask = __builtin_vec_perm(zero, ones, mask);
 687   data &= mask;
 688
 689   /* While altivec loads mask addresses, we still need to align S so
 690      that the offset we compute at the end is correct.  */
 691   s = (const uchar *)((uintptr_t)s & -16);
 692
 693   /* Main loop processing 16 bytes at a time.  */
 694   goto start;
 695   do
 696     {
 697       vc m_nl, m_cr, m_bs, m_qm;
 698
 699       s += 16;
 700       data = __builtin_vec_ld(0, (const vc *)s);
 701
 702     start:
 703       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 704       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 705       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 706       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 707       t = (m_nl | m_cr) | (m_bs | m_qm);
 708
 709       /* T now contains 0xff in bytes for which we matched one of the relevant
 710          characters.  We want to exit the loop if any byte in T is non-zero.
 711          Below is the expansion of vec_any_ne(t, zero).  */
 712     }
 713   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 714
 715   {
 716 #define N  (sizeof(vc) / sizeof(long))
 717
 718     union {
 719       vc v;
 720       /* Statically assert that N is 2 or 4.  */
 721       unsigned long l[(N == 2 || N == 4) ? N : -1];
 722     } u;
 723     unsigned long l, i = 0;
 724
 725     u.v = t;
 726
 727     /* Find the first word of T that is non-zero.  */
 728     switch (N)
 729       {
 730       case 4:
 731         l = u.l[i++];
 732         if (l != 0)
 733           break;
 734         s += sizeof(unsigned long);
 735         l = u.l[i++];
 736         if (l != 0)
 737           break;
 738         s += sizeof(unsigned long);
 739         /* FALLTHROUGH */
 740       case 2:
 741         l = u.l[i++];
 742         if (l != 0)
 743           break;
 744         s += sizeof(unsigned long);
 745         l = u.l[i];
 746       }
 747
 748     /* L now contains 0xff in bytes for which we matched one of the
 749        relevant characters.  We can find the byte index by finding
 750        its bit index and dividing by 8.  */
 751     l = __builtin_clzl(l) >> 3;
 752     return s + l;
 753
 754 #undef N
 755   }
 756 }
 757
 758 #elif defined (__ARM_NEON) && defined (__ARM_64BIT_STATE)
 759 #include "arm_neon.h"
 760
 761 /* This doesn't have to be the exact page size, but no system may use
 762    a size smaller than this.  ARMv8 requires a minimum page size of
 763    4k.  The impact of being conservative here is a small number of
 764    cases will take the slightly slower entry path into the main
 765    loop.  */
 766
 767 #define AARCH64_MIN_PAGE_SIZE 4096
 768
 769 static const uchar *
 770 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 771 {
 772   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
 773   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
 774   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
 775   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
 776   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
 777
 778 #ifdef __ARM_BIG_ENDIAN
 779   const int16x8_t shift = {8, 8, 8, 8, 0, 0, 0, 0};
 780 #else
 781   const int16x8_t shift = {0, 0, 0, 0, 8, 8, 8, 8};
 782 #endif
 783
 784   unsigned int found;
 785   const uint8_t *p;
 786   uint8x16_t data;
 787   uint8x16_t t;
 788   uint16x8_t m;
 789   uint8x16_t u, v, w;
 790
 791   /* Align the source pointer.  */
 792   p = (const uint8_t *)((uintptr_t)s & -16);
 793
 794   /* Assuming random string start positions, with a 4k page size we'll take
 795      the slow path about 0.37% of the time.  */
 796   if (__builtin_expect ((AARCH64_MIN_PAGE_SIZE
 797                          - (((uintptr_t) s) & (AARCH64_MIN_PAGE_SIZE - 1)))
 798                         < 16, 0))
 799     {
 800       /* Slow path: the string starts near a possible page boundary.  */
 801       uint32_t misalign, mask;
 802
 803       misalign = (uintptr_t)s & 15;
 804       mask = (-1u << misalign) & 0xffff;
 805       data = vld1q_u8 (p);
 806       t = vceqq_u8 (data, repl_nl);
 807       u = vceqq_u8 (data, repl_cr);
 808       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 809       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 810       t = vorrq_u8 (v, w);
 811       t = vandq_u8 (t, xmask);
 812       m = vpaddlq_u8 (t);
 813       m = vshlq_u16 (m, shift);
 814       found = vaddvq_u16 (m);
 815       found &= mask;
 816       if (found)
 817         return (const uchar*)p + __builtin_ctz (found);
 818     }
 819   else
 820     {
 821       data = vld1q_u8 ((const uint8_t *) s);
 822       t = vceqq_u8 (data, repl_nl);
 823       u = vceqq_u8 (data, repl_cr);
 824       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 825       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 826       t = vorrq_u8 (v, w);
 827       if (__builtin_expect (vpaddd_u64 ((uint64x2_t)t) != 0, 0))
 828         goto done;
 829     }
 830
 831   do
 832     {
 833       p += 16;
 834       data = vld1q_u8 (p);
 835       t = vceqq_u8 (data, repl_nl);
 836       u = vceqq_u8 (data, repl_cr);
 837       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 838       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 839       t = vorrq_u8 (v, w);
 840     } while (!vpaddd_u64 ((uint64x2_t)t));
 841
 842 done:
 843   /* Now that we've found the terminating substring, work out precisely where
 844      we need to stop.  */
 845   t = vandq_u8 (t, xmask);
 846   m = vpaddlq_u8 (t);
 847   m = vshlq_u16 (m, shift);
 848   found = vaddvq_u16 (m);
 849   return (((((uintptr_t) p) < (uintptr_t) s) ? s : (const uchar *)p)
 850           + __builtin_ctz (found));
 851 }
 852
 853 #elif defined (__ARM_NEON)
 854 #include "arm_neon.h"
 855
 856 static const uchar *
 857 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 858 {
 859   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
 860   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
 861   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
 862   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
 863   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
 864
 865   unsigned int misalign, found, mask;
 866   const uint8_t *p;
 867   uint8x16_t data;
 868
 869   /* Align the source pointer.  */
 870   misalign = (uintptr_t)s & 15;
 871   p = (const uint8_t *)((uintptr_t)s & -16);
 872   data = vld1q_u8 (p);
 873
 874   /* Create a mask for the bytes that are valid within the first
 875      16-byte block.  The Idea here is that the AND with the mask
 876      within the loop is "free", since we need some AND or TEST
 877      insn in order to set the flags for the branch anyway.  */
 878   mask = (-1u << misalign) & 0xffff;
 879
 880   /* Main loop, processing 16 bytes at a time.  */
 881   goto start;
 882
 883   do
 884     {
 885       uint8x8_t l;
 886       uint16x4_t m;
 887       uint32x2_t n;
 888       uint8x16_t t, u, v, w;
 889
 890       p += 16;
 891       data = vld1q_u8 (p);
 892       mask = 0xffff;
 893
 894     start:
 895       t = vceqq_u8 (data, repl_nl);
 896       u = vceqq_u8 (data, repl_cr);
 897       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 898       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 899       t = vandq_u8 (vorrq_u8 (v, w), xmask);
 900       l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
 901       m = vpaddl_u8 (l);
 902       n = vpaddl_u16 (m);
 903
 904       found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
 905               vshr_n_u64 ((uint64x1_t) n, 24)), 0);
 906       found &= mask;
 907     }
 908   while (!found);
 909
 910   /* FOUND contains 1 in bits for which we matched a relevant
 911      character.  Conversion to the byte index is trivial.  */
 912   found = __builtin_ctz (found);
 913   return (const uchar *)p + found;
 914 }
 915
 916 #else
 917
 918 /* We only have one accelerated alternative.  Use a direct call so that
 919    we encourage inlining.  */
 920
 921 #define search_line_fast  search_line_acc_char
 922
 923 #endif
 924
 925 /* Initialize the lexer if needed.  */
 926
 927 void
 928 _cpp_init_lexer (void)
 929 {
 930 #ifdef HAVE_init_vectorized_lexer
 931   init_vectorized_lexer ();
 932 #endif
 933 }
 934
 935 /* Returns with a logical line that contains no escaped newlines or
 936    trigraphs.  This is a time-critical inner loop.  */
 937 void
 938 _cpp_clean_line (cpp_reader *pfile)
 939 {
 940   cpp_buffer *buffer;
 941   const uchar *s;
 942   uchar c, *d, *p;
 943
 944   buffer = pfile->buffer;
 945   buffer->cur_note = buffer->notes_used = 0;
 946   buffer->cur = buffer->line_base = buffer->next_line;
 947   buffer->need_line = false;
 948   s = buffer->next_line;
 949
 950   if (!buffer->from_stage3)
 951     {
 952       const uchar *pbackslash = NULL;
 953
 954       /* Fast path.  This is the common case of an un-escaped line with
 955          no trigraphs.  The primary win here is by not writing any
 956          data back to memory until we have to.  */
 957       while (1)
 958         {
 959           /* Perform an optimized search for \n, \r, \\, ?.  */
 960           s = search_line_fast (s, buffer->rlimit);
 961
 962           c = *s;
 963           if (c == '\\')
 964             {
 965               /* Record the location of the backslash and continue.  */
 966               pbackslash = s++;
 967             }
 968           else if (__builtin_expect (c == '?', 0))
 969             {
 970               if (__builtin_expect (s[1] == '?', false)
 971                    && _cpp_trigraph_map[s[2]])
 972                 {
 973                   /* Have a trigraph.  We may or may not have to convert
 974                      it.  Add a line note regardless, for -Wtrigraphs.  */
 975                   add_line_note (buffer, s, s[2]);
 976                   if (CPP_OPTION (pfile, trigraphs))
 977                     {
 978                       /* We do, and that means we have to switch to the
 979                          slow path.  */
 980                       d = (uchar *) s;
 981                       *d = _cpp_trigraph_map[s[2]];
 982                       s += 2;
 983                       goto slow_path;
 984                     }
 985                 }
 986               /* Not a trigraph.  Continue on fast-path.  */
 987               s++;
 988             }
 989           else
 990             break;
 991         }
 992
 993       /* This must be \r or \n.  We're either done, or we'll be forced
 994          to write back to the buffer and continue on the slow path.  */
 995       d = (uchar *) s;
 996
 997       if (__builtin_expect (s == buffer->rlimit, false))
 998         goto done;
 999
1000       /* DOS line ending? */
1001       if (__builtin_expect (c == '\r', false) && s[1] == '\n')
1002         {
1003           s++;
1004           if (s == buffer->rlimit)
1005             goto done;
1006         }
1007
1008       if (__builtin_expect (pbackslash == NULL, true))
1009         goto done;
1010
1011       /* Check for escaped newline.  */
1012       p = d;
1013       while (is_nvspace (p[-1]))
1014         p--;
1015       if (p - 1 != pbackslash)
1016         goto done;
1017
1018       /* Have an escaped newline; process it and proceed to
1019          the slow path.  */
1020       add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
1021       d = p - 2;
1022       buffer->next_line = p - 1;
1023
1024     slow_path:
1025       while (1)
1026         {
1027           c = *++s;
1028           *++d = c;
1029
1030           if (c == '\n' || c == '\r')
1031             {
1032               /* Handle DOS line endings.  */
1033               if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
1034                 s++;
1035               if (s == buffer->rlimit)
1036                 break;
1037
1038               /* Escaped?  */
1039               p = d;
1040               while (p != buffer->next_line && is_nvspace (p[-1]))
1041                 p--;
1042               if (p == buffer->next_line || p[-1] != '\\')
1043                 break;
1044
1045               add_line_note (buffer, p - 1, p != d ? ' ': '\\');
1046               d = p - 2;
1047               buffer->next_line = p - 1;
1048             }
1049           else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
1050             {
1051               /* Add a note regardless, for the benefit of -Wtrigraphs.  */
1052               add_line_note (buffer, d, s[2]);
1053               if (CPP_OPTION (pfile, trigraphs))
1054                 {
1055                   *d = _cpp_trigraph_map[s[2]];
1056                   s += 2;
1057                 }
1058             }
1059         }
1060     }
1061   else
1062     {
1063       while (*s != '\n' && *s != '\r')
1064         s++;
1065       d = (uchar *) s;
1066
1067       /* Handle DOS line endings.  */
1068       if (*s == '\r' && s + 1 != buffer->rlimit && s[1] == '\n')
1069         s++;
1070     }
1071
1072  done:
1073   *d = '\n';
1074   /* A sentinel note that should never be processed.  */
1075   add_line_note (buffer, d + 1, '\n');
1076   buffer->next_line = s + 1;
1077 }
1078
1079 /* Return true if the trigraph indicated by NOTE should be warned
1080    about in a comment.  */
1081 static bool
1082 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
1083 {
1084   const uchar *p;
1085
1086   /* Within comments we don't warn about trigraphs, unless the
1087      trigraph forms an escaped newline, as that may change
1088      behavior.  */
1089   if (note->type != '/')
1090     return false;
1091
1092   /* If -trigraphs, then this was an escaped newline iff the next note
1093      is coincident.  */
1094   if (CPP_OPTION (pfile, trigraphs))
1095     return note[1].pos == note->pos;
1096
1097   /* Otherwise, see if this forms an escaped newline.  */
1098   p = note->pos + 3;
1099   while (is_nvspace (*p))
1100     p++;
1101
1102   /* There might have been escaped newlines between the trigraph and the
1103      newline we found.  Hence the position test.  */
1104   return (*p == '\n' && p < note[1].pos);
1105 }
1106
1107 /* Process the notes created by add_line_note as far as the current
1108    location.  */
1109 void
1110 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
1111 {
1112   cpp_buffer *buffer = pfile->buffer;
1113
1114   for (;;)
1115     {
1116       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
1117       unsigned int col;
1118
1119       if (note->pos > buffer->cur)
1120         break;
1121
1122       buffer->cur_note++;
1123       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
1124
1125       if (note->type == '\\' || note->type == ' ')
1126         {
1127           if (note->type == ' ' && !in_comment)
1128             cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
1129                                  "backslash and newline separated by space");
1130
1131           if (buffer->next_line > buffer->rlimit)
1132             {
1133               cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
1134                                    "backslash-newline at end of file");
1135               /* Prevent "no newline at end of file" warning.  */
1136               buffer->next_line = buffer->rlimit;
1137             }
1138
1139           buffer->line_base = note->pos;
1140           CPP_INCREMENT_LINE (pfile, 0);
1141         }
1142       else if (_cpp_trigraph_map[note->type])
1143         {
1144           if (CPP_OPTION (pfile, warn_trigraphs)
1145               && (!in_comment || warn_in_comment (pfile, note)))
1146             {
1147               if (CPP_OPTION (pfile, trigraphs))
1148                 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
1149                                        pfile->line_table->highest_line, col,
1150                                        "trigraph ??%c converted to %c",
1151                                        note->type,
1152                                        (int) _cpp_trigraph_map[note->type]);
1153               else
1154                 {
1155                   cpp_warning_with_line
1156                     (pfile, CPP_W_TRIGRAPHS,
1157                      pfile->line_table->highest_line, col,
1158                      "trigraph ??%c ignored, use -trigraphs to enable",
1159                      note->type);
1160                 }
1161             }
1162         }
1163       else if (note->type == 0)
1164         /* Already processed in lex_raw_string.  */;
1165       else
1166         abort ();
1167     }
1168 }
1169
1170 namespace bidi {
1171   enum class kind {
1172     NONE, LRE, RLE, LRO, RLO, LRI, RLI, FSI, PDF, PDI, LTR, RTL
1173   };
1174
1175   /* All the UTF-8 encodings of bidi characters start with E2.  */
1176   constexpr uchar utf8_start = 0xe2;
1177
1178   struct context
1179   {
1180     context () {}
1181     context (location_t loc, kind k, bool pdf, bool ucn)
1182     : m_loc (loc), m_kind (k), m_pdf (pdf), m_ucn (ucn)
1183     {
1184     }
1185
1186     kind get_pop_kind () const
1187     {
1188       return m_pdf ? kind::PDF : kind::PDI;
1189     }
1190     bool ucn_p () const
1191     {
1192       return m_ucn;
1193     }
1194
1195     location_t m_loc;
1196     kind m_kind;
1197     unsigned m_pdf : 1;
1198     unsigned m_ucn : 1;
1199   };
1200
1201   /* A vector holding currently open bidi contexts.  We use a char for
1202      each context, its LSB is 1 if it represents a PDF context, 0 if it
1203      represents a PDI context.  The next bit is 1 if this context was open
1204      by a bidi character written as a UCN, and 0 when it was UTF-8.  */
1205   semi_embedded_vec <context, 16> vec;
1206
1207   /* Close the whole comment/identifier/string literal/character constant
1208      context.  */
1209   void on_close ()
1210   {
1211     vec.truncate (0);
1212   }
1213
1214   /* Pop the last element in the vector.  */
1215   void pop ()
1216   {
1217     unsigned int len = vec.count ();
1218     gcc_checking_assert (len > 0);
1219     vec.truncate (len - 1);
1220   }
1221
1222   /* Return the pop kind of the context of the Ith element.  */
1223   kind pop_kind_at (unsigned int i)
1224   {
1225     return vec[i].get_pop_kind ();
1226   }
1227
1228   /* Return the pop kind of the context that is currently opened.  */
1229   kind current_ctx ()
1230   {
1231     unsigned int len = vec.count ();
1232     if (len == 0)
1233       return kind::NONE;
1234     return vec[len - 1].get_pop_kind ();
1235   }
1236
1237   /* Return true if the current context comes from a UCN origin, that is,
1238      the bidi char which started this bidi context was written as a UCN.  */
1239   bool current_ctx_ucn_p ()
1240   {
1241     unsigned int len = vec.count ();
1242     gcc_checking_assert (len > 0);
1243     return vec[len - 1].m_ucn;
1244   }
1245
1246   location_t current_ctx_loc ()
1247   {
1248     unsigned int len = vec.count ();
1249     gcc_checking_assert (len > 0);
1250     return vec[len - 1].m_loc;
1251   }
1252
1253   /* We've read a bidi char, update the current vector as necessary.
1254      LOC is only valid when K is not kind::NONE.  */
1255   void on_char (kind k, bool ucn_p, location_t loc)
1256   {
1257     switch (k)
1258       {
1259       case kind::LRE:
1260       case kind::RLE:
1261       case kind::LRO:
1262       case kind::RLO:
1263         vec.push (context (loc, k, true, ucn_p));
1264         break;
1265       case kind::LRI:
1266       case kind::RLI:
1267       case kind::FSI:
1268         vec.push (context (loc, k, false, ucn_p));
1269         break;
1270       /* PDF terminates the scope of the last LRE, RLE, LRO, or RLO
1271          whose scope has not yet been terminated.  */
1272       case kind::PDF:
1273         if (current_ctx () == kind::PDF)
1274           pop ();
1275         break;
1276       /* PDI terminates the scope of the last LRI, RLI, or FSI whose
1277          scope has not yet been terminated, as well as the scopes of
1278          any subsequent LREs, RLEs, LROs, or RLOs whose scopes have not
1279          yet been terminated.  */
1280       case kind::PDI:
1281         for (int i = vec.count () - 1; i >= 0; --i)
1282           if (pop_kind_at (i) == kind::PDI)
1283             {
1284               vec.truncate (i);
1285               break;
1286             }
1287         break;
1288       case kind::LTR:
1289       case kind::RTL:
1290         /* These aren't popped by a PDF/PDI.  */
1291         break;
1292       ATTR_LIKELY case kind::NONE:
1293         break;
1294       default:
1295         abort ();
1296       }
1297   }
1298
1299   /* Return a descriptive string for K.  */
1300   const char *to_str (kind k)
1301   {
1302     switch (k)
1303       {
1304       case kind::LRE:
1305         return "U+202A (LEFT-TO-RIGHT EMBEDDING)";
1306       case kind::RLE:
1307         return "U+202B (RIGHT-TO-LEFT EMBEDDING)";
1308       case kind::LRO:
1309         return "U+202D (LEFT-TO-RIGHT OVERRIDE)";
1310       case kind::RLO:
1311         return "U+202E (RIGHT-TO-LEFT OVERRIDE)";
1312       case kind::LRI:
1313         return "U+2066 (LEFT-TO-RIGHT ISOLATE)";
1314       case kind::RLI:
1315         return "U+2067 (RIGHT-TO-LEFT ISOLATE)";
1316       case kind::FSI:
1317         return "U+2068 (FIRST STRONG ISOLATE)";
1318       case kind::PDF:
1319         return "U+202C (POP DIRECTIONAL FORMATTING)";
1320       case kind::PDI:
1321         return "U+2069 (POP DIRECTIONAL ISOLATE)";
1322       case kind::LTR:
1323         return "U+200E (LEFT-TO-RIGHT MARK)";
1324       case kind::RTL:
1325         return "U+200F (RIGHT-TO-LEFT MARK)";
1326       default:
1327         abort ();
1328       }
1329   }
1330 }
1331
1332 /* Get location_t for the range of bytes [START, START + NUM_BYTES)
1333    within the current line in FILE, with the caret at START.  */
1334
1335 static location_t
1336 get_location_for_byte_range_in_cur_line (cpp_reader *pfile,
1337                                          const unsigned char *const start,
1338                                          size_t num_bytes)
1339 {
1340   gcc_checking_assert (num_bytes > 0);
1341
1342   /* CPP_BUF_COLUMN and linemap_position_for_column both refer
1343      to offsets in bytes, but CPP_BUF_COLUMN is 0-based,
1344      whereas linemap_position_for_column is 1-based.  */
1345
1346   /* Get 0-based offsets within the line.  */
1347   size_t start_offset = CPP_BUF_COLUMN (pfile->buffer, start);
1348   size_t end_offset = start_offset + num_bytes - 1;
1349
1350   /* Now convert to location_t, where "columns" are 1-based byte offsets.  */
1351   location_t start_loc = linemap_position_for_column (pfile->line_table,
1352                                                       start_offset + 1);
1353   location_t end_loc = linemap_position_for_column (pfile->line_table,
1354                                                      end_offset + 1);
1355
1356   if (start_loc == end_loc)
1357     return start_loc;
1358
1359   source_range src_range;
1360   src_range.m_start = start_loc;
1361   src_range.m_finish = end_loc;
1362   location_t combined_loc = COMBINE_LOCATION_DATA (pfile->line_table,
1363                                                    start_loc,
1364                                                    src_range,
1365                                                    NULL,
1366                                                    0);
1367   return combined_loc;
1368 }
1369
1370 /* Parse a sequence of 3 bytes starting with P and return its bidi code.  */
1371
1372 static bidi::kind
1373 get_bidi_utf8_1 (const unsigned char *const p)
1374 {
1375   gcc_checking_assert (p[0] == bidi::utf8_start);
1376
1377   if (p[1] == 0x80)
1378     switch (p[2])
1379       {
1380       case 0xaa:
1381         return bidi::kind::LRE;
1382       case 0xab:
1383         return bidi::kind::RLE;
1384       case 0xac:
1385         return bidi::kind::PDF;
1386       case 0xad:
1387         return bidi::kind::LRO;
1388       case 0xae:
1389         return bidi::kind::RLO;
1390       case 0x8e:
1391         return bidi::kind::LTR;
1392       case 0x8f:
1393         return bidi::kind::RTL;
1394       default:
1395         break;
1396       }
1397   else if (p[1] == 0x81)
1398     switch (p[2])
1399       {
1400       case 0xa6:
1401         return bidi::kind::LRI;
1402       case 0xa7:
1403         return bidi::kind::RLI;
1404       case 0xa8:
1405         return bidi::kind::FSI;
1406       case 0xa9:
1407         return bidi::kind::PDI;
1408       default:
1409         break;
1410       }
1411
1412   return bidi::kind::NONE;
1413 }
1414
1415 /* Parse a sequence of 3 bytes starting with P and return its bidi code.
1416    If the kind is not NONE, write the location to *OUT.*/
1417
1418 static bidi::kind
1419 get_bidi_utf8 (cpp_reader *pfile, const unsigned char *const p, location_t *out)
1420 {
1421   bidi::kind result = get_bidi_utf8_1 (p);
1422   if (result != bidi::kind::NONE)
1423     {
1424       /* We have a sequence of 3 bytes starting at P.  */
1425       *out = get_location_for_byte_range_in_cur_line (pfile, p, 3);
1426     }
1427   return result;
1428 }
1429
1430 /* Parse a UCN where P points just past \u or \U and return its bidi code.  */
1431
1432 static bidi::kind
1433 get_bidi_ucn_1 (const unsigned char *p, bool is_U, const unsigned char **end)
1434 {
1435   /* 6.4.3 Universal Character Names
1436       \u hex-quad
1437       \U hex-quad hex-quad
1438       \u { simple-hexadecimal-digit-sequence }
1439      where \unnnn means \U0000nnnn.  */
1440
1441   *end = p + 4;
1442   if (is_U)
1443     {
1444       if (p[0] != '0' || p[1] != '0' || p[2] != '0' || p[3] != '0')
1445         return bidi::kind::NONE;
1446       /* Skip 4B so we can treat \u and \U the same below.  */
1447       p += 4;
1448       *end += 4;
1449     }
1450   else if (p[0] == '{')
1451     {
1452       p++;
1453       while (*p == '0')
1454         p++;
1455       if (p[0] != '2'
1456           || p[1] != '0'
1457           || !ISXDIGIT (p[2])
1458           || !ISXDIGIT (p[3])
1459           || p[4] != '}')
1460         return bidi::kind::NONE;
1461       *end = p + 5;
1462     }
1463
1464   /* All code points we are looking for start with 20xx.  */
1465   if (p[0] != '2' || p[1] != '0')
1466     return bidi::kind::NONE;
1467   else if (p[2] == '2')
1468     switch (p[3])
1469       {
1470       case 'a':
1471       case 'A':
1472         return bidi::kind::LRE;
1473       case 'b':
1474       case 'B':
1475         return bidi::kind::RLE;
1476       case 'c':
1477       case 'C':
1478         return bidi::kind::PDF;
1479       case 'd':
1480       case 'D':
1481         return bidi::kind::LRO;
1482       case 'e':
1483       case 'E':
1484         return bidi::kind::RLO;
1485       default:
1486         break;
1487       }
1488   else if (p[2] == '6')
1489     switch (p[3])
1490       {
1491       case '6':
1492         return bidi::kind::LRI;
1493       case '7':
1494         return bidi::kind::RLI;
1495       case '8':
1496         return bidi::kind::FSI;
1497       case '9':
1498         return bidi::kind::PDI;
1499       default:
1500         break;
1501       }
1502   else if (p[2] == '0')
1503     switch (p[3])
1504       {
1505       case 'e':
1506       case 'E':
1507         return bidi::kind::LTR;
1508       case 'f':
1509       case 'F':
1510         return bidi::kind::RTL;
1511       default:
1512         break;
1513       }
1514
1515   return bidi::kind::NONE;
1516 }
1517
1518 /* Parse a UCN where P points just past \u or \U and return its bidi code.
1519    If the kind is not NONE, write the location to *OUT.  */
1520
1521 static bidi::kind
1522 get_bidi_ucn (cpp_reader *pfile, const unsigned char *p, bool is_U,
1523               location_t *out)
1524 {
1525   const unsigned char *end;
1526   bidi::kind result = get_bidi_ucn_1 (p, is_U, &end);
1527   if (result != bidi::kind::NONE)
1528     {
1529       const unsigned char *start = p - 2;
1530       size_t num_bytes = end - start;
1531       *out = get_location_for_byte_range_in_cur_line (pfile, start, num_bytes);
1532     }
1533   return result;
1534 }
1535
1536 /* Parse a named universal character escape where P points just past \N and
1537    return its bidi code.  If the kind is not NONE, write the location to
1538    *OUT.  */
1539
1540 static bidi::kind
1541 get_bidi_named (cpp_reader *pfile, const unsigned char *p, location_t *out)
1542 {
1543   bidi::kind result = bidi::kind::NONE;
1544   if (*p != '{')
1545     return bidi::kind::NONE;
1546   if (strncmp ((const char *) (p + 1), "LEFT-TO-RIGHT ", 14) == 0)
1547     {
1548       if (strncmp ((const char *) (p + 15), "MARK}", 5) == 0)
1549         result = bidi::kind::LTR;
1550       else if (strncmp ((const char *) (p + 15), "EMBEDDING}", 10) == 0)
1551         result = bidi::kind::LRE;
1552       else if (strncmp ((const char *) (p + 15), "OVERRIDE}", 9) == 0)
1553         result = bidi::kind::LRO;
1554       else if (strncmp ((const char *) (p + 15), "ISOLATE}", 8) == 0)
1555         result = bidi::kind::LRI;
1556     }
1557   else if (strncmp ((const char *) (p + 1), "RIGHT-TO-LEFT ", 14) == 0)
1558     {
1559       if (strncmp ((const char *) (p + 15), "MARK}", 5) == 0)
1560         result = bidi::kind::RTL;
1561       else if (strncmp ((const char *) (p + 15), "EMBEDDING}", 10) == 0)
1562         result = bidi::kind::RLE;
1563       else if (strncmp ((const char *) (p + 15), "OVERRIDE}", 9) == 0)
1564         result = bidi::kind::RLO;
1565       else if (strncmp ((const char *) (p + 15), "ISOLATE}", 8) == 0)
1566         result = bidi::kind::RLI;
1567     }
1568   else if (strncmp ((const char *) (p + 1), "POP DIRECTIONAL ", 16) == 0)
1569     {
1570       if (strncmp ((const char *) (p + 16), "FORMATTING}", 11) == 0)
1571         result = bidi::kind::PDF;
1572       else if (strncmp ((const char *) (p + 16), "ISOLATE}", 8) == 0)
1573         result = bidi::kind::PDI;
1574     }
1575   else if (strncmp ((const char *) (p + 1), "FIRST STRONG ISOLATE}", 21) == 0)
1576     result = bidi::kind::FSI;
1577   if (result != bidi::kind::NONE)
1578     *out = get_location_for_byte_range_in_cur_line (pfile, p - 2,
1579                                                     (strchr ((const char *)
1580                                                              (p + 1), '}')
1581                                                      - (const char *) p)
1582                                                     + 3);
1583   return result;
1584 }
1585
1586 /* Subclass of rich_location for reporting on unpaired UTF-8
1587    bidirectional control character(s).
1588    Escape the source lines on output, and show all unclosed
1589    bidi context, labelling everything.  */
1590
1591 class unpaired_bidi_rich_location : public rich_location
1592 {
1593  public:
1594   class custom_range_label : public range_label
1595   {
1596    public:
1597      label_text get_text (unsigned range_idx) const final override
1598      {
1599        /* range 0 is the primary location; each subsequent range i + 1
1600           is for bidi::vec[i].  */
1601        if (range_idx > 0)
1602          {
1603            const bidi::context &ctxt (bidi::vec[range_idx - 1]);
1604            return label_text::borrow (bidi::to_str (ctxt.m_kind));
1605          }
1606        else
1607          return label_text::borrow (_("end of bidirectional context"));
1608      }
1609   };
1610
1611   unpaired_bidi_rich_location (cpp_reader *pfile, location_t loc)
1612   : rich_location (pfile->line_table, loc, &m_custom_label)
1613   {
1614     set_escape_on_output (true);
1615     for (unsigned i = 0; i < bidi::vec.count (); i++)
1616       add_range (bidi::vec[i].m_loc,
1617                  SHOW_RANGE_WITHOUT_CARET,
1618                  &m_custom_label);
1619   }
1620
1621  private:
1622    custom_range_label m_custom_label;
1623 };
1624
1625 /* We're closing a bidi context, that is, we've encountered a newline,
1626    are closing a C-style comment, or are at the end of a string literal,
1627    character constant, or identifier.  Warn if this context was not
1628    properly terminated by a PDI or PDF.  P points to the last character
1629    in this context.  */
1630
1631 static void
1632 maybe_warn_bidi_on_close (cpp_reader *pfile, const uchar *p)
1633 {
1634   const auto warn_bidi = CPP_OPTION (pfile, cpp_warn_bidirectional);
1635   if (bidi::vec.count () > 0
1636       && (warn_bidi & bidirectional_unpaired
1637           && (!bidi::current_ctx_ucn_p ()
1638               || (warn_bidi & bidirectional_ucn))))
1639     {
1640       const location_t loc
1641         = linemap_position_for_column (pfile->line_table,
1642                                        CPP_BUF_COLUMN (pfile->buffer, p));
1643       unpaired_bidi_rich_location rich_loc (pfile, loc);
1644       /* cpp_callbacks doesn't yet have a way to handle singular vs plural
1645          forms of a diagnostic, so fake it for now.  */
1646       if (bidi::vec.count () > 1)
1647         cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1648                         "unpaired UTF-8 bidirectional control characters "
1649                         "detected");
1650       else
1651         cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1652                         "unpaired UTF-8 bidirectional control character "
1653                         "detected");
1654     }
1655   /* We're done with this context.  */
1656   bidi::on_close ();
1657 }
1658
1659 /* We're at the beginning or in the middle of an identifier/comment/string
1660    literal/character constant.  Warn if we've encountered a bidi character.
1661    KIND says which bidi control character it was; UCN_P is true iff this bidi
1662    control character was written as a UCN.  LOC is the location of the
1663    character, but is only valid if KIND != bidi::kind::NONE.  */
1664
1665 static void
1666 maybe_warn_bidi_on_char (cpp_reader *pfile, bidi::kind kind,
1667                          bool ucn_p, location_t loc)
1668 {
1669   if (__builtin_expect (kind == bidi::kind::NONE, 1))
1670     return;
1671
1672   const auto warn_bidi = CPP_OPTION (pfile, cpp_warn_bidirectional);
1673
1674   if (warn_bidi & (bidirectional_unpaired|bidirectional_any))
1675     {
1676       rich_location rich_loc (pfile->line_table, loc);
1677       rich_loc.set_escape_on_output (true);
1678
1679       /* It seems excessive to warn about a PDI/PDF that is closing
1680          an opened context because we've already warned about the
1681          opening character.  Except warn when we have a UCN x UTF-8
1682          mismatch, if UCN checking is enabled.  */
1683       if (kind == bidi::current_ctx ())
1684         {
1685           if (warn_bidi == (bidirectional_unpaired|bidirectional_ucn)
1686               && bidi::current_ctx_ucn_p () != ucn_p)
1687             {
1688               rich_loc.add_range (bidi::current_ctx_loc ());
1689               cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1690                               "UTF-8 vs UCN mismatch when closing "
1691                               "a context by \"%s\"", bidi::to_str (kind));
1692             }
1693         }
1694       else if (warn_bidi & bidirectional_any
1695                && (!ucn_p || (warn_bidi & bidirectional_ucn)))
1696         {
1697           if (kind == bidi::kind::PDF || kind == bidi::kind::PDI)
1698             cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1699                             "\"%s\" is closing an unopened context",
1700                             bidi::to_str (kind));
1701           else
1702             cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1703                             "found problematic Unicode character \"%s\"",
1704                             bidi::to_str (kind));
1705         }
1706     }
1707   /* We're done with this context.  */
1708   bidi::on_char (kind, ucn_p, loc);
1709 }
1710
1711 static const cppchar_t utf8_continuation = 0x80;
1712 static const cppchar_t utf8_signifier = 0xC0;
1713
1714 /* Emit -Winvalid-utf8 warning on invalid UTF-8 character starting
1715    at PFILE->buffer->cur.  Return a pointer after the diagnosed
1716    invalid character.  */
1717
1718 static const uchar *
1719 _cpp_warn_invalid_utf8 (cpp_reader *pfile)
1720 {
1721   cpp_buffer *buffer = pfile->buffer;
1722   const uchar *cur = buffer->cur;
1723   bool pedantic = (CPP_PEDANTIC (pfile)
1724                    && CPP_OPTION (pfile, cpp_warn_invalid_utf8) == 2);
1725
1726   if (cur[0] < utf8_signifier
1727       || cur[1] < utf8_continuation || cur[1] >= utf8_signifier)
1728     {
1729       if (pedantic)
1730         cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1731                              pfile->line_table->highest_line,
1732                              CPP_BUF_COL (buffer),
1733                              "invalid UTF-8 character <%x>",
1734                              cur[0]);
1735       else
1736         cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1737                                pfile->line_table->highest_line,
1738                                CPP_BUF_COL (buffer),
1739                                "invalid UTF-8 character <%x>",
1740                                cur[0]);
1741       return cur + 1;
1742     }
1743   else if (cur[2] < utf8_continuation || cur[2] >= utf8_signifier)
1744     {
1745       if (pedantic)
1746         cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1747                              pfile->line_table->highest_line,
1748                              CPP_BUF_COL (buffer),
1749                              "invalid UTF-8 character <%x><%x>",
1750                              cur[0], cur[1]);
1751       else
1752         cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1753                                pfile->line_table->highest_line,
1754                                CPP_BUF_COL (buffer),
1755                                "invalid UTF-8 character <%x><%x>",
1756                                cur[0], cur[1]);
1757       return cur + 2;
1758     }
1759   else if (cur[3] < utf8_continuation || cur[3] >= utf8_signifier)
1760     {
1761       if (pedantic)
1762         cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1763                              pfile->line_table->highest_line,
1764                              CPP_BUF_COL (buffer),
1765                              "invalid UTF-8 character <%x><%x><%x>",
1766                              cur[0], cur[1], cur[2]);
1767       else
1768         cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1769                                pfile->line_table->highest_line,
1770                                CPP_BUF_COL (buffer),
1771                                "invalid UTF-8 character <%x><%x><%x>",
1772                                cur[0], cur[1], cur[2]);
1773       return cur + 3;
1774     }
1775   else
1776     {
1777       if (pedantic)
1778         cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1779                              pfile->line_table->highest_line,
1780                              CPP_BUF_COL (buffer),
1781                              "invalid UTF-8 character <%x><%x><%x><%x>",
1782                              cur[0], cur[1], cur[2], cur[3]);
1783       else
1784         cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1785                                pfile->line_table->highest_line,
1786                                CPP_BUF_COL (buffer),
1787                                "invalid UTF-8 character <%x><%x><%x><%x>",
1788                                cur[0], cur[1], cur[2], cur[3]);
1789       return cur + 4;
1790     }
1791 }
1792
1793 /* Helper function of *skip_*_comment and lex*_string.  For C,
1794    character at CUR[-1] with MSB set handle -Wbidi-chars* and
1795    -Winvalid-utf8 diagnostics and return pointer to first character
1796    that should be processed next.  */
1797
1798 static inline const uchar *
1799 _cpp_handle_multibyte_utf8 (cpp_reader *pfile, uchar c,
1800                             const uchar *cur, bool warn_bidi_p,
1801                             bool warn_invalid_utf8_p)
1802 {
1803   /* If this is a beginning of a UTF-8 encoding, it might be
1804      a bidirectional control character.  */
1805   if (c == bidi::utf8_start && warn_bidi_p)
1806     {
1807       location_t loc;
1808       bidi::kind kind = get_bidi_utf8 (pfile, cur - 1, &loc);
1809       maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
1810     }
1811   if (!warn_invalid_utf8_p)
1812     return cur;
1813   if (c >= utf8_signifier)
1814     {
1815       cppchar_t s;
1816       const uchar *pstr = cur - 1;
1817       if (_cpp_valid_utf8 (pfile, &pstr, pfile->buffer->rlimit, 0, NULL, &s)
1818           && s <= UCS_LIMIT)
1819         return pstr;
1820     }
1821   pfile->buffer->cur = cur - 1;
1822   return _cpp_warn_invalid_utf8 (pfile);
1823 }
1824
1825 /* Skip a C-style block comment.  We find the end of the comment by
1826    seeing if an asterisk is before every '/' we encounter.  Returns
1827    nonzero if comment terminated by EOF, zero otherwise.
1828
1829    Buffer->cur points to the initial asterisk of the comment.  */
1830 bool
1831 _cpp_skip_block_comment (cpp_reader *pfile)
1832 {
1833   cpp_buffer *buffer = pfile->buffer;
1834   const uchar *cur = buffer->cur;
1835   uchar c;
1836   const bool warn_bidi_p = pfile->warn_bidi_p ();
1837   const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
1838   const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
1839
1840   cur++;
1841   if (*cur == '/')
1842     cur++;
1843
1844   for (;;)
1845     {
1846       /* People like decorating comments with '*', so check for '/'
1847          instead for efficiency.  */
1848       c = *cur++;
1849
1850       if (c == '/')
1851         {
1852           if (cur[-2] == '*')
1853             {
1854               if (warn_bidi_p)
1855                 maybe_warn_bidi_on_close (pfile, cur);
1856               break;
1857             }
1858
1859           /* Warn about potential nested comments, but not if the '/'
1860              comes immediately before the true comment delimiter.
1861              Don't bother to get it right across escaped newlines.  */
1862           if (CPP_OPTION (pfile, warn_comments)
1863               && cur[0] == '*' && cur[1] != '/')
1864             {
1865               buffer->cur = cur;
1866               cpp_warning_with_line (pfile, CPP_W_COMMENTS,
1867                                      pfile->line_table->highest_line,
1868                                      CPP_BUF_COL (buffer),
1869                                      "\"/*\" within comment");
1870             }
1871         }
1872       else if (c == '\n')
1873         {
1874           unsigned int cols;
1875           buffer->cur = cur - 1;
1876           if (warn_bidi_p)
1877             maybe_warn_bidi_on_close (pfile, cur);
1878           _cpp_process_line_notes (pfile, true);
1879           if (buffer->next_line >= buffer->rlimit)
1880             return true;
1881           _cpp_clean_line (pfile);
1882
1883           cols = buffer->next_line - buffer->line_base;
1884           CPP_INCREMENT_LINE (pfile, cols);
1885
1886           cur = buffer->cur;
1887         }
1888       else if (__builtin_expect (c >= utf8_continuation, 0)
1889                && warn_bidi_or_invalid_utf8_p)
1890         cur = _cpp_handle_multibyte_utf8 (pfile, c, cur, warn_bidi_p,
1891                                           warn_invalid_utf8_p);
1892     }
1893
1894   buffer->cur = cur;
1895   _cpp_process_line_notes (pfile, true);
1896   return false;
1897 }
1898
1899 /* Skip a C++ line comment, leaving buffer->cur pointing to the
1900    terminating newline.  Handles escaped newlines.  Returns nonzero
1901    if a multiline comment.  */
1902 static int
1903 skip_line_comment (cpp_reader *pfile)
1904 {
1905   cpp_buffer *buffer = pfile->buffer;
1906   location_t orig_line = pfile->line_table->highest_line;
1907   const bool warn_bidi_p = pfile->warn_bidi_p ();
1908   const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
1909   const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
1910
1911   if (!warn_bidi_or_invalid_utf8_p)
1912     while (*buffer->cur != '\n')
1913       buffer->cur++;
1914   else if (!warn_invalid_utf8_p)
1915     {
1916       while (*buffer->cur != '\n'
1917              && *buffer->cur != bidi::utf8_start)
1918         buffer->cur++;
1919       if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
1920         {
1921           while (*buffer->cur != '\n')
1922             {
1923               if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
1924                 {
1925                   location_t loc;
1926                   bidi::kind kind = get_bidi_utf8 (pfile, buffer->cur, &loc);
1927                   maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
1928                 }
1929               buffer->cur++;
1930             }
1931           maybe_warn_bidi_on_close (pfile, buffer->cur);
1932         }
1933     }
1934   else
1935     {
1936       while (*buffer->cur != '\n')
1937         {
1938           if (*buffer->cur < utf8_continuation)
1939             {
1940               buffer->cur++;
1941               continue;
1942             }
1943           buffer->cur
1944             = _cpp_handle_multibyte_utf8 (pfile, *buffer->cur, buffer->cur + 1,
1945                                           warn_bidi_p, warn_invalid_utf8_p);
1946         }
1947       if (warn_bidi_p)
1948         maybe_warn_bidi_on_close (pfile, buffer->cur);
1949     }
1950
1951   _cpp_process_line_notes (pfile, true);
1952   return orig_line != pfile->line_table->highest_line;
1953 }
1954
1955 /* Skips whitespace, saving the next non-whitespace character.  */
1956 static void
1957 skip_whitespace (cpp_reader *pfile, cppchar_t c)
1958 {
1959   cpp_buffer *buffer = pfile->buffer;
1960   bool saw_NUL = false;
1961
1962   do
1963     {
1964       /* Horizontal space always OK.  */
1965       if (c == ' ' || c == '\t')
1966         ;
1967       /* Just \f \v or \0 left.  */
1968       else if (c == '\0')
1969         saw_NUL = true;
1970       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
1971         cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
1972                              CPP_BUF_COL (buffer),
1973                              "%s in preprocessing directive",
1974                              c == '\f' ? "form feed" : "vertical tab");
1975
1976       c = *buffer->cur++;
1977     }
1978   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
1979   while (is_nvspace (c));
1980
1981   if (saw_NUL)
1982     {
1983       encoding_rich_location rich_loc (pfile);
1984       cpp_error_at (pfile, CPP_DL_WARNING, &rich_loc,
1985                     "null character(s) ignored");
1986     }
1987
1988   buffer->cur--;
1989 }
1990
1991 /* See if the characters of a number token are valid in a name (no
1992    '.', '+' or '-').  */
1993 static int
1994 name_p (cpp_reader *pfile, const cpp_string *string)
1995 {
1996   unsigned int i;
1997
1998   for (i = 0; i < string->len; i++)
1999     if (!is_idchar (string->text[i]))
2000       return 0;
2001
2002   return 1;
2003 }
2004
2005 /* After parsing an identifier or other sequence, produce a warning about
2006    sequences not in NFC/NFKC.  */
2007 static void
2008 warn_about_normalization (cpp_reader *pfile,
2009                           const cpp_token *token,
2010                           const struct normalize_state *s,
2011                           bool identifier)
2012 {
2013   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
2014       && !pfile->state.skipping)
2015     {
2016       location_t loc = token->src_loc;
2017
2018       /* If possible, create a location range for the token.  */
2019       if (loc >= RESERVED_LOCATION_COUNT
2020           && token->type != CPP_EOF
2021           /* There must be no line notes to process.  */
2022           && (!(pfile->buffer->cur
2023                 >= pfile->buffer->notes[pfile->buffer->cur_note].pos
2024                 && !pfile->overlaid_buffer)))
2025         {
2026           source_range tok_range;
2027           tok_range.m_start = loc;
2028           tok_range.m_finish
2029             = linemap_position_for_column (pfile->line_table,
2030                                            CPP_BUF_COLUMN (pfile->buffer,
2031                                                            pfile->buffer->cur));
2032           loc = COMBINE_LOCATION_DATA (pfile->line_table,
2033                                        loc, tok_range, NULL, 0);
2034         }
2035
2036       encoding_rich_location rich_loc (pfile, loc);
2037
2038       /* Make sure that the token is printed using UCNs, even
2039          if we'd otherwise happily print UTF-8.  */
2040       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
2041       size_t sz;
2042
2043       sz = cpp_spell_token (pfile, token, buf, false) - buf;
2044       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
2045         cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
2046                         "`%.*s' is not in NFKC", (int) sz, buf);
2047       else if (identifier && CPP_OPTION (pfile, xid_identifiers))
2048         cpp_pedwarning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
2049                                   "`%.*s' is not in NFC", (int) sz, buf);
2050       else
2051         cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
2052                         "`%.*s' is not in NFC", (int) sz, buf);
2053       free (buf);
2054     }
2055 }
2056
2057 /* Returns TRUE if the sequence starting at buffer->cur is valid in
2058    an identifier.  FIRST is TRUE if this starts an identifier.  */
2059
2060 static bool
2061 forms_identifier_p (cpp_reader *pfile, int first,
2062                     struct normalize_state *state)
2063 {
2064   cpp_buffer *buffer = pfile->buffer;
2065   const bool warn_bidi_p = pfile->warn_bidi_p ();
2066
2067   if (*buffer->cur == '$')
2068     {
2069       if (!CPP_OPTION (pfile, dollars_in_ident))
2070         return false;
2071
2072       buffer->cur++;
2073       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
2074         {
2075           CPP_OPTION (pfile, warn_dollars) = 0;
2076           cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
2077         }
2078
2079       return true;
2080     }
2081
2082   /* Is this a syntactically valid UCN or a valid UTF-8 char?  */
2083   if (CPP_OPTION (pfile, extended_identifiers))
2084     {
2085       cppchar_t s;
2086       if (*buffer->cur >= utf8_signifier)
2087         {
2088           if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0)
2089               && warn_bidi_p)
2090             {
2091               location_t loc;
2092               bidi::kind kind = get_bidi_utf8 (pfile, buffer->cur, &loc);
2093               maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
2094             }
2095           if (_cpp_valid_utf8 (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
2096                                state, &s))
2097             return true;
2098         }
2099       else if (*buffer->cur == '\\'
2100                && (buffer->cur[1] == 'u'
2101                    || buffer->cur[1] == 'U'
2102                    || buffer->cur[1] == 'N'))
2103         {
2104           buffer->cur += 2;
2105           if (warn_bidi_p)
2106             {
2107               location_t loc;
2108               bidi::kind kind;
2109               if (buffer->cur[-1] == 'N')
2110                 kind = get_bidi_named (pfile, buffer->cur, &loc);
2111               else
2112                 kind = get_bidi_ucn (pfile, buffer->cur,
2113                                      buffer->cur[-1] == 'U', &loc);
2114               maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
2115             }
2116           if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
2117                               state, &s, NULL, NULL))
2118             return true;
2119           buffer->cur -= 2;
2120         }
2121     }
2122
2123   return false;
2124 }
2125
2126 /* Helper function to issue error about improper __VA_OPT__ use.  */
2127 static void
2128 maybe_va_opt_error (cpp_reader *pfile)
2129 {
2130   if (CPP_PEDANTIC (pfile) && !CPP_OPTION (pfile, va_opt))
2131     {
2132       /* __VA_OPT__ should not be accepted at all, but allow it in
2133          system headers.  */
2134       if (!_cpp_in_system_header (pfile))
2135         cpp_error (pfile, CPP_DL_PEDWARN,
2136                    "__VA_OPT__ is not available until C++20");
2137     }
2138   else if (!pfile->state.va_args_ok)
2139     {
2140       /* __VA_OPT__ should only appear in the replacement list of a
2141          variadic macro.  */
2142       cpp_error (pfile, CPP_DL_PEDWARN,
2143                  "__VA_OPT__ can only appear in the expansion"
2144                  " of a C++20 variadic macro");
2145     }
2146 }
2147
2148 /* Helper function to get the cpp_hashnode of the identifier BASE.  */
2149 static cpp_hashnode *
2150 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
2151 {
2152   cpp_hashnode *result;
2153   const uchar *cur;
2154   unsigned int len;
2155   unsigned int hash = HT_HASHSTEP (0, *base);
2156
2157   cur = base + 1;
2158   while (ISIDNUM (*cur))
2159     {
2160       hash = HT_HASHSTEP (hash, *cur);
2161       cur++;
2162     }
2163   len = cur - base;
2164   hash = HT_HASHFINISH (hash, len);
2165   result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
2166                                               base, len, hash, HT_ALLOC));
2167
2168   /* Rarely, identifiers require diagnostics when lexed.  */
2169   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
2170                         && !pfile->state.skipping, 0))
2171     {
2172       /* It is allowed to poison the same identifier twice.  */
2173       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
2174         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
2175                    NODE_NAME (result));
2176
2177       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
2178          replacement list of a variadic macro.  */
2179       if (result == pfile->spec_nodes.n__VA_ARGS__
2180           && !pfile->state.va_args_ok)
2181         {
2182           if (CPP_OPTION (pfile, cplusplus))
2183             cpp_error (pfile, CPP_DL_PEDWARN,
2184                        "__VA_ARGS__ can only appear in the expansion"
2185                        " of a C++11 variadic macro");
2186           else
2187             cpp_error (pfile, CPP_DL_PEDWARN,
2188                        "__VA_ARGS__ can only appear in the expansion"
2189                        " of a C99 variadic macro");
2190         }
2191
2192       if (result == pfile->spec_nodes.n__VA_OPT__)
2193         maybe_va_opt_error (pfile);
2194
2195       /* For -Wc++-compat, warn about use of C++ named operators.  */
2196       if (result->flags & NODE_WARN_OPERATOR)
2197         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
2198                      "identifier \"%s\" is a special operator name in C++",
2199                      NODE_NAME (result));
2200     }
2201
2202   return result;
2203 }
2204
2205 /* Get the cpp_hashnode of an identifier specified by NAME in
2206    the current cpp_reader object.  If none is found, NULL is returned.  */
2207 cpp_hashnode *
2208 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
2209 {
2210   cpp_hashnode *result;
2211   result = lex_identifier_intern (pfile, (uchar *) name);
2212   return result;
2213 }
2214
2215 /* Lex an identifier starting at BUFFER->CUR - 1.  */
2216 static cpp_hashnode *
2217 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
2218                 struct normalize_state *nst, cpp_hashnode **spelling)
2219 {
2220   cpp_hashnode *result;
2221   const uchar *cur;
2222   unsigned int len;
2223   unsigned int hash = HT_HASHSTEP (0, *base);
2224   const bool warn_bidi_p = pfile->warn_bidi_p ();
2225
2226   cur = pfile->buffer->cur;
2227   if (! starts_ucn)
2228     {
2229       while (ISIDNUM (*cur))
2230         {
2231           hash = HT_HASHSTEP (hash, *cur);
2232           cur++;
2233         }
2234       NORMALIZE_STATE_UPDATE_IDNUM (nst, *(cur - 1));
2235     }
2236   pfile->buffer->cur = cur;
2237   if (starts_ucn || forms_identifier_p (pfile, false, nst))
2238     {
2239       /* Slower version for identifiers containing UCNs
2240          or extended chars (including $).  */
2241       do {
2242         while (ISIDNUM (*pfile->buffer->cur))
2243           {
2244             NORMALIZE_STATE_UPDATE_IDNUM (nst, *pfile->buffer->cur);
2245             pfile->buffer->cur++;
2246           }
2247       } while (forms_identifier_p (pfile, false, nst));
2248       if (warn_bidi_p)
2249         maybe_warn_bidi_on_close (pfile, pfile->buffer->cur);
2250       result = _cpp_interpret_identifier (pfile, base,
2251                                           pfile->buffer->cur - base);
2252       *spelling = cpp_lookup (pfile, base, pfile->buffer->cur - base);
2253     }
2254   else
2255     {
2256       len = cur - base;
2257       hash = HT_HASHFINISH (hash, len);
2258
2259       result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
2260                                                   base, len, hash, HT_ALLOC));
2261       *spelling = result;
2262     }
2263
2264   /* Rarely, identifiers require diagnostics when lexed.  */
2265   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
2266                         && !pfile->state.skipping, 0))
2267     {
2268       /* It is allowed to poison the same identifier twice.  */
2269       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
2270         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
2271                    NODE_NAME (result));
2272
2273       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
2274          replacement list of a variadic macro.  */
2275       if (result == pfile->spec_nodes.n__VA_ARGS__
2276           && !pfile->state.va_args_ok)
2277         {
2278           if (CPP_OPTION (pfile, cplusplus))
2279             cpp_error (pfile, CPP_DL_PEDWARN,
2280                        "__VA_ARGS__ can only appear in the expansion"
2281                        " of a C++11 variadic macro");
2282           else
2283             cpp_error (pfile, CPP_DL_PEDWARN,
2284                        "__VA_ARGS__ can only appear in the expansion"
2285                        " of a C99 variadic macro");
2286         }
2287
2288       /* __VA_OPT__ should only appear in the replacement list of a
2289          variadic macro.  */
2290       if (result == pfile->spec_nodes.n__VA_OPT__)
2291         maybe_va_opt_error (pfile);
2292
2293       /* For -Wc++-compat, warn about use of C++ named operators.  */
2294       if (result->flags & NODE_WARN_OPERATOR)
2295         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
2296                      "identifier \"%s\" is a special operator name in C++",
2297                      NODE_NAME (result));
2298     }
2299
2300   return result;
2301 }
2302
2303 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
2304 static void
2305 lex_number (cpp_reader *pfile, cpp_string *number,
2306             struct normalize_state *nst)
2307 {
2308   const uchar *cur;
2309   const uchar *base;
2310   uchar *dest;
2311
2312   base = pfile->buffer->cur - 1;
2313   do
2314     {
2315       const uchar *adj_digit_sep = NULL;
2316       cur = pfile->buffer->cur;
2317
2318       /* N.B. ISIDNUM does not include $.  */
2319       while (ISIDNUM (*cur)
2320              || (*cur == '.' && !DIGIT_SEP (cur[-1]))
2321              || DIGIT_SEP (*cur)
2322              || (VALID_SIGN (*cur, cur[-1]) && !DIGIT_SEP (cur[-2])))
2323         {
2324           NORMALIZE_STATE_UPDATE_IDNUM (nst, *cur);
2325           /* Adjacent digit separators do not form part of the pp-number syntax.
2326              However, they can safely be diagnosed here as an error, since '' is
2327              not a valid preprocessing token.  */
2328           if (DIGIT_SEP (*cur) && DIGIT_SEP (cur[-1]) && !adj_digit_sep)
2329             adj_digit_sep = cur;
2330           cur++;
2331         }
2332       /* A number can't end with a digit separator.  */
2333       while (cur > pfile->buffer->cur && DIGIT_SEP (cur[-1]))
2334         --cur;
2335       if (adj_digit_sep && adj_digit_sep < cur)
2336         cpp_error (pfile, CPP_DL_ERROR, "adjacent digit separators");
2337
2338       pfile->buffer->cur = cur;
2339     }
2340   while (forms_identifier_p (pfile, false, nst));
2341
2342   number->len = cur - base;
2343   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
2344   memcpy (dest, base, number->len);
2345   dest[number->len] = '\0';
2346   number->text = dest;
2347 }
2348
2349 /* Create a token of type TYPE with a literal spelling.  */
2350 static void
2351 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
2352                 unsigned int len, enum cpp_ttype type)
2353 {
2354   token->type = type;
2355   token->val.str.len = len;
2356   token->val.str.text = cpp_alloc_token_string (pfile, base, len);
2357 }
2358
2359 const uchar *
2360 cpp_alloc_token_string (cpp_reader *pfile,
2361                         const unsigned char *ptr, unsigned len)
2362 {
2363   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
2364
2365   dest[len] = 0;
2366   memcpy (dest, ptr, len);
2367   return dest;
2368 }
2369
2370 /* A pair of raw buffer pointers.  The currently open one is [1], the
2371    first one is [0].  Used for string literal lexing.  */
2372 struct lit_accum {
2373   _cpp_buff *first;
2374   _cpp_buff *last;
2375   const uchar *rpos;
2376   size_t accum;
2377
2378   lit_accum ()
2379     : first (NULL), last (NULL), rpos (0), accum (0)
2380   {
2381   }
2382
2383   void append (cpp_reader *, const uchar *, size_t);
2384
2385   void read_begin (cpp_reader *);
2386   bool reading_p () const
2387   {
2388     return rpos != NULL;
2389   }
2390   char read_char ()
2391   {
2392     char c = *rpos++;
2393     if (rpos == BUFF_FRONT (last))
2394       rpos = NULL;
2395     return c;
2396   }
2397 };
2398
2399 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
2400    sequence from *FIRST_BUFF_P to LAST_BUFF_P.  */
2401
2402 void
2403 lit_accum::append (cpp_reader *pfile, const uchar *base, size_t len)
2404 {
2405   if (!last)
2406     /* Starting.  */
2407     first = last = _cpp_get_buff (pfile, len);
2408   else if (len > BUFF_ROOM (last))
2409     {
2410       /* There is insufficient room in the buffer.  Copy what we can,
2411          and then either extend or create a new one.  */
2412       size_t room = BUFF_ROOM (last);
2413       memcpy (BUFF_FRONT (last), base, room);
2414       BUFF_FRONT (last) += room;
2415       base += room;
2416       len -= room;
2417       accum += room;
2418
2419       gcc_checking_assert (!rpos);
2420
2421       last = _cpp_append_extend_buff (pfile, last, len);
2422     }
2423
2424   memcpy (BUFF_FRONT (last), base, len);
2425   BUFF_FRONT (last) += len;
2426   accum += len;
2427 }
2428
2429 void
2430 lit_accum::read_begin (cpp_reader *pfile)
2431 {
2432   /* We never accumulate more than 4 chars to read.  */
2433   if (BUFF_ROOM (last) < 4)
2434
2435     last = _cpp_append_extend_buff (pfile, last, 4);
2436   rpos = BUFF_FRONT (last);
2437 }
2438
2439 /* Returns true if a macro has been defined.
2440    This might not work if compile with -save-temps,
2441    or preprocess separately from compilation.  */
2442
2443 static bool
2444 is_macro(cpp_reader *pfile, const uchar *base)
2445 {
2446   const uchar *cur = base;
2447   if (! ISIDST (*cur))
2448     return false;
2449   unsigned int hash = HT_HASHSTEP (0, *cur);
2450   ++cur;
2451   while (ISIDNUM (*cur))
2452     {
2453       hash = HT_HASHSTEP (hash, *cur);
2454       ++cur;
2455     }
2456   hash = HT_HASHFINISH (hash, cur - base);
2457
2458   cpp_hashnode *result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
2459                                         base, cur - base, hash, HT_NO_INSERT));
2460
2461   return result && cpp_macro_p (result);
2462 }
2463
2464 /* Returns true if a literal suffix does not have the expected form
2465    and is defined as a macro.  */
2466
2467 static bool
2468 is_macro_not_literal_suffix(cpp_reader *pfile, const uchar *base)
2469 {
2470   /* User-defined literals outside of namespace std must start with a single
2471      underscore, so assume anything of that form really is a UDL suffix.
2472      We don't need to worry about UDLs defined inside namespace std because
2473      their names are reserved, so cannot be used as macro names in valid
2474      programs.  */
2475   if (base[0] == '_' && base[1] != '_')
2476     return false;
2477   return is_macro (pfile, base);
2478 }
2479
2480 /* Lexes a raw string.  The stored string contains the spelling,
2481    including double quotes, delimiter string, '(' and ')', any leading
2482    'L', 'u', 'U' or 'u8' and 'R' modifier.  The created token contains
2483    the type of the literal, or CPP_OTHER if it was not properly
2484    terminated.
2485
2486    BASE is the start of the token.  Updates pfile->buffer->cur to just
2487    after the lexed string.
2488
2489    The spelling is NUL-terminated, but it is not guaranteed that this
2490    is the first NUL since embedded NULs are preserved.  */
2491
2492 static void
2493 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
2494 {
2495   const uchar *pos = base;
2496   const bool warn_bidi_p = pfile->warn_bidi_p ();
2497   const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
2498   const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
2499
2500   /* 'tis a pity this information isn't passed down from the lexer's
2501      initial categorization of the token.  */
2502   enum cpp_ttype type = CPP_STRING;
2503
2504   if (*pos == 'L')
2505     {
2506       type = CPP_WSTRING;
2507       pos++;
2508     }
2509   else if (*pos == 'U')
2510     {
2511       type = CPP_STRING32;
2512       pos++;
2513     }
2514   else if (*pos == 'u')
2515     {
2516       if (pos[1] == '8')
2517         {
2518           type = CPP_UTF8STRING;
2519           pos++;
2520         }
2521       else
2522         type = CPP_STRING16;
2523       pos++;
2524     }
2525
2526   gcc_checking_assert (pos[0] == 'R' && pos[1] == '"');
2527   pos += 2;
2528
2529   _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
2530
2531   /* Skip notes before the ".  */
2532   while (note->pos < pos)
2533     ++note;
2534
2535   lit_accum accum;
2536
2537   uchar prefix[17];
2538   unsigned prefix_len = 0;
2539   enum Phase
2540   {
2541    PHASE_PREFIX = -2,
2542    PHASE_NONE = -1,
2543    PHASE_SUFFIX = 0
2544   } phase = PHASE_PREFIX;
2545
2546   for (;;)
2547     {
2548       gcc_checking_assert (note->pos >= pos);
2549
2550       /* Undo any escaped newlines and trigraphs.  */
2551       if (!accum.reading_p () && note->pos == pos)
2552         switch (note->type)
2553           {
2554           case '\\':
2555           case ' ':
2556             /* Restore backslash followed by newline.  */
2557             accum.append (pfile, base, pos - base);
2558             base = pos;
2559             accum.read_begin (pfile);
2560             accum.append (pfile, UC"\\", 1);
2561
2562           after_backslash:
2563             if (note->type == ' ')
2564               /* GNU backslash whitespace newline extension.  FIXME
2565                  could be any sequence of non-vertical space.  When we
2566                  can properly restore any such sequence, we should
2567                  mark this note as handled so _cpp_process_line_notes
2568                  doesn't warn.  */
2569               accum.append (pfile, UC" ", 1);
2570
2571             accum.append (pfile, UC"\n", 1);
2572             note++;
2573             break;
2574
2575           case '\n':
2576             /* This can happen for ??/<NEWLINE> when trigraphs are not
2577                being interpretted.  */
2578             gcc_checking_assert (!CPP_OPTION (pfile, trigraphs));
2579             note->type = 0;
2580             note++;
2581             break;
2582
2583           default:
2584             gcc_checking_assert (_cpp_trigraph_map[note->type]);
2585
2586             /* Don't warn about this trigraph in
2587                _cpp_process_line_notes, since trigraphs show up as
2588                trigraphs in raw strings.  */
2589             uchar type = note->type;
2590             note->type = 0;
2591
2592             if (CPP_OPTION (pfile, trigraphs))
2593               {
2594                 accum.append (pfile, base, pos - base);
2595                 base = pos;
2596                 accum.read_begin (pfile);
2597                 accum.append (pfile, UC"??", 2);
2598                 accum.append (pfile, &type, 1);
2599
2600                 /* ??/ followed by newline gets two line notes, one for
2601                    the trigraph and one for the backslash/newline.  */
2602                 if (type == '/' && note[1].pos == pos)
2603                   {
2604                     note++;
2605                     gcc_assert (note->type == '\\' || note->type == ' ');
2606                     goto after_backslash;
2607                   }
2608                 /* Skip the replacement character.  */
2609                 base = ++pos;
2610               }
2611
2612             note++;
2613             break;
2614           }
2615
2616       /* Now get a char to process.  Either from an expanded note, or
2617          from the line buffer.  */
2618       bool read_note = accum.reading_p ();
2619       char c = read_note ? accum.read_char () : *pos++;
2620
2621       if (phase == PHASE_PREFIX)
2622         {
2623           if (c == '(')
2624             {
2625               /* Done.  */
2626               phase = PHASE_NONE;
2627               prefix[prefix_len++] = '"';
2628             }
2629           else if (prefix_len < 16
2630                    /* Prefix chars are any of the basic character set,
2631                       [lex.charset] except for '
2632                       ()\\\t\v\f\n'. Optimized for a contiguous
2633                       alphabet.  */
2634                    /* Unlike a switch, this collapses down to one or
2635                       two shift and bitmask operations on an ASCII
2636                       system, with an outlier or two.   */
2637                    && (('Z' - 'A' == 25
2638                         ? ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
2639                         : ISIDST (c))
2640                        || (c >= '0' && c <= '9')
2641                        || c == '_' || c == '{' || c == '}'
2642                        || c == '[' || c == ']' || c == '#'
2643                        || c == '<' || c == '>' || c == '%'
2644                        || c == ':' || c == ';' || c == '.' || c == '?'
2645                        || c == '*' || c == '+' || c == '-' || c == '/'
2646                        || c == '^' || c == '&' || c == '|' || c == '~'
2647                        || c == '!' || c == '=' || c == ','
2648                        || c == '"' || c == '\''))
2649             prefix[prefix_len++] = c;
2650           else
2651             {
2652               /* Something is wrong.  */
2653               int col = CPP_BUF_COLUMN (pfile->buffer, pos) + read_note;
2654               if (prefix_len == 16)
2655                 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2656                                      col, "raw string delimiter longer "
2657                                      "than 16 characters");
2658               else if (c == '\n')
2659                 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2660                                      col, "invalid new-line in raw "
2661                                      "string delimiter");
2662               else
2663                 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2664                                      col, "invalid character '%c' in "
2665                                      "raw string delimiter", c);
2666               type = CPP_OTHER;
2667               phase = PHASE_NONE;
2668               /* Continue until we get a close quote, that's probably
2669                  the best failure mode.  */
2670               prefix_len = 0;
2671             }
2672           if (c != '\n')
2673             continue;
2674         }
2675
2676       if (phase != PHASE_NONE)
2677         {
2678           if (prefix[phase] != c)
2679             phase = PHASE_NONE;
2680           else if (unsigned (phase + 1) == prefix_len)
2681             break;
2682           else
2683             {
2684               phase = Phase (phase + 1);
2685               continue;
2686             }
2687         }
2688
2689       if (!prefix_len && c == '"')
2690         /* Failure mode lexing.  */
2691         goto out;
2692       else if (prefix_len && c == ')')
2693         phase = PHASE_SUFFIX;
2694       else if (!read_note && c == '\n')
2695         {
2696           pos--;
2697           pfile->buffer->cur = pos;
2698           if (pfile->state.in_directive
2699               || (pfile->state.parsing_args
2700                   && pfile->buffer->next_line >= pfile->buffer->rlimit))
2701             {
2702               cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
2703                                    "unterminated raw string");
2704               type = CPP_OTHER;
2705               goto out;
2706             }
2707
2708           accum.append (pfile, base, pos - base + 1);
2709           _cpp_process_line_notes (pfile, false);
2710
2711           if (pfile->buffer->next_line < pfile->buffer->rlimit)
2712             CPP_INCREMENT_LINE (pfile, 0);
2713           pfile->buffer->need_line = true;
2714
2715           if (!_cpp_get_fresh_line (pfile))
2716             {
2717               /* We ran out of file and failed to get a line.  */
2718               location_t src_loc = token->src_loc;
2719               token->type = CPP_EOF;
2720               /* Tell the compiler the line number of the EOF token.  */
2721               token->src_loc = pfile->line_table->highest_line;
2722               token->flags = BOL;
2723               if (accum.first)
2724                 _cpp_release_buff (pfile, accum.first);
2725               cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
2726                                    "unterminated raw string");
2727               /* Now pop the buffer that _cpp_get_fresh_line did not.  */
2728               _cpp_pop_buffer (pfile);
2729               return;
2730             }
2731
2732           pos = base = pfile->buffer->cur;
2733           note = &pfile->buffer->notes[pfile->buffer->cur_note];
2734         }
2735       else if (__builtin_expect ((unsigned char) c >= utf8_continuation, 0)
2736                && warn_bidi_or_invalid_utf8_p)
2737         pos = _cpp_handle_multibyte_utf8 (pfile, c, pos, warn_bidi_p,
2738                                           warn_invalid_utf8_p);
2739     }
2740
2741   if (warn_bidi_p)
2742     maybe_warn_bidi_on_close (pfile, pos);
2743
2744   if (CPP_OPTION (pfile, user_literals))
2745     {
2746       /* If a string format macro, say from inttypes.h, is placed touching
2747          a string literal it could be parsed as a C++11 user-defined string
2748          literal thus breaking the program.  */
2749       if (is_macro_not_literal_suffix (pfile, pos))
2750         {
2751           /* Raise a warning, but do not consume subsequent tokens.  */
2752           if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
2753             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
2754                                    token->src_loc, 0,
2755                                    "invalid suffix on literal; C++11 requires "
2756                                    "a space between literal and string macro");
2757         }
2758       /* Grab user defined literal suffix.  */
2759       else if (ISIDST (*pos))
2760         {
2761           type = cpp_userdef_string_add_type (type);
2762           ++pos;
2763
2764           while (ISIDNUM (*pos))
2765             ++pos;
2766         }
2767     }
2768
2769  out:
2770   pfile->buffer->cur = pos;
2771   if (!accum.accum)
2772     create_literal (pfile, token, base, pos - base, type);
2773   else
2774     {
2775       size_t extra_len = pos - base;
2776       uchar *dest = _cpp_unaligned_alloc (pfile, accum.accum + extra_len + 1);
2777
2778       token->type = type;
2779       token->val.str.len = accum.accum + extra_len;
2780       token->val.str.text = dest;
2781       for (_cpp_buff *buf = accum.first; buf; buf = buf->next)
2782         {
2783           size_t len = BUFF_FRONT (buf) - buf->base;
2784           memcpy (dest, buf->base, len);
2785           dest += len;
2786         }
2787       _cpp_release_buff (pfile, accum.first);
2788       memcpy (dest, base, extra_len);
2789       dest[extra_len] = '\0';
2790     }
2791 }
2792
2793 /* Lexes a string, character constant, or angle-bracketed header file
2794    name.  The stored string contains the spelling, including opening
2795    quote and any leading 'L', 'u', 'U' or 'u8' and optional
2796    'R' modifier.  It returns the type of the literal, or CPP_OTHER
2797    if it was not properly terminated, or CPP_LESS for an unterminated
2798    header name which must be relexed as normal tokens.
2799
2800    The spelling is NUL-terminated, but it is not guaranteed that this
2801    is the first NUL since embedded NULs are preserved.  */
2802 static void
2803 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
2804 {
2805   bool saw_NUL = false;
2806   const uchar *cur;
2807   cppchar_t terminator;
2808   enum cpp_ttype type;
2809
2810   cur = base;
2811   terminator = *cur++;
2812   if (terminator == 'L' || terminator == 'U')
2813     terminator = *cur++;
2814   else if (terminator == 'u')
2815     {
2816       terminator = *cur++;
2817       if (terminator == '8')
2818         terminator = *cur++;
2819     }
2820   if (terminator == 'R')
2821     {
2822       lex_raw_string (pfile, token, base);
2823       return;
2824     }
2825   if (terminator == '"')
2826     type = (*base == 'L' ? CPP_WSTRING :
2827             *base == 'U' ? CPP_STRING32 :
2828             *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
2829                          : CPP_STRING);
2830   else if (terminator == '\'')
2831     type = (*base == 'L' ? CPP_WCHAR :
2832             *base == 'U' ? CPP_CHAR32 :
2833             *base == 'u' ? (base[1] == '8' ? CPP_UTF8CHAR : CPP_CHAR16)
2834                          : CPP_CHAR);
2835   else
2836     terminator = '>', type = CPP_HEADER_NAME;
2837
2838   const bool warn_bidi_p = pfile->warn_bidi_p ();
2839   const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
2840   const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
2841   for (;;)
2842     {
2843       cppchar_t c = *cur++;
2844
2845       /* In #include-style directives, terminators are not escapable.  */
2846       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
2847         {
2848           if ((cur[0] == 'u' || cur[0] == 'U' || cur[0] == 'N') && warn_bidi_p)
2849             {
2850               location_t loc;
2851               bidi::kind kind;
2852               if (cur[0] == 'N')
2853                 kind = get_bidi_named (pfile, cur + 1, &loc);
2854               else
2855                 kind = get_bidi_ucn (pfile, cur + 1, cur[0] == 'U', &loc);
2856               maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
2857             }
2858           cur++;
2859         }
2860       else if (c == terminator)
2861         {
2862           if (warn_bidi_p)
2863             maybe_warn_bidi_on_close (pfile, cur - 1);
2864           break;
2865         }
2866       else if (c == '\n')
2867         {
2868           cur--;
2869           /* Unmatched quotes always yield undefined behavior, but
2870              greedy lexing means that what appears to be an unterminated
2871              header name may actually be a legitimate sequence of tokens.  */
2872           if (terminator == '>')
2873             {
2874               token->type = CPP_LESS;
2875               return;
2876             }
2877           type = CPP_OTHER;
2878           break;
2879         }
2880       else if (c == '\0')
2881         saw_NUL = true;
2882       else if (__builtin_expect (c >= utf8_continuation, 0)
2883                && warn_bidi_or_invalid_utf8_p)
2884         cur = _cpp_handle_multibyte_utf8 (pfile, c, cur, warn_bidi_p,
2885                                           warn_invalid_utf8_p);
2886     }
2887
2888   if (saw_NUL && !pfile->state.skipping)
2889     cpp_error (pfile, CPP_DL_WARNING,
2890                "null character(s) preserved in literal");
2891
2892   if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
2893     cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
2894                (int) terminator);
2895
2896   if (CPP_OPTION (pfile, user_literals))
2897     {
2898       /* If a string format macro, say from inttypes.h, is placed touching
2899          a string literal it could be parsed as a C++11 user-defined string
2900          literal thus breaking the program.  */
2901       if (is_macro_not_literal_suffix (pfile, cur))
2902         {
2903           /* Raise a warning, but do not consume subsequent tokens.  */
2904           if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
2905             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
2906                                    token->src_loc, 0,
2907                                    "invalid suffix on literal; C++11 requires "
2908                                    "a space between literal and string macro");
2909         }
2910       /* Grab user defined literal suffix.  */
2911       else if (ISIDST (*cur))
2912         {
2913           type = cpp_userdef_char_add_type (type);
2914           type = cpp_userdef_string_add_type (type);
2915           ++cur;
2916
2917           while (ISIDNUM (*cur))
2918             ++cur;
2919         }
2920     }
2921   else if (CPP_OPTION (pfile, cpp_warn_cxx11_compat)
2922            && is_macro (pfile, cur)
2923            && !pfile->state.skipping)
2924     cpp_warning_with_line (pfile, CPP_W_CXX11_COMPAT,
2925                            token->src_loc, 0, "C++11 requires a space "
2926                            "between string literal and macro");
2927
2928   pfile->buffer->cur = cur;
2929   create_literal (pfile, token, base, cur - base, type);
2930 }
2931
2932 /* Return the comment table. The client may not make any assumption
2933    about the ordering of the table.  */
2934 cpp_comment_table *
2935 cpp_get_comments (cpp_reader *pfile)
2936 {
2937   return &pfile->comments;
2938 }
2939
2940 /* Append a comment to the end of the comment table. */
2941 static void
2942 store_comment (cpp_reader *pfile, cpp_token *token)
2943 {
2944   int len;
2945
2946   if (pfile->comments.allocated == 0)
2947     {
2948       pfile->comments.allocated = 256;
2949       pfile->comments.entries = (cpp_comment *) xmalloc
2950         (pfile->comments.allocated * sizeof (cpp_comment));
2951     }
2952
2953   if (pfile->comments.count == pfile->comments.allocated)
2954     {
2955       pfile->comments.allocated *= 2;
2956       pfile->comments.entries = (cpp_comment *) xrealloc
2957         (pfile->comments.entries,
2958          pfile->comments.allocated * sizeof (cpp_comment));
2959     }
2960
2961   len = token->val.str.len;
2962
2963   /* Copy comment. Note, token may not be NULL terminated. */
2964   pfile->comments.entries[pfile->comments.count].comment =
2965     (char *) xmalloc (sizeof (char) * (len + 1));
2966   memcpy (pfile->comments.entries[pfile->comments.count].comment,
2967           token->val.str.text, len);
2968   pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
2969
2970   /* Set source location. */
2971   pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
2972
2973   /* Increment the count of entries in the comment table. */
2974   pfile->comments.count++;
2975 }
2976
2977 /* The stored comment includes the comment start and any terminator.  */
2978 static void
2979 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
2980               cppchar_t type)
2981 {
2982   unsigned char *buffer;
2983   unsigned int len, clen, i;
2984
2985   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
2986
2987   /* C++ comments probably (not definitely) have moved past a new
2988      line, which we don't want to save in the comment.  */
2989   if (is_vspace (pfile->buffer->cur[-1]))
2990     len--;
2991
2992   /* If we are currently in a directive or in argument parsing, then
2993      we need to store all C++ comments as C comments internally, and
2994      so we need to allocate a little extra space in that case.
2995
2996      Note that the only time we encounter a directive here is
2997      when we are saving comments in a "#define".  */
2998   clen = ((pfile->state.in_directive || pfile->state.parsing_args)
2999           && type == '/') ? len + 2 : len;
3000
3001   buffer = _cpp_unaligned_alloc (pfile, clen);
3002
3003   token->type = CPP_COMMENT;
3004   token->val.str.len = clen;
3005   token->val.str.text = buffer;
3006
3007   buffer[0] = '/';
3008   memcpy (buffer + 1, from, len - 1);
3009
3010   /* Finish conversion to a C comment, if necessary.  */
3011   if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
3012     {
3013       buffer[1] = '*';
3014       buffer[clen - 2] = '*';
3015       buffer[clen - 1] = '/';
3016       /* As there can be in a C++ comments illegal sequences for C comments
3017          we need to filter them out.  */
3018       for (i = 2; i < (clen - 2); i++)
3019         if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
3020           buffer[i] = '|';
3021     }
3022
3023   /* Finally store this comment for use by clients of libcpp. */
3024   store_comment (pfile, token);
3025 }
3026
3027 /* Returns true if comment at COMMENT_START is a recognized FALLTHROUGH
3028    comment.  */
3029
3030 static bool
3031 fallthrough_comment_p (cpp_reader *pfile, const unsigned char *comment_start)
3032 {
3033   const unsigned char *from = comment_start + 1;
3034
3035   switch (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough))
3036     {
3037       /* For both -Wimplicit-fallthrough=0 and -Wimplicit-fallthrough=5 we
3038          don't recognize any comments.  The latter only checks attributes,
3039          the former doesn't warn.  */
3040     case 0:
3041     default:
3042       return false;
3043       /* -Wimplicit-fallthrough=1 considers any comment, no matter what
3044          content it has.  */
3045     case 1:
3046       return true;
3047     case 2:
3048       /* -Wimplicit-fallthrough=2 looks for (case insensitive)
3049          .*falls?[ \t-]*thr(u|ough).* regex.  */
3050       for (; (size_t) (pfile->buffer->cur - from) >= sizeof "fallthru" - 1;
3051            from++)
3052         {
3053           /* Is there anything like strpbrk with upper boundary, or
3054              memchr looking for 2 characters rather than just one?  */
3055           if (from[0] != 'f' && from[0] != 'F')
3056             continue;
3057           if (from[1] != 'a' && from[1] != 'A')
3058             continue;
3059           if (from[2] != 'l' && from[2] != 'L')
3060             continue;
3061           if (from[3] != 'l' && from[3] != 'L')
3062             continue;
3063           from += sizeof "fall" - 1;
3064           if (from[0] == 's' || from[0] == 'S')
3065             from++;
3066           while (*from == ' ' || *from == '\t' || *from == '-')
3067             from++;
3068           if (from[0] != 't' && from[0] != 'T')
3069             continue;
3070           if (from[1] != 'h' && from[1] != 'H')
3071             continue;
3072           if (from[2] != 'r' && from[2] != 'R')
3073             continue;
3074           if (from[3] == 'u' || from[3] == 'U')
3075             return true;
3076           if (from[3] != 'o' && from[3] != 'O')
3077             continue;
3078           if (from[4] != 'u' && from[4] != 'U')
3079             continue;
3080           if (from[5] != 'g' && from[5] != 'G')
3081             continue;
3082           if (from[6] != 'h' && from[6] != 'H')
3083             continue;
3084           return true;
3085         }
3086       return false;
3087     case 3:
3088     case 4:
3089       break;
3090     }
3091
3092   /* Whole comment contents:
3093      -fallthrough
3094      @fallthrough@
3095    */
3096   if (*from == '-' || *from == '@')
3097     {
3098       size_t len = sizeof "fallthrough" - 1;
3099       if ((size_t) (pfile->buffer->cur - from - 1) < len)
3100         return false;
3101       if (memcmp (from + 1, "fallthrough", len))
3102         return false;
3103       if (*from == '@')
3104         {
3105           if (from[len + 1] != '@')
3106             return false;
3107           len++;
3108         }
3109       from += 1 + len;
3110     }
3111   /* Whole comment contents (regex):
3112      lint -fallthrough[ \t]*
3113    */
3114   else if (*from == 'l')
3115     {
3116       size_t len = sizeof "int -fallthrough" - 1;
3117       if ((size_t) (pfile->buffer->cur - from - 1) < len)
3118         return false;
3119       if (memcmp (from + 1, "int -fallthrough", len))
3120         return false;
3121       from += 1 + len;
3122       while (*from == ' ' || *from == '\t')
3123         from++;
3124     }
3125   /* Whole comment contents (regex):
3126      [ \t]*FALLTHR(U|OUGH)[ \t]*
3127    */
3128   else if (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough) == 4)
3129     {
3130       while (*from == ' ' || *from == '\t')
3131         from++;
3132       if ((size_t) (pfile->buffer->cur - from)  < sizeof "FALLTHRU" - 1)
3133         return false;
3134       if (memcmp (from, "FALLTHR", sizeof "FALLTHR" - 1))
3135         return false;
3136       from += sizeof "FALLTHR" - 1;
3137       if (*from == 'U')
3138         from++;
3139       else if ((size_t) (pfile->buffer->cur - from)  < sizeof "OUGH" - 1)
3140         return false;
3141       else if (memcmp (from, "OUGH", sizeof "OUGH" - 1))
3142         return false;
3143       else
3144         from += sizeof "OUGH" - 1;
3145       while (*from == ' ' || *from == '\t')
3146         from++;
3147     }
3148   /* Whole comment contents (regex):
3149      [ \t.!]*(ELSE,? |INTENTIONAL(LY)? )?FALL(S | |-)?THR(OUGH|U)[ \t.!]*(-[^\n\r]*)?
3150      [ \t.!]*(Else,? |Intentional(ly)? )?Fall((s | |-)[Tt]|t)hr(ough|u)[ \t.!]*(-[^\n\r]*)?
3151      [ \t.!]*([Ee]lse,? |[Ii]ntentional(ly)? )?fall(s | |-)?thr(ough|u)[ \t.!]*(-[^\n\r]*)?
3152    */
3153   else
3154     {
3155       while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
3156         from++;
3157       unsigned char f = *from;
3158       bool all_upper = false;
3159       if (f == 'E' || f == 'e')
3160         {
3161           if ((size_t) (pfile->buffer->cur - from)
3162               < sizeof "else fallthru" - 1)
3163             return false;
3164           if (f == 'E' && memcmp (from + 1, "LSE", sizeof "LSE" - 1) == 0)
3165             all_upper = true;
3166           else if (memcmp (from + 1, "lse", sizeof "lse" - 1))
3167             return false;
3168           from += sizeof "else" - 1;
3169           if (*from == ',')
3170             from++;
3171           if (*from != ' ')
3172             return false;
3173           from++;
3174           if (all_upper && *from == 'f')
3175             return false;
3176           if (f == 'e' && *from == 'F')
3177             return false;
3178           f = *from;
3179         }
3180       else if (f == 'I' || f == 'i')
3181         {
3182           if ((size_t) (pfile->buffer->cur - from)
3183               < sizeof "intentional fallthru" - 1)
3184             return false;
3185           if (f == 'I' && memcmp (from + 1, "NTENTIONAL",
3186                                   sizeof "NTENTIONAL" - 1) == 0)
3187             all_upper = true;
3188           else if (memcmp (from + 1, "ntentional",
3189                            sizeof "ntentional" - 1))
3190             return false;
3191           from += sizeof "intentional" - 1;
3192           if (*from == ' ')
3193             {
3194               from++;
3195               if (all_upper && *from == 'f')
3196                 return false;
3197             }
3198           else if (all_upper)
3199             {
3200               if (memcmp (from, "LY F", sizeof "LY F" - 1))
3201                 return false;
3202               from += sizeof "LY " - 1;
3203             }
3204           else
3205             {
3206               if (memcmp (from, "ly ", sizeof "ly " - 1))
3207                 return false;
3208               from += sizeof "ly " - 1;
3209             }
3210           if (f == 'i' && *from == 'F')
3211             return false;
3212           f = *from;
3213         }
3214       if (f != 'F' && f != 'f')
3215         return false;
3216       if ((size_t) (pfile->buffer->cur - from) < sizeof "fallthru" - 1)
3217         return false;
3218       if (f == 'F' && memcmp (from + 1, "ALL", sizeof "ALL" - 1) == 0)
3219         all_upper = true;
3220       else if (all_upper)
3221         return false;
3222       else if (memcmp (from + 1, "all", sizeof "all" - 1))
3223         return false;
3224       from += sizeof "fall" - 1;
3225       if (*from == (all_upper ? 'S' : 's') && from[1] == ' ')
3226         from += 2;
3227       else if (*from == ' ' || *from == '-')
3228         from++;
3229       else if (*from != (all_upper ? 'T' : 't'))
3230         return false;
3231       if ((f == 'f' || *from != 'T') && (all_upper || *from != 't'))
3232         return false;
3233       if ((size_t) (pfile->buffer->cur - from) < sizeof "thru" - 1)
3234         return false;
3235       if (memcmp (from + 1, all_upper ? "HRU" : "hru", sizeof "hru" - 1))
3236         {
3237           if ((size_t) (pfile->buffer->cur - from) < sizeof "through" - 1)
3238             return false;
3239           if (memcmp (from + 1, all_upper ? "HROUGH" : "hrough",
3240                       sizeof "hrough" - 1))
3241             return false;
3242           from += sizeof "through" - 1;
3243         }
3244       else
3245         from += sizeof "thru" - 1;
3246       while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
3247         from++;
3248       if (*from == '-')
3249         {
3250           from++;
3251           if (*comment_start == '*')
3252             {
3253               do
3254                 {
3255                   while (*from && *from != '*'
3256                          && *from != '\n' && *from != '\r')
3257                     from++;
3258                   if (*from != '*' || from[1] == '/')
3259                     break;
3260                   from++;
3261                 }
3262               while (1);
3263             }
3264           else
3265             while (*from && *from != '\n' && *from != '\r')
3266               from++;
3267         }
3268     }
3269   /* C block comment.  */
3270   if (*comment_start == '*')
3271     {
3272       if (*from != '*' || from[1] != '/')
3273         return false;
3274     }
3275   /* C++ line comment.  */
3276   else if (*from != '\n')
3277     return false;
3278
3279   return true;
3280 }
3281
3282 /* Allocate COUNT tokens for RUN.  */
3283 void
3284 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
3285 {
3286   run->base = XNEWVEC (cpp_token, count);
3287   run->limit = run->base + count;
3288   run->next = NULL;
3289 }
3290
3291 /* Returns the next tokenrun, or creates one if there is none.  */
3292 static tokenrun *
3293 next_tokenrun (tokenrun *run)
3294 {
3295   if (run->next == NULL)
3296     {
3297       run->next = XNEW (tokenrun);
3298       run->next->prev = run;
3299       _cpp_init_tokenrun (run->next, 250);
3300     }
3301
3302   return run->next;
3303 }
3304
3305 /* Return the number of not yet processed token in a given
3306    context.  */
3307 int
3308 _cpp_remaining_tokens_num_in_context (cpp_context *context)
3309 {
3310   if (context->tokens_kind == TOKENS_KIND_DIRECT)
3311     return (LAST (context).token - FIRST (context).token);
3312   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
3313            || context->tokens_kind == TOKENS_KIND_EXTENDED)
3314     return (LAST (context).ptoken - FIRST (context).ptoken);
3315   else
3316       abort ();
3317 }
3318
3319 /* Returns the token present at index INDEX in a given context.  If
3320    INDEX is zero, the next token to be processed is returned.  */
3321 static const cpp_token*
3322 _cpp_token_from_context_at (cpp_context *context, int index)
3323 {
3324   if (context->tokens_kind == TOKENS_KIND_DIRECT)
3325     return &(FIRST (context).token[index]);
3326   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
3327            || context->tokens_kind == TOKENS_KIND_EXTENDED)
3328     return FIRST (context).ptoken[index];
3329  else
3330    abort ();
3331 }
3332
3333 /* Look ahead in the input stream.  */
3334 const cpp_token *
3335 cpp_peek_token (cpp_reader *pfile, int index)
3336 {
3337   cpp_context *context = pfile->context;
3338   const cpp_token *peektok;
3339   int count;
3340
3341   /* First, scan through any pending cpp_context objects.  */
3342   while (context->prev)
3343     {
3344       ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
3345
3346       if (index < (int) sz)
3347         return _cpp_token_from_context_at (context, index);
3348       index -= (int) sz;
3349       context = context->prev;
3350     }
3351
3352   /* We will have to read some new tokens after all (and do so
3353      without invalidating preceding tokens).  */
3354   count = index;
3355   pfile->keep_tokens++;
3356
3357   /* For peeked tokens temporarily disable line_change reporting,
3358      until the tokens are parsed for real.  */
3359   void (*line_change) (cpp_reader *, const cpp_token *, int)
3360     = pfile->cb.line_change;
3361   pfile->cb.line_change = NULL;
3362
3363   do
3364     {
3365       peektok = _cpp_lex_token (pfile);
3366       if (peektok->type == CPP_EOF)
3367         {
3368           index--;
3369           break;
3370         }
3371       else if (peektok->type == CPP_PRAGMA)
3372         {
3373           /* Don't peek past a pragma.  */
3374           if (peektok == &pfile->directive_result)
3375             /* Save the pragma in the buffer.  */
3376             *pfile->cur_token++ = *peektok;
3377           index--;
3378           break;
3379         }
3380     }
3381   while (index--);
3382
3383   _cpp_backup_tokens_direct (pfile, count - index);
3384   pfile->keep_tokens--;
3385   pfile->cb.line_change = line_change;
3386
3387   return peektok;
3388 }
3389
3390 /* Allocate a single token that is invalidated at the same time as the
3391    rest of the tokens on the line.  Has its line and col set to the
3392    same as the last lexed token, so that diagnostics appear in the
3393    right place.  */
3394 cpp_token *
3395 _cpp_temp_token (cpp_reader *pfile)
3396 {
3397   cpp_token *old, *result;
3398   ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
3399   ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
3400
3401   old = pfile->cur_token - 1;
3402   /* Any pre-existing lookaheads must not be clobbered.  */
3403   if (la)
3404     {
3405       if (sz <= la)
3406         {
3407           tokenrun *next = next_tokenrun (pfile->cur_run);
3408
3409           if (sz < la)
3410             memmove (next->base + 1, next->base,
3411                      (la - sz) * sizeof (cpp_token));
3412
3413           next->base[0] = pfile->cur_run->limit[-1];
3414         }
3415
3416       if (sz > 1)
3417         memmove (pfile->cur_token + 1, pfile->cur_token,
3418                  MIN (la, sz - 1) * sizeof (cpp_token));
3419     }
3420
3421   if (!sz && pfile->cur_token == pfile->cur_run->limit)
3422     {
3423       pfile->cur_run = next_tokenrun (pfile->cur_run);
3424       pfile->cur_token = pfile->cur_run->base;
3425     }
3426
3427   result = pfile->cur_token++;
3428   result->src_loc = old->src_loc;
3429   return result;
3430 }
3431
3432 /* We're at the beginning of a logical line (so not in
3433   directives-mode) and RESULT is a CPP_NAME with NODE_MODULE set.  See
3434   if we should enter deferred_pragma mode to tokenize the rest of the
3435   line as a module control-line.  */
3436
3437 static void
3438 cpp_maybe_module_directive (cpp_reader *pfile, cpp_token *result)
3439 {
3440   unsigned backup = 0; /* Tokens we peeked.  */
3441   cpp_hashnode *node = result->val.node.node;
3442   cpp_token *peek = result;
3443   cpp_token *keyword = peek;
3444   cpp_hashnode *(&n_modules)[spec_nodes::M_HWM][2] = pfile->spec_nodes.n_modules;
3445   int header_count = 0;
3446
3447   /* Make sure the incoming state is as we expect it.  This way we
3448      can restore it using constants.  */
3449   gcc_checking_assert (!pfile->state.in_deferred_pragma
3450                        && !pfile->state.skipping
3451                        && !pfile->state.parsing_args
3452                        && !pfile->state.angled_headers
3453                        && (pfile->state.save_comments
3454                            == !CPP_OPTION (pfile, discard_comments)));
3455
3456   /* Enter directives mode sufficiently for peeking.  We don't have
3457      to actually set in_directive.  */
3458   pfile->state.in_deferred_pragma = true;
3459
3460   /* These two fields are needed to process tokenization in deferred
3461      pragma mode.  They are not used outside deferred pragma mode or
3462      directives mode.  */
3463   pfile->state.pragma_allow_expansion = true;
3464   pfile->directive_line = result->src_loc;
3465
3466   /* Saving comments is incompatible with directives mode.   */
3467   pfile->state.save_comments = 0;
3468
3469   if (node == n_modules[spec_nodes::M_EXPORT][0])
3470     {
3471       peek = _cpp_lex_direct (pfile);
3472       keyword = peek;
3473       backup++;
3474       if (keyword->type != CPP_NAME)
3475         goto not_module;
3476       node = keyword->val.node.node;
3477       if (!(node->flags & NODE_MODULE))
3478         goto not_module;
3479     }
3480
3481   if (node == n_modules[spec_nodes::M__IMPORT][0])
3482     /* __import  */
3483     header_count = backup + 2 + 16;
3484   else if (node == n_modules[spec_nodes::M_IMPORT][0])
3485     /* import  */
3486     header_count = backup + 2 + (CPP_OPTION (pfile, preprocessed) ? 16 : 0);
3487   else if (node == n_modules[spec_nodes::M_MODULE][0])
3488     ; /* module  */
3489   else
3490     goto not_module;
3491
3492   /* We've seen [export] {module|import|__import}.  Check the next token.  */
3493   if (header_count)
3494     /* After '{,__}import' a header name may appear.  */
3495     pfile->state.angled_headers = true;
3496   peek = _cpp_lex_direct (pfile);
3497   backup++;
3498
3499   /* ... import followed by identifier, ':', '<' or
3500      header-name preprocessing tokens, or module
3501      followed by cpp-identifier, ':' or ';' preprocessing
3502      tokens.  C++ keywords are not yet relevant.  */
3503   if (peek->type == CPP_NAME
3504       || peek->type == CPP_COLON
3505       ||  (header_count
3506            ? (peek->type == CPP_LESS
3507               || (peek->type == CPP_STRING && peek->val.str.text[0] != 'R')
3508               || peek->type == CPP_HEADER_NAME)
3509            : peek->type == CPP_SEMICOLON))
3510     {
3511       pfile->state.pragma_allow_expansion = !CPP_OPTION (pfile, preprocessed);
3512       if (!pfile->state.pragma_allow_expansion)
3513         pfile->state.prevent_expansion++;
3514
3515       if (!header_count && linemap_included_from
3516           (LINEMAPS_LAST_ORDINARY_MAP (pfile->line_table)))
3517         cpp_error_with_line (pfile, CPP_DL_ERROR, keyword->src_loc, 0,
3518                              "module control-line cannot be in included file");
3519
3520       /* The first one or two tokens cannot be macro names.  */
3521       for (int ix = backup; ix--;)
3522         {
3523           cpp_token *tok = ix ? keyword : result;
3524           cpp_hashnode *node = tok->val.node.node;
3525
3526           /* Don't attempt to expand the token.  */
3527           tok->flags |= NO_EXPAND;
3528           if (_cpp_defined_macro_p (node)
3529               && _cpp_maybe_notify_macro_use (pfile, node, tok->src_loc)
3530               && !cpp_fun_like_macro_p (node))
3531             cpp_error_with_line (pfile, CPP_DL_ERROR, tok->src_loc, 0,
3532                                  "module control-line \"%s\" cannot be"
3533                                  " an object-like macro",
3534                                  NODE_NAME (node));
3535         }
3536
3537       /* Map to underbar variants.  */
3538       keyword->val.node.node = n_modules[header_count
3539                                          ? spec_nodes::M_IMPORT
3540                                          : spec_nodes::M_MODULE][1];
3541       if (backup != 1)
3542         result->val.node.node = n_modules[spec_nodes::M_EXPORT][1];
3543
3544       /* Maybe tell the tokenizer we expect a header-name down the
3545          road.  */
3546       pfile->state.directive_file_token = header_count;
3547     }
3548   else
3549     {
3550     not_module:
3551       /* Drop out of directive mode.  */
3552       /* We aaserted save_comments had this value upon entry.  */
3553       pfile->state.save_comments
3554         = !CPP_OPTION (pfile, discard_comments);
3555       pfile->state.in_deferred_pragma = false;
3556       /* Do not let this remain on.  */
3557       pfile->state.angled_headers = false;
3558     }
3559
3560   /* In either case we want to backup the peeked tokens.  */
3561   if (backup)
3562     {
3563       /* If we saw EOL, we should drop it, because this isn't a module
3564          control-line after all.  */
3565       bool eol = peek->type == CPP_PRAGMA_EOL;
3566       if (!eol || backup > 1)
3567         {
3568           /* Put put the peeked tokens back  */
3569           _cpp_backup_tokens_direct (pfile, backup);
3570           /* But if the last one was an EOL, forget it.  */
3571           if (eol)
3572             pfile->lookaheads--;
3573         }
3574     }
3575 }
3576
3577 /* Lex a token into RESULT (external interface).  Takes care of issues
3578    like directive handling, token lookahead, multiple include
3579    optimization and skipping.  */
3580 const cpp_token *
3581 _cpp_lex_token (cpp_reader *pfile)
3582 {
3583   cpp_token *result;
3584
3585   for (;;)
3586     {
3587       if (pfile->cur_token == pfile->cur_run->limit)
3588         {
3589           pfile->cur_run = next_tokenrun (pfile->cur_run);
3590           pfile->cur_token = pfile->cur_run->base;
3591         }
3592       /* We assume that the current token is somewhere in the current
3593          run.  */
3594       if (pfile->cur_token < pfile->cur_run->base
3595           || pfile->cur_token >= pfile->cur_run->limit)
3596         abort ();
3597
3598       if (pfile->lookaheads)
3599         {
3600           pfile->lookaheads--;
3601           result = pfile->cur_token++;
3602         }
3603       else
3604         result = _cpp_lex_direct (pfile);
3605
3606       if (result->flags & BOL)
3607         {
3608           /* Is this a directive.  If _cpp_handle_directive returns
3609              false, it is an assembler #.  */
3610           if (result->type == CPP_HASH
3611               /* 6.10.3 p 11: Directives in a list of macro arguments
3612                  gives undefined behavior.  This implementation
3613                  handles the directive as normal.  */
3614               && pfile->state.parsing_args != 1)
3615             {
3616               if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
3617                 {
3618                   if (pfile->directive_result.type == CPP_PADDING)
3619                     continue;
3620                   result = &pfile->directive_result;
3621                 }
3622             }
3623           else if (pfile->state.in_deferred_pragma)
3624             result = &pfile->directive_result;
3625           else if (result->type == CPP_NAME
3626                    && (result->val.node.node->flags & NODE_MODULE)
3627                    && !pfile->state.skipping
3628                    /* Unlike regular directives, we do not deal with
3629                       tokenizing module directives as macro arguments.
3630                       That's not permitted.  */
3631                    && !pfile->state.parsing_args)
3632             {
3633               /* P1857.  Before macro expansion, At start of logical
3634                  line ... */
3635               /* We don't have to consider lookaheads at this point.  */
3636               gcc_checking_assert (!pfile->lookaheads);
3637
3638               cpp_maybe_module_directive (pfile, result);
3639             }
3640
3641           if (pfile->cb.line_change && !pfile->state.skipping)
3642             pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
3643         }
3644
3645       /* We don't skip tokens in directives.  */
3646       if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
3647         break;
3648
3649       /* Outside a directive, invalidate controlling macros.  At file
3650          EOF, _cpp_lex_direct takes care of popping the buffer, so we never
3651          get here and MI optimization works.  */
3652       pfile->mi_valid = false;
3653
3654       if (!pfile->state.skipping || result->type == CPP_EOF)
3655         break;
3656     }
3657
3658   return result;
3659 }
3660
3661 /* Returns true if a fresh line has been loaded.  */
3662 bool
3663 _cpp_get_fresh_line (cpp_reader *pfile)
3664 {
3665   /* We can't get a new line until we leave the current directive.  */
3666   if (pfile->state.in_directive)
3667     return false;
3668
3669   for (;;)
3670     {
3671       cpp_buffer *buffer = pfile->buffer;
3672
3673       if (!buffer->need_line)
3674         return true;
3675
3676       if (buffer->next_line < buffer->rlimit)
3677         {
3678           _cpp_clean_line (pfile);
3679           return true;
3680         }
3681
3682       /* First, get out of parsing arguments state.  */
3683       if (pfile->state.parsing_args)
3684         return false;
3685
3686       /* End of buffer.  Non-empty files should end in a newline.  */
3687       if (buffer->buf != buffer->rlimit
3688           && buffer->next_line > buffer->rlimit
3689           && !buffer->from_stage3)
3690         {
3691           /* Clip to buffer size.  */
3692           buffer->next_line = buffer->rlimit;
3693         }
3694
3695       if (buffer->prev && !buffer->return_at_eof)
3696         _cpp_pop_buffer (pfile);
3697       else
3698         {
3699           /* End of translation.  Do not pop the buffer yet. Increment
3700              line number so that the EOF token is on a line of its own
3701              (_cpp_lex_direct doesn't increment in that case, because
3702              it's hard for it to distinguish this special case). */
3703           CPP_INCREMENT_LINE (pfile, 0);
3704           return false;
3705         }
3706     }
3707 }
3708
3709 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)          \
3710   do                                                    \
3711     {                                                   \
3712       result->type = ELSE_TYPE;                         \
3713       if (*buffer->cur == CHAR)                         \
3714         buffer->cur++, result->type = THEN_TYPE;        \
3715     }                                                   \
3716   while (0)
3717
3718 /* Lex a token into pfile->cur_token, which is also incremented, to
3719    get diagnostics pointing to the correct location.
3720
3721    Does not handle issues such as token lookahead, multiple-include
3722    optimization, directives, skipping etc.  This function is only
3723    suitable for use by _cpp_lex_token, and in special cases like
3724    lex_expansion_token which doesn't care for any of these issues.
3725
3726    When meeting a newline, returns CPP_EOF if parsing a directive,
3727    otherwise returns to the start of the token buffer if permissible.
3728    Returns the location of the lexed token.  */
3729 cpp_token *
3730 _cpp_lex_direct (cpp_reader *pfile)
3731 {
3732   cppchar_t c;
3733   cpp_buffer *buffer;
3734   const unsigned char *comment_start;
3735   bool fallthrough_comment = false;
3736   cpp_token *result = pfile->cur_token++;
3737
3738  fresh_line:
3739   result->flags = 0;
3740   buffer = pfile->buffer;
3741   if (buffer->need_line)
3742     {
3743       if (pfile->state.in_deferred_pragma)
3744         {
3745           /* This can happen in cases like:
3746              #define loop(x) whatever
3747              #pragma omp loop
3748              where when trying to expand loop we need to peek
3749              next token after loop, but aren't still in_deferred_pragma
3750              mode but are in in_directive mode, so buffer->need_line
3751              is set, a CPP_EOF is peeked.  */
3752           result->type = CPP_PRAGMA_EOL;
3753           pfile->state.in_deferred_pragma = false;
3754           if (!pfile->state.pragma_allow_expansion)
3755             pfile->state.prevent_expansion--;
3756           return result;
3757         }
3758       if (!_cpp_get_fresh_line (pfile))
3759         {
3760           result->type = CPP_EOF;
3761           /* Not a real EOF in a directive or arg parsing -- we refuse
3762              to advance to the next file now, and will once we're out
3763              of those modes.  */
3764           if (!pfile->state.in_directive && !pfile->state.parsing_args)
3765             {
3766               /* Tell the compiler the line number of the EOF token.  */
3767               result->src_loc = pfile->line_table->highest_line;
3768               result->flags = BOL;
3769               /* Now pop the buffer that _cpp_get_fresh_line did not.  */
3770               _cpp_pop_buffer (pfile);
3771             }
3772           return result;
3773         }
3774       if (buffer != pfile->buffer)
3775         fallthrough_comment = false;
3776       if (!pfile->keep_tokens)
3777         {
3778           pfile->cur_run = &pfile->base_run;
3779           result = pfile->base_run.base;
3780           pfile->cur_token = result + 1;
3781         }
3782       result->flags = BOL;
3783       if (pfile->state.parsing_args == 2)
3784         result->flags |= PREV_WHITE;
3785     }
3786   buffer = pfile->buffer;
3787  update_tokens_line:
3788   result->src_loc = pfile->line_table->highest_line;
3789
3790  skipped_white:
3791   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
3792       && !pfile->overlaid_buffer)
3793     {
3794       _cpp_process_line_notes (pfile, false);
3795       result->src_loc = pfile->line_table->highest_line;
3796     }
3797   c = *buffer->cur++;
3798
3799   if (pfile->forced_token_location)
3800     result->src_loc = pfile->forced_token_location;
3801   else
3802     result->src_loc = linemap_position_for_column (pfile->line_table,
3803                                           CPP_BUF_COLUMN (buffer, buffer->cur));
3804
3805   switch (c)
3806     {
3807     case ' ': case '\t': case '\f': case '\v': case '\0':
3808       result->flags |= PREV_WHITE;
3809       skip_whitespace (pfile, c);
3810       goto skipped_white;
3811
3812     case '\n':
3813       /* Increment the line, unless this is the last line ...  */
3814       if (buffer->cur < buffer->rlimit
3815           /* ... or this is a #include, (where _cpp_stack_file needs to
3816              unwind by one line) ...  */
3817           || (pfile->state.in_directive > 1
3818               /* ... except traditional-cpp increments this elsewhere.  */
3819               && !CPP_OPTION (pfile, traditional)))
3820         CPP_INCREMENT_LINE (pfile, 0);
3821       buffer->need_line = true;
3822       if (pfile->state.in_deferred_pragma)
3823         {
3824           /* Produce the PRAGMA_EOL on this line.  File reading
3825              ensures there is always a \n at end of the buffer, thus
3826              in a deferred pragma we always see CPP_PRAGMA_EOL before
3827              any CPP_EOF.  */
3828           result->type = CPP_PRAGMA_EOL;
3829           result->flags &= ~PREV_WHITE;
3830           pfile->state.in_deferred_pragma = false;
3831           if (!pfile->state.pragma_allow_expansion)
3832             pfile->state.prevent_expansion--;
3833           return result;
3834         }
3835       goto fresh_line;
3836
3837     case '0': case '1': case '2': case '3': case '4':
3838     case '5': case '6': case '7': case '8': case '9':
3839       {
3840         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3841         result->type = CPP_NUMBER;
3842         lex_number (pfile, &result->val.str, &nst);
3843         warn_about_normalization (pfile, result, &nst, false);
3844         break;
3845       }
3846
3847     case 'L':
3848     case 'u':
3849     case 'U':
3850     case 'R':
3851       /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
3852          wide strings or raw strings.  */
3853       if (c == 'L' || CPP_OPTION (pfile, rliterals)
3854           || (c != 'R' && CPP_OPTION (pfile, uliterals)))
3855         {
3856           if ((*buffer->cur == '\'' && c != 'R')
3857               || *buffer->cur == '"'
3858               || (*buffer->cur == 'R'
3859                   && c != 'R'
3860                   && buffer->cur[1] == '"'
3861                   && CPP_OPTION (pfile, rliterals))
3862               || (*buffer->cur == '8'
3863                   && c == 'u'
3864                   && ((buffer->cur[1] == '"' || (buffer->cur[1] == '\''
3865                                 && CPP_OPTION (pfile, utf8_char_literals)))
3866                       || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
3867                           && CPP_OPTION (pfile, rliterals)))))
3868             {
3869               lex_string (pfile, result, buffer->cur - 1);
3870               break;
3871             }
3872         }
3873       /* Fall through.  */
3874
3875     case '_':
3876     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
3877     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
3878     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
3879     case 's': case 't':           case 'v': case 'w': case 'x':
3880     case 'y': case 'z':
3881     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
3882     case 'G': case 'H': case 'I': case 'J': case 'K':
3883     case 'M': case 'N': case 'O': case 'P': case 'Q':
3884     case 'S': case 'T':           case 'V': case 'W': case 'X':
3885     case 'Y': case 'Z':
3886       result->type = CPP_NAME;
3887       {
3888         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3889         result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
3890                                                 &nst,
3891                                                 &result->val.node.spelling);
3892         warn_about_normalization (pfile, result, &nst, true);
3893       }
3894
3895       /* Convert named operators to their proper types.  */
3896       if (result->val.node.node->flags & NODE_OPERATOR)
3897         {
3898           result->flags |= NAMED_OP;
3899           result->type = (enum cpp_ttype) result->val.node.node->directive_index;
3900         }
3901
3902       /* Signal FALLTHROUGH comment followed by another token.  */
3903       if (fallthrough_comment)
3904         result->flags |= PREV_FALLTHROUGH;
3905       break;
3906
3907     case '\'':
3908     case '"':
3909       lex_string (pfile, result, buffer->cur - 1);
3910       break;
3911
3912     case '/':
3913       /* A potential block or line comment.  */
3914       comment_start = buffer->cur;
3915       c = *buffer->cur;
3916
3917       if (c == '*')
3918         {
3919           if (_cpp_skip_block_comment (pfile))
3920             cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
3921         }
3922       else if (c == '/' && ! CPP_OPTION (pfile, traditional))
3923         {
3924           /* Don't warn for system headers.  */
3925           if (_cpp_in_system_header (pfile))
3926             ;
3927           /* Warn about comments if pedantically GNUC89, and not
3928              in system headers.  */
3929           else if (CPP_OPTION (pfile, lang) == CLK_GNUC89
3930                    && CPP_PEDANTIC (pfile)
3931                    && ! buffer->warned_cplusplus_comments)
3932             {
3933               if (cpp_error (pfile, CPP_DL_PEDWARN,
3934                              "C++ style comments are not allowed in ISO C90"))
3935                 cpp_error (pfile, CPP_DL_NOTE,
3936                            "(this will be reported only once per input file)");
3937               buffer->warned_cplusplus_comments = 1;
3938             }
3939           /* Or if specifically desired via -Wc90-c99-compat.  */
3940           else if (CPP_OPTION (pfile, cpp_warn_c90_c99_compat) > 0
3941                    && ! CPP_OPTION (pfile, cplusplus)
3942                    && ! buffer->warned_cplusplus_comments)
3943             {
3944               if (cpp_error (pfile, CPP_DL_WARNING,
3945                              "C++ style comments are incompatible with C90"))
3946                 cpp_error (pfile, CPP_DL_NOTE,
3947                            "(this will be reported only once per input file)");
3948               buffer->warned_cplusplus_comments = 1;
3949             }
3950           /* In C89/C94, C++ style comments are forbidden.  */
3951           else if ((CPP_OPTION (pfile, lang) == CLK_STDC89
3952                     || CPP_OPTION (pfile, lang) == CLK_STDC94))
3953             {
3954               /* But don't be confused about valid code such as
3955                  - // immediately followed by *,
3956                  - // in a preprocessing directive,
3957                  - // in an #if 0 block.  */
3958               if (buffer->cur[1] == '*'
3959                   || pfile->state.in_directive
3960                   || pfile->state.skipping)
3961                 {
3962                   result->type = CPP_DIV;
3963                   break;
3964                 }
3965               else if (! buffer->warned_cplusplus_comments)
3966                 {
3967                   if (cpp_error (pfile, CPP_DL_ERROR,
3968                                  "C++ style comments are not allowed in "
3969                                  "ISO C90"))
3970                     cpp_error (pfile, CPP_DL_NOTE,
3971                                "(this will be reported only once per input "
3972                                "file)");
3973                   buffer->warned_cplusplus_comments = 1;
3974                 }
3975             }
3976           if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
3977             cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
3978         }
3979       else if (c == '=')
3980         {
3981           buffer->cur++;
3982           result->type = CPP_DIV_EQ;
3983           break;
3984         }
3985       else
3986         {
3987           result->type = CPP_DIV;
3988           break;
3989         }
3990
3991       if (fallthrough_comment_p (pfile, comment_start))
3992         fallthrough_comment = true;
3993
3994       if (pfile->cb.comment)
3995         {
3996           size_t len = pfile->buffer->cur - comment_start;
3997           pfile->cb.comment (pfile, result->src_loc, comment_start - 1,
3998                              len + 1);
3999         }
4000
4001       if (!pfile->state.save_comments)
4002         {
4003           result->flags |= PREV_WHITE;
4004           goto update_tokens_line;
4005         }
4006
4007       if (fallthrough_comment)
4008         result->flags |= PREV_FALLTHROUGH;
4009
4010       /* Save the comment as a token in its own right.  */
4011       save_comment (pfile, result, comment_start, c);
4012       break;
4013
4014     case '<':
4015       if (pfile->state.angled_headers)
4016         {
4017           lex_string (pfile, result, buffer->cur - 1);
4018           if (result->type != CPP_LESS)
4019             break;
4020         }
4021
4022       result->type = CPP_LESS;
4023       if (*buffer->cur == '=')
4024         {
4025           buffer->cur++, result->type = CPP_LESS_EQ;
4026           if (*buffer->cur == '>'
4027               && CPP_OPTION (pfile, cplusplus)
4028               && CPP_OPTION (pfile, lang) >= CLK_GNUCXX20)
4029             buffer->cur++, result->type = CPP_SPACESHIP;
4030         }
4031       else if (*buffer->cur == '<')
4032         {
4033           buffer->cur++;
4034           IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
4035         }
4036       else if (CPP_OPTION (pfile, digraphs))
4037         {
4038           if (*buffer->cur == ':')
4039             {
4040               /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
4041                  three characters are <:: and the subsequent character
4042                  is neither : nor >, the < is treated as a preprocessor
4043                  token by itself".  */
4044               if (CPP_OPTION (pfile, cplusplus)
4045                   && CPP_OPTION (pfile, lang) != CLK_CXX98
4046                   && CPP_OPTION (pfile, lang) != CLK_GNUCXX
4047                   && buffer->cur[1] == ':'
4048                   && buffer->cur[2] != ':' && buffer->cur[2] != '>')
4049                 break;
4050
4051               buffer->cur++;
4052               result->flags |= DIGRAPH;
4053               result->type = CPP_OPEN_SQUARE;
4054             }
4055           else if (*buffer->cur == '%')
4056             {
4057               buffer->cur++;
4058               result->flags |= DIGRAPH;
4059               result->type = CPP_OPEN_BRACE;
4060             }
4061         }
4062       break;
4063
4064     case '>':
4065       result->type = CPP_GREATER;
4066       if (*buffer->cur == '=')
4067         buffer->cur++, result->type = CPP_GREATER_EQ;
4068       else if (*buffer->cur == '>')
4069         {
4070           buffer->cur++;
4071           IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
4072         }
4073       break;
4074
4075     case '%':
4076       result->type = CPP_MOD;
4077       if (*buffer->cur == '=')
4078         buffer->cur++, result->type = CPP_MOD_EQ;
4079       else if (CPP_OPTION (pfile, digraphs))
4080         {
4081           if (*buffer->cur == ':')
4082             {
4083               buffer->cur++;
4084               result->flags |= DIGRAPH;
4085               result->type = CPP_HASH;
4086               if (*buffer->cur == '%' && buffer->cur[1] == ':')
4087                 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
4088             }
4089           else if (*buffer->cur == '>')
4090             {
4091               buffer->cur++;
4092               result->flags |= DIGRAPH;
4093               result->type = CPP_CLOSE_BRACE;
4094             }
4095         }
4096       break;
4097
4098     case '.':
4099       result->type = CPP_DOT;
4100       if (ISDIGIT (*buffer->cur))
4101         {
4102           struct normalize_state nst = INITIAL_NORMALIZE_STATE;
4103           result->type = CPP_NUMBER;
4104           lex_number (pfile, &result->val.str, &nst);
4105           warn_about_normalization (pfile, result, &nst, false);
4106         }
4107       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
4108         buffer->cur += 2, result->type = CPP_ELLIPSIS;
4109       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
4110         buffer->cur++, result->type = CPP_DOT_STAR;
4111       break;
4112
4113     case '+':
4114       result->type = CPP_PLUS;
4115       if (*buffer->cur == '+')
4116         buffer->cur++, result->type = CPP_PLUS_PLUS;
4117       else if (*buffer->cur == '=')
4118         buffer->cur++, result->type = CPP_PLUS_EQ;
4119       break;
4120
4121     case '-':
4122       result->type = CPP_MINUS;
4123       if (*buffer->cur == '>')
4124         {
4125           buffer->cur++;
4126           result->type = CPP_DEREF;
4127           if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
4128             buffer->cur++, result->type = CPP_DEREF_STAR;
4129         }
4130       else if (*buffer->cur == '-')
4131         buffer->cur++, result->type = CPP_MINUS_MINUS;
4132       else if (*buffer->cur == '=')
4133         buffer->cur++, result->type = CPP_MINUS_EQ;
4134       break;
4135
4136     case '&':
4137       result->type = CPP_AND;
4138       if (*buffer->cur == '&')
4139         buffer->cur++, result->type = CPP_AND_AND;
4140       else if (*buffer->cur == '=')
4141         buffer->cur++, result->type = CPP_AND_EQ;
4142       break;
4143
4144     case '|':
4145       result->type = CPP_OR;
4146       if (*buffer->cur == '|')
4147         buffer->cur++, result->type = CPP_OR_OR;
4148       else if (*buffer->cur == '=')
4149         buffer->cur++, result->type = CPP_OR_EQ;
4150       break;
4151
4152     case ':':
4153       result->type = CPP_COLON;
4154       if (*buffer->cur == ':' && CPP_OPTION (pfile, scope))
4155         buffer->cur++, result->type = CPP_SCOPE;
4156       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
4157         {
4158           buffer->cur++;
4159           result->flags |= DIGRAPH;
4160           result->type = CPP_CLOSE_SQUARE;
4161         }
4162       break;
4163
4164     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
4165     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
4166     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
4167     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
4168     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
4169
4170     case '?': result->type = CPP_QUERY; break;
4171     case '~': result->type = CPP_COMPL; break;
4172     case ',': result->type = CPP_COMMA; break;
4173     case '(': result->type = CPP_OPEN_PAREN; break;
4174     case ')': result->type = CPP_CLOSE_PAREN; break;
4175     case '[': result->type = CPP_OPEN_SQUARE; break;
4176     case ']': result->type = CPP_CLOSE_SQUARE; break;
4177     case '{': result->type = CPP_OPEN_BRACE; break;
4178     case '}': result->type = CPP_CLOSE_BRACE; break;
4179     case ';': result->type = CPP_SEMICOLON; break;
4180
4181       /* @ is a punctuator in Objective-C.  */
4182     case '@': result->type = CPP_ATSIGN; break;
4183
4184     default:
4185       {
4186         const uchar *base = --buffer->cur;
4187         static int no_warn_cnt;
4188
4189         /* Check for an extended identifier ($ or UCN or UTF-8).  */
4190         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
4191         if (forms_identifier_p (pfile, true, &nst))
4192           {
4193             result->type = CPP_NAME;
4194             result->val.node.node = lex_identifier (pfile, base, true, &nst,
4195                                                     &result->val.node.spelling);
4196             warn_about_normalization (pfile, result, &nst, true);
4197             break;
4198           }
4199
4200         /* Otherwise this will form a CPP_OTHER token.  Parse valid UTF-8 as a
4201            single token.  */
4202         buffer->cur++;
4203         if (c >= utf8_signifier)
4204           {
4205             const uchar *pstr = base;
4206             cppchar_t s;
4207             if (_cpp_valid_utf8 (pfile, &pstr, buffer->rlimit, 0, NULL, &s))
4208               {
4209                 if (s > UCS_LIMIT && CPP_OPTION (pfile, cpp_warn_invalid_utf8))
4210                   {
4211                     buffer->cur = base;
4212                     _cpp_warn_invalid_utf8 (pfile);
4213                   }
4214                 buffer->cur = pstr;
4215               }
4216             else if (CPP_OPTION (pfile, cpp_warn_invalid_utf8))
4217               {
4218                 buffer->cur = base;
4219                 const uchar *end = _cpp_warn_invalid_utf8 (pfile);
4220                 buffer->cur = base + 1;
4221                 no_warn_cnt = end - buffer->cur;
4222               }
4223           }
4224         else if (c >= utf8_continuation
4225                  && CPP_OPTION (pfile, cpp_warn_invalid_utf8))
4226           {
4227             if (no_warn_cnt)
4228               --no_warn_cnt;
4229             else
4230               {
4231                 buffer->cur = base;
4232                 _cpp_warn_invalid_utf8 (pfile);
4233                 buffer->cur = base + 1;
4234               }
4235           }
4236         create_literal (pfile, result, base, buffer->cur - base, CPP_OTHER);
4237         break;
4238       }
4239
4240     }
4241
4242   /* Potentially convert the location of the token to a range.  */
4243   if (result->src_loc >= RESERVED_LOCATION_COUNT
4244       && result->type != CPP_EOF)
4245     {
4246       /* Ensure that any line notes are processed, so that we have the
4247          correct physical line/column for the end-point of the token even
4248          when a logical line is split via one or more backslashes.  */
4249       if (buffer->cur >= buffer->notes[buffer->cur_note].pos
4250           && !pfile->overlaid_buffer)
4251         _cpp_process_line_notes (pfile, false);
4252
4253       source_range tok_range;
4254       tok_range.m_start = result->src_loc;
4255       tok_range.m_finish
4256         = linemap_position_for_column (pfile->line_table,
4257                                        CPP_BUF_COLUMN (buffer, buffer->cur));
4258
4259       result->src_loc = COMBINE_LOCATION_DATA (pfile->line_table,
4260                                                result->src_loc,
4261                                                tok_range, NULL, 0);
4262     }
4263
4264   return result;
4265 }
4266
4267 /* An upper bound on the number of bytes needed to spell TOKEN.
4268    Does not include preceding whitespace.  */
4269 unsigned int
4270 cpp_token_len (const cpp_token *token)
4271 {
4272   unsigned int len;
4273
4274   switch (TOKEN_SPELL (token))
4275     {
4276     default:            len = 6;                                break;
4277     case SPELL_LITERAL: len = token->val.str.len;               break;
4278     case SPELL_IDENT:   len = NODE_LEN (token->val.node.node) * 10;     break;
4279     }
4280
4281   return len;
4282 }
4283
4284 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
4285    Return the number of bytes read out of NAME.  (There are always
4286    10 bytes written to BUFFER.)  */
4287
4288 static size_t
4289 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
4290 {
4291   int j;
4292   int ucn_len = 0;
4293   int ucn_len_c;
4294   unsigned t;
4295   unsigned long utf32;
4296
4297   /* Compute the length of the UTF-8 sequence.  */
4298   for (t = *name; t & 0x80; t <<= 1)
4299     ucn_len++;
4300
4301   utf32 = *name & (0x7F >> ucn_len);
4302   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
4303     {
4304       utf32 = (utf32 << 6) | (*++name & 0x3F);
4305
4306       /* Ill-formed UTF-8.  */
4307       if ((*name & ~0x3F) != 0x80)
4308         abort ();
4309     }
4310
4311   *buffer++ = '\\';
4312   *buffer++ = 'U';
4313   for (j = 7; j >= 0; j--)
4314     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
4315   return ucn_len;
4316 }
4317
4318 /* Given a token TYPE corresponding to a digraph, return a pointer to
4319    the spelling of the digraph.  */
4320 static const unsigned char *
4321 cpp_digraph2name (enum cpp_ttype type)
4322 {
4323   return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
4324 }
4325
4326 /* Write the spelling of an identifier IDENT, using UCNs, to BUFFER.
4327    The buffer must already contain the enough space to hold the
4328    token's spelling.  Returns a pointer to the character after the
4329    last character written.  */
4330 unsigned char *
4331 _cpp_spell_ident_ucns (unsigned char *buffer, cpp_hashnode *ident)
4332 {
4333   size_t i;
4334   const unsigned char *name = NODE_NAME (ident);
4335
4336   for (i = 0; i < NODE_LEN (ident); i++)
4337     if (name[i] & ~0x7F)
4338       {
4339         i += utf8_to_ucn (buffer, name + i) - 1;
4340         buffer += 10;
4341       }
4342     else
4343       *buffer++ = name[i];
4344
4345   return buffer;
4346 }
4347
4348 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
4349    already contain the enough space to hold the token's spelling.
4350    Returns a pointer to the character after the last character written.
4351    FORSTRING is true if this is to be the spelling after translation
4352    phase 1 (with the original spelling of extended identifiers), false
4353    if extended identifiers should always be written using UCNs (there is
4354    no option for always writing them in the internal UTF-8 form).
4355    FIXME: Would be nice if we didn't need the PFILE argument.  */
4356 unsigned char *
4357 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
4358                  unsigned char *buffer, bool forstring)
4359 {
4360   switch (TOKEN_SPELL (token))
4361     {
4362     case SPELL_OPERATOR:
4363       {
4364         const unsigned char *spelling;
4365         unsigned char c;
4366
4367         if (token->flags & DIGRAPH)
4368           spelling = cpp_digraph2name (token->type);
4369         else if (token->flags & NAMED_OP)
4370           goto spell_ident;
4371         else
4372           spelling = TOKEN_NAME (token);
4373
4374         while ((c = *spelling++) != '\0')
4375           *buffer++ = c;
4376       }
4377       break;
4378
4379     spell_ident:
4380     case SPELL_IDENT:
4381       if (forstring)
4382         {
4383           memcpy (buffer, NODE_NAME (token->val.node.spelling),
4384                   NODE_LEN (token->val.node.spelling));
4385           buffer += NODE_LEN (token->val.node.spelling);
4386         }
4387       else
4388         buffer = _cpp_spell_ident_ucns (buffer, token->val.node.node);
4389       break;
4390
4391     case SPELL_LITERAL:
4392       memcpy (buffer, token->val.str.text, token->val.str.len);
4393       buffer += token->val.str.len;
4394       break;
4395
4396     case SPELL_NONE:
4397       cpp_error (pfile, CPP_DL_ICE,
4398                  "unspellable token %s", TOKEN_NAME (token));
4399       break;
4400     }
4401
4402   return buffer;
4403 }
4404
4405 /* Returns TOKEN spelt as a null-terminated string.  The string is
4406    freed when the reader is destroyed.  Useful for diagnostics.  */
4407 unsigned char *
4408 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
4409 {
4410   unsigned int len = cpp_token_len (token) + 1;
4411   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
4412
4413   end = cpp_spell_token (pfile, token, start, false);
4414   end[0] = '\0';
4415
4416   return start;
4417 }
4418
4419 /* Returns a pointer to a string which spells the token defined by
4420    TYPE and FLAGS.  Used by C front ends, which really should move to
4421    using cpp_token_as_text.  */
4422 const char *
4423 cpp_type2name (enum cpp_ttype type, unsigned char flags)
4424 {
4425   if (flags & DIGRAPH)
4426     return (const char *) cpp_digraph2name (type);
4427   else if (flags & NAMED_OP)
4428     return cpp_named_operator2name (type);
4429
4430   return (const char *) token_spellings[type].name;
4431 }
4432
4433 /* Writes the spelling of token to FP, without any preceding space.
4434    Separated from cpp_spell_token for efficiency - to avoid stdio
4435    double-buffering.  */
4436 void
4437 cpp_output_token (const cpp_token *token, FILE *fp)
4438 {
4439   switch (TOKEN_SPELL (token))
4440     {
4441     case SPELL_OPERATOR:
4442       {
4443         const unsigned char *spelling;
4444         int c;
4445
4446         if (token->flags & DIGRAPH)
4447           spelling = cpp_digraph2name (token->type);
4448         else if (token->flags & NAMED_OP)
4449           goto spell_ident;
4450         else
4451           spelling = TOKEN_NAME (token);
4452
4453         c = *spelling;
4454         do
4455           putc (c, fp);
4456         while ((c = *++spelling) != '\0');
4457       }
4458       break;
4459
4460     spell_ident:
4461     case SPELL_IDENT:
4462       {
4463         size_t i;
4464         const unsigned char * name = NODE_NAME (token->val.node.node);
4465
4466         for (i = 0; i < NODE_LEN (token->val.node.node); i++)
4467           if (name[i] & ~0x7F)
4468             {
4469               unsigned char buffer[10];
4470               i += utf8_to_ucn (buffer, name + i) - 1;
4471               fwrite (buffer, 1, 10, fp);
4472             }
4473           else
4474             fputc (NODE_NAME (token->val.node.node)[i], fp);
4475       }
4476       break;
4477
4478     case SPELL_LITERAL:
4479       if (token->type == CPP_HEADER_NAME)
4480         fputc ('"', fp);
4481       fwrite (token->val.str.text, 1, token->val.str.len, fp);
4482       if (token->type == CPP_HEADER_NAME)
4483         fputc ('"', fp);
4484       break;
4485
4486     case SPELL_NONE:
4487       /* An error, most probably.  */
4488       break;
4489     }
4490 }
4491
4492 /* Compare two tokens.  */
4493 int
4494 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
4495 {
4496   if (a->type == b->type && a->flags == b->flags)
4497     switch (TOKEN_SPELL (a))
4498       {
4499       default:                  /* Keep compiler happy.  */
4500       case SPELL_OPERATOR:
4501         /* token_no is used to track where multiple consecutive ##
4502            tokens were originally located.  */
4503         return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
4504       case SPELL_NONE:
4505         return (a->type != CPP_MACRO_ARG
4506                 || (a->val.macro_arg.arg_no == b->val.macro_arg.arg_no
4507                     && a->val.macro_arg.spelling == b->val.macro_arg.spelling));
4508       case SPELL_IDENT:
4509         return (a->val.node.node == b->val.node.node
4510                 && a->val.node.spelling == b->val.node.spelling);
4511       case SPELL_LITERAL:
4512         return (a->val.str.len == b->val.str.len
4513                 && !memcmp (a->val.str.text, b->val.str.text,
4514                             a->val.str.len));
4515       }
4516
4517   return 0;
4518 }
4519
4520 /* Returns nonzero if a space should be inserted to avoid an
4521    accidental token paste for output.  For simplicity, it is
4522    conservative, and occasionally advises a space where one is not
4523    needed, e.g. "." and ".2".  */
4524 int
4525 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
4526                  const cpp_token *token2)
4527 {
4528   enum cpp_ttype a = token1->type, b = token2->type;
4529   cppchar_t c;
4530
4531   if (token1->flags & NAMED_OP)
4532     a = CPP_NAME;
4533   if (token2->flags & NAMED_OP)
4534     b = CPP_NAME;
4535
4536   c = EOF;
4537   if (token2->flags & DIGRAPH)
4538     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
4539   else if (token_spellings[b].category == SPELL_OPERATOR)
4540     c = token_spellings[b].name[0];
4541
4542   /* Quickly get everything that can paste with an '='.  */
4543   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
4544     return 1;
4545
4546   switch (a)
4547     {
4548     case CPP_GREATER:   return c == '>';
4549     case CPP_LESS:      return c == '<' || c == '%' || c == ':';
4550     case CPP_PLUS:      return c == '+';
4551     case CPP_MINUS:     return c == '-' || c == '>';
4552     case CPP_DIV:       return c == '/' || c == '*'; /* Comments.  */
4553     case CPP_MOD:       return c == ':' || c == '>';
4554     case CPP_AND:       return c == '&';
4555     case CPP_OR:        return c == '|';
4556     case CPP_COLON:     return c == ':' || c == '>';
4557     case CPP_DEREF:     return c == '*';
4558     case CPP_DOT:       return c == '.' || c == '%' || b == CPP_NUMBER;
4559     case CPP_HASH:      return c == '#' || c == '%'; /* Digraph form.  */
4560     case CPP_PRAGMA:
4561     case CPP_NAME:      return ((b == CPP_NUMBER
4562                                  && name_p (pfile, &token2->val.str))
4563                                 || b == CPP_NAME
4564                                 || b == CPP_CHAR || b == CPP_STRING); /* L */
4565     case CPP_NUMBER:    return (b == CPP_NUMBER || b == CPP_NAME
4566                                 || b == CPP_CHAR
4567                                 || c == '.' || c == '+' || c == '-');
4568                                       /* UCNs */
4569     case CPP_OTHER:     return ((token1->val.str.text[0] == '\\'
4570                                  && b == CPP_NAME)
4571                                 || (CPP_OPTION (pfile, objc)
4572                                     && token1->val.str.text[0] == '@'
4573                                     && (b == CPP_NAME || b == CPP_STRING)));
4574     case CPP_LESS_EQ:   return c == '>';
4575     case CPP_STRING:
4576     case CPP_WSTRING:
4577     case CPP_UTF8STRING:
4578     case CPP_STRING16:
4579     case CPP_STRING32:  return (CPP_OPTION (pfile, user_literals)
4580                                 && (b == CPP_NAME
4581                                     || (TOKEN_SPELL (token2) == SPELL_LITERAL
4582                                         && ISIDST (token2->val.str.text[0]))));
4583
4584     default:            break;
4585     }
4586
4587   return 0;
4588 }
4589
4590 /* Output all the remaining tokens on the current line, and a newline
4591    character, to FP.  Leading whitespace is removed.  If there are
4592    macros, special token padding is not performed.  */
4593 void
4594 cpp_output_line (cpp_reader *pfile, FILE *fp)
4595 {
4596   const cpp_token *token;
4597
4598   token = cpp_get_token (pfile);
4599   while (token->type != CPP_EOF)
4600     {
4601       cpp_output_token (token, fp);
4602       token = cpp_get_token (pfile);
4603       if (token->flags & PREV_WHITE)
4604         putc (' ', fp);
4605     }
4606
4607   putc ('\n', fp);
4608 }
4609
4610 /* Return a string representation of all the remaining tokens on the
4611    current line.  The result is allocated using xmalloc and must be
4612    freed by the caller.  */
4613 unsigned char *
4614 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
4615 {
4616   const cpp_token *token;
4617   unsigned int out = dir_name ? ustrlen (dir_name) : 0;
4618   unsigned int alloced = 120 + out;
4619   unsigned char *result = (unsigned char *) xmalloc (alloced);
4620
4621   /* If DIR_NAME is empty, there are no initial contents.  */
4622   if (dir_name)
4623     {
4624       sprintf ((char *) result, "#%s ", dir_name);
4625       out += 2;
4626     }
4627
4628   token = cpp_get_token (pfile);
4629   while (token->type != CPP_EOF)
4630     {
4631       unsigned char *last;
4632       /* Include room for a possible space and the terminating nul.  */
4633       unsigned int len = cpp_token_len (token) + 2;
4634
4635       if (out + len > alloced)
4636         {
4637           alloced *= 2;
4638           if (out + len > alloced)
4639             alloced = out + len;
4640           result = (unsigned char *) xrealloc (result, alloced);
4641         }
4642
4643       last = cpp_spell_token (pfile, token, &result[out], 0);
4644       out = last - result;
4645
4646       token = cpp_get_token (pfile);
4647       if (token->flags & PREV_WHITE)
4648         result[out++] = ' ';
4649     }
4650
4651   result[out] = '\0';
4652   return result;
4653 }
4654
4655 /* Memory buffers.  Changing these three constants can have a dramatic
4656    effect on performance.  The values here are reasonable defaults,
4657    but might be tuned.  If you adjust them, be sure to test across a
4658    range of uses of cpplib, including heavy nested function-like macro
4659    expansion.  Also check the change in peak memory usage (NJAMD is a
4660    good tool for this).  */
4661 #define MIN_BUFF_SIZE 8000
4662 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
4663 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
4664         (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
4665
4666 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
4667   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
4668 #endif
4669
4670 /* Create a new allocation buffer.  Place the control block at the end
4671    of the buffer, so that buffer overflows will cause immediate chaos.  */
4672 static _cpp_buff *
4673 new_buff (size_t len)
4674 {
4675   _cpp_buff *result;
4676   unsigned char *base;
4677
4678   if (len < MIN_BUFF_SIZE)
4679     len = MIN_BUFF_SIZE;
4680   len = CPP_ALIGN (len);
4681
4682 #ifdef ENABLE_VALGRIND_ANNOTATIONS
4683   /* Valgrind warns about uses of interior pointers, so put _cpp_buff
4684      struct first.  */
4685   size_t slen = CPP_ALIGN2 (sizeof (_cpp_buff), 2 * DEFAULT_ALIGNMENT);
4686   base = XNEWVEC (unsigned char, len + slen);
4687   result = (_cpp_buff *) base;
4688   base += slen;
4689 #else
4690   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
4691   result = (_cpp_buff *) (base + len);
4692 #endif
4693   result->base = base;
4694   result->cur = base;
4695   result->limit = base + len;
4696   result->next = NULL;
4697   return result;
4698 }
4699
4700 /* Place a chain of unwanted allocation buffers on the free list.  */
4701 void
4702 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
4703 {
4704   _cpp_buff *end = buff;
4705
4706   while (end->next)
4707     end = end->next;
4708   end->next = pfile->free_buffs;
4709   pfile->free_buffs = buff;
4710 }
4711
4712 /* Return a free buffer of size at least MIN_SIZE.  */
4713 _cpp_buff *
4714 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
4715 {
4716   _cpp_buff *result, **p;
4717
4718   for (p = &pfile->free_buffs;; p = &(*p)->next)
4719     {
4720       size_t size;
4721
4722       if (*p == NULL)
4723         return new_buff (min_size);
4724       result = *p;
4725       size = result->limit - result->base;
4726       /* Return a buffer that's big enough, but don't waste one that's
4727          way too big.  */
4728       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
4729         break;
4730     }
4731
4732   *p = result->next;
4733   result->next = NULL;
4734   result->cur = result->base;
4735   return result;
4736 }
4737
4738 /* Creates a new buffer with enough space to hold the uncommitted
4739    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
4740    the excess bytes to the new buffer.  Chains the new buffer after
4741    BUFF, and returns the new buffer.  */
4742 _cpp_buff *
4743 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
4744 {
4745   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
4746   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
4747
4748   buff->next = new_buff;
4749   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
4750   return new_buff;
4751 }
4752
4753 /* Creates a new buffer with enough space to hold the uncommitted
4754    remaining bytes of the buffer pointed to by BUFF, and at least
4755    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
4756    Chains the new buffer before the buffer pointed to by BUFF, and
4757    updates the pointer to point to the new buffer.  */
4758 void
4759 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
4760 {
4761   _cpp_buff *new_buff, *old_buff = *pbuff;
4762   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
4763
4764   new_buff = _cpp_get_buff (pfile, size);
4765   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
4766   new_buff->next = old_buff;
4767   *pbuff = new_buff;
4768 }
4769
4770 /* Free a chain of buffers starting at BUFF.  */
4771 void
4772 _cpp_free_buff (_cpp_buff *buff)
4773 {
4774   _cpp_buff *next;
4775
4776   for (; buff; buff = next)
4777     {
4778       next = buff->next;
4779 #ifdef ENABLE_VALGRIND_ANNOTATIONS
4780       free (buff);
4781 #else
4782       free (buff->base);
4783 #endif
4784     }
4785 }
4786
4787 /* Allocate permanent, unaligned storage of length LEN.  */
4788 unsigned char *
4789 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
4790 {
4791   _cpp_buff *buff = pfile->u_buff;
4792   unsigned char *result = buff->cur;
4793
4794   if (len > (size_t) (buff->limit - result))
4795     {
4796       buff = _cpp_get_buff (pfile, len);
4797       buff->next = pfile->u_buff;
4798       pfile->u_buff = buff;
4799       result = buff->cur;
4800     }
4801
4802   buff->cur = result + len;
4803   return result;
4804 }
4805
4806 /* Allocate permanent, unaligned storage of length LEN from a_buff.
4807    That buffer is used for growing allocations when saving macro
4808    replacement lists in a #define, and when parsing an answer to an
4809    assertion in #assert, #unassert or #if (and therefore possibly
4810    whilst expanding macros).  It therefore must not be used by any
4811    code that they might call: specifically the lexer and the guts of
4812    the macro expander.
4813
4814    All existing other uses clearly fit this restriction: storing
4815    registered pragmas during initialization.  */
4816 unsigned char *
4817 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
4818 {
4819   _cpp_buff *buff = pfile->a_buff;
4820   unsigned char *result = buff->cur;
4821
4822   if (len > (size_t) (buff->limit - result))
4823     {
4824       buff = _cpp_get_buff (pfile, len);
4825       buff->next = pfile->a_buff;
4826       pfile->a_buff = buff;
4827       result = buff->cur;
4828     }
4829
4830   buff->cur = result + len;
4831   return result;
4832 }
4833
4834 /* Commit or allocate storage from a buffer.  */
4835
4836 void *
4837 _cpp_commit_buff (cpp_reader *pfile, size_t size)
4838 {
4839   void *ptr = BUFF_FRONT (pfile->a_buff);
4840
4841   if (pfile->hash_table->alloc_subobject)
4842     {
4843       void *copy = pfile->hash_table->alloc_subobject (size);
4844       memcpy (copy, ptr, size);
4845       ptr = copy;
4846     }
4847   else
4848     BUFF_FRONT (pfile->a_buff) += size;
4849
4850   return ptr;
4851 }
4852
4853 /* Say which field of TOK is in use.  */
4854
4855 enum cpp_token_fld_kind
4856 cpp_token_val_index (const cpp_token *tok)
4857 {
4858   switch (TOKEN_SPELL (tok))
4859     {
4860     case SPELL_IDENT:
4861       return CPP_TOKEN_FLD_NODE;
4862     case SPELL_LITERAL:
4863       return CPP_TOKEN_FLD_STR;
4864     case SPELL_OPERATOR:
4865       /* Operands which were originally spelled as ident keep around
4866          the node for the exact spelling.  */
4867       if (tok->flags & NAMED_OP)
4868         return CPP_TOKEN_FLD_NODE;
4869       else if (tok->type == CPP_PASTE)
4870         return CPP_TOKEN_FLD_TOKEN_NO;
4871       else
4872         return CPP_TOKEN_FLD_NONE;
4873     case SPELL_NONE:
4874       if (tok->type == CPP_MACRO_ARG)
4875         return CPP_TOKEN_FLD_ARG_NO;
4876       else if (tok->type == CPP_PADDING)
4877         return CPP_TOKEN_FLD_SOURCE;
4878       else if (tok->type == CPP_PRAGMA)
4879         return CPP_TOKEN_FLD_PRAGMA;
4880       /* fall through */
4881     default:
4882       return CPP_TOKEN_FLD_NONE;
4883     }
4884 }
4885
4886 /* All tokens lexed in R after calling this function will be forced to
4887    have their location_t to be P, until
4888    cpp_stop_forcing_token_locations is called for R.  */
4889
4890 void
4891 cpp_force_token_locations (cpp_reader *r, location_t loc)
4892 {
4893   r->forced_token_location = loc;
4894 }
4895
4896 /* Go back to assigning locations naturally for lexed tokens.  */
4897
4898 void
4899 cpp_stop_forcing_token_locations (cpp_reader *r)
4900 {
4901   r->forced_token_location = 0;
4902 }
4903
4904 /* We're looking at \, if it's escaping EOL, look past it.  If at
4905    LIMIT, don't advance.  */
4906
4907 static const unsigned char *
4908 do_peek_backslash (const unsigned char *peek, const unsigned char *limit)
4909 {
4910   const unsigned char *probe = peek;
4911
4912   if (__builtin_expect (peek[1] == '\n', true))
4913     {
4914     eol:
4915       probe += 2;
4916       if (__builtin_expect (probe < limit, true))
4917         {
4918           peek = probe;
4919           if (*peek == '\\')
4920             /* The user might be perverse.  */
4921             return do_peek_backslash (peek, limit);
4922         }
4923     }
4924   else if (__builtin_expect (peek[1] == '\r', false))
4925     {
4926       if (probe[2] == '\n')
4927         probe++;
4928       goto eol;
4929     }
4930
4931   return peek;
4932 }
4933
4934 static const unsigned char *
4935 do_peek_next (const unsigned char *peek, const unsigned char *limit)
4936 {
4937   if (__builtin_expect (*peek == '\\', false))
4938     peek = do_peek_backslash (peek, limit);
4939   return peek;
4940 }
4941
4942 static const unsigned char *
4943 do_peek_prev (const unsigned char *peek, const unsigned char *bound)
4944 {
4945   if (peek == bound)
4946     return NULL;
4947
4948   unsigned char c = *--peek;
4949   if (__builtin_expect (c == '\n', false)
4950       || __builtin_expect (c == 'r', false))
4951     {
4952       if (peek == bound)
4953         return peek;
4954       int ix = -1;
4955       if (c == '\n' && peek[ix] == '\r')
4956         {
4957           if (peek + ix == bound)
4958             return peek;
4959           ix--;
4960         }
4961
4962       if (peek[ix] == '\\')
4963         return do_peek_prev (peek + ix, bound);
4964
4965       return peek;
4966     }
4967   else
4968     return peek;
4969 }
4970
4971 /* If PEEK[-1] is identifier MATCH, scan past it and trailing white
4972    space.  Otherwise return NULL.  */
4973
4974 static const unsigned char *
4975 do_peek_ident (const char *match, const unsigned char *peek,
4976                const unsigned char *limit)
4977 {
4978   for (; *++match; peek++)
4979     if (*peek != *match)
4980       {
4981         peek = do_peek_next (peek, limit);
4982         if (*peek != *match)
4983           return NULL;
4984       }
4985
4986   /* Must now not be looking at an identifier char.  */
4987   peek = do_peek_next (peek, limit);
4988   if (ISIDNUM (*peek))
4989     return NULL;
4990
4991   /* Skip control-line whitespace.  */
4992  ws:
4993   while (*peek == ' ' || *peek == '\t')
4994     peek++;
4995   if (__builtin_expect (*peek == '\\', false))
4996     {
4997       peek = do_peek_backslash (peek, limit);
4998       if (*peek != '\\')
4999         goto ws;
5000     }
5001
5002   return peek;
5003 }
5004
5005 /* Are we looking at a module control line starting as PEEK - 1?  */
5006
5007 static bool
5008 do_peek_module (cpp_reader *pfile, unsigned char c,
5009                 const unsigned char *peek, const unsigned char *limit)
5010 {
5011   bool import = false;
5012
5013   if (__builtin_expect (c == 'e', false))
5014     {
5015       if (!((peek[0] == 'x' || peek[0] == '\\')
5016             && (peek = do_peek_ident ("export", peek, limit))))
5017         return false;
5018
5019       /* export, peek for import or module.  No need to peek __import
5020          here.  */
5021       if (peek[0] == 'i')
5022         {
5023           if (!((peek[1] == 'm' || peek[1] == '\\')
5024                 && (peek = do_peek_ident ("import", peek + 1, limit))))
5025             return false;
5026           import = true;
5027         }
5028       else if (peek[0] == 'm')
5029         {
5030           if (!((peek[1] == 'o' || peek[1] == '\\')
5031                 && (peek = do_peek_ident ("module", peek + 1, limit))))
5032             return false;
5033         }
5034       else
5035         return false;
5036     }
5037   else if (__builtin_expect (c == 'i', false))
5038     {
5039       if (!((peek[0] == 'm' || peek[0] == '\\')
5040             && (peek = do_peek_ident ("import", peek, limit))))
5041         return false;
5042       import = true;
5043     }
5044   else if (__builtin_expect (c == '_', false))
5045     {
5046       /* Needed for translated includes.   */
5047       if (!((peek[0] == '_' || peek[0] == '\\')
5048             && (peek = do_peek_ident ("__import", peek, limit))))
5049         return false;
5050       import = true;
5051     }
5052   else if (__builtin_expect (c == 'm', false))
5053     {
5054       if (!((peek[0] == 'o' || peek[0] == '\\')
5055             && (peek = do_peek_ident ("module", peek, limit))))
5056         return false;
5057     }
5058   else
5059     return false;
5060
5061   /* Peek the next character to see if it's good enough.  We'll be at
5062      the first non-whitespace char, including skipping an escaped
5063      newline.  */
5064   /* ... import followed by identifier, ':', '<' or header-name
5065      preprocessing tokens, or module followed by identifier, ':' or
5066      ';' preprocessing tokens.  */
5067   unsigned char p = *peek++;
5068
5069   /* A character literal is ... single quotes, ... optionally preceded
5070      by u8, u, U, or L */
5071   /* A string-literal is a ... double quotes, optionally prefixed by
5072      R, u8, u8R, u, uR, U, UR, L, or LR */
5073   if (p == 'u')
5074     {
5075       peek = do_peek_next (peek, limit);
5076       if (*peek == '8')
5077         {
5078           peek++;
5079           goto peek_u8;
5080         }
5081       goto peek_u;
5082     }
5083   else if (p == 'U' || p == 'L')
5084     {
5085     peek_u8:
5086       peek = do_peek_next (peek, limit);
5087     peek_u:
5088       if (*peek == '\"' || *peek == '\'')
5089         return false;
5090
5091       if (*peek == 'R')
5092         goto peek_R;
5093       /* Identifier. Ok.  */
5094     }
5095   else if (p == 'R')
5096     {
5097     peek_R:
5098       if (CPP_OPTION (pfile, rliterals))
5099         {
5100           peek = do_peek_next (peek, limit);
5101           if (*peek == '\"')
5102             return false;
5103         }
5104       /* Identifier. Ok.  */
5105     }
5106   else if ('Z' - 'A' == 25
5107            ? ((p >= 'A' && p <= 'Z') || (p >= 'a' && p <= 'z') || p == '_')
5108            : ISIDST (p))
5109     {
5110       /* Identifier.  Ok. */
5111     }
5112   else if (p == '<')
5113     {
5114       /* Maybe angle header, ok for import.  Reject
5115          '<=', '<<' digraph:'<:'.  */
5116       if (!import)
5117         return false;
5118       peek = do_peek_next (peek, limit);
5119       if (*peek == '=' || *peek == '<'
5120           || (*peek == ':' && CPP_OPTION (pfile, digraphs)))
5121         return false;
5122     }
5123   else if (p == ';')
5124     {
5125       /* SEMICOLON, ok for module.  */
5126       if (import)
5127         return false;
5128     }
5129   else if (p == '"')
5130     {
5131       /* STRING, ok for import.  */
5132       if (!import)
5133         return false;
5134     }
5135   else if (p == ':')
5136     {
5137       /* Maybe COLON, ok.  Reject '::', digraph:':>'.  */
5138       peek = do_peek_next (peek, limit);
5139       if (*peek == ':' || (*peek == '>' && CPP_OPTION (pfile, digraphs)))
5140         return false;
5141     }
5142   else
5143     /* FIXME: Detect a unicode character, excluding those not
5144        permitted as the initial character. [lex.name]/1.  I presume
5145        we need to check the \[uU] spellings, and directly using
5146        Unicode in say UTF8 form?  Or perhaps we do the phase-1
5147        conversion of UTF8 to universal-character-names?  */
5148     return false;
5149
5150   return true;
5151 }
5152
5153 /* Directives-only scanning.  Somewhat more relaxed than correct
5154    parsing -- some ill-formed programs will not be rejected.  */
5155
5156 void
5157 cpp_directive_only_process (cpp_reader *pfile,
5158                             void *data,
5159                             void (*cb) (cpp_reader *, CPP_DO_task, void *, ...))
5160 {
5161   bool module_p = CPP_OPTION (pfile, module_directives);
5162
5163   do
5164     {
5165     restart:
5166       /* Buffer initialization, but no line cleaning. */
5167       cpp_buffer *buffer = pfile->buffer;
5168       buffer->cur_note = buffer->notes_used = 0;
5169       buffer->cur = buffer->line_base = buffer->next_line;
5170       buffer->need_line = false;
5171       /* Files always end in a newline or carriage return.  We rely on this for
5172          character peeking safety.  */
5173       gcc_assert (buffer->rlimit[0] == '\n' || buffer->rlimit[0] == '\r');
5174
5175       const unsigned char *base = buffer->cur;
5176       unsigned line_count = 0;
5177       const unsigned char *line_start = base;
5178
5179       bool bol = true;
5180       bool raw = false;
5181
5182       const unsigned char *lwm = base;
5183       for (const unsigned char *pos = base, *limit = buffer->rlimit;
5184            pos < limit;)
5185         {
5186           unsigned char c = *pos++;
5187           /* This matches the switch in _cpp_lex_direct.  */
5188           switch (c)
5189             {
5190             case ' ': case '\t': case '\f': case '\v':
5191               /* Whitespace, do nothing.  */
5192               break;
5193
5194             case '\r': /* MAC line ending, or Windows \r\n  */
5195               if (*pos == '\n')
5196                 pos++;
5197               /* FALLTHROUGH */
5198
5199             case '\n':
5200               bol = true;
5201
5202             next_line:
5203               CPP_INCREMENT_LINE (pfile, 0);
5204               line_count++;
5205               line_start = pos;
5206               break;
5207
5208             case '\\':
5209               /* <backslash><newline> is removed, and doesn't undo any
5210                  preceeding escape or whatnot.  */
5211               if (*pos == '\n')
5212                 {
5213                   pos++;
5214                   goto next_line;
5215                 }
5216               else if (*pos == '\r')
5217                 {
5218                   if (pos[1] == '\n')
5219                     pos++;
5220                   pos++;
5221                   goto next_line;
5222                 }
5223               goto dflt;
5224
5225             case '#':
5226               if (bol)
5227                 {
5228                   /* Line directive.  */
5229                   if (pos - 1 > base && !pfile->state.skipping)
5230                     cb (pfile, CPP_DO_print, data,
5231                         line_count, base, pos - 1 - base);
5232
5233                   /* Prep things for directive handling. */
5234                   buffer->next_line = pos;
5235                   buffer->need_line = true;
5236                   bool ok = _cpp_get_fresh_line (pfile);
5237                   gcc_checking_assert (ok);
5238
5239                   /* Ensure proper column numbering for generated
5240                      error messages. */
5241                   buffer->line_base -= pos - line_start;
5242
5243                   _cpp_handle_directive (pfile, line_start + 1 != pos);
5244
5245                   /* Sanitize the line settings.  Duplicate #include's can
5246                      mess things up. */
5247                   // FIXME: Necessary?
5248                   pfile->line_table->highest_location
5249                     = pfile->line_table->highest_line;
5250
5251                   if (!pfile->state.skipping
5252                       && pfile->buffer->next_line < pfile->buffer->rlimit)
5253                     cb (pfile, CPP_DO_location, data,
5254                         pfile->line_table->highest_line);
5255
5256                   goto restart;
5257                 }
5258               goto dflt;
5259
5260             case '/':
5261               {
5262                 const unsigned char *peek = do_peek_next (pos, limit);
5263                 if (!(*peek == '/' || *peek == '*'))
5264                   goto dflt;
5265
5266                 /* Line or block comment  */
5267                 bool is_block = *peek == '*';
5268                 bool star = false;
5269                 bool esc = false;
5270                 location_t sloc
5271                   = linemap_position_for_column (pfile->line_table,
5272                                                  pos - line_start);
5273
5274                 while (pos < limit)
5275                   {
5276                     char c = *pos++;
5277                     switch (c)
5278                       {
5279                       case '\\':
5280                         esc = true;
5281                         break;
5282
5283                       case '\r':
5284                         if (*pos == '\n')
5285                           pos++;
5286                         /* FALLTHROUGH  */
5287
5288                       case '\n':
5289                         {
5290                           CPP_INCREMENT_LINE (pfile, 0);
5291                           line_count++;
5292                           line_start = pos;
5293                           if (!esc && !is_block)
5294                             {
5295                               bol = true;
5296                               goto done_comment;
5297                             }
5298                         }
5299                         if (!esc)
5300                           star = false;
5301                         esc = false;
5302                         break;
5303
5304                       case '*':
5305                         if (pos > peek)
5306                           star = is_block;
5307                         esc = false;
5308                         break;
5309
5310                       case '/':
5311                         if (star)
5312                           goto done_comment;
5313                         /* FALLTHROUGH  */
5314
5315                       default:
5316                         star = false;
5317                         esc = false;
5318                         break;
5319                       }
5320                   }
5321                 if (pos < limit || is_block)
5322                   cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
5323                                        "unterminated comment");
5324               done_comment:
5325                 lwm = pos;
5326                 break;
5327               }
5328
5329             case '\'':
5330               if (!CPP_OPTION (pfile, digit_separators))
5331                 goto delimited_string;
5332
5333               /* Possibly a number punctuator.  */
5334               if (!ISIDNUM (*do_peek_next (pos, limit)))
5335                 goto delimited_string;
5336
5337               goto quote_peek;
5338
5339             case '\"':
5340               if (!CPP_OPTION (pfile, rliterals))
5341                 goto delimited_string;
5342
5343             quote_peek:
5344               {
5345                 /* For ' see if it's a number punctuator
5346                    \.?<digit>(<digit>|<identifier-nondigit>
5347                    |'<digit>|'<nondigit>|[eEpP]<sign>|\.)* */
5348                 /* For " see if it's a raw string
5349                    {U,L,u,u8}R.  This includes CPP_NUMBER detection,
5350                    because that could be 0e+R.  */
5351                 const unsigned char *peek = pos - 1;
5352                 bool quote_first = c == '"';
5353                 bool quote_eight = false;
5354                 bool maybe_number_start = false;
5355                 bool want_number = false;
5356
5357                 while ((peek = do_peek_prev (peek, lwm)))
5358                   {
5359                     unsigned char p = *peek;
5360                     if (quote_first)
5361                       {
5362                         if (!raw)
5363                           {
5364                             if (p != 'R')
5365                               break;
5366                             raw = true;
5367                             continue;
5368                           }
5369
5370                         quote_first = false;
5371                         if (p == 'L' || p == 'U' || p == 'u')
5372                           ;
5373                         else if (p == '8')
5374                           quote_eight = true;
5375                         else
5376                           goto second_raw;
5377                       }
5378                     else if (quote_eight)
5379                       {
5380                         if (p != 'u')
5381                           {
5382                             raw = false;
5383                             break;
5384                           }
5385                         quote_eight = false;
5386                       }
5387                     else if (c == '"')
5388                       {
5389                       second_raw:;
5390                         if (!want_number && ISIDNUM (p))
5391                           {
5392                             raw = false;
5393                             break;
5394                           }
5395                       }
5396
5397                     if (ISDIGIT (p))
5398                       maybe_number_start = true;
5399                     else if (p == '.')
5400                       want_number = true;
5401                     else if (ISIDNUM (p))
5402                       maybe_number_start = false;
5403                     else if (p == '+' || p == '-')
5404                       {
5405                         if (const unsigned char *peek_prev
5406                             = do_peek_prev (peek, lwm))
5407                           {
5408                             p = *peek_prev;
5409                             if (p == 'e' || p == 'E'
5410                                 || p == 'p' || p == 'P')
5411                               {
5412                                 want_number = true;
5413                                 maybe_number_start = false;
5414                               }
5415                             else
5416                               break;
5417                           }
5418                         else
5419                           break;
5420                       }
5421                     else if (p == '\'' || p == '\"')
5422                       {
5423                         /* If this is lwm, this must be the end of a
5424                            previous string.  So this is a trailing
5425                            literal type, (a) if those are allowed,
5426                              and (b) maybe_start is false.  Otherwise
5427                              this must be a CPP_NUMBER because we've
5428                              met another ', and we'd have checked that
5429                              in its own right.  */
5430                         if (peek == lwm && CPP_OPTION (pfile, uliterals))
5431                           {
5432                             if  (!maybe_number_start && !want_number)
5433                               /* Must be a literal type.  */
5434                               raw = false;
5435                           }
5436                         else if (p == '\''
5437                                  && CPP_OPTION (pfile, digit_separators))
5438                           maybe_number_start = true;
5439                         break;
5440                       }
5441                     else if (c == '\'')
5442                       break;
5443                     else if (!quote_first && !quote_eight)
5444                       break;
5445                   }
5446
5447                 if (maybe_number_start)
5448                   {
5449                     if (c == '\'')
5450                       /* A CPP NUMBER.  */
5451                       goto dflt;
5452                     raw = false;
5453                   }
5454
5455                 goto delimited_string;
5456               }
5457
5458             delimited_string:
5459               {
5460                 /* (Possibly raw) string or char literal.  */
5461                 unsigned char end = c;
5462                 int delim_len = -1;
5463                 const unsigned char *delim = NULL;
5464                 location_t sloc = linemap_position_for_column (pfile->line_table,
5465                                                                pos - line_start);
5466                 int esc = 0;
5467
5468                 if (raw)
5469                   {
5470                     /* There can be no line breaks in the delimiter.  */
5471                     delim = pos;
5472                     for (delim_len = 0; (c = *pos++) != '('; delim_len++)
5473                       {
5474                         if (delim_len == 16)
5475                           {
5476                             cpp_error_with_line (pfile, CPP_DL_ERROR,
5477                                                  sloc, 0,
5478                                                  "raw string delimiter"
5479                                                  " longer than %d"
5480                                                  " characters",
5481                                                  delim_len);
5482                             raw = false;
5483                             pos = delim;
5484                             break;
5485                           }
5486                         if (strchr (") \\\t\v\f\n", c))
5487                           {
5488                             cpp_error_with_line (pfile, CPP_DL_ERROR,
5489                                                  sloc, 0,
5490                                                  "invalid character '%c'"
5491                                                  " in raw string"
5492                                                  " delimiter", c);
5493                             raw = false;
5494                             pos = delim;
5495                             break;
5496                           }
5497                         if (pos >= limit)
5498                           goto bad_string;
5499                       }
5500                   }
5501
5502                 while (pos < limit)
5503                   {
5504                     char c = *pos++;
5505                     switch (c)
5506                       {
5507                       case '\\':
5508                         if (!raw)
5509                           esc++;
5510                         break;
5511
5512                       case '\r':
5513                         if (*pos == '\n')
5514                           pos++;
5515                         /* FALLTHROUGH  */
5516
5517                       case '\n':
5518                         {
5519                           CPP_INCREMENT_LINE (pfile, 0);
5520                           line_count++;
5521                           line_start = pos;
5522                         }
5523                         if (esc)
5524                           esc--;
5525                         break;
5526
5527                       case ')':
5528                         if (raw
5529                             && pos + delim_len + 1 < limit
5530                             && pos[delim_len] == end
5531                             && !memcmp (delim, pos, delim_len))
5532                           {
5533                             pos += delim_len + 1;
5534                             raw = false;
5535                             goto done_string;
5536                           }
5537                         break;
5538
5539                       default:
5540                         if (!raw && !(esc & 1) && c == end)
5541                           goto done_string;
5542                         esc = 0;
5543                         break;
5544                       }
5545                   }
5546               bad_string:
5547                 cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
5548                                      "unterminated literal");
5549
5550               done_string:
5551                 raw = false;
5552                 lwm = pos - 1;
5553               }
5554               goto dflt;
5555
5556             case '_':
5557             case 'e':
5558             case 'i':
5559             case 'm':
5560               if (bol && module_p && !pfile->state.skipping
5561                   && do_peek_module (pfile, c, pos, limit))
5562                 {
5563                   /* We've seen the start of a module control line.
5564                      Start up the tokenizer.  */
5565                   pos--; /* Backup over the first character.  */
5566
5567                   /* Backup over whitespace to start of line.  */
5568                   while (pos > line_start
5569                          && (pos[-1] == ' ' || pos[-1] == '\t'))
5570                     pos--;
5571
5572                   if (pos > base)
5573                     cb (pfile, CPP_DO_print, data, line_count, base, pos - base);
5574
5575                   /* Prep things for directive handling. */
5576                   buffer->next_line = pos;
5577                   buffer->need_line = true;
5578
5579                   /* Now get tokens until the PRAGMA_EOL.  */
5580                   do
5581                     {
5582                       location_t spelling;
5583                       const cpp_token *tok
5584                         = cpp_get_token_with_location (pfile, &spelling);
5585
5586                       gcc_assert (pfile->state.in_deferred_pragma
5587                                   || tok->type == CPP_PRAGMA_EOL);
5588                       cb (pfile, CPP_DO_token, data, tok, spelling);
5589                     }
5590                   while (pfile->state.in_deferred_pragma);
5591
5592                   if (pfile->buffer->next_line < pfile->buffer->rlimit)
5593                     cb (pfile, CPP_DO_location, data,
5594                         pfile->line_table->highest_line);
5595
5596                   pfile->mi_valid = false;
5597                   goto restart;
5598                 }
5599               goto dflt;
5600
5601             default:
5602             dflt:
5603               bol = false;
5604               pfile->mi_valid = false;
5605               break;
5606             }
5607         }
5608
5609       if (buffer->rlimit > base && !pfile->state.skipping)
5610         {
5611           const unsigned char *limit = buffer->rlimit;
5612           /* If the file was not newline terminated, add rlimit, which is
5613              guaranteed to point to a newline, to the end of our range.  */
5614           if (limit[-1] != '\n')
5615             {
5616               limit++;
5617               CPP_INCREMENT_LINE (pfile, 0);
5618               line_count++;
5619             }
5620           cb (pfile, CPP_DO_print, data, line_count, base, limit - base);
5621         }
5622
5623       _cpp_pop_buffer (pfile);
5624     }
5625   while (pfile->buffer);
5626 }